This is the mail archive of the libc-alpha@sources.redhat.com mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

gawk fixes for regex library


Greetings.

In a fit of <whatever>, I decided to sync the gawk regex routines with
those in the glibc CVS.  I managed to find the ones on which my current
code was based, went through them, extracted out the important changes,
and put them into the current code.

I had at one point also done changes to make the code compile under K&R C,
but I decided that this just isn't worth it anymore, so I didn't bother
forward porting that code.  (Yay!)

A suggestion. In my humble opinion, it'd be better to change
RE_TRANSLATE_TYPE to `unsigned char *' at the definition, and then
remove the extra `unsigned' casts and declarations throughout the code.
But I'll leave that up to you folks.

Paolo says the current code is faster than the old code.  This seems to
be true.  But it's still nowhere near as fast as the dfa code in GNU grep.
Thus, for the eventual 3.1.4 release, I've reinstated the old code that
uses dfa.c for ``does it match?'' cases, with the full regex only for
cases where I need the ``where does it match?'' information.

Anyway, here are my diffs.  I hope this is useful.

Thanks,

Arnold Robbins
The gawk guy
-----------------------
2004-January-7		Arnold D. Robbins	<arnold@skeeve.com>

	* regcomp.c (re_compile_fastmap_iter): Fix `icase' assigment if not
	RE_ENABLE_I18N.
	(parse_expression): Similar for dfa->has_mb_code;
	(build_range_exp): Fix `cmp_buf' contents to be correct if don't have
	valid multibyte characters.

	* regex.c: Include config.h if HAVE_CONFIG_H. For MSC, include <stdio.h>
	Include <limits.h> here, before include of regex.h, so that regex.h
	can fix the value of `RE_DUP_MAX' correctly.

	* regex.c: If __APPLE_CC__ is defined, give empty definition for `__restrict'.

	* regex_internal.c (re_string_allocate): Only test `dfa->mb_cur_max' if
	`RE_ENABLE_I18N' is defined.

	* regex_internal.h: Don't include <limits.h> here, added comment about it.
	For other GNU distributions, make sure that `ENABLE_NLS' is defined before
	including <libintl.h>.
	(BE): Add definition if needed.

	* regexec.c (re_search_internal): Set `sb' correctly if `RE_ENABLE_I18N' isn't
	defined.

--- /usr/local/src/Gnu/libc/posix/regcomp.c	2004-01-06 23:59:24.000000000 +0200
+++ regcomp.c	2004-01-07 15:43:20.000000000 +0200
@@ -324,7 +324,11 @@
 {
   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
   int node_cnt;
+#ifdef RE_ENABLE_I18N
   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
+#else
+  int icase = (bufp->syntax & RE_ICASE);
+#endif
   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
     {
       int node = init_state->nodes.elems[node_cnt];
@@ -2233,8 +2237,10 @@
 	  *err = REG_ESPACE;
 	  return NULL;
 	}
+#ifdef RE_ENABLE_I18N
       if (dfa->mb_cur_max > 1)
 	dfa->has_mb_node = 1;
+#endif
       break;
     case OP_WORD:
       tree = build_charclass_op (dfa, regexp->trans, "alnum", "_", 0, err);
@@ -2558,8 +2564,8 @@
 		? __btowc (start_ch) : start_elem->opr.wch);
     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
 	      ? __btowc (end_ch) : end_elem->opr.wch);
-    cmp_buf[0] = start_wc;
-    cmp_buf[4] = end_wc;
+    cmp_buf[0] = start_wc != WEOF ? start_wc : start_ch;
+    cmp_buf[4] = end_wc != WEOF ? end_wc : end_ch;
     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
       return REG_ERANGE;
 
--- /usr/local/src/Gnu/libc/posix/regex.c	2003-11-16 09:03:23.000000000 +0200
+++ regex.c	2004-01-07 15:48:01.000000000 +0200
@@ -43,9 +43,19 @@
 # include "../locale/localeinfo.h"
 #endif
 
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
 /* POSIX says that <sys/types.h> must be included (by the caller) before
    <regex.h>.  */
 #include <sys/types.h>
+
+#if defined (_MSC_VER)
+#include <stdio.h> /* for size_t */
+#endif
+
+#include <limits.h>
 #include <regex.h>
 #include "regex_internal.h"
 
--- /usr/local/src/Gnu/libc/posix/regex.h	2003-11-13 07:58:44.000000000 +0200
+++ regex.h	2004-01-07 15:45:34.000000000 +0200
@@ -543,6 +543,8 @@
 #  else
 #   define __restrict
 #  endif
+# elif defined __APPLE_CC__
+#  define __restrict
 # endif
 #endif
 /* gcc 3.1 and up support the [restrict] syntax.  */
--- /usr/local/src/Gnu/libc/posix/regex_internal.c	2004-01-07 00:00:16.000000000 +0200
+++ regex_internal.c	2004-01-07 15:47:42.000000000 +0200
@@ -70,7 +70,11 @@
   pstr->word_char = dfa->word_char;
   pstr->word_ops_used = dfa->word_ops_used;
   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
-  pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
+  pstr->valid_len = (pstr->mbs_allocated
+#ifdef RE_ENABLE_I18N
+		     || dfa->mb_cur_max > 1
+#endif
+		     ) ? pstr->valid_len : len;
   pstr->valid_raw_len = pstr->valid_len;
   return REG_NOERROR;
 }
--- /usr/local/src/Gnu/libc/posix/regex_internal.h	2004-01-06 23:59:50.000000000 +0200
+++ regex_internal.h	2004-01-07 15:58:52.000000000 +0200
@@ -27,7 +27,14 @@
 
 #include <assert.h>
 #include <ctype.h>
+#if 0
+/* Don't include this here. On some systems it sets RE_DUP_MAX to a
+ * lower value than GNU regex allows.  Instead, include it in
+ * regex.c, before include of <regex.h>, which correctly
+ * #undefs RE_DUP_MAX and sets it to the right value.
+ */
 #include <limits.h>
+#endif
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -57,7 +64,7 @@
 #endif
 
 /* This is for other GNU distributions with internationalized messages.  */
-#if HAVE_LIBINTL_H || defined _LIBC
+#if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC
 # include <libintl.h>
 # ifdef _LIBC
 #  undef gettext
@@ -112,6 +119,13 @@
 # define __attribute(arg)
 #endif
 
+#if _LIBC || __GNUC__ >= 3
+# define BE(expr, val) __builtin_expect (expr, val)
+#else
+# define BE(expr, val) (expr)
+# define inline
+#endif
+
 extern const char __re_error_msgid[] attribute_hidden;
 extern const size_t __re_error_msgid_idx[] attribute_hidden;
 
--- /usr/local/src/Gnu/libc/posix/regexec.c	2004-01-03 06:06:39.000000000 +0200
+++ regexec.c	2004-01-07 15:52:20.000000000 +0200
@@ -646,7 +646,11 @@
   incr = (range < 0) ? -1 : 1;
   left_lim = (range < 0) ? start + range : start;
   right_lim = (range < 0) ? start : start + range;
+#ifdef RE_ENABLE_I18N
   sb = dfa->mb_cur_max == 1;
+#else
+  sb = 1;
+#endif
   fast_translate = sb || !(preg->syntax & RE_ICASE || preg->translate);
 
   for (;;)


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]