This is the mail archive of the libc-hacker@sources.redhat.com mailing list for the glibc project.

Note that libc-hacker is a closed list. You may look at the archives of this list, but subscription and posting are not open.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Speed up regex UTF-8 (and easily other mb charset) handling


Hi!

The following patch speeds up UTF-8 handling in regex
(and perhaps other MB charsets if start of mb character can be
determined, though for them I haven't implemented the hooks yet).

The extended tst-regex test can last several hours without this
patch and finish within seconds with it.
>From real world tests, e.g.:
time LC_ALL=en_US.UTF-8 LD_LIBRARY_PATH=/usr/src/libc.old/obj /bin/sed 's/./x/g' /etc/termcap > /dev/null

real    0m8.884s
user    0m8.880s
sys     0m0.010s
time LC_ALL=en_US.UTF-8 LD_LIBRARY_PATH=/usr/src/libc/obj /bin/sed 's/./x/g' /etc/termcap > /dev/null

real    0m3.121s
user    0m3.100s
sys     0m0.020s

(where the only difference between those 2 libcs is this patch).

2003-11-11  Jakub Jelinek  <jakub@redhat.com>

	* iconv/gconv.h (__gconv_prevmb_fct): New typedef.
	(struct __gconv_step): New field __prevmb_fct.
	* iconv/gconv_int.h (__gconv_prevmb_ascii): New declaration.
	* iconv/gconv_simple.c (BUILTIN_TRANSFORMATION): Add PrevMbFct
	argument.
	(__gconv_prevmb_ascii): New function.
	* iconv/gconv_builtin.h: Add PrevMbFct argument to all
	BUILTIN_TRANSFORMATION invocations.
	* iconv/gconv_conf.c (BUILTIN_TRANSFORMATION): Add PrevMbFct
	argument.
	* iconv/iconvconfig.c (BUILTIN_TRANSFORMATION): Likewise.
	* iconv/gconv_builtin.c (map): New field prevmb_fct.
	(BUILTIN_TRANSFORMATION): Add PrevMbFct argument.  Use it to
	initialize prevmb_fct field.
	(__gconv_get_builtin_trans): Initialize __prevmb_fct field.
	* iconv/gconv_cache.c (find_module): Initialize __prevmb_fct field.
	* iconv/gconv_db.c (gen_steps, increment_counter): Likewise.
	* iconv/skeleton.c: Document FROM_PREVMB.
	(gconv_init): Initialize __prevmb_fct field.
	Undefine FROM_PREVMB at the end.
	* iconv/loop.c: Document PREVMB_BODY.
	(gconv_prevmb, FROM_PREVMB): Define if PREVMB_BODY is defined.
	Undefine PREVMB_BODY at the end.
	* posix/regex_internal.c [_LIBC]: Include wcsmbs/wcsmbsload.h
	and dlfcn.h.
	(re_string_reconstruct) [_LIBC]: Use __prevmb_fct if available.
	* posix/tst-regex.c (umemlen): New variable.
	(test_expr): Add expectedicase argument.  Test case insensitive
	searches as well as backwards searches (case sensitive and
	insensitive) too.
	(run_test): Add icase argument.  Use it to compute regcomp flags.
	(run_test_backwards): New function.
	(main): Cast read to size_t to avoid warning.  Set umemlen.
	Add expectedicase arguments to test_expr.

--- libc/iconv/gconv_conf.c.jj	2003-09-14 20:13:39.000000000 +0200
+++ libc/iconv/gconv_conf.c	2003-11-11 11:44:23.000000000 +0100
@@ -62,7 +62,7 @@ static const char gconv_module_ext[] = M
 static struct gconv_module builtin_modules[] =
 {
 #define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \
-			       MinF, MaxF, MinT, MaxT) \
+			       PrevMbFct, MinF, MaxF, MinT, MaxT) \
   {									      \
     .from_string = From,						      \
     .to_string = To,							      \
@@ -81,7 +81,7 @@ static struct gconv_module builtin_modul
 static const char *builtin_aliases[] =
 {
 #define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \
-			       MinF, MaxF, MinT, MaxT)
+			       PrevMbFct, MinF, MaxF, MinT, MaxT)
 #define BUILTIN_ALIAS(From, To) From " " To,
 
 #include "gconv_builtin.h"
--- libc/iconv/gconv.h.jj	2002-12-02 22:44:26.000000000 +0100
+++ libc/iconv/gconv.h	2003-11-11 12:05:21.000000000 +0100
@@ -74,6 +74,13 @@ typedef int (*__gconv_fct) (struct __gco
 /* Type of a specialized conversion function for a single byte to INTERNAL.  */
 typedef wint_t (*__gconv_btowc_fct) (struct __gconv_step *, unsigned char);
 
+/* Type of a specialized function to return starting byte of a multi-byte
+   character.  Searching starts from ptr-1 backwards.  If no starting byte
+   of a multi-byte character is found even at the byte pointed by first,
+   the function returns NULL.  */
+typedef __const unsigned char *(*__gconv_prevmb_fct) (__const unsigned char *,
+						      __const unsigned char *);
+
 /* Constructor and destructor for local data for conversion step.  */
 typedef int (*__gconv_init_fct) (struct __gconv_step *);
 typedef void (*__gconv_end_fct) (struct __gconv_step *);
@@ -124,6 +131,7 @@ struct __gconv_step
 
   __gconv_fct __fct;
   __gconv_btowc_fct __btowc_fct;
+  __gconv_prevmb_fct __prevmb_fct;
   __gconv_init_fct __init_fct;
   __gconv_end_fct __end_fct;
 
--- libc/iconv/gconv_builtin.c.jj	2002-12-02 22:48:08.000000000 +0100
+++ libc/iconv/gconv_builtin.c	2003-11-11 11:43:00.000000000 +0100
@@ -31,6 +31,7 @@ static struct builtin_map
 {
   const char *name;
   __gconv_fct fct;
+  __gconv_prevmb_fct prevmb_fct;
   __gconv_btowc_fct btowc_fct;
 
   int min_needed_from;
@@ -41,11 +42,12 @@ static struct builtin_map
 } map[] =
 {
 #define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \
-			       MinF, MaxF, MinT, MaxT) \
+			       PrevMbFct, MinF, MaxF, MinT, MaxT) \
   {									      \
     .name = Name,							      \
     .fct = Fct,								      \
     .btowc_fct = BtowcFct,						      \
+    .prevmb_fct = PrevMbFct,						      \
 									      \
     .min_needed_from = MinF,						      \
     .max_needed_from = MaxF,						      \
@@ -72,6 +74,7 @@ __gconv_get_builtin_trans (const char *n
 
   step->__fct = map[cnt].fct;
   step->__btowc_fct = map[cnt].btowc_fct;
+  step->__prevmb_fct = map[cnt].prevmb_fct;
   step->__init_fct = NULL;
   step->__end_fct = NULL;
   step->__shlib_handle = NULL;
--- libc/iconv/gconv_int.h.jj	2003-06-11 23:33:21.000000000 +0200
+++ libc/iconv/gconv_int.h	2003-11-11 12:14:58.000000000 +0100
@@ -297,6 +297,14 @@ __BUILTIN_TRANSFORM (__gconv_transform_u
    only ASCII characters.  */
 extern wint_t __gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c);
 
+/* Specialized function to return starting byte of a multi-byte
+   character for encodings where only ASCII characters start multi-byte
+   sequences.  Searching starts from ptr-1 backwards.  If no starting byte
+   of a multi-byte character is found even at the byte pointed by first,
+   the function returns NULL.  */
+extern const unsigned char *__gconv_prevmb_ascii (const unsigned char *ptr,
+						  const unsigned char *first);
+
 #endif
 
 __END_DECLS
--- libc/iconv/gconv_simple.c.jj	2003-06-11 23:36:37.000000000 +0200
+++ libc/iconv/gconv_simple.c	2003-11-11 12:14:37.000000000 +0100
@@ -32,7 +32,7 @@
 
 #define BUILTIN_ALIAS(s1, s2) /* nothing */
 #define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \
-			       MinF, MaxF, MinT, MaxT) \
+			       PrevMbFct, MinF, MaxF, MinT, MaxT) \
   extern int Fct (struct __gconv_step *, struct __gconv_step_data *,	      \
 		  __const unsigned char **, __const unsigned char *,	      \
 		  unsigned char **, size_t *, int, int);
@@ -56,6 +56,22 @@ __gconv_btwoc_ascii (struct __gconv_step
 }
 
 
+/* Specialized function to return starting byte of a multi-byte
+   character for encodings where only ASCII characters start multi-byte
+   sequences.  Searching starts from ptr-1 backwards.  If no starting byte
+   of a multi-byte character is found even at the byte pointed by first,
+   the function returns NULL.  */
+const unsigned char *
+__gconv_prevmb_ascii (const unsigned char *ptr,
+		      const unsigned char *first)
+{
+  while (--ptr >= first)
+    if (*ptr < 0x80)
+      return ptr;
+  return NULL;
+}
+
+
 /* Transform from the internal, UCS4-like format, to UCS4.  The
    difference between the internal ucs4 format and the real UCS4
    format is, if any, the endianess.  The Unicode/ISO 10646 says that
--- libc/iconv/iconvconfig.c.jj	2003-06-11 23:38:47.000000000 +0200
+++ libc/iconv/iconvconfig.c	2003-11-11 11:45:22.000000000 +0100
@@ -202,7 +202,7 @@ static struct
 #define BUILTIN_ALIAS(alias, real) \
     { .from = alias, .to = real },
 #define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \
-			       MinF, MaxF, MinT, MaxT)
+			       PrevMbFct, MinF, MaxF, MinT, MaxT)
 #include <gconv_builtin.h>
   };
 #undef BUILTIN_ALIAS
@@ -219,7 +219,7 @@ static struct
   {
 #define BUILTIN_ALIAS(alias, real)
 #define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \
-			       MinF, MaxF, MinT, MaxT) \
+			       PrevMbFct, MinF, MaxF, MinT, MaxT) \
     { .from = From, .to = To, .module = Name, .cost = Cost },
 #include <gconv_builtin.h>
   };
--- libc/iconv/gconv_builtin.h.jj	2002-12-02 22:46:00.000000000 +0100
+++ libc/iconv/gconv_builtin.h	2003-11-11 12:15:19.000000000 +0100
@@ -30,14 +30,18 @@ BUILTIN_ALIAS ("OSF00010105//", "ISO-106
 BUILTIN_ALIAS ("OSF00010106//", "ISO-10646/UCS4/") /* level 3 */
 
 BUILTIN_TRANSFORMATION ("INTERNAL", "ISO-10646/UCS4/", 1, "=INTERNAL->ucs4",
-			__gconv_transform_internal_ucs4, NULL, 4, 4, 4, 4)
+			__gconv_transform_internal_ucs4, NULL, NULL,
+			4, 4, 4, 4)
 BUILTIN_TRANSFORMATION ("ISO-10646/UCS4/", "INTERNAL", 1, "=ucs4->INTERNAL",
-			__gconv_transform_ucs4_internal, NULL, 4, 4, 4, 4)
+			__gconv_transform_ucs4_internal, NULL, NULL,
+			4, 4, 4, 4)
 
 BUILTIN_TRANSFORMATION ("INTERNAL", "UCS-4LE//", 1, "=INTERNAL->ucs4le",
-			__gconv_transform_internal_ucs4le, NULL, 4, 4, 4, 4)
+			__gconv_transform_internal_ucs4le, NULL, NULL,
+			4, 4, 4, 4)
 BUILTIN_TRANSFORMATION ("UCS-4LE//", "INTERNAL", 1, "=ucs4le->INTERNAL",
-			__gconv_transform_ucs4le_internal, NULL, 4, 4, 4, 4)
+			__gconv_transform_ucs4le_internal, NULL, NULL,
+			4, 4, 4, 4)
 
 BUILTIN_ALIAS ("WCHAR_T//", "INTERNAL")
 
@@ -48,11 +52,12 @@ BUILTIN_ALIAS ("OSF05010001//", "ISO-106
 BUILTIN_ALIAS ("ISO-10646/UTF-8/", "ISO-10646/UTF8/")
 
 BUILTIN_TRANSFORMATION ("INTERNAL", "ISO-10646/UTF8/", 1, "=INTERNAL->utf8",
-			__gconv_transform_internal_utf8, NULL, 4, 4, 1, 6)
+			__gconv_transform_internal_utf8, NULL, NULL,
+			4, 4, 1, 6)
 
 BUILTIN_TRANSFORMATION ("ISO-10646/UTF8/", "INTERNAL", 1, "=utf8->INTERNAL",
 			__gconv_transform_utf8_internal, __gconv_btwoc_ascii,
-			1, 6, 4, 4)
+			__gconv_prevmb_ascii, 1, 6, 4, 4)
 
 BUILTIN_ALIAS ("UCS2//", "ISO-10646/UCS2/")
 BUILTIN_ALIAS ("UCS-2//", "ISO-10646/UCS2/")
@@ -61,10 +66,12 @@ BUILTIN_ALIAS ("OSF00010101//", "ISO-106
 BUILTIN_ALIAS ("OSF00010102//", "ISO-10646/UCS2/") /* level 3 */
 
 BUILTIN_TRANSFORMATION ("ISO-10646/UCS2/", "INTERNAL", 1, "=ucs2->INTERNAL",
-			__gconv_transform_ucs2_internal, NULL, 2, 2, 4, 4)
+			__gconv_transform_ucs2_internal, NULL, NULL,
+			2, 2, 4, 4)
 
 BUILTIN_TRANSFORMATION ("INTERNAL", "ISO-10646/UCS2/", 1, "=INTERNAL->ucs2",
-			__gconv_transform_internal_ucs2, NULL, 4, 4, 2, 2)
+			__gconv_transform_internal_ucs2, NULL, NULL,
+			4, 4, 2, 2)
 
 
 BUILTIN_ALIAS ("ANSI_X3.4//", "ANSI_X3.4-1968//")
@@ -82,10 +89,11 @@ BUILTIN_ALIAS ("OSF00010020//", "ANSI_X3
 
 BUILTIN_TRANSFORMATION ("ANSI_X3.4-1968//", "INTERNAL", 1, "=ascii->INTERNAL",
 			__gconv_transform_ascii_internal, __gconv_btwoc_ascii,
-			4, 4, 1, 1)
+			NULL, 4, 4, 1, 1)
 
 BUILTIN_TRANSFORMATION ("INTERNAL", "ANSI_X3.4-1968//", 1, "=INTERNAL->ascii",
-			__gconv_transform_internal_ascii, NULL, 4, 4, 1, 1)
+			__gconv_transform_internal_ascii, NULL, NULL,
+			4, 4, 1, 1)
 
 
 #if BYTE_ORDER == BIG_ENDIAN
@@ -96,12 +104,12 @@ BUILTIN_ALIAS ("UCS-2LE//", "UNICODELITT
 
 BUILTIN_TRANSFORMATION ("UNICODELITTLE//", "INTERNAL", 1,
 			"=ucs2reverse->INTERNAL",
-			__gconv_transform_ucs2reverse_internal, NULL,
+			__gconv_transform_ucs2reverse_internal, NULL, NULL,
 			2, 2, 4, 4)
 
 BUILTIN_TRANSFORMATION ("INTERNAL", "UNICODELITTLE//", 1,
 			"=INTERNAL->ucs2reverse",
-			__gconv_transform_internal_ucs2reverse, NULL,
+			__gconv_transform_internal_ucs2reverse, NULL, NULL,
 			4, 4, 2, 2)
 #else
 BUILTIN_ALIAS ("UNICODELITTLE//", "ISO-10646/UCS2/")
@@ -111,11 +119,11 @@ BUILTIN_ALIAS ("UCS-2BE//", "UNICODEBIG/
 
 BUILTIN_TRANSFORMATION ("UNICODEBIG//", "INTERNAL", 1,
 			"=ucs2reverse->INTERNAL",
-			__gconv_transform_ucs2reverse_internal, NULL,
+			__gconv_transform_ucs2reverse_internal, NULL, NULL,
 			2, 2, 4, 4)
 
 BUILTIN_TRANSFORMATION ("INTERNAL", "UNICODEBIG//", 1,
 			"=INTERNAL->ucs2reverse",
-			__gconv_transform_internal_ucs2reverse, NULL,
+			__gconv_transform_internal_ucs2reverse, NULL, NULL,
 			4, 4, 2, 2)
 #endif
--- libc/iconv/skeleton.c.jj	2002-12-02 22:49:35.000000000 +0100
+++ libc/iconv/skeleton.c	2003-11-11 11:56:37.000000000 +0100
@@ -339,6 +339,9 @@ gconv_init (struct __gconv_step *step)
 #ifdef FROM_ONEBYTE
       step->__btowc_fct = FROM_ONEBYTE;
 #endif
+#ifdef FROM_PREVMB
+      step->__prevmb_fct = FROM_PREVMB;
+#endif
     }
   else if (__builtin_expect (strcmp (step->__to_name, CHARSET_NAME), 0) == 0)
     {
--- libc/iconv/loop.c.jj	2003-06-11 23:38:13.000000000 +0200
+++ libc/iconv/loop.c	2003-11-11 12:05:59.000000000 +0100
@@ -46,6 +46,8 @@
 
      ONEBYTE_BODY	body of the specialized conversion function for a
 			single byte from the current character set to INTERNAL.
+     PREVMB_BODY	body of the specialized function for searching backwards
+			for start of a multi-byte character.
 */
 
 #include <assert.h>
@@ -471,6 +473,14 @@ gconv_btowc (struct __gconv_step *step, 
 #endif
 
 
+#ifdef PREVMB_BODY
+static const unsigned char *
+gconv_prevmb (const unsigned char *ptr, const unsigned char *first)
+  PREVMB_BODY
+# define FROM_PREVMB gconv_prevmb
+#endif
+
+
 /* We remove the macro definitions so that we can include this file again
    for the definition of another function.  */
 #undef MIN_NEEDED_INPUT
@@ -484,6 +494,7 @@ gconv_btowc (struct __gconv_step *step, 
 #undef INIT_PARAMS
 #undef UPDATE_PARAMS
 #undef ONEBYTE_BODY
+#undef PREVMB_BODY
 #undef UNPACK_BYTES
 #undef CLEAR_STATE
 #undef LOOP_NEED_STATE
--- libc/iconv/gconv_cache.c.jj	2003-06-11 23:38:47.000000000 +0200
+++ libc/iconv/gconv_cache.c	2003-11-11 19:52:25.000000000 +0100
@@ -205,6 +205,7 @@ find_module (const char *directory, cons
 
       /* These settings can be overridden by the init function.  */
       result->__btowc_fct = NULL;
+      result->__prevmb_fct = NULL;
       result->__data = NULL;
 
       /* Call the init function.  */
--- libc/iconv/gconv_db.c.jj	2003-06-11 23:31:59.000000000 +0200
+++ libc/iconv/gconv_db.c	2003-11-11 19:53:39.000000000 +0100
@@ -269,6 +269,7 @@ gen_steps (struct derivation_step *best,
 
 	      /* These settings can be overridden by the init function.  */
 	      result[step_cnt].__btowc_fct = NULL;
+	      result[step_cnt].__prevmb_fct = NULL;
 
 	      /* Call the init function.  */
 	      if (result[step_cnt].__init_fct != NULL)
@@ -358,6 +359,7 @@ increment_counter (struct __gconv_step *
 
 	      /* These settings can be overridden by the init function.  */
 	      step->__btowc_fct = NULL;
+	      step->__prevmb_fct = NULL;
 	    }
 
 	  /* Call the init function.  */
--- libc/posix/tst-regex.c.jj	2001-07-06 06:55:38.000000000 +0200
+++ libc/posix/tst-regex.c	2003-11-11 18:57:30.000000000 +0100
@@ -1,4 +1,4 @@
-/* Copyright (C) 2001 Free Software Foundation, Inc.
+/* Copyright (C) 2001, 2003 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -44,10 +44,13 @@ static iconv_t cd;
 static char *mem;
 static char *umem;
 static size_t memlen;
+static size_t umemlen;
 
-static int test_expr (const char *expr, int expected);
+static int test_expr (const char *expr, int expected, int expectedicase);
 static int run_test (const char *expr, const char *mem, size_t memlen,
-		     int expected);
+		     int icase, int expected);
+static int run_test_backwards (const char *expr, const char *mem,
+			       size_t memlen, int icase, int expected);
 
 
 int
@@ -78,7 +81,7 @@ main (void)
   if (mem == NULL)
     error (EXIT_FAILURE, errno, "while allocating buffer");
 
-  if (read (fd, mem, memlen) != memlen)
+  if ((size_t) read (fd, mem, memlen) != memlen)
     error (EXIT_FAILURE, 0, "cannot read entire file");
   mem[memlen] = '\0';
 
@@ -102,6 +105,7 @@ main (void)
   outmem = umem;
   outlen = 2 * memlen - 1;
   iconv (cd, &inmem, &inlen, &outmem, &outlen);
+  umemlen = outmem - umem;
   if (inlen != 0)
     error (EXIT_FAILURE, errno, "cannot convert buffer");
 
@@ -116,11 +120,11 @@ main (void)
 
   /* Run the actual tests.  All tests are run in a single-byte and a
      multi-byte locale.  */
-  result = test_expr ("[äáàâéèêíìîñöóòôüúùû]", 2);
-  result |= test_expr ("G.ran", 2);
-  result |= test_expr ("G.\\{1\\}ran", 2);
-  result |= test_expr ("G.*ran", 3);
-  result |= test_expr ("[äáàâ]", 0);
+  result = test_expr ("[äáàâéèêíìîñöóòôüúùû]", 2, 2);
+  result |= test_expr ("G.ran", 2, 3);
+  result |= test_expr ("G.\\{1\\}ran", 2, 3);
+  result |= test_expr ("G.*ran", 3, 44);
+  result |= test_expr ("[äáàâ]", 0, 0);
 
   /* Free the resources.  */
   free (umem);
@@ -132,7 +136,7 @@ main (void)
 
 
 static int
-test_expr (const char *expr, int expected)
+test_expr (const char *expr, int expected, int expectedicase)
 {
   int result;
   char *inmem;
@@ -146,7 +150,14 @@ test_expr (const char *expr, int expecte
     error (EXIT_FAILURE, 0, "cannot set locale de_DE.ISO-8859-1");
 
   printf ("\nTest \"%s\" with 8-bit locale\n", expr);
-  result = run_test (expr, mem, memlen, expected);
+  result = run_test (expr, mem, memlen, 0, expected);
+  printf ("\nTest \"%s\" with 8-bit locale, case insensitive\n", expr);
+  result |= run_test (expr, mem, memlen, 1, expectedicase);
+  printf ("\nTest \"%s\" backwards with 8-bit locale\n", expr);
+  result |= run_test_backwards (expr, mem, memlen, 0, expected);
+  printf ("\nTest \"%s\" backwards with 8-bit locale, case insensitive\n",
+	  expr);
+  result |= run_test_backwards (expr, mem, memlen, 1, expectedicase);
 
   /* Second test: search with an UTF-8 locale.  */
   if (setlocale (LC_ALL, "de_DE.UTF-8") == NULL)
@@ -163,14 +174,22 @@ test_expr (const char *expr, int expecte
 
   /* Run the tests.  */
   printf ("\nTest \"%s\" with multi-byte locale\n", expr);
-  result |= run_test (uexpr, umem, 2 * memlen - outlen, expected);
+  result |= run_test (uexpr, umem, umemlen, 0, expected);
+  printf ("\nTest \"%s\" with multi-byte locale, case insensitive\n", expr);
+  result |= run_test (uexpr, umem, umemlen, 1, expectedicase);
+  printf ("\nTest \"%s\" backwards with multi-byte locale\n", expr);
+  result |= run_test_backwards (uexpr, umem, umemlen, 0, expected);
+  printf ("\nTest \"%s\" backwards with multi-byte locale, case insensitive\n",
+	  expr);
+  result |= run_test_backwards (uexpr, umem, umemlen, 1, expectedicase);
 
   return result;
 }
 
 
 static int
-run_test (const char *expr, const char *mem, size_t memlen, int expected)
+run_test (const char *expr, const char *mem, size_t memlen, int icase,
+	  int expected)
 {
 #ifdef _POSIX_CPUTIME
   struct timespec start;
@@ -186,7 +205,7 @@ run_test (const char *expr, const char *
     use_clock = clock_gettime (cl, &start) == 0;
 #endif
 
-  err = regcomp (&re, expr, REG_NEWLINE);
+  err = regcomp (&re, expr, REG_NEWLINE | (icase ? REG_ICASE : 0));
   if (err != REG_NOERROR)
     {
       char buf[200];
@@ -257,3 +276,97 @@ run_test (const char *expr, const char *
      expect.  */
   return cnt != expected;
 }
+
+
+static int
+run_test_backwards (const char *expr, const char *mem, size_t memlen,
+		    int icase, int expected)
+{
+#ifdef _POSIX_CPUTIME
+  struct timespec start;
+  struct timespec finish;
+#endif
+  struct re_pattern_buffer re;
+  const char *err;
+  size_t offset;
+  int cnt;
+
+#ifdef _POSIX_CPUTIME
+  if (use_clock)
+    use_clock = clock_gettime (cl, &start) == 0;
+#endif
+
+  re_set_syntax ((RE_SYNTAX_POSIX_BASIC & ~RE_DOT_NEWLINE)
+		 | RE_HAT_LISTS_NOT_NEWLINE
+		 | (icase ? RE_ICASE : 0));
+
+  memset (&re, 0, sizeof (re));
+  re.fastmap = malloc (256);
+  if (re.fastmap == NULL)
+    error (EXIT_FAILURE, errno, "cannot allocate fastmap");
+
+  err = re_compile_pattern (expr, strlen (expr), &re);
+  if (err != NULL)
+    error (EXIT_FAILURE, 0, "cannot compile expression: %s", err);
+
+  if (re_compile_fastmap (&re))
+    error (EXIT_FAILURE, 0, "couldn't compile fastmap");
+
+  cnt = 0;
+  offset = memlen;
+  assert (mem[memlen] == '\0');
+  while (offset <= memlen)
+    {
+      int start;
+      const char *sp;
+      const char *ep;
+
+      start = re_search (&re, mem, memlen, offset, -offset, NULL);
+      if (start == -1)
+	break;
+
+      if (start == -2)
+	error (EXIT_FAILURE, 0, "internal error in re_search");
+
+      sp = mem + start;
+      while (sp > mem && sp[-1] != '\n')
+	--sp;
+
+      ep = mem + start;
+      while (*ep != '\0' && *ep != '\n')
+	++ep;
+
+      printf ("match %d: \"%.*s\"\n", ++cnt, (int) (ep - sp), sp);
+
+      offset = sp - 1 - mem;
+    }
+
+  regfree (&re);
+
+#ifdef _POSIX_CPUTIME
+  if (use_clock)
+    {
+      use_clock = clock_gettime (cl, &finish) == 0;
+      if (use_clock)
+	{
+	  if (finish.tv_nsec < start.tv_nsec)
+	    {
+	      finish.tv_nsec -= start.tv_nsec - 1000000000;
+	      finish.tv_sec -= 1 + start.tv_sec;
+	    }
+	  else
+	    {
+	      finish.tv_nsec -= start.tv_nsec;
+	      finish.tv_sec -= start.tv_sec;
+	    }
+
+	  printf ("elapsed time: %ld.%09ld sec\n",
+		  finish.tv_sec, finish.tv_nsec);
+	}
+    }
+#endif
+
+  /* Return an error if the number of matches found is not match we
+     expect.  */
+  return cnt != expected;
+}
--- libc/posix/regex_internal.c.jj	2003-11-11 17:35:49.000000000 +0100
+++ libc/posix/regex_internal.c	2003-11-11 19:29:19.000000000 +0100
@@ -18,6 +18,11 @@
    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
    02111-1307 USA.  */
 
+#ifdef _LIBC
+# include <wcsmbs/wcsmbsload.h>
+# include <dlfcn.h>
+#endif
+
 static void re_string_construct_common (const char *str, int len,
 					re_string_t *pstr,
 					RE_TRANSLATE_TYPE trans, int icase);
@@ -432,10 +437,42 @@ re_string_reconstruct (pstr, idx, eflags
 	  if (MB_CUR_MAX > 1)
 	    {
 	      int wcs_idx;
-	      wint_t wc;
-	      pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
-	      for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
-		pstr->wcs[wcs_idx] = WEOF;
+	      wint_t wc = WEOF;
+# ifdef _LIBC
+	      const struct gconv_fcts *fcts;
+  
+	      /* Get the conversion functions.  */
+	      fcts = get_gconv_fcts (_NL_CURRENT_DATA (LC_CTYPE));
+
+	      if (__builtin_expect (fcts->towc_nsteps == 1, 1)
+		  && __builtin_expect (fcts->towc->__prevmb_fct != NULL, 1))
+		{
+		  /* Use the shortcut function.  */
+		  const char *prev, *raw;
+		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
+		  prev = DL_CALL_FCT (fcts->towc->__prevmb_fct,
+				      (raw + offset, raw + pstr->valid_len));
+		  if (prev != NULL)
+		    {
+		      mbstate_t cur_state;
+		      wchar_t wc2;
+
+		      memset (&cur_state, 0, sizeof (cur_state));
+		      if (mbrtowc (&wc2, prev, raw + offset - prev, &cur_state)
+			  == raw + offset - prev)
+			{
+			  memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
+			  wc = wc2;
+			}
+		    }
+		}
+# endif
+	      if (wc == WEOF)
+		{
+		  pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
+		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
+		    pstr->wcs[wcs_idx] = WEOF;
+		}
 	      if (pstr->trans && wc <= 0xff)
 		wc = pstr->trans[wc];
 	      pstr->tip_context = (IS_WIDE_WORD_CHAR (wc) ? CONTEXT_WORD

	Jakub


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]