This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] Remove use of SSE4.2 functions for strstr on i686


On Mon, Dec 16, 2013 at 02:04:44PM +1000, Allan McRae wrote:
> From: Andreas Schwab <schwab@suse.de>
> 
> The SSE4.2 have been removed from x86_64 by commit 584b18eb.  This patch
> fixes the build on i686, which attempts to use the removed files.
> ---
> 
> This patch is focused on fixing the i686 build.  We probably want to
> eventually use the SSE2 implementations that were added.
> 
> This patch turned out to be exactly the one used on openSUSE to work
> around the issue with unaligned input [1].  Looking at that, I guessed
> that Andreas Schwab was the author.  Andreas, can you confirm?
> 
Ok, with that

For optimized implementation there could be various levels of
optimization. I modified a template which was used for 64bit version to 32 bit one.

There is some work to do, modifying ifunc and adding strstr_string
fallback.

Do we need assembly on i686 or will this be enough?

A template is here:

#include <emmintrin.h>
#include <stdint.h>
#include <stdlib.h>

typedef __m128i tp_vector;
typedef uint32_t tp_mask;

#define LOAD(x) _mm_load_si128 ((tp_vector *) (x))
#define LOADU(x) _mm_loadu_si128 ((tp_vector *) (x))
#define MIN _mm_min_epu8

#define EQ  _mm_cmpeq_epi8

#define OR  _mm_or_si128
#define XOR  _mm_xor_si128

#define BROADCAST(x) _mm_set1_epi8 (x)
#define get_mask(x) ((uint32_t) _mm_movemask_epi8 (x))

/* Align VALUE down by ALIGN bytes.  */
#define ALIGN_DOWN(value, align) \
       ALIGN_DOWN_M1(value, align - 1)
/* Align VALUE down by ALIGN_M1 + 1 bytes.
   Useful if you have precomputed ALIGN - 1.  */
#define ALIGN_DOWN_M1(value, align_m1) \
       (void *)((uintptr_t)(value) \
                & ~(uintptr_t)(align_m1))

/* Align VALUE up by ALIGN bytes.  */
#define ALIGN_UP(value, align) \
       ALIGN_UP_M1(value, align - 1)
/* Align VALUE up by ALIGN_M1 + 1 bytes.
   Useful if you have precomputed ALIGN - 1.  */
#define ALIGN_UP_M1(value, align_m1) \
       (void *)(((uintptr_t)(value) + (uintptr_t)(align_m1)) \
                & ~(uintptr_t)(align_m1))


#define PREVCHAR(x,y,z) LOADU(x)

static inline tp_mask
first_bit (tp_mask x)
{
  return __builtin_ctzl (x);
}

static inline tp_mask
forget_first_bit (tp_mask x)
{
  return x & (x - 1);
}

static inline int
strcmp2 (char *x, char *y, long *buy)
{
  int i = 0;
  while (1)
    {
      if (!y[i])
	return 0;

      if (x[i] != y[i])
	{
	  buy += i; return 1;
	}
      i++;
    }
}
char *strchr (char *s, int n);

char *strstr_string (char *a, char *b);

unsigned char *
strstr_new (unsigned char *s, unsigned char *n)
{
  long i;
  tp_mask m;

  unsigned char *s_or, *s_al;
  if (__builtin_expect (!n[0], 0))
    return s;

  if (!n[1])
    return strchr (s, n[0]);

  tp_vector v0 = BROADCAST (n[0]), v1 = BROADCAST (n[1]), vz = BROADCAST (0);
  tp_vector vs0, vs1, vs2, vs3, vsb;
  long buy;
  long unused;
  if (__builtin_expect ((((size_t) s) & 4095) <= 4096 - 33, 1))
    {
      vs0 = LOADU (s);
      vs1 = LOADU (s + 16);
      vs1 = OR (MIN (EQ (vs0, v0), EQ (v1, LOADU (s + 16 + 1))), EQ (vs1, vz));
      vs1 = OR (MIN (EQ (vs1, v0), EQ (v1, LOADU (s + 16 + 1))), EQ (vs1, vz));
      m = get_mask (vs0) | (get_mask (vs1) << 16);

      while (m)
	{
	  unsigned char *r = s + first_bit (m);
	  if (!*r)
	    return NULL;

	  if (!strcmp2 (r + 2, n + 2, &unused))
	    return r;

	  m = forget_first_bit (m);
	}
    }
  else
    {
      s_al = ALIGN_DOWN (s, 32);

      vs0 = LOAD (s_al);
      vs1 = LOAD (s_al + 16);
      vs0 = OR (MIN (EQ (vs0, v1), EQ (v0, LOADU (s_al - 1))), EQ (vs0, vz));
      vs1 = OR (MIN (EQ (vs1, v1), EQ (v0, LOADU (s_al + 16 - 1))), EQ (vs1, vz));
      tp_mask m = get_mask (vs0) | (get_mask (vs1) << 16);
      m = m >> (s - s_al);
      while (m)
	{
	  unsigned char *r = s + first_bit (m);
	  if (!*r)
	    return NULL;

	  if (r != s && !strcmp2 (r + 2, n + 2, &unused))
	    return r;

	  m = forget_first_bit (m);
	}
    }

  buy = -512;
  s_or = s;
  s = ALIGN_DOWN (s + 32, 32);

  while (1)
    {
      vs0 = LOAD (s);
      vs1 = LOAD (s + 16);
      tp_vector ret = vs0, ret2;
      ret = MIN (ret, vs1);
      vs0 = OR (XOR (PREVCHAR (s - 1, vs3, vs0), v0), XOR (vs0, v1));
      vs1 = OR (XOR (PREVCHAR (s - 1 + 16, vs0, vs1), v0), XOR (vs1, v1));

      ret2 = MIN (vs0, vs1);
      ret = MIN (ret, ret2);

      if (get_mask (EQ (ret, vz)))
	{
	  tp_mask m = get_mask (EQ(vs0, vz)) | (get_mask (EQ(vs1, vz)) << 16);
	  while (m)
	    {
	      unsigned char *r = s + first_bit (m);
	      if (!*r)
		return NULL;

	      if (!strcmp2 (r + 1, n + 2, &buy))
		return r - 1;

	      if (buy > s - s_or)
		return strstr_string (s, n);

	      m = forget_first_bit (m);
	    }
	}
      s += 32;
    }
}


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]