This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] powerpc: strstr optimization


On Fri, Apr 17, 2015 at 02:09:44AM -0500, Rajalakshmi Srinivasaraghavan wrote:
> This patch optimizes strstr function for power >= 7 systems.  Performance
> gain is obtained using doubleword aligned memory access and usage of cmpb
> instruction for quicker comparison.  The average improvement of this
> optimization is ~40%.

As I did same with x64 I will add improvements, I didn't look at patch
in detail.

First problem looks there is  quadratic worst case. That is solved by
buy-or-rent technique. You count number of comparisons and if that
exceeds C*N where N is number of haystack bytes read so far and C
suitable constant then you switch to slower generic algorithm.

Main improvement in x64 case was checking for pair of characters. A pair
of characters is less likely than single character even vectorized
branch would be rarely triggered saving misprediction cost that
dominates performance.

That improved performance about 25 times.


For checking pair only with arithmetic you look on zero bytes of
following expression

(LOAD (haystack + n) ^ BROADCAST(needle[0])) | (LOAD (haystack + n + 1) ^ BROADCAST(needle[1]))

Then you need to do technical improvements.

One is unrolling to test 64 bytes at once, you could keep that to 32/16
if that improves performance. Hot portion is header, loop is rarely used
so you should check first 64 bytes without going to main loop.

Next is that you need to load vector twice, once for first character and
then for second character. You need to have loads corresponding to
second character aligned. For first character you could use unaligned
loads if they are fast or get these by bit fiddling from second
character loads. This way you don't have to worry about crossing page
boundaries.

Here is template that I used to generate x64 version you could use that
with modifications. Main problem is mapping suitable instrincts or
replacing these with corresponding powerpc instructions.


#include <emmintrin.h>
#include <stdint.h>
#include <stdlib.h>
#include "../header.h"

#ifdef AS_SSSE3
# define PREVCHAR(s, va, vb) CONCAT (va, vb, 15)
#else
# define PREVCHAR(s, va, vb) LOADU (s)
#endif

static inline tp_mask
forget_first_bit (tp_mask x)
{
  return x & (x - 1);
}

static inline int
strcmp2 (char *x, char *y, long *buy)
{
  int i = 0;
  while (1)
    {
      if (!y[i])
	return 0;

      if (x[i] != y[i])
	{
	  buy += i; return 1;
	}
      i++;
    }
}
char *strchr (char *s, int n);

char *strstr_string (char *a, char *b);

unsigned char *
strstr_new (unsigned char *s, unsigned char *n)
{
  long i;
  unsigned char *s_or, *s_al;
  if (__builtin_expect (!n[0], 0))
    return s;

  if (!n[1])
    return strchr (s, n[0]);

  tp_vector v0 = BROADCAST (n[0]), v1 = BROADCAST (n[1]), vz = BROADCAST (0);
  tp_vector vs0, vs1, vs2, vs3, vsb;
  long buy;
  long unused;
  if ((((size_t) s) & 4095) <= 4096 - 65)
    {
      vs0 = LOADU (s);
      /*or test three chars. */
      vs0 = OR (MIN (EQ (vs0, v0), EQ (v1, LOADU (s + 1))), EQ (vs0, vz));
      tp_mask m = get_mask (vs0);    //| (get_mask(vs1) << 16) | (get_mask(vs2) <<32) | (get_mask(vs3)<<48);

      while (m)
	{
	  unsigned char *r = s + first_bit (m);
	  if (!*r)
	    return NULL;

	  if (!strcmp2 (r + 2, n + 2, &unused))
	    return r;

	  m = forget_first_bit (m);
	}

      vs1 = LOADU (s + 16);
      vs2 = LOADU (s + 32);
      vs3 = LOADU (s + 48);
      /*or test three chars. */
      vs1 = OR (MIN (EQ (vs1, v0), EQ (v1, LOADU (s + 16 + 1))), EQ (vs1, vz));
      vs2 = OR (MIN (EQ (vs2, v0), EQ (v1, LOADU (s + 32 + 1))), EQ (vs2, vz));
      vs3 = OR (MIN (EQ (vs3, v0), EQ (v1, LOADU (s + 48 + 1))), EQ (vs3, vz));
      m = (get_mask (vs1) << 16) | (get_mask (vs2) << 32) | (get_mask (vs3) << 48);

      while (m)
	{
	  unsigned char *r = s + first_bit (m);
	  if (!*r)
	    return NULL;

	  if (!strcmp2 (r + 2, n + 2, &unused))
	    return r;

	  m = forget_first_bit (m);
	}
    }
  else
    {
      s_al = ALIGN_DOWN (s, 64);

      vs0 = LOAD (s_al);
      vs1 = LOAD (s_al + 16);
      vs2 = LOAD (s_al + 32);
      vs3 = LOAD (s_al + 48);
      /*or test three chars. */
      vs0 = OR (MIN (EQ (vs0, v1), EQ (v0, LOADU (s_al - 1))), EQ (vs0, vz));
      vs1 = OR (MIN (EQ (vs1, v1), EQ (v0, LOADU (s_al + 16 - 1))), EQ (vs1, vz));
      vs2 = OR (MIN (EQ (vs2, v1), EQ (v0, LOADU (s_al + 32 - 1))), EQ (vs2, vz));
      vs3 = OR (MIN (EQ (vs3, v1), EQ (v0, LOADU (s_al + 48 - 1))), EQ (vs3, vz));
      tp_mask m = get_mask (vs0) | (get_mask (vs1) << 16) | (get_mask (vs2) << 32) | (get_mask (vs3) << 48);
      m = m >> (s - s_al);
      while (m)
	{
	  unsigned char *r = s + first_bit (m);
	  if (!*r)
	    return NULL;

	  if (r != s && !strcmp2 (r + 2, n + 2, &unused))
	    return r;

	  m = forget_first_bit (m);
	}
    }

  buy = -512;
  s_or = s;
  s = ALIGN_DOWN (s + 64, 64);
#ifdef AS_SSSE3
  vs3 = LOAD (s - 16);
#endif
  while (1)
    {
      vs0 = LOAD (s);
      vs1 = LOAD (s + 16);
      vs2 = LOAD (s + 32);
      vs3 = LOAD (s + 48);
      /*or test three chars. */
      tp_vector ret = vs0;
      ret = MIN (ret, vs1);
      ret = MIN (ret, vs2);
      ret = MIN (ret, vs3);
      tp_vector ret2 = OR (XOR (PREVCHAR (s - 1, vs3, vs0), v0), XOR (vs0, v1));
      ret2 = MIN (ret2, OR (XOR (PREVCHAR (s - 1 + 16, vs0, vs1), v0), XOR (vs1, v1)));
      ret2 = MIN (ret2, OR (XOR (PREVCHAR (s - 1 + 32, vs1, vs2), v0), XOR (vs2, v1)));
      ret2 = MIN (ret2, OR (XOR (PREVCHAR (s - 1 + 48, vs2, vs3), v0), XOR (vs3, v1)));
      ret = MIN (ret, ret2);
      if (get_mask (EQ (ret, vz)))
	{
	  s_al = ALIGN_DOWN (s, 64);

	  vs0 = LOAD (s_al);
	  vs1 = LOAD (s_al + 16);
	  vs2 = LOAD (s_al + 32);
	  vs3 = LOAD (s_al + 48);
	  /*or test three chars. */
	  vs0 = OR (MIN (EQ (vs0, v1), EQ (v0, LOADU (s_al - 1))), EQ (vs0, vz)); /* todo last three nondestructive */
	  vs1 = OR (MIN (EQ (vs1, v1), EQ (v0, LOADU (s_al + 16 - 1))), EQ (vs1, vz));
	  vs2 = OR (MIN (EQ (vs2, v1), EQ (v0, LOADU (s_al + 32 - 1))), EQ (vs2, vz));
	  vs3 = OR (MIN (EQ (vs3, v1), EQ (v0, LOADU (s_al + 48 - 1))), EQ (vs3, vz));
	  tp_mask m = get_mask (vs0) | (get_mask (vs1) << 16) | (get_mask (vs2) << 32) | (get_mask (vs3) << 48);
	  while (m)
	    {
	      unsigned char *r = s + first_bit (m);
	      if (!*r)
		return NULL;

	      if (!strcmp2 (r + 1, n + 2, &buy))
		return r - 1;

	      if (buy > s - s_or)
		return strstr_string (s, n);

	      m = forget_first_bit (m);
	    }
	}
      s += 64;
    }
}


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]