This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH] powerpc: strstr optimization
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: Rajalakshmi Srinivasaraghavan <raji at linux dot vnet dot ibm dot com>
- Cc: libc-alpha at sourceware dot org, sjmunroe at us dot ibm dot com
- Date: Fri, 1 May 2015 14:05:10 +0200
- Subject: Re: [PATCH] powerpc: strstr optimization
- Authentication-results: sourceware.org; auth=none
- References: <1429254584-124997-1-git-send-email-raji at linux dot vnet dot ibm dot com>
On Fri, Apr 17, 2015 at 02:09:44AM -0500, Rajalakshmi Srinivasaraghavan wrote:
> This patch optimizes strstr function for power >= 7 systems. Performance
> gain is obtained using doubleword aligned memory access and usage of cmpb
> instruction for quicker comparison. The average improvement of this
> optimization is ~40%.
As I did same with x64 I will add improvements, I didn't look at patch
in detail.
First problem looks there is quadratic worst case. That is solved by
buy-or-rent technique. You count number of comparisons and if that
exceeds C*N where N is number of haystack bytes read so far and C
suitable constant then you switch to slower generic algorithm.
Main improvement in x64 case was checking for pair of characters. A pair
of characters is less likely than single character even vectorized
branch would be rarely triggered saving misprediction cost that
dominates performance.
That improved performance about 25 times.
For checking pair only with arithmetic you look on zero bytes of
following expression
(LOAD (haystack + n) ^ BROADCAST(needle[0])) | (LOAD (haystack + n + 1) ^ BROADCAST(needle[1]))
Then you need to do technical improvements.
One is unrolling to test 64 bytes at once, you could keep that to 32/16
if that improves performance. Hot portion is header, loop is rarely used
so you should check first 64 bytes without going to main loop.
Next is that you need to load vector twice, once for first character and
then for second character. You need to have loads corresponding to
second character aligned. For first character you could use unaligned
loads if they are fast or get these by bit fiddling from second
character loads. This way you don't have to worry about crossing page
boundaries.
Here is template that I used to generate x64 version you could use that
with modifications. Main problem is mapping suitable instrincts or
replacing these with corresponding powerpc instructions.
#include <emmintrin.h>
#include <stdint.h>
#include <stdlib.h>
#include "../header.h"
#ifdef AS_SSSE3
# define PREVCHAR(s, va, vb) CONCAT (va, vb, 15)
#else
# define PREVCHAR(s, va, vb) LOADU (s)
#endif
static inline tp_mask
forget_first_bit (tp_mask x)
{
return x & (x - 1);
}
static inline int
strcmp2 (char *x, char *y, long *buy)
{
int i = 0;
while (1)
{
if (!y[i])
return 0;
if (x[i] != y[i])
{
buy += i; return 1;
}
i++;
}
}
char *strchr (char *s, int n);
char *strstr_string (char *a, char *b);
unsigned char *
strstr_new (unsigned char *s, unsigned char *n)
{
long i;
unsigned char *s_or, *s_al;
if (__builtin_expect (!n[0], 0))
return s;
if (!n[1])
return strchr (s, n[0]);
tp_vector v0 = BROADCAST (n[0]), v1 = BROADCAST (n[1]), vz = BROADCAST (0);
tp_vector vs0, vs1, vs2, vs3, vsb;
long buy;
long unused;
if ((((size_t) s) & 4095) <= 4096 - 65)
{
vs0 = LOADU (s);
/*or test three chars. */
vs0 = OR (MIN (EQ (vs0, v0), EQ (v1, LOADU (s + 1))), EQ (vs0, vz));
tp_mask m = get_mask (vs0); //| (get_mask(vs1) << 16) | (get_mask(vs2) <<32) | (get_mask(vs3)<<48);
while (m)
{
unsigned char *r = s + first_bit (m);
if (!*r)
return NULL;
if (!strcmp2 (r + 2, n + 2, &unused))
return r;
m = forget_first_bit (m);
}
vs1 = LOADU (s + 16);
vs2 = LOADU (s + 32);
vs3 = LOADU (s + 48);
/*or test three chars. */
vs1 = OR (MIN (EQ (vs1, v0), EQ (v1, LOADU (s + 16 + 1))), EQ (vs1, vz));
vs2 = OR (MIN (EQ (vs2, v0), EQ (v1, LOADU (s + 32 + 1))), EQ (vs2, vz));
vs3 = OR (MIN (EQ (vs3, v0), EQ (v1, LOADU (s + 48 + 1))), EQ (vs3, vz));
m = (get_mask (vs1) << 16) | (get_mask (vs2) << 32) | (get_mask (vs3) << 48);
while (m)
{
unsigned char *r = s + first_bit (m);
if (!*r)
return NULL;
if (!strcmp2 (r + 2, n + 2, &unused))
return r;
m = forget_first_bit (m);
}
}
else
{
s_al = ALIGN_DOWN (s, 64);
vs0 = LOAD (s_al);
vs1 = LOAD (s_al + 16);
vs2 = LOAD (s_al + 32);
vs3 = LOAD (s_al + 48);
/*or test three chars. */
vs0 = OR (MIN (EQ (vs0, v1), EQ (v0, LOADU (s_al - 1))), EQ (vs0, vz));
vs1 = OR (MIN (EQ (vs1, v1), EQ (v0, LOADU (s_al + 16 - 1))), EQ (vs1, vz));
vs2 = OR (MIN (EQ (vs2, v1), EQ (v0, LOADU (s_al + 32 - 1))), EQ (vs2, vz));
vs3 = OR (MIN (EQ (vs3, v1), EQ (v0, LOADU (s_al + 48 - 1))), EQ (vs3, vz));
tp_mask m = get_mask (vs0) | (get_mask (vs1) << 16) | (get_mask (vs2) << 32) | (get_mask (vs3) << 48);
m = m >> (s - s_al);
while (m)
{
unsigned char *r = s + first_bit (m);
if (!*r)
return NULL;
if (r != s && !strcmp2 (r + 2, n + 2, &unused))
return r;
m = forget_first_bit (m);
}
}
buy = -512;
s_or = s;
s = ALIGN_DOWN (s + 64, 64);
#ifdef AS_SSSE3
vs3 = LOAD (s - 16);
#endif
while (1)
{
vs0 = LOAD (s);
vs1 = LOAD (s + 16);
vs2 = LOAD (s + 32);
vs3 = LOAD (s + 48);
/*or test three chars. */
tp_vector ret = vs0;
ret = MIN (ret, vs1);
ret = MIN (ret, vs2);
ret = MIN (ret, vs3);
tp_vector ret2 = OR (XOR (PREVCHAR (s - 1, vs3, vs0), v0), XOR (vs0, v1));
ret2 = MIN (ret2, OR (XOR (PREVCHAR (s - 1 + 16, vs0, vs1), v0), XOR (vs1, v1)));
ret2 = MIN (ret2, OR (XOR (PREVCHAR (s - 1 + 32, vs1, vs2), v0), XOR (vs2, v1)));
ret2 = MIN (ret2, OR (XOR (PREVCHAR (s - 1 + 48, vs2, vs3), v0), XOR (vs3, v1)));
ret = MIN (ret, ret2);
if (get_mask (EQ (ret, vz)))
{
s_al = ALIGN_DOWN (s, 64);
vs0 = LOAD (s_al);
vs1 = LOAD (s_al + 16);
vs2 = LOAD (s_al + 32);
vs3 = LOAD (s_al + 48);
/*or test three chars. */
vs0 = OR (MIN (EQ (vs0, v1), EQ (v0, LOADU (s_al - 1))), EQ (vs0, vz)); /* todo last three nondestructive */
vs1 = OR (MIN (EQ (vs1, v1), EQ (v0, LOADU (s_al + 16 - 1))), EQ (vs1, vz));
vs2 = OR (MIN (EQ (vs2, v1), EQ (v0, LOADU (s_al + 32 - 1))), EQ (vs2, vz));
vs3 = OR (MIN (EQ (vs3, v1), EQ (v0, LOADU (s_al + 48 - 1))), EQ (vs3, vz));
tp_mask m = get_mask (vs0) | (get_mask (vs1) << 16) | (get_mask (vs2) << 32) | (get_mask (vs3) << 48);
while (m)
{
unsigned char *r = s + first_bit (m);
if (!*r)
return NULL;
if (!strcmp2 (r + 1, n + 2, &buy))
return r - 1;
if (buy > s - s_or)
return strstr_string (s, n);
m = forget_first_bit (m);
}
}
s += 64;
}
}