This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH] Remove use of SSE4.2 functions for strstr on i686
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: Allan McRae <allan at archlinux dot org>
- Cc: libc-alpha at sourceware dot org, schwab at suse dot de
- Date: Mon, 16 Dec 2013 12:28:54 +0100
- Subject: Re: [PATCH] Remove use of SSE4.2 functions for strstr on i686
- Authentication-results: sourceware.org; auth=none
- References: <1387166684-14151-1-git-send-email-allan at archlinux dot org>
On Mon, Dec 16, 2013 at 02:04:44PM +1000, Allan McRae wrote:
> From: Andreas Schwab <schwab@suse.de>
>
> The SSE4.2 have been removed from x86_64 by commit 584b18eb. This patch
> fixes the build on i686, which attempts to use the removed files.
> ---
>
> This patch is focused on fixing the i686 build. We probably want to
> eventually use the SSE2 implementations that were added.
>
> This patch turned out to be exactly the one used on openSUSE to work
> around the issue with unaligned input [1]. Looking at that, I guessed
> that Andreas Schwab was the author. Andreas, can you confirm?
>
Ok, with that
For optimized implementation there could be various levels of
optimization. I modified a template which was used for 64bit version to 32 bit one.
There is some work to do, modifying ifunc and adding strstr_string
fallback.
Do we need assembly on i686 or will this be enough?
A template is here:
#include <emmintrin.h>
#include <stdint.h>
#include <stdlib.h>
typedef __m128i tp_vector;
typedef uint32_t tp_mask;
#define LOAD(x) _mm_load_si128 ((tp_vector *) (x))
#define LOADU(x) _mm_loadu_si128 ((tp_vector *) (x))
#define MIN _mm_min_epu8
#define EQ _mm_cmpeq_epi8
#define OR _mm_or_si128
#define XOR _mm_xor_si128
#define BROADCAST(x) _mm_set1_epi8 (x)
#define get_mask(x) ((uint32_t) _mm_movemask_epi8 (x))
/* Align VALUE down by ALIGN bytes. */
#define ALIGN_DOWN(value, align) \
ALIGN_DOWN_M1(value, align - 1)
/* Align VALUE down by ALIGN_M1 + 1 bytes.
Useful if you have precomputed ALIGN - 1. */
#define ALIGN_DOWN_M1(value, align_m1) \
(void *)((uintptr_t)(value) \
& ~(uintptr_t)(align_m1))
/* Align VALUE up by ALIGN bytes. */
#define ALIGN_UP(value, align) \
ALIGN_UP_M1(value, align - 1)
/* Align VALUE up by ALIGN_M1 + 1 bytes.
Useful if you have precomputed ALIGN - 1. */
#define ALIGN_UP_M1(value, align_m1) \
(void *)(((uintptr_t)(value) + (uintptr_t)(align_m1)) \
& ~(uintptr_t)(align_m1))
#define PREVCHAR(x,y,z) LOADU(x)
static inline tp_mask
first_bit (tp_mask x)
{
return __builtin_ctzl (x);
}
static inline tp_mask
forget_first_bit (tp_mask x)
{
return x & (x - 1);
}
static inline int
strcmp2 (char *x, char *y, long *buy)
{
int i = 0;
while (1)
{
if (!y[i])
return 0;
if (x[i] != y[i])
{
buy += i; return 1;
}
i++;
}
}
char *strchr (char *s, int n);
char *strstr_string (char *a, char *b);
unsigned char *
strstr_new (unsigned char *s, unsigned char *n)
{
long i;
tp_mask m;
unsigned char *s_or, *s_al;
if (__builtin_expect (!n[0], 0))
return s;
if (!n[1])
return strchr (s, n[0]);
tp_vector v0 = BROADCAST (n[0]), v1 = BROADCAST (n[1]), vz = BROADCAST (0);
tp_vector vs0, vs1, vs2, vs3, vsb;
long buy;
long unused;
if (__builtin_expect ((((size_t) s) & 4095) <= 4096 - 33, 1))
{
vs0 = LOADU (s);
vs1 = LOADU (s + 16);
vs1 = OR (MIN (EQ (vs0, v0), EQ (v1, LOADU (s + 16 + 1))), EQ (vs1, vz));
vs1 = OR (MIN (EQ (vs1, v0), EQ (v1, LOADU (s + 16 + 1))), EQ (vs1, vz));
m = get_mask (vs0) | (get_mask (vs1) << 16);
while (m)
{
unsigned char *r = s + first_bit (m);
if (!*r)
return NULL;
if (!strcmp2 (r + 2, n + 2, &unused))
return r;
m = forget_first_bit (m);
}
}
else
{
s_al = ALIGN_DOWN (s, 32);
vs0 = LOAD (s_al);
vs1 = LOAD (s_al + 16);
vs0 = OR (MIN (EQ (vs0, v1), EQ (v0, LOADU (s_al - 1))), EQ (vs0, vz));
vs1 = OR (MIN (EQ (vs1, v1), EQ (v0, LOADU (s_al + 16 - 1))), EQ (vs1, vz));
tp_mask m = get_mask (vs0) | (get_mask (vs1) << 16);
m = m >> (s - s_al);
while (m)
{
unsigned char *r = s + first_bit (m);
if (!*r)
return NULL;
if (r != s && !strcmp2 (r + 2, n + 2, &unused))
return r;
m = forget_first_bit (m);
}
}
buy = -512;
s_or = s;
s = ALIGN_DOWN (s + 32, 32);
while (1)
{
vs0 = LOAD (s);
vs1 = LOAD (s + 16);
tp_vector ret = vs0, ret2;
ret = MIN (ret, vs1);
vs0 = OR (XOR (PREVCHAR (s - 1, vs3, vs0), v0), XOR (vs0, v1));
vs1 = OR (XOR (PREVCHAR (s - 1 + 16, vs0, vs1), v0), XOR (vs1, v1));
ret2 = MIN (vs0, vs1);
ret = MIN (ret, ret2);
if (get_mask (EQ (ret, vz)))
{
tp_mask m = get_mask (EQ(vs0, vz)) | (get_mask (EQ(vs1, vz)) << 16);
while (m)
{
unsigned char *r = s + first_bit (m);
if (!*r)
return NULL;
if (!strcmp2 (r + 1, n + 2, &buy))
return r - 1;
if (buy > s - s_or)
return strstr_string (s, n);
m = forget_first_bit (m);
}
}
s += 32;
}
}