This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [RFC] Clean up SSE variable shifts
- From: Ulrich Drepper <drepper at redhat dot com>
- To: Richard Henderson <rth at twiddle dot net>
- Cc: libc-alpha at sourceware dot org, Hongjiu Lu <hongjiu dot lu at intel dot com>
- Date: Mon, 23 Aug 2010 13:07:36 -0400 (EDT)
- Subject: Re: [RFC] Clean up SSE variable shifts
----- "Richard Henderson" <rth@twiddle.net> wrote:
> And for good measure, here's a third patch which implements a
> variable shift via pshufb.
As a forth variant try this patch with the following change. Considering the reduced memory footprint (one cache line instead of four) and the improved capabilities for unaligned SSE memory operations in modern CPUs it might be even faster.
diff -u b/sysdeps/x86_64/multiarch/varshift.S b/sysdeps/x86_64/multiarch/varshift.S
--- b/sysdeps/x86_64/multiarch/varshift.S
+++ b/sysdeps/x86_64/multiarch/varshift.S
@@ -26,20 +26,5 @@
- .size ___m128i_shift_right, 256
+ .size ___m128i_shift_right, 31
- .balign 16
___m128i_shift_right:
.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
- .byte 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1
- .byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1
- .byte 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1
- .byte 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1
- .byte 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1
- .byte 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1
- .byte 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1
- .byte 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1
- .byte 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1
- .byte 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
- .byte 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
- .byte 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
- .byte 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
- .byte 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
- .byte 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+ .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
diff -u b/sysdeps/x86_64/multiarch/varshift.h b/sysdeps/x86_64/multiarch/varshift.h
--- b/sysdeps/x86_64/multiarch/varshift.h
+++ b/sysdeps/x86_64/multiarch/varshift.h
@@ -20,8 +20,8 @@
-extern __m128i ___m128i_shift_right[16] __attribute__((visibility("hidden")));
+extern char ___m128i_shift_right[31] __attribute__((visibility("hidden")));
static __inline__ __m128i
__m128i_shift_right (__m128i value, unsigned long offset)
{
- return _mm_shuffle_epi8 (value, ___m128i_shift_right[offset]);
+ return _mm_shuffle_epi8 (value, _mm_loadu_si128 ((__m128 *) (___m128i_shift_right + offset)));
}
--
â Ulrich Drepper â Red Hat, Inc. â 444 Castro St â Mountain View, CA â