This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH] powerpc64: strcpy optimization for unaligned string
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: Rajalakshmi Srinivasaraghavan <raji at linux dot vnet dot ibm dot com>
- Cc: libc-alpha at sourceware dot org
- Date: Thu, 18 Dec 2014 22:13:48 +0100
- Subject: Re: [PATCH] powerpc64: strcpy optimization for unaligned string
- Authentication-results: sourceware.org; auth=none
- References: <1418832071-93495-1-git-send-email-raji at linux dot vnet dot ibm dot com> <5491A9A5 dot 2000400 at linux dot vnet dot ibm dot com>
On Wed, Dec 17, 2014 at 09:34:53PM +0530, Rajalakshmi Srinivasaraghavan wrote:
>
>
> This patch optimizes strcpy for ppc64 for unaligned source or
> destination address. The source or destination address is aligned
> to doubleword and data is shifted based on the alignment and
> added with the previous loaded data to be written as a doubleword.
> For each load, cmpb instruction is used for faster null check.
>
> More combination of unaligned inputs is also added in benchtest
> to measure the improvement.The new optimization shows 2 to 80% of
> performance improvement for longer string though it does not show
> big difference on string size less than 16 due to additional checks.
>
> This patch is tested on powerpc64 BE and LE and I have also attached
> the benchtest result.
>
As I wrote that benchtests are suspect first retest what happens if you
do not always call strcpy with same input and output buffer. What
diffence that makes in benchmark?
diff --git a/benchtests/bench-strcpy.c b/benchtests/bench-strcpy.c
index c3ab4cf..0329f60 100644
--- a/benchtests/bench-strcpy.c
+++ b/benchtests/bench-strcpy.c
@@ -71,25 +71,25 @@ SIMPLE_STRCPY (CHAR *dst, const CHAR *src)
typedef CHAR *(*proto_t) (CHAR *, const CHAR *);
static void
-do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
+do_one_test (impl_t *impl, CHAR **dst, CHAR **src,
size_t len __attribute__((unused)))
{
size_t i, iters = INNER_LOOP_ITERS;
timing_t start, stop, cur;
- if (CALL (impl, dst, src) != STRCPY_RESULT (dst, len))
+ if (CALL (impl, dst[0], src[0]) != STRCPY_RESULT (dst[0], len[0]))
{
error (0, 0, "Wrong result in function %s %p %p", impl->name,
- CALL (impl, dst, src), STRCPY_RESULT (dst, len));
+ CALL (impl, dst[0], src[0]), STRCPY_RESULT (dst[0], len));
ret = 1;
return;
}
- if (STRCMP (dst, src) != 0)
+ if (STRCMP (dst[0], src[0]) != 0)
{
error (0, 0,
"Wrong result in function %s dst \"%" sfmt "\" src \"%" sfmt "\"",
- impl->name, dst, src);
+ impl->name, dst[0], src[0]);
ret = 1;
return;
}
@@ -97,7 +97,7 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
TIMING_NOW (start);
for (i = 0; i < iters; ++i)
{
- CALL (impl, dst, src);
+ CALL (impl, dst[i % 16], src[i % 16]);
}
TIMING_NOW (stop);
@@ -109,8 +109,8 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
static void
do_test (size_t align1, size_t align2, size_t len, int max_char)
{
- size_t i;
- CHAR *s1, *s2;
+ size_t i, j;
+ CHAR **s1, **s2;
/* For wcscpy: align1 and align2 here mean alignment not in bytes,
but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
len for wcschr here isn't in bytes but it's number of wchar_t symbols. */
@@ -122,12 +122,17 @@ do_test (size_t align1, size_t align2, size_t len, int max_char)
if ((align2 + len) * sizeof(CHAR) >= page_size)
return;
- s1 = (CHAR *) (buf1) + align1;
- s2 = (CHAR *) (buf2) + align2;
+ s1 = calloc (sizeof (char *), 16);
+ s2 = calloc (sizeof (char *), 16);
+ for (j = 0; j < 16; j++)
+ {
+ s1[j] = ((CHAR *) calloc (align1 + len + 1, sizeof (CHAR))) + align1;
+ s2[j] = ((CHAR *) calloc (align2 + len + 1, sizeof (CHAR))) + align2;
- for (i = 0; i < len; i++)
- s1[i] = 32 + 23 * i % (max_char - 32);
- s1[len] = 0;
+ for (i = 0; i < len; i++)
+ s1[j][i] = 32 + 23 * i % (max_char - 32);
+ s1[j][len] = 0;
+ }
printf ("Length %4zd, alignments in bytes %2zd/%2zd:", len, align1 * sizeof(CHAR), align2 * sizeof(CHAR));