This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

Re: [RFC] strcpy optimizations

From: OndÅej BÃlka <neleai at seznam dot cz>
To: libc-alpha at sourceware dot org
Date: Thu, 14 Feb 2013 10:26:42 +0100
Subject: Re: [RFC] strcpy optimizations
References: <20130129201542.GA23023@domone.kolej.mff.cuni.cz>

On Tue, Jan 29, 2013 at 09:15:42PM +0100, OndÅej BÃlka wrote:
> Hi, I have several questions about strcpy.
> 
> Second question:
> Related is that I looked at strcpy implementations and possible improvement
> is handle differently copying 1-15 bytes. I put microbenchmark at
> 
thanks to feedback I now have strcpy version that is faster for small
strings and 1/3 of space as strcpy_sse2_unaligned. Header is probably
best for all architectures except atom. Loop is same as in
strcpy_sse2_unaligned.

I tried filter outliers as they measured number of pages where copy on write took place.

I used same profiler as strlen, results are here.
http://kam.mff.cuni.cz/~ondra/benchmark_string/strcpy_profile.html


I now use c instrincs to generate strcpy/stpcpy. It will be converted to assemby
as gcc has still bug 29776. Second reason for assembly is that gcc
pushes and pops rbp,rbx just to store constants 64 and 63 that are used only
once.

For strncpy I put code for code for tail optimization as but nothing
else.
http://www.sourceware.org/ml/libc-alpha/2013-02/msg00213.html

Comments?

#ifdef AS_STPCPY
# define _STR_STP(x, y) y
#else
# define _STR_STP(x, y) x
#endif

#include <emmintrin.h>
#include <stdint.h>
#include <stdlib.h>
typedef __m128i tp_vector;
typedef uint64_t tp_mask;

#define LOAD(x) _mm_load_si128 ((tp_vector *) (x))
#define LOADU(x) _mm_loadu_si128 ((tp_vector *) (x))
#define STORE(x,y) _mm_store_si128 ((tp_vector *) (x), (y))
#define STOREU(x,y) _mm_storeu_si128 ((tp_vector *) (x), (y))

#define MIN _mm_min_epu8
#define EQ  _mm_cmpeq_epi8
#define OR  _mm_or_si128

#define BROADCAST(x) _mm_set1_epi8 (x)
#define get_mask(x) ((uint64_t) _mm_movemask_epi8 (x))

static inline tp_mask first_bit (tp_mask x)
{
  return __builtin_ctzl (x);
}



static char *memcpy_small_tail (char *dest, char *src, size_t no, char *ret);


#ifdef AS_STRNCPY
# define RETURN(x) return memset_tail (x, 0, endp - (x), \
                                       _STR_STP (ret, x));
#else
# define RETURN(x) return _STR_STP (ret, x)
#endif


#ifdef AS_STPCPY
char *STRCPY (char *dest, char *src)
#else
char *STRCPY (char *dest, char *src)
{
  return strcpy_tail (dest, src, dest);
}

char *strcpy_tail (char *dest, char *src, char *ret)
#endif
{
  char *s, *d;
  tp_vector vz = BROADCAST (0);
  tp_vector v0, v1, v2, v3;
  tp_mask m, m2;
  int i, no, offset;

page_cont:
  ;
	/* Test if reading next 64 bytes cross a page.  */
  if ((((uint64_t) src) & (4096 - 1)) > 4096 - 64)
    {
      offset = ((uint64_t) src) % 64;
      for (i = offset; i < 64; i++)
        {
          dest[i - offset] = src[i - offset];
          if (!src[i - offset]) 
            RETURN (dest + i - offset);
        }
      goto aligned;
    }

  /* Handle strings upto 16 bytes first.  */
  v0 = LOADU (src);
  m = get_mask (EQ (v0, vz));
  if (m)
    {
      no = first_bit (m);
#ifdef AS_STRNCPY
      memset(dest + no, 0, endp - (dest + no) );
#endif
      return memcpy_small_tail (dest, src, no + 1, _STR_STP (ret, dest + no));
    }
  STOREU ( dest, v0);

  /* Handle remaining 48 bytes.  */
  v1 = LOADU (src + 16);
  v2 = LOADU (src + 32);
  v3 = LOADU (src + 48);

  /* Set highest bit as sentinel to do copying in all cases.  */
  m = (get_mask (EQ (v1, vz)) << 16)
       | (get_mask (EQ (v2, vz)) << 32)
       |( (get_mask (EQ (v3, vz)) | (1 << 15)) << 48);

  no = first_bit (m);

  if (no + 1 >= 32)
    STOREU (dest + 16, v1);
  if (no + 1 >= 48)
    STOREU (dest + 32, v2);

  STOREU (dest + no + 1 - 16, LOADU(src + no + 1 - 16));

  if (no != 63 || src[63] == 0)
    {
			RETURN (dest + no);
    }

aligned:
  ;
  /* Align loads to 64 bytes.  */
  offset = ((uint64_t) src) % 64;
  dest = dest + 64 - offset;
  src = src + 64 - offset;
  v0 = LOAD (src + 0);
  v1 = LOAD (src + 16);
  v2 = LOAD (src + 32);
  v3 = LOAD (src + 48);

  /* We use aligned loads and unaligned stores.
     This is faster than unaligned load and aligned stores because
     test for load crossing page is not neccessary.  */
  while (1)
    {
      /* Test zero byte among 64 bytes and if is then handle it by header.  */
      if (get_mask (EQ (MIN (v0, MIN (v1, MIN (v2, v3))), vz))) goto page_cont;
      STOREU (dest + 0, v0);
      v0 = LOAD (src + 64);
      STOREU (dest + 16, v1);
      v1 = LOAD (src + 16 + 64);
      STOREU (dest + 32, v2);
      v2 = LOAD (src + 32 + 64);
      STOREU (dest + 48, v3);
      v3 = LOAD (src + 48 + 64);
      src += 64;
      dest += 64;
    }

}

/* Copies up to 16 bytes and returns ret.  */
/* A viable optimization could be test 1 << no do defer need of first_bit.  */
static char *memcpy_small_tail (char *dest, char *src, size_t no, char *ret)
{
  if (no & (8 + 16))
    {
      ((uint64_t *) dest)[0] = ((uint64_t *) src)[0];
      ((uint64_t *)(dest + no - 8))[0] = ((uint64_t *)(src + no - 8))[0];
      return ret;
    }
  if (no & 4)
    {
      ((uint32_t *) dest)[0] = ((uint32_t *) src)[0];
      ((uint32_t *)(dest + no - 4))[0] = ((uint32_t *)(src + no - 4))[0];
      return ret;
    }
  dest[0] = src[0];
  if (no & 2)
    {
      ((uint16_t *)(dest + no - 2))[0] = ((uint16_t *)(src + no - 2))[0];
      return ret;
    }
  return ret;
}

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]