This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[RFC] improve generic strlen.


Hi

As I promised earlier here is working version of improved strlen.
For long strings on x64 its twice as fast as current C version. Or 50%
slower when there is 1/128 probability of character 128.

For simplicity it assumes unaligned loads. For aligned ones I would need
to add define to mask first bytes.

Could you try this on arm or other arch to see what improvement is
possible?

Also while header is most important it needs platform specific
optimizations so if this is improvement next step is take assembly and
fix gcc mistakes.

#include <string.h>
#undef strlen
#include <stdint.h>

typedef uint64_t vec_t;
#define VEC_SIZE sizeof (vec_t)
#define ITER_SIZE 4 * VEC_SIZE

#define LOAD(x) (*((vec_t *) (x)))

#define ONES (((vec_t) ~0) / 255)
#define HIGH_BITS (128 * ONES)

size_t
strlen (const char *s)
{
  vec_t m0, m1, m2, m3;
  vec_t s0, s1, s2, s3;
  vec_t m;

  const char *s_start = s;
  if (((uintptr_t) s) % 4096 < 4096 - ITER_SIZE)
    {
      s0 = LOAD (s);
      m0 = (s0 - ONES) & (~s0) & HIGH_BITS;
      if (m0)
	return s - s_start - 1 + ffsl (m0) / 8;

      s1 = LOAD (s + VEC_SIZE);
      m1 = (s1 - ONES) & (~s1) & HIGH_BITS;
      if (m1)
	return s - s_start + VEC_SIZE - 1 + ffsl (m1) / 8;

      s2 = LOAD (s + 2 * VEC_SIZE);
      m2 = (s2 - ONES) & (~s2) & HIGH_BITS;
      if (m2)
	return s - s_start + 2 * VEC_SIZE - 1 + ffsl (m2) / 8;

      s3 = LOAD (s + 3 * VEC_SIZE);
      m3 = (s3 - ONES) & (~s3) & HIGH_BITS;
      if (m3)
	return s - s_start + 3 * VEC_SIZE - 1 + ffsl (m3) / 8;
    }
  else
    {
      int i;
      for (i = 0; i < ITER_SIZE; i++)
	if (!s[i])
	  return i;
    }


  s = (const char *) (((uintptr_t) s) & (~(ITER_SIZE - 1)));

  while (1)
    {
      s += ITER_SIZE;

      s0 = LOAD (s);
      s1 = LOAD (s + VEC_SIZE);
      s2 = LOAD (s + 2 * VEC_SIZE);
      s3 = LOAD (s + 3 * VEC_SIZE);
      m0 = s0 ^ (s0 - ONES);
      m1 = s1 ^ (s1 - ONES);
      m2 = s2 ^ (s2 - ONES);
      m3 = s3 ^ (s3 - ONES);
      m = (m0 | m1 | m2 | m3) & HIGH_BITS;

      if (m)
	{
	  /* Here gcc messes up loop, as it spills registers to save
	     intermediate value easily wasting 99 cycles in 99 iterations
	     just to save few cycles in last one. */
	  asm volatile ("" : : : "memory");
	  s0 = LOAD (s);
	  m0 = (s0 - ONES) & (~s0) & HIGH_BITS;
	  if (m0)
	    return s - s_start - 1 + ffsl (m0) / 8;

	  s1 = LOAD (s + VEC_SIZE);
	  m1 = (s1 - ONES) & (~s1) & HIGH_BITS;
	  if (m1)
	    return s - s_start + VEC_SIZE - 1 + ffsl (m1) / 8;

	  s2 = LOAD (s + 2 * VEC_SIZE);
	  m2 = (s2 - ONES) & (~s2) & HIGH_BITS;
	  if (m2)
	    return s - s_start + 2 * VEC_SIZE - 1 + ffsl (m2) / 8;

	  s3 = LOAD (s + 3 * VEC_SIZE);
	  m3 = (s3 - ONES) & (~s3) & HIGH_BITS;
	  if (m3)
	    return s - s_start + 3 * VEC_SIZE - 1 + ffsl (m3) / 8;
	}
    }
}


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]