[PATCH v9 5/6] nss: Optimize nss_hash in nss_hash.c

Tue May 17 05:11:20 GMT 2022

On 17/05/2022 02:00, Noah Goldstein via Libc-alpha wrote:
> The prior unrolling didn't really do much as it left the dependency
> chain between iterations. Unrolled the loop for 4 so 4x multiplies
> could be pipelined in out-of-order machines.
> 
> Results for __nss_hash
> Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
> 
> Time as Geometric Mean of N=25 runs
> Geometric of all benchmark New / Old: 0.845
>    type, length, New Time, Old Time, New Time / Old Time
>   fixed,      0,    4.019,    3.729,               1.078
>   fixed,      1,     4.95,    5.707,               0.867
>   fixed,      2,    5.152,    5.657,               0.911
>   fixed,      3,    4.641,    5.721,               0.811
>   fixed,      4,    5.551,     5.81,               0.955
>   fixed,      5,    6.525,    6.552,               0.996
>   fixed,      6,    6.711,    6.561,               1.023
>   fixed,      7,    6.715,    6.767,               0.992
>   fixed,      8,    7.874,    7.915,               0.995
>   fixed,      9,    8.888,    9.767,                0.91
>   fixed,     10,    8.959,    9.762,               0.918
>   fixed,     11,    9.188,    9.987,                0.92
>   fixed,     12,    9.708,   10.618,               0.914
>   fixed,     13,   10.393,    11.14,               0.933
>   fixed,     14,   10.628,   12.097,               0.879
>   fixed,     15,   10.982,   12.965,               0.847
>   fixed,     16,   11.851,   14.429,               0.821
>   fixed,     32,   24.334,   34.414,               0.707
>   fixed,     64,   55.618,   86.688,               0.642
>   fixed,    128,  118.261,   224.36,               0.527
>   fixed,    256,  256.183,  538.629,               0.476
> random,      2,   11.194,   11.556,               0.969
> random,      4,   17.516,   17.205,               1.018
> random,      8,   23.501,   20.985,                1.12
> random,     16,   28.131,   29.212,               0.963
> random,     32,   35.436,   38.662,               0.917
> random,     64,    45.74,   58.868,               0.777
> random,    128,   75.394,  121.963,               0.618
> random,    256,  139.524,  260.726,               0.535
> ---
>   nss/nss_hash.c | 79 +++++++++++++++++++++++++++-----------------------
>   1 file changed, 42 insertions(+), 37 deletions(-)
> 
> diff --git a/nss/nss_hash.c b/nss/nss_hash.c
> index 27a348ea9b..c6a375f386 100644
> --- a/nss/nss_hash.c
> +++ b/nss/nss_hash.c
> @@ -19,58 +19,63 @@
>   
>   /* This is from libc/db/hash/hash_func.c, hash3 is static there */
>   /*
> - * This is INCREDIBLY ugly, but fast.  We break the string up into 8 byte
> + * This is INCREDIBLY ugly, but fast.  We break the string up into 4 byte
>    * units.  On the first time through the loop we get the "leftover bytes"
> - * (strlen % 8).  On every other iteration, we perform 8 HASHC's so we handle
> - * all 8 bytes.  Essentially, this saves us 7 cmp & branch instructions.  If
> - * this routine is heavily used enough, it's worth the ugly coding.
> + * (len % 4).  On every other iteration, we perform a 4x unrolled version
> + * HASHC. Further unrolling does not appear to help.
>    *
>    * OZ's original sdbm hash
>    */
>   uint32_t
>   __nss_hash (const void *keyarg, size_t len)
>   {
> +  enum
> +  {
> +    HASH_CONST_P0 = 1,	       /* (uint32_t)(65599 ^ 0).  */
> +    HASH_CONST_P1 = 65599,     /* (uint32_t)(65599 ^ 1).  */
> +    HASH_CONST_P2 = 8261505,   /* (uint32_t)(65599 ^ 2).  */
> +    HASH_CONST_P3 = 780587199, /* (uint32_t)(65599 ^ 3).  */
> +    HASH_CONST_P4 = 1139564289 /* (uint32_t)(65599 ^ 4).  */
> +  };
> +
>     const unsigned char *key;
> -  size_t loop;
>     uint32_t h;
>   
> -#define HASHC   h = *key++ + 65599 * h
> +#define HASHC	h = *key++ + HASH_CONST_P1 * h
>   
>     h = 0;
>     key = keyarg;
>     if (len > 0)
>       {
> -      loop = (len + 8 - 1) >> 3;
> -      switch (len & (8 - 1))
> -        {
> -        case 0:
> -          do
> -            {
> -              HASHC;
> -              /* FALLTHROUGH */
> -            case 7:
> -              HASHC;
> -              /* FALLTHROUGH */
> -            case 6:
> -              HASHC;
> -              /* FALLTHROUGH */
> -            case 5:
> -              HASHC;
> -              /* FALLTHROUGH */
> -            case 4:
> -              HASHC;
> -              /* FALLTHROUGH */
> -            case 3:
> -              HASHC;
> -              /* FALLTHROUGH */
> -            case 2:
> -              HASHC;
> -              /* FALLTHROUGH */
> -            case 1:
> -              HASHC;
> -            }
> -	  while (--loop);
> -        }
> +      switch ((len & (4 - 1)))
> +	{
> +	case 0:
> +	  /* h starts out as zero so no need to include the multiply. */
> +	  h = *key++;
> +	  /* FALLTHROUGH */
> +	case 3:
> +	  HASHC;
> +	  /* FALLTHROUGH */
> +	case 2:
> +	  HASHC;
> +	  /* FALLTHROUGH */
> +	case 1:
> +	  HASHC;
> +	  /* FALLTHROUGH */
> +	}

The first 4 bytes, also sufficient for len <= 4.  OK.

> +
> +      uint32_t c0, c1, c2, c3;
> +      for (--len; len >= 4; len -= 4)
> +	{
> +	  c0 = (unsigned char) *(key + 0);
> +	  c1 = (unsigned char) *(key + 1);
> +	  c2 = (unsigned char) *(key + 2);
> +	  c3 = (unsigned char) *(key + 3);
> +	  h = HASH_CONST_P4 * h + HASH_CONST_P3 * c0 + HASH_CONST_P2 * c1
> +	      + HASH_CONST_P1 * c2 + HASH_CONST_P0 * c3;
> +
> +	  key += 4;
> +	}

Remaining larger lengths.  OK.

>       }
>     return h;
>   }

TBH this wins solely on the front of the code being easier to 
understand.  The fact that it is also faster in some cases is a bonus :)

LGTM.

Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>