This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] Add a new macro to mask a float


LGTM.  I assume you have checked some performance gains on POWER8 and
it would be good if we could add some asinf/powf/tanf synthetic
benchmarks on benchtest.

On 28/06/2016 16:27, Tulio Magno Quites Machado Filho wrote:
> Defining a new macro allows architectures to provide more efficient
> implementations than using a GET_FLOAT_WORD/SET_FLOAT_WORD pair.
> As an example, POWER8 is able to mask the float directly in the VSX
> without copying the data to a GPR and copying it back.
> 
> This patch introduces the new macro MASK_FLOAT.  The generic
> implementation remains unchanged.
> 
> Tested on x86_64, ppc, ppc64, ppc64le and s390x.
> 
> 2016-06-28  Tulio Magno Quites Machado Filho  <tuliom@linux.vnet.ibm.com>
> 
> 	* sysdeps/generic/math_private.h (MASK_FLOAT): New macro.
> 	* sysdeps/ieee754/flt-32/e_acosf.c (__ieee754_acosf): Replace
> 	SET_FLOAT_WORD and GET_FLOAT_WORD sequence by MASK_FLOAT.
> 	* sysdeps/ieee754/flt-32/e_asinf.c (__ieee754_asinf): Likewise.
> 	* sysdeps/ieee754/flt-32/e_powf.c (__ieee754_powf): Likewise.
> 	* sysdeps/ieee754/flt-32/k_tanf.c (__kernel_tanf): Likewise.
> 	* sysdeps/ieee754/flt-32/s_modff.c (__modff): Likewise.
> 	* sysdeps/powerpc/powerpc64/power8/fpu/math_private.h: New file.
> ---
>  sysdeps/generic/math_private.h                     | 14 +++++++
>  sysdeps/ieee754/flt-32/e_acosf.c                   |  4 +-
>  sysdeps/ieee754/flt-32/e_asinf.c                   |  4 +-
>  sysdeps/ieee754/flt-32/e_powf.c                    | 18 +++------
>  sysdeps/ieee754/flt-32/k_tanf.c                    |  7 +---
>  sysdeps/ieee754/flt-32/s_modff.c                   |  4 +-
>  .../powerpc/powerpc64/power8/fpu/math_private.h    | 47 ++++++++++++++++++++++
>  7 files changed, 72 insertions(+), 26 deletions(-)
>  create mode 100644 sysdeps/powerpc/powerpc64/power8/fpu/math_private.h
> 
> diff --git a/sysdeps/generic/math_private.h b/sysdeps/generic/math_private.h
> index cf1865d..21e272c 100644
> --- a/sysdeps/generic/math_private.h
> +++ b/sysdeps/generic/math_private.h
> @@ -181,6 +181,20 @@ do {								\
>  } while (0)
>  #endif
>  
> +/* Apply an integer mask on a float.
> +
> +   The default implementation invokes the GET_FLOAT_WORD/SET_FLOAT_WORD
> +   macro pair.  Note that this macro can only be used to apply an AND
> +   mask supplied directly as a parameter.  */
> +#ifndef MASK_FLOAT
> +# define MASK_FLOAT(f,mask)		\
> +do {					\
> +  u_int32_t __tmp;			\
> +  GET_FLOAT_WORD(__tmp, f);		\
> +  SET_FLOAT_WORD(f, __tmp&mask);	\
> +} while (0)
> +#endif
> +
>  /* Get long double macros from a separate header.  */
>  #include <math_ldbl.h>
>  
> diff --git a/sysdeps/ieee754/flt-32/e_acosf.c b/sysdeps/ieee754/flt-32/e_acosf.c
> index 6f792f6..8b29e53 100644
> --- a/sysdeps/ieee754/flt-32/e_acosf.c
> +++ b/sysdeps/ieee754/flt-32/e_acosf.c
> @@ -61,12 +61,10 @@ __ieee754_acosf(float x)
>  	    w = r*s-pio2_lo;
>  	    return pi - (float)2.0*(s+w);
>  	} else {			/* x > 0.5 */
> -	    int32_t idf;
>  	    z = (one-x)*(float)0.5;
>  	    s = __ieee754_sqrtf(z);
>  	    df = s;
> -	    GET_FLOAT_WORD(idf,df);
> -	    SET_FLOAT_WORD(df,idf&0xfffff000);
> +	    MASK_FLOAT(df,0xfffff000);
>  	    c  = (z-df*df)/(s+df);
>  	    p = z*(pS0+z*(pS1+z*(pS2+z*(pS3+z*(pS4+z*pS5)))));
>  	    q = one+z*(qS1+z*(qS2+z*(qS3+z*qS4)));
> diff --git a/sysdeps/ieee754/flt-32/e_asinf.c b/sysdeps/ieee754/flt-32/e_asinf.c
> index 2ca2dbc..95e0a79 100644
> --- a/sysdeps/ieee754/flt-32/e_asinf.c
> +++ b/sysdeps/ieee754/flt-32/e_asinf.c
> @@ -89,10 +89,8 @@ float __ieee754_asinf(float x)
>  	if(ix>=0x3F79999A) {	/* if |x| > 0.975 */
>  	    t = pio2_hi-(2.0f*(s+s*p)-pio2_lo);
>  	} else {
> -	    int32_t iw;
>  	    w  = s;
> -	    GET_FLOAT_WORD(iw,w);
> -	    SET_FLOAT_WORD(w,iw&0xfffff000);
> +	    MASK_FLOAT(w,0xfffff000);
>  	    c  = (t-w*w)/(s+w);
>  	    r  = p;
>  	    p  = 2.0f*s*r-(pio2_lo-2.0f*c);
> diff --git a/sysdeps/ieee754/flt-32/e_powf.c b/sysdeps/ieee754/flt-32/e_powf.c
> index c72fe37..d62e877 100644
> --- a/sysdeps/ieee754/flt-32/e_powf.c
> +++ b/sysdeps/ieee754/flt-32/e_powf.c
> @@ -136,8 +136,7 @@ __ieee754_powf(float x, float y)
>  	    u = ivln2_h*t;	/* ivln2_h has 16 sig. bits */
>  	    v = t*ivln2_l-w*ivln2;
>  	    t1 = u+v;
> -	    GET_FLOAT_WORD(is,t1);
> -	    SET_FLOAT_WORD(t1,is&0xfffff000);
> +	    MASK_FLOAT(t1,0xfffff000);
>  	    t2 = v-(t1-u);
>  	} else {
>  	    float s2,s_h,s_l,t_h,t_l;
> @@ -163,8 +162,7 @@ __ieee754_powf(float x, float y)
>  	    v = one/(ax+bp[k]);
>  	    s = u*v;
>  	    s_h = s;
> -	    GET_FLOAT_WORD(is,s_h);
> -	    SET_FLOAT_WORD(s_h,is&0xfffff000);
> +	    MASK_FLOAT(s_h,0xfffff000);
>  	/* t_h=ax+bp[k] High */
>  	    SET_FLOAT_WORD (t_h,
>  			    ((((ix>>1)|0x20000000)+0x00400000+(k<<21))
> @@ -177,24 +175,21 @@ __ieee754_powf(float x, float y)
>  	    r += s_l*(s_h+s);
>  	    s2  = s_h*s_h;
>  	    t_h = (float)3.0+s2+r;
> -	    GET_FLOAT_WORD(is,t_h);
> -	    SET_FLOAT_WORD(t_h,is&0xfffff000);
> +	    MASK_FLOAT(t_h,0xfffff000);
>  	    t_l = r-((t_h-(float)3.0)-s2);
>  	/* u+v = s*(1+...) */
>  	    u = s_h*t_h;
>  	    v = s_l*t_h+t_l*s;
>  	/* 2/(3log2)*(s+...) */
>  	    p_h = u+v;
> -	    GET_FLOAT_WORD(is,p_h);
> -	    SET_FLOAT_WORD(p_h,is&0xfffff000);
> +	    MASK_FLOAT(p_h,0xfffff000);
>  	    p_l = v-(p_h-u);
>  	    z_h = cp_h*p_h;		/* cp_h+cp_l = 2/(3*log2) */
>  	    z_l = cp_l*p_h+p_l*cp+dp_l[k];
>  	/* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
>  	    t = (float)n;
>  	    t1 = (((z_h+z_l)+dp_h[k])+t);
> -	    GET_FLOAT_WORD(is,t1);
> -	    SET_FLOAT_WORD(t1,is&0xfffff000);
> +	    MASK_FLOAT(t1,0xfffff000);
>  	    t2 = z_l-(((t1-t)-dp_h[k])-z_h);
>  	}
>  
> @@ -234,8 +229,7 @@ __ieee754_powf(float x, float y)
>  	    p_h -= t;
>  	}
>  	t = p_l+p_h;
> -	GET_FLOAT_WORD(is,t);
> -	SET_FLOAT_WORD(t,is&0xfffff000);
> +	MASK_FLOAT(t,0xfffff000);
>  	u = t*lg2_h;
>  	v = (p_l-(t-p_h))*lg2+t*lg2_l;
>  	z = u+v;
> diff --git a/sysdeps/ieee754/flt-32/k_tanf.c b/sysdeps/ieee754/flt-32/k_tanf.c
> index 9f0e558..d805816 100644
> --- a/sysdeps/ieee754/flt-32/k_tanf.c
> +++ b/sysdeps/ieee754/flt-32/k_tanf.c
> @@ -87,14 +87,11 @@ float __kernel_tanf(float x, float y, int iy)
>  			   simply return -1.0/(x+r) here */
>       /*  compute -1.0/(x+r) accurately */
>  	    float a,t;
> -	    int32_t i;
>  	    z  = w;
> -	    GET_FLOAT_WORD(i,z);
> -	    SET_FLOAT_WORD(z,i&0xfffff000);
> +	    MASK_FLOAT(z, 0xfffff000);
>  	    v  = r-(z - x); 	/* z+v = r+x */
>  	    t = a  = -(float)1.0/w;	/* a = -1.0/w */
> -	    GET_FLOAT_WORD(i,t);
> -	    SET_FLOAT_WORD(t,i&0xfffff000);
> +	    MASK_FLOAT(t, 0xfffff000);
>  	    s  = (float)1.0+t*z;
>  	    return t+a*(s+t*v);
>  	}
> diff --git a/sysdeps/ieee754/flt-32/s_modff.c b/sysdeps/ieee754/flt-32/s_modff.c
> index 23f6a90..491f50f 100644
> --- a/sysdeps/ieee754/flt-32/s_modff.c
> +++ b/sysdeps/ieee754/flt-32/s_modff.c
> @@ -32,10 +32,8 @@ __modff(float x, float *iptr)
>  	    } else {
>  		i = (0x007fffff)>>j0;
>  		if((i0&i)==0) {			/* x is integral */
> -		    u_int32_t ix;
>  		    *iptr = x;
> -		    GET_FLOAT_WORD(ix,x);
> -		    SET_FLOAT_WORD(x,ix&0x80000000);	/* return +-0 */
> +		    MASK_FLOAT(x,0x80000000);	/* return +-0 */
>  		    return x;
>  		} else {
>  		    SET_FLOAT_WORD(*iptr,i0&(~i));
> diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/math_private.h b/sysdeps/powerpc/powerpc64/power8/fpu/math_private.h
> new file mode 100644
> index 0000000..700e410
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/power8/fpu/math_private.h
> @@ -0,0 +1,47 @@
> +/* Private inline math functions for POWER8.
> +   Copyright (C) 2016 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +/* Faster to do an in-place masking of the float number in the VSR
> +   than move to GPR for the masking and back.  maskl, maskr, and maski
> +   are used to convert the 32-bit "mask" parameter to a 64-bit mask
> +   suitable for the internal representation of a scalar
> +   single-precision floating point number in the Power8 processor.
> +   Note: before applying the mask, xvmovdp is used to ensure f is
> +   normalized.  */
> +#define MASK_FLOAT(f, mask)					\
> +  do {								\
> +    long tmpmask = mask;					\
> +    float tmpf = f;						\
> +    long maskl = 0xc000000000000000;				\
> +    long maskr = 0x3fffffffffffffff;				\
> +    long maski = 0x3800000000000000;				\
> +    tmpmask = tmpmask << 32;					\
> +    tmpmask = ((tmpmask&maskl) | ((tmpmask&maskr)>>3) | maski);	\
> +    union {							\
> +      long l;							\
> +      double d;							\
> +    } umask = {.l = tmpmask};					\
> +    __asm__ ("xvmovdp %x2, %x2\n\t"				\
> +	     "xxland %x0, %x2, %1\n\t"				\
> +	     : "=wa" (tmpf)					\
> +	     : "d" (umask.d),					\
> +	       "wa" (tmpf) );					\
> +    f = tmpf;							\
> +  } while(0)
> +
> +#include_next <math_private.h>
> 


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]