This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH] Add a new macro to mask a float
- From: Adhemerval Zanella <adhemerval dot zanella at linaro dot org>
- To: libc-alpha at sourceware dot org
- Date: Wed, 29 Jun 2016 13:41:53 -0300
- Subject: Re: [PATCH] Add a new macro to mask a float
- Authentication-results: sourceware.org; auth=none
- References: <1467142073-13886-1-git-send-email-tuliom at linux dot vnet dot ibm dot com>
LGTM. I assume you have checked some performance gains on POWER8 and
it would be good if we could add some asinf/powf/tanf synthetic
benchmarks on benchtest.
On 28/06/2016 16:27, Tulio Magno Quites Machado Filho wrote:
> Defining a new macro allows architectures to provide more efficient
> implementations than using a GET_FLOAT_WORD/SET_FLOAT_WORD pair.
> As an example, POWER8 is able to mask the float directly in the VSX
> without copying the data to a GPR and copying it back.
>
> This patch introduces the new macro MASK_FLOAT. The generic
> implementation remains unchanged.
>
> Tested on x86_64, ppc, ppc64, ppc64le and s390x.
>
> 2016-06-28 Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>
>
> * sysdeps/generic/math_private.h (MASK_FLOAT): New macro.
> * sysdeps/ieee754/flt-32/e_acosf.c (__ieee754_acosf): Replace
> SET_FLOAT_WORD and GET_FLOAT_WORD sequence by MASK_FLOAT.
> * sysdeps/ieee754/flt-32/e_asinf.c (__ieee754_asinf): Likewise.
> * sysdeps/ieee754/flt-32/e_powf.c (__ieee754_powf): Likewise.
> * sysdeps/ieee754/flt-32/k_tanf.c (__kernel_tanf): Likewise.
> * sysdeps/ieee754/flt-32/s_modff.c (__modff): Likewise.
> * sysdeps/powerpc/powerpc64/power8/fpu/math_private.h: New file.
> ---
> sysdeps/generic/math_private.h | 14 +++++++
> sysdeps/ieee754/flt-32/e_acosf.c | 4 +-
> sysdeps/ieee754/flt-32/e_asinf.c | 4 +-
> sysdeps/ieee754/flt-32/e_powf.c | 18 +++------
> sysdeps/ieee754/flt-32/k_tanf.c | 7 +---
> sysdeps/ieee754/flt-32/s_modff.c | 4 +-
> .../powerpc/powerpc64/power8/fpu/math_private.h | 47 ++++++++++++++++++++++
> 7 files changed, 72 insertions(+), 26 deletions(-)
> create mode 100644 sysdeps/powerpc/powerpc64/power8/fpu/math_private.h
>
> diff --git a/sysdeps/generic/math_private.h b/sysdeps/generic/math_private.h
> index cf1865d..21e272c 100644
> --- a/sysdeps/generic/math_private.h
> +++ b/sysdeps/generic/math_private.h
> @@ -181,6 +181,20 @@ do { \
> } while (0)
> #endif
>
> +/* Apply an integer mask on a float.
> +
> + The default implementation invokes the GET_FLOAT_WORD/SET_FLOAT_WORD
> + macro pair. Note that this macro can only be used to apply an AND
> + mask supplied directly as a parameter. */
> +#ifndef MASK_FLOAT
> +# define MASK_FLOAT(f,mask) \
> +do { \
> + u_int32_t __tmp; \
> + GET_FLOAT_WORD(__tmp, f); \
> + SET_FLOAT_WORD(f, __tmp&mask); \
> +} while (0)
> +#endif
> +
> /* Get long double macros from a separate header. */
> #include <math_ldbl.h>
>
> diff --git a/sysdeps/ieee754/flt-32/e_acosf.c b/sysdeps/ieee754/flt-32/e_acosf.c
> index 6f792f6..8b29e53 100644
> --- a/sysdeps/ieee754/flt-32/e_acosf.c
> +++ b/sysdeps/ieee754/flt-32/e_acosf.c
> @@ -61,12 +61,10 @@ __ieee754_acosf(float x)
> w = r*s-pio2_lo;
> return pi - (float)2.0*(s+w);
> } else { /* x > 0.5 */
> - int32_t idf;
> z = (one-x)*(float)0.5;
> s = __ieee754_sqrtf(z);
> df = s;
> - GET_FLOAT_WORD(idf,df);
> - SET_FLOAT_WORD(df,idf&0xfffff000);
> + MASK_FLOAT(df,0xfffff000);
> c = (z-df*df)/(s+df);
> p = z*(pS0+z*(pS1+z*(pS2+z*(pS3+z*(pS4+z*pS5)))));
> q = one+z*(qS1+z*(qS2+z*(qS3+z*qS4)));
> diff --git a/sysdeps/ieee754/flt-32/e_asinf.c b/sysdeps/ieee754/flt-32/e_asinf.c
> index 2ca2dbc..95e0a79 100644
> --- a/sysdeps/ieee754/flt-32/e_asinf.c
> +++ b/sysdeps/ieee754/flt-32/e_asinf.c
> @@ -89,10 +89,8 @@ float __ieee754_asinf(float x)
> if(ix>=0x3F79999A) { /* if |x| > 0.975 */
> t = pio2_hi-(2.0f*(s+s*p)-pio2_lo);
> } else {
> - int32_t iw;
> w = s;
> - GET_FLOAT_WORD(iw,w);
> - SET_FLOAT_WORD(w,iw&0xfffff000);
> + MASK_FLOAT(w,0xfffff000);
> c = (t-w*w)/(s+w);
> r = p;
> p = 2.0f*s*r-(pio2_lo-2.0f*c);
> diff --git a/sysdeps/ieee754/flt-32/e_powf.c b/sysdeps/ieee754/flt-32/e_powf.c
> index c72fe37..d62e877 100644
> --- a/sysdeps/ieee754/flt-32/e_powf.c
> +++ b/sysdeps/ieee754/flt-32/e_powf.c
> @@ -136,8 +136,7 @@ __ieee754_powf(float x, float y)
> u = ivln2_h*t; /* ivln2_h has 16 sig. bits */
> v = t*ivln2_l-w*ivln2;
> t1 = u+v;
> - GET_FLOAT_WORD(is,t1);
> - SET_FLOAT_WORD(t1,is&0xfffff000);
> + MASK_FLOAT(t1,0xfffff000);
> t2 = v-(t1-u);
> } else {
> float s2,s_h,s_l,t_h,t_l;
> @@ -163,8 +162,7 @@ __ieee754_powf(float x, float y)
> v = one/(ax+bp[k]);
> s = u*v;
> s_h = s;
> - GET_FLOAT_WORD(is,s_h);
> - SET_FLOAT_WORD(s_h,is&0xfffff000);
> + MASK_FLOAT(s_h,0xfffff000);
> /* t_h=ax+bp[k] High */
> SET_FLOAT_WORD (t_h,
> ((((ix>>1)|0x20000000)+0x00400000+(k<<21))
> @@ -177,24 +175,21 @@ __ieee754_powf(float x, float y)
> r += s_l*(s_h+s);
> s2 = s_h*s_h;
> t_h = (float)3.0+s2+r;
> - GET_FLOAT_WORD(is,t_h);
> - SET_FLOAT_WORD(t_h,is&0xfffff000);
> + MASK_FLOAT(t_h,0xfffff000);
> t_l = r-((t_h-(float)3.0)-s2);
> /* u+v = s*(1+...) */
> u = s_h*t_h;
> v = s_l*t_h+t_l*s;
> /* 2/(3log2)*(s+...) */
> p_h = u+v;
> - GET_FLOAT_WORD(is,p_h);
> - SET_FLOAT_WORD(p_h,is&0xfffff000);
> + MASK_FLOAT(p_h,0xfffff000);
> p_l = v-(p_h-u);
> z_h = cp_h*p_h; /* cp_h+cp_l = 2/(3*log2) */
> z_l = cp_l*p_h+p_l*cp+dp_l[k];
> /* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
> t = (float)n;
> t1 = (((z_h+z_l)+dp_h[k])+t);
> - GET_FLOAT_WORD(is,t1);
> - SET_FLOAT_WORD(t1,is&0xfffff000);
> + MASK_FLOAT(t1,0xfffff000);
> t2 = z_l-(((t1-t)-dp_h[k])-z_h);
> }
>
> @@ -234,8 +229,7 @@ __ieee754_powf(float x, float y)
> p_h -= t;
> }
> t = p_l+p_h;
> - GET_FLOAT_WORD(is,t);
> - SET_FLOAT_WORD(t,is&0xfffff000);
> + MASK_FLOAT(t,0xfffff000);
> u = t*lg2_h;
> v = (p_l-(t-p_h))*lg2+t*lg2_l;
> z = u+v;
> diff --git a/sysdeps/ieee754/flt-32/k_tanf.c b/sysdeps/ieee754/flt-32/k_tanf.c
> index 9f0e558..d805816 100644
> --- a/sysdeps/ieee754/flt-32/k_tanf.c
> +++ b/sysdeps/ieee754/flt-32/k_tanf.c
> @@ -87,14 +87,11 @@ float __kernel_tanf(float x, float y, int iy)
> simply return -1.0/(x+r) here */
> /* compute -1.0/(x+r) accurately */
> float a,t;
> - int32_t i;
> z = w;
> - GET_FLOAT_WORD(i,z);
> - SET_FLOAT_WORD(z,i&0xfffff000);
> + MASK_FLOAT(z, 0xfffff000);
> v = r-(z - x); /* z+v = r+x */
> t = a = -(float)1.0/w; /* a = -1.0/w */
> - GET_FLOAT_WORD(i,t);
> - SET_FLOAT_WORD(t,i&0xfffff000);
> + MASK_FLOAT(t, 0xfffff000);
> s = (float)1.0+t*z;
> return t+a*(s+t*v);
> }
> diff --git a/sysdeps/ieee754/flt-32/s_modff.c b/sysdeps/ieee754/flt-32/s_modff.c
> index 23f6a90..491f50f 100644
> --- a/sysdeps/ieee754/flt-32/s_modff.c
> +++ b/sysdeps/ieee754/flt-32/s_modff.c
> @@ -32,10 +32,8 @@ __modff(float x, float *iptr)
> } else {
> i = (0x007fffff)>>j0;
> if((i0&i)==0) { /* x is integral */
> - u_int32_t ix;
> *iptr = x;
> - GET_FLOAT_WORD(ix,x);
> - SET_FLOAT_WORD(x,ix&0x80000000); /* return +-0 */
> + MASK_FLOAT(x,0x80000000); /* return +-0 */
> return x;
> } else {
> SET_FLOAT_WORD(*iptr,i0&(~i));
> diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/math_private.h b/sysdeps/powerpc/powerpc64/power8/fpu/math_private.h
> new file mode 100644
> index 0000000..700e410
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/power8/fpu/math_private.h
> @@ -0,0 +1,47 @@
> +/* Private inline math functions for POWER8.
> + Copyright (C) 2016 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +/* Faster to do an in-place masking of the float number in the VSR
> + than move to GPR for the masking and back. maskl, maskr, and maski
> + are used to convert the 32-bit "mask" parameter to a 64-bit mask
> + suitable for the internal representation of a scalar
> + single-precision floating point number in the Power8 processor.
> + Note: before applying the mask, xvmovdp is used to ensure f is
> + normalized. */
> +#define MASK_FLOAT(f, mask) \
> + do { \
> + long tmpmask = mask; \
> + float tmpf = f; \
> + long maskl = 0xc000000000000000; \
> + long maskr = 0x3fffffffffffffff; \
> + long maski = 0x3800000000000000; \
> + tmpmask = tmpmask << 32; \
> + tmpmask = ((tmpmask&maskl) | ((tmpmask&maskr)>>3) | maski); \
> + union { \
> + long l; \
> + double d; \
> + } umask = {.l = tmpmask}; \
> + __asm__ ("xvmovdp %x2, %x2\n\t" \
> + "xxland %x0, %x2, %1\n\t" \
> + : "=wa" (tmpf) \
> + : "d" (umask.d), \
> + "wa" (tmpf) ); \
> + f = tmpf; \
> + } while(0)
> +
> +#include_next <math_private.h>
>