This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] powerpc: Add a POWER8-optimized version of powf()



On 25/05/2017 14:47, Paul Clarke wrote:
> This implementation is heavily based on sysdeps/ieee754/flt-32/e_powf.c.
> Most significant changes are code simplification and use of doubles for
> intermediate values.  Also, some rearrangement to move early
> non-dependent code later, out of the faster paths.
> 
> 2017-05-25  Paul A. Clarke  <pc@us.ibm.com>
> 
> 	* sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
> 	[$(subdir) = math] (libm-sysdep_routines): Add e_powf-power8 and
> 	e_powf-ppc64.
> 	* sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf-power8.c: New file.
> 	* sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf-ppc64.c: Likewise.
> 	* sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf.c: Likewise.
> 	* sysdeps/powerpc/powerpc64/power8/fpu/e_powf.c: Likewise.

This changes seems to be arch independent and I would like to avoid adding
even more arch specific.  Is there any reason why this can't be used as
the default implementation?  Do you have number on different architecture
for it? 

> ---
>  sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile   |   3 +-
>  .../multiarch/{s_cosf-ppc64.c => e_powf-power8.c}  |  12 +-
>  .../powerpc64/fpu/multiarch/e_powf-ppc64.c}        |   9 +-
>  .../powerpc64/fpu/multiarch/{s_cosf.c => e_powf.c} |  16 +--
>  .../powerpc64/power8/fpu}/e_powf.c                 | 140 +++++++--------------
>  5 files changed, 69 insertions(+), 111 deletions(-)
>  copy sysdeps/powerpc/powerpc64/fpu/multiarch/{s_cosf-ppc64.c => e_powf-power8.c} (80%)
>  copy sysdeps/{i386/symbol-hacks.h => powerpc/powerpc64/fpu/multiarch/e_powf-ppc64.c} (82%)
>  copy sysdeps/powerpc/powerpc64/fpu/multiarch/{s_cosf.c => e_powf.c} (71%)
>  copy sysdeps/{ieee754/flt-32 => powerpc/powerpc64/power8/fpu}/e_powf.c (69%)
> 
> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
> index 317a988..dc7422e 100644
> --- a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
> @@ -27,7 +27,8 @@ libm-sysdep_routines += s_isnan-power7 s_isnan-power6x s_isnan-power6 \
>  			s_llrint-power8 s_llround-power8 \
>  			e_expf-power8 e_expf-ppc64 \
>  			s_sinf-ppc64 s_sinf-power8 \
> -			s_cosf-ppc64 s_cosf-power8
> +			s_cosf-ppc64 s_cosf-power8 \
> +			e_powf-ppc64 e_powf-power8
>  
>  CFLAGS-s_logbf-power7.c = -mcpu=power7
>  CFLAGS-s_logbl-power7.c = -mcpu=power7
> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf-ppc64.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf-power8.c
> similarity index 80%
> copy from sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf-ppc64.c
> copy to sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf-power8.c
> index 635624c..5dfd969 100644
> --- a/sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf-ppc64.c
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf-power8.c
> @@ -1,4 +1,4 @@
> -/* cosf function.  PowerPC64 default version.
> +/* __ieee754_powf() POWER8 version.
>     Copyright (C) 2017 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>  
> @@ -16,11 +16,9 @@
>     License along with the GNU C Library; if not, see
>     <http://www.gnu.org/licenses/>.  */
>  
> -#include <sysdep.h>
> +#undef strong_alias
> +#define strong_alias(a, b)
>  
> -#undef weak_alias
> -#define weak_alias(a, b)
> +#define __ieee754_powf __ieee754_powf_power8
>  
> -#define __cosf __cosf_ppc64
> -
> -#include <sysdeps/powerpc/fpu/s_cosf.c>
> +#include <sysdeps/powerpc/powerpc64/power8/fpu/e_powf.c>
> diff --git a/sysdeps/i386/symbol-hacks.h b/sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf-ppc64.c
> similarity index 82%
> copy from sysdeps/i386/symbol-hacks.h
> copy to sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf-ppc64.c
> index 36a13c8..43b466d 100644
> --- a/sysdeps/i386/symbol-hacks.h
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf-ppc64.c
> @@ -1,4 +1,4 @@
> -/* Hacks needed for symbol manipulation.  i386 version.
> +/* __ieee_powf() PowerPC64 version.
>     Copyright (C) 2017 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>  
> @@ -16,6 +16,9 @@
>     License along with the GNU C Library; if not, see
>     <http://www.gnu.org/licenses/>.  */
>  
> -#include <sysdeps/wordsize-32/divdi3-symbol-hacks.h>
> +#undef strong_alias
> +#define strong_alias(a, b)
>  
> -#include_next "symbol-hacks.h"
> +#define __ieee754_powf __ieee754_powf_ppc64
> +
> +#include <sysdeps/ieee754/flt-32/e_powf.c>
> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf.c
> similarity index 71%
> copy from sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf.c
> copy to sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf.c
> index acf2a59..325d166 100644
> --- a/sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf.c
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf.c
> @@ -1,4 +1,4 @@
> -/* Multiple versions of cosf.
> +/* Multiple versions of ieee754_powf.
>     Copyright (C) 2017 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>  
> @@ -17,15 +17,15 @@
>     <http://www.gnu.org/licenses/>.  */
>  
>  #include <math.h>
> -#include <shlib-compat.h>
> +#include <math_private.h>
>  #include "init-arch.h"
>  
> -extern __typeof (__cosf) __cosf_ppc64 attribute_hidden;
> -extern __typeof (__cosf) __cosf_power8 attribute_hidden;
> +extern __typeof (__ieee754_powf) __ieee754_powf_ppc64 attribute_hidden;
> +extern __typeof (__ieee754_powf) __ieee754_powf_power8 attribute_hidden;
>  
> -libc_ifunc (__cosf,
> +libc_ifunc (__ieee754_powf,
>  	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
> -	    ? __cosf_power8
> -	    : __cosf_ppc64);
> +	    ? __ieee754_powf_power8
> +	    : __ieee754_powf_ppc64);
>  
> -weak_alias (__cosf, cosf)
> +strong_alias (__ieee754_powf, __powf_finite)
> diff --git a/sysdeps/ieee754/flt-32/e_powf.c b/sysdeps/powerpc/powerpc64/power8/fpu/e_powf.c
> similarity index 69%
> copy from sysdeps/ieee754/flt-32/e_powf.c
> copy to sysdeps/powerpc/powerpc64/power8/fpu/e_powf.c
> index 13b49de..628232a 100644
> --- a/sysdeps/ieee754/flt-32/e_powf.c
> +++ b/sysdeps/powerpc/powerpc64/power8/fpu/e_powf.c
> @@ -4,7 +4,7 @@
>  
>  /*
>   * ====================================================
> - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
> + * Copyright (C) 1993-2017 by Sun Microsystems, Inc. All rights reserved.
>   *
>   * Developed at SunPro, a Sun Microsystems, Inc. business.
>   * Permission to use, copy, modify, and distribute this
> @@ -38,11 +38,9 @@ P2   = -2.7777778450e-03, /* 0xbb360b61 */
>  P3   =  6.6137559770e-05, /* 0x388ab355 */
>  P4   = -1.6533901999e-06, /* 0xb5ddea0e */
>  P5   =  4.1381369442e-08, /* 0x3331bb4c */
> -lg2  =  6.9314718246e-01, /* 0x3f317218 */
>  lg2_h  =  6.93145752e-01, /* 0x3f317200 */
>  lg2_l  =  1.42860654e-06, /* 0x35bfbe8c */
>  ovt =  4.2995665694e-08, /* -(128-log2(ovfl+.5ulp)) */
> -cp    =  9.6179670095e-01, /* 0x3f76384f =2/(3ln2) */
>  cp_h  =  0xf.64p-4, /* cp high 12 bits.  */
>  cp_l  =  -0x7.b11e3p-16, /* 2/(3ln2) - cp_h.  */
>  ivln2    =  1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
> @@ -52,14 +50,13 @@ ivln2_l  =  7.0526075433e-06; /* 0x36eca570 =1/ln2 tail*/
>  float
>  __ieee754_powf(float x, float y)
>  {
> -	float z,ax,z_h,z_l,p_h,p_l;
> -	float y1,t1,t2,r,s,t,u,v,w;
> +	float z, ax, s;
> +	double d1, d2;
>  	int32_t i,j,k,yisint,n;
> -	int32_t hx,hy,ix,iy,is;
> +	int32_t hx,hy,ix,iy;
>  
> -	GET_FLOAT_WORD(hx,x);
>  	GET_FLOAT_WORD(hy,y);
> -	ix = hx&0x7fffffff;  iy = hy&0x7fffffff;
> +	iy = hy&0x7fffffff;
>  
>      /* y==zero: x**0 = 1 */
>  	if(iy==0 && !issignaling (x)) return one;
> @@ -68,26 +65,14 @@ __ieee754_powf(float x, float y)
>  	if(x == 1.0 && !issignaling (y)) return one;
>  	if(x == -1.0 && isinf(y)) return one;
>  
> +	GET_FLOAT_WORD(hx,x);
> +	ix = hx&0x7fffffff;
> +
>      /* +-NaN return x+y */
>  	if(__builtin_expect(ix > 0x7f800000 ||
>  			    iy > 0x7f800000, 0))
>  		return x+y;
>  
> -    /* determine if y is an odd int when x < 0
> -     * yisint = 0	... y is not an integer
> -     * yisint = 1	... y is an odd int
> -     * yisint = 2	... y is an even int
> -     */
> -	yisint  = 0;
> -	if(hx<0) {
> -	    if(iy>=0x4b800000) yisint = 2; /* even integer y */
> -	    else if(iy>=0x3f800000) {
> -		k = (iy>>23)-0x7f;	   /* exponent */
> -		j = iy>>(23-k);
> -		if((j<<(23-k))==iy) yisint = 2-(j&1);
> -	    }
> -	}
> -
>      /* special value of y */
>  	if (__builtin_expect(iy==0x7f800000, 0)) {	/* y is +-inf */
>  	    if (ix==0x3f800000)
> @@ -106,6 +91,21 @@ __ieee754_powf(float x, float y)
>  	    return __ieee754_sqrtf(x);
>  	}
>  
> +    /* determine if y is an odd int when x < 0
> +     * yisint = 0	... y is not an integer
> +     * yisint = 1	... y is an odd int
> +     * yisint = 2	... y is an even int
> +     */
> +	yisint  = 0;
> +	if(hx<0) {
> +	    if(iy>=0x4b800000) yisint = 2; /* even integer y */
> +	    else if(iy>=0x3f800000) {
> +		k = (iy>>23)-0x7f;	   /* exponent */
> +		j = iy>>(23-k);
> +		if((j<<(23-k))==iy) yisint = 2-(j&1);
> +	    }
> +	}
> +
>  	ax   = fabsf(x);
>      /* special value of x */
>  	if(__builtin_expect(ix==0x7f800000||ix==0||ix==0x3f800000, 0)){
> @@ -131,16 +131,10 @@ __ieee754_powf(float x, float y)
>  	    if(ix>0x3f800007) return (hy>0)? huge*huge:tiny*tiny;
>  	/* now |1-x| is tiny <= 2**-20, suffice to compute
>  	   log(x) by x-x^2/2+x^3/3-x^4/4 */
> -	    t = ax-1;		/* t has 20 trailing zeros */
> -	    w = (t*t)*((float)0.5-t*((float)0.333333333333-t*(float)0.25));
> -	    u = ivln2_h*t;	/* ivln2_h has 16 sig. bits */
> -	    v = t*ivln2_l-w*ivln2;
> -	    t1 = u+v;
> -	    GET_FLOAT_WORD(is,t1);
> -	    SET_FLOAT_WORD(t1,is&0xfffff000);
> -	    t2 = v-(t1-u);
> +	    d2 = ax-1;		/* d2 has 20 trailing zeros.  */
> +	    d2 = d2 * (ivln2_l + (double)ivln2_h) -
> +		 (d2 * d2) * (0.5 - d2 * (0.333333333333 - d2 * 0.25)) * ivln2;
>  	} else {
> -	    float s2,s_h,s_l,t_h,t_l;
>  	    /* Avoid internal underflow for tiny y.  The exact value
>  	       of y does not matter if |y| <= 2**-32.  */
>  	    if (iy < 0x2f800000)
> @@ -158,69 +152,37 @@ __ieee754_powf(float x, float y)
>  	    else {k=0;n+=1;ix -= 0x00800000;}
>  	    SET_FLOAT_WORD(ax,ix);
>  
> -	/* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
> -	    u = ax-bp[k];		/* bp[0]=1.0, bp[1]=1.5 */
> -	    v = one/(ax+bp[k]);
> -	    s = u*v;
> -	    s_h = s;
> -	    GET_FLOAT_WORD(is,s_h);
> -	    SET_FLOAT_WORD(s_h,is&0xfffff000);
> -	/* t_h=ax+bp[k] High */
> -	    SET_FLOAT_WORD (t_h,
> -			    ((((ix>>1)|0x20000000)+0x00400000+(k<<21))
> -			     & 0xfffff000));
> -	    t_l = ax - (t_h-bp[k]);
> -	    s_l = v*((u-s_h*t_h)-s_h*t_l);
> -	/* compute log(ax) */
> -	    s2 = s*s;
> -	    r = s2*s2*(L1+s2*(L2+s2*(L3+s2*(L4+s2*(L5+s2*L6)))));
> -	    r += s_l*(s_h+s);
> -	    s2  = s_h*s_h;
> -	    t_h = (float)3.0+s2+r;
> -	    GET_FLOAT_WORD(is,t_h);
> -	    SET_FLOAT_WORD(t_h,is&0xfffff000);
> -	    t_l = r-((t_h-(float)3.0)-s2);
> -	/* u+v = s*(1+...) */
> -	    u = s_h*t_h;
> -	    v = s_l*t_h+t_l*s;
> -	/* 2/(3log2)*(s+...) */
> -	    p_h = u+v;
> -	    GET_FLOAT_WORD(is,p_h);
> -	    SET_FLOAT_WORD(p_h,is&0xfffff000);
> -	    p_l = v-(p_h-u);
> -	    z_h = cp_h*p_h;		/* cp_h+cp_l = 2/(3*log2) */
> -	    z_l = cp_l*p_h+p_l*cp+dp_l[k];
> -	/* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
> -	    t = (float)n;
> -	    t1 = (((z_h+z_l)+dp_h[k])+t);
> -	    GET_FLOAT_WORD(is,t1);
> -	    SET_FLOAT_WORD(t1,is&0xfffff000);
> -	    t2 = z_l-(((t1-t)-dp_h[k])-z_h);
> +	/* compute d1 = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
> +	    d1 = (ax-(double)bp[k])/(ax+(double)bp[k]);
> +	/* compute d2 = log(ax) */
> +	    d2 = d1 * d1;
> +	    d2 = 3.0 + d2 + d2*d2*(L1+d2*(L2+d2*(L3+d2*(L4+d2*(L5+d2*L6)))));
> +	/* 2/(3log2)*(d2+...) */
> +	    d2 = d1*d2*((double)cp_h + cp_l);
> +	/* log2(ax) = (d2+..)*2/(3*log2) */
> +	    d2 = d2+((double)dp_h[k]+dp_l[k])+(double)n;
>  	}
>  
>  	s = one; /* s (sign of result -ve**odd) = -1 else = 1 */
>  	if(((((u_int32_t)hx>>31)-1)|(yisint-1))==0)
>  	    s = -one;	/* (-ve)**(odd int) */
>  
> -    /* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */
> -	GET_FLOAT_WORD(is,y);
> -	SET_FLOAT_WORD(y1,is&0xfffff000);
> -	p_l = (y-y1)*t1+y*t2;
> -	p_h = y1*t1;
> -	z = p_l+p_h;
> +    /* compute y * d2 */
> +	d1 = y * d2;
> +	z = d1;
>  	GET_FLOAT_WORD(j,z);
>  	if (__builtin_expect(j>0x43000000, 0))		/* if z > 128 */
>  	    return s*huge*huge;				/* overflow */
>  	else if (__builtin_expect(j==0x43000000, 0)) {	/* if z == 128 */
> -	    if(p_l+ovt>z-p_h) return s*huge*huge;	/* overflow */
> +	    if(ovt>(z-d1)) return s*huge*huge;	/* overflow */
>  	}
>  	else if (__builtin_expect((j&0x7fffffff)>0x43160000, 0))/* z <= -150 */
>  	    return s*tiny*tiny;				/* underflow */
>  	else if (__builtin_expect((u_int32_t) j==0xc3160000, 0)){/* z == -150*/
> -	    if(p_l<=z-p_h) return s*tiny*tiny;		/* underflow */
> +	    if(0.0<=(z-d1)) return s*tiny*tiny;		/* underflow */
>  	}
>      /*
> -     * compute 2**(p_h+p_l)
> +     * compute 2**d1
>       */
>  	i = j&0x7fffffff;
>  	k = (i>>23)-0x7f;
> @@ -228,22 +190,16 @@ __ieee754_powf(float x, float y)
>  	if(i>0x3f000000) {		/* if |z| > 0.5, set n = [z+0.5] */
>  	    n = j+(0x00800000>>(k+1));
>  	    k = ((n&0x7fffffff)>>23)-0x7f;	/* new k for n */
> -	    SET_FLOAT_WORD(t,n&~(0x007fffff>>k));
> +	    SET_FLOAT_WORD(z,n&~(0x007fffff>>k));
>  	    n = ((n&0x007fffff)|0x00800000)>>(23-k);
>  	    if(j<0) n = -n;
> -	    p_h -= t;
> +	    d1 -= z;
>  	}
> -	t = p_l+p_h;
> -	GET_FLOAT_WORD(is,t);
> -	SET_FLOAT_WORD(t,is&0xfffff000);
> -	u = t*lg2_h;
> -	v = (p_l-(t-p_h))*lg2+t*lg2_l;
> -	z = u+v;
> -	w = v-(z-u);
> -	t  = z*z;
> -	t1  = z - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))));
> -	r  = (z*t1)/(t1-two)-(w+z*w);
> -	z  = one-(r-z);
> +	d1 = d1 * ((double)lg2_h + lg2_l);
> +	d2 = d1*d1;
> +	d2 = d1 - d2*(P1+d2*(P2+d2*(P3+d2*(P4+d2*P5))));
> +	d2 = (d1*d2)/(d2-two);
> +	z = one - (d2-d1);
>  	GET_FLOAT_WORD(j,z);
>  	j += (n<<23);
>  	if((j>>23)<=0)	/* subnormal output */
> 


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]