This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
[PATCH v2 1/1] aarch64: Add optimized version of sinf and cosf

From: Ashwin Sekhar T K <ashwin dot sekhar at caviumnetworks dot com>
To: libc-alpha at sourceware dot org
Cc: Ashwin Sekhar T K <ashwin dot sekhar at caviumnetworks dot com>
Date: Thu, 6 Jul 2017 02:55:16 -0700
Subject: [PATCH v2 1/1] aarch64: Add optimized version of sinf and cosf
Authentication-results: sourceware.org; auth=none
Authentication-results: sourceware.org; dkim=none (message not signed) header.d=none;sourceware.org; dmarc=none action=none header.from=caviumnetworks.com;
References: <20170613071707.43396-1-ashwin.sekhar@caviumnetworks.com> <20170706095516.34985-1-ashwin.sekhar@caviumnetworks.com>
Spamdiagnosticmetadata: NSPM
Spamdiagnosticoutput: 1:99
The algorithm is based on the sinf and cosf implementations
for powerpc in ./powerpc/powerpc64/power8/fpu and that for
x86 in ./x86_64/fpu.

	* sysdeps/aarch64/fpu/chebyshev.h: New file.
	* sysdeps/aarch64/fpu/reduce.h: Likewise.
	* sysdeps/aarch64/fpu/s_cosf.c: Likewise.
	* sysdeps/aarch64/fpu/s_sinf.c: Likewise.
---
 sysdeps/aarch64/fpu/chebyshev.h |  84 ++++++++++++++++++++++++
 sysdeps/aarch64/fpu/reduce.h    | 142 ++++++++++++++++++++++++++++++++++++++++
 sysdeps/aarch64/fpu/s_cosf.c    | 110 +++++++++++++++++++++++++++++++
 sysdeps/aarch64/fpu/s_sinf.c    | 120 +++++++++++++++++++++++++++++++++
 4 files changed, 456 insertions(+)
 create mode 100644 sysdeps/aarch64/fpu/chebyshev.h
 create mode 100644 sysdeps/aarch64/fpu/reduce.h
 create mode 100644 sysdeps/aarch64/fpu/s_cosf.c
 create mode 100644 sysdeps/aarch64/fpu/s_sinf.c

diff --git a/sysdeps/aarch64/fpu/chebyshev.h b/sysdeps/aarch64/fpu/chebyshev.h
new file mode 100644
index 0000000000..4c694cf37f
--- /dev/null
+++ b/sysdeps/aarch64/fpu/chebyshev.h
@@ -0,0 +1,84 @@
+/* Functions for evaluating chebysev polynomials for sin and cos
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef __CHEBYSHEV_H__
+#define __CHEBYSHEV_H__
+
+#include <errno.h>
+#include <math.h>
+#include <math_private.h>
+
+/* sin (x) = x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))),
+   where 2^-5<=|x|<Pi/4.  */
+static inline double
+chebyshev_sin_polynomial_1 (double x)
+{
+	double S0, S1, S2, S3, S4, y;
+
+	S0 = -1.66666666666265311791406134034332e-01;
+	S1 = 8.33333332439092043519845987020744e-03;
+	S2 = -1.98412633515623692105969699817081e-04;
+	S3 = 2.75552591873811586009688362475245e-06;
+	S4 = -2.47545996176987174320511533257699e-08;
+	y = x * x;
+	return  x * (1.0 + y * (S0 + y * (S1 + y * (S2 + y * (S3 + y * S4)))));
+}
+
+/* cos (x) = 1+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))),
+   where 2^-5<=|x|<Pi/4.  */
+static inline double
+chebyshev_cos_polynomial_1 (double x)
+{
+	double C0, C1, C2, C3, C4, y;
+
+	C0 = -4.99999999994893751242841517523630e-01;
+	C1 = 4.16666665534264832326805105822132e-02;
+	C2 = -1.38888806593809050610177635576292e-03;
+	C3 = 2.47989607241011055654977823792251e-05;
+	C4 = -2.71747891329266278019784327732444e-07;
+	y = x * x;
+	return  1.0 + y * (C0 + y * (C1 + y * (C2 + y * (C3 + y * C4))));
+}
+
+/* sin (x) = x+x^3*(SS0+x^2*SS1),
+   where 2^-27<=|x|<2^-5.  */
+static inline double
+chebyshev_sin_polynomial_2 (double x)
+{
+	double SS0, SS1, y;
+
+	SS0 = -1.66666666634829235826842364076583e-01;
+	SS1 = 8.33312019844746100505350483445000e-03;
+	y = x * x;
+	return x * (1.0 + y * (SS0 + y * SS1));
+}
+
+/* cos (x) = 1+x^2*(CC0+x^2*CC1),
+   where 2^-27<=|x|<2^-5.  */
+static inline double
+chebyshev_cos_polynomial_2 (double x)
+{
+	double CC0, CC1, y;
+
+	CC0 = -4.99999999406199269191830580894020e-01;
+	CC1 = 4.16647402420742621331761768033175e-02;
+	y = x * x;
+	return 1.0 + y * (CC0 + y * CC1);
+}
+
+#endif
diff --git a/sysdeps/aarch64/fpu/reduce.h b/sysdeps/aarch64/fpu/reduce.h
new file mode 100644
index 0000000000..fc6e8c8b41
--- /dev/null
+++ b/sysdeps/aarch64/fpu/reduce.h
@@ -0,0 +1,142 @@
+/* Range reduction functions for sin and cos inputs
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef __REDUCE_H__
+#define __REDUCE_H__
+
+#include <errno.h>
+#include <math.h>
+#include <math_private.h>
+
+#define I_SPABS_MASK	0x7fffffff
+#define I_SPABS_2POWN27	0x32000000
+#define I_SPABS_2POWN5	0x3d000000
+#define I_SPABS_PIO4	0x3f490fdb
+#define I_SPABS_9PIO4	0x40e231d6
+#define I_SPABS_2POW23	0x4b000000
+#define I_SPABS_INF	0x7f800000
+
+/* 4/Pi broken into sum of positive DP values.  */
+static double INVPIO4_TABLE[25] = {
+	0.00000000000000000000000000000000e+00,
+	1.27323953807353973388671875000000e+00,
+	6.66162294771233121082332218065858e-09,
+	4.55202009562027200395410188316081e-18,
+	5.05365203780056430379174094616698e-26,
+	3.33657353140390501092355175630980e-34,
+	2.31774265657771014271759915567725e-43,
+	1.41079183488085906188916890478151e-51,
+	1.78201357714620429447917285584916e-59,
+	6.45440934111020426845713721490652e-68,
+	2.96289605657163538186678664280422e-77,
+	2.34290278673081796193027756956770e-85,
+	6.89165747744598758512920848666612e-94,
+	2.61827895738527799147711877424548e-102,
+	5.22516501694879285329083609946870e-111,
+	2.31723558129677581012126324525784e-119,
+	5.36762980505877895920512518100769e-128,
+	2.49914273179058265651805723374739e-135,
+	3.32028340088181692034775714274009e-144,
+	1.98261407071607720791943444409774e-152,
+	1.34423330943468258263688817309715e-162,
+	2.61360711509865349546753597258351e-169,
+	2.26640747502086603170125306310953e-178,
+	2.39096791372273354372455558796085e-186,
+	2.14336400443697767564443045577643e-194
+};
+
+/* Range Reduction for Pi/4<=|x|<Inf
+
+   Returns n and y, where
+    y = |x| - j * Pi/4,
+    j = (k + 1) & 0xfffffffe,
+    n = (k + 1) & 0x7,
+    k = trunc (|x| / (Pi / 4)).
+
+   In other words,
+    n is the pi/4 octant that x falls into in a trignometric plane.
+    y is the difference between x and the multiple of pi/2 closest to x.  */
+
+static int
+reducef (float x, double *y)
+{
+	double INVPIO4, PIO4, PIO4HI, PIO4LO;
+	uint64_t n, ix;
+	double t;
+
+	INVPIO4 = 1.27323954473516276486577680771006e+00;
+	PIO4 = 7.85398163397448278999490867136046e-01;
+	PIO4HI = -7.85398162901401519775390625000000e-01;
+	PIO4LO = -4.96046789840270212596747252887163e-10;
+
+	GET_FLOAT_WORD (ix, x);
+	ix = ix & I_SPABS_MASK;
+
+	if (ix < I_SPABS_9PIO4)
+	{
+		/* Here if Pi/4<=|x|<9*Pi/4.  */
+		double j;
+
+		t = fabs (x);
+		n = t * INVPIO4 + 1.0;
+		j = n & 0x0e;
+		t = t - j * PIO4;
+	}
+	else if (ix < I_SPABS_2POW23)
+	{
+		/* Here if 9*Pi/4<=|x|<2^23.  */
+		double j;
+
+		t = fabs (x);
+		n = t * INVPIO4 + 1.0;
+		j = n & 0xfffffffe;
+		t = t + j * PIO4HI + j * PIO4LO;
+	}
+	else
+	{
+		/* Here if 2^23<=|x|<=Inf.  */
+		double tmp0, tmp1, tmp2, tmp3;
+		uint64_t bitpos, j;
+
+		bitpos = (ix >> 23) - 0x7f + 59;
+		t = fabs (x);
+		j = (bitpos * ((0x100000000 / 28) + 1)) >> 32;
+		tmp0 = INVPIO4_TABLE[j - 2] * t;
+		tmp1 = INVPIO4_TABLE[j - 1] * t;
+		tmp2 = INVPIO4_TABLE[j] * t;
+		tmp3 = INVPIO4_TABLE[j + 1] * t;
+		tmp0 = tmp0 - ((uint64_t)tmp0 & ~0x7);
+		n = tmp0 + tmp1;
+		tmp0 = tmp0 - n;
+		t = tmp0 + tmp1 + tmp2 + tmp3;
+		if (n & 0x1)
+			t = t - 1.0;
+		else if (t > 1.0)
+		{
+			t = t - 2.0;
+			n = n + 1;
+		}
+		n = n + 1;
+		t = t * PIO4;
+	}
+	*y = t;
+
+	return n & 0x7;
+}
+
+#endif
diff --git a/sysdeps/aarch64/fpu/s_cosf.c b/sysdeps/aarch64/fpu/s_cosf.c
new file mode 100644
index 0000000000..1b2c34195b
--- /dev/null
+++ b/sysdeps/aarch64/fpu/s_cosf.c
@@ -0,0 +1,110 @@
+/* Optimized version of cosf
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <math.h>
+#include <math_private.h>
+#include "chebyshev.h"
+#include "reduce.h"
+
+/* Short algorithm description:
+
+   1) if |x| <  2^-27: return 1.0-|x|.
+   2) if |x| <  2^-5 : return 1.0+x^2*DP_COS2_0+x^5*DP_COS2_1.
+   3) if |x| <   Pi/4: return 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).
+   4) if |x| <    Inf:
+	5.1) Range reduction: k=trunc (|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1,
+	     t=|x|-j*Pi/4.
+	5.2) Reconstruction:
+	     s = (-1.0)^(((n+2)>>2)&1)
+	     if ((n+2)&2 != 0)
+	     {
+		using cos (t) polynomial for |t|<Pi/4, result is
+		s     * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))).
+	     }
+	     else
+	     {
+		using sin (t) polynomial for |t|<Pi/4, result is
+		s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))).
+	     }
+   5) if x is Inf    : return x-x, and set errno=EDOM.
+   6) if x is NaN    : return x-x.
+
+   Special cases:
+   cos (+-0) = 1 not raising inexact,
+   cos (subnormal) raises inexact,
+   cos (min_normalized) raises inexact,
+   cos (normalized) raises inexact,
+   cos (Inf) = NaN, raises invalid, sets errno to EDOM,
+   cos (NaN) = NaN.  */
+
+float
+__cosf (float x)
+{
+	uint32_t ix;
+
+	GET_FLOAT_WORD (ix, x);
+	ix &= I_SPABS_MASK;
+
+	if (ix >= I_SPABS_PIO4)
+		goto large_args;
+
+	if (ix < I_SPABS_2POWN5)
+		goto small_args;
+
+	/* Here if 2^-5<=|x|<Pi/4.  */
+	return chebyshev_cos_polynomial_1 (x);
+
+large_args:
+	if (ix < I_SPABS_INF)
+	{
+		/* Here if Pi/4<=|x|<Inf.  */
+		double t, sign, val;
+		int n;
+
+		n = reducef (x, &t);
+
+		if (((n + 2) >> 2) & 0x1)
+			sign = -1.0;
+		else
+			sign = 1.0;
+
+		if ((n + 2) & 0x2)
+			val = chebyshev_cos_polynomial_1 (t);
+		else
+			val = chebyshev_sin_polynomial_1 (t);
+
+		return sign * val;
+	}
+	/* Here if x is Inf or Nan.  */
+	if (ix == I_SPABS_INF)
+		__set_errno (EDOM);
+	return x - x;
+
+small_args:
+	if (ix >= I_SPABS_2POWN27)
+	{
+		/* Here if 2^-27<=|x|<2^-5.  */
+		return chebyshev_cos_polynomial_2 (x);
+	}
+
+	/* Here if |x| < 2^-27.  */
+	return 1.0 - fabsf (x);
+}
+
+weak_alias (__cosf, cosf);
diff --git a/sysdeps/aarch64/fpu/s_sinf.c b/sysdeps/aarch64/fpu/s_sinf.c
new file mode 100644
index 0000000000..c99253be87
--- /dev/null
+++ b/sysdeps/aarch64/fpu/s_sinf.c
@@ -0,0 +1,120 @@
+/* Optimized version of sinf
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <math.h>
+#include <math_private.h>
+#include "chebyshev.h"
+#include "reduce.h"
+
+/* Short algorithm description:
+
+   1) if |x| == 0    : return x.
+   2) if |x| <  2^-27: return x-x*DP_SMALL, raise underflow only when needed.
+   3) if |x| <  2^-5 : return x+x^3*DP_SIN2_0+x^5*DP_SIN2_1.
+   4) if |x| <   Pi/4: return x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))).
+   5) if |x| <    Inf:
+	5.1) Range reduction: k=trunc (|x|/(Pi/4)), j=(k+1)&0x0ffffffe, n=k+1,
+	     t=|x|-j*Pi/4.
+	5.2) Reconstruction:
+	     s = sign (x) * (-1.0)^((n>>2)&1)
+	     if (n&2 != 0)
+	     {
+		using cos (t) polynomial for |t|<Pi/4, result is
+		s     * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))).
+	     }
+	     else
+	     {
+		using sin (t) polynomial for |t|<Pi/4, result is
+		s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))).
+	     }
+   6) if x is Inf    : return x-x, and set errno=EDOM.
+   7) if x is NaN    : return x-x.
+
+   Special cases:
+   sin (+-0) = +-0 not raising inexact/underflow,
+   sin (subnormal) raises inexact/underflow,
+   sin (min_normalized) raises inexact/underflow,
+   sin (normalized) raises inexact,
+   sin (Inf) = NaN, raises invalid, sets errno to EDOM,
+   sin (NaN) = NaN.  */
+
+float
+__sinf (float x)
+{
+	uint32_t ix, sign_x;
+
+	GET_FLOAT_WORD (ix, x);
+	sign_x = ix >> 31;
+	ix &= I_SPABS_MASK;
+
+	if (ix >= I_SPABS_PIO4)
+		goto large_args;
+
+	if (ix < I_SPABS_2POWN5)
+		goto small_args;
+
+	/* Here if 2^-5<=|x|<Pi/4.  */
+	return chebyshev_sin_polynomial_1 (x);
+
+large_args:
+	if (ix < I_SPABS_INF)
+	{
+		/* Here if Pi/4<=|x|<Inf.  */
+		double t, sign, val;
+		int n;
+
+		n = reducef (x, &t);
+
+		if ((sign_x ^ (n >> 2)) & 0x1)
+			sign = -1.0;
+		else
+			sign = 1.0;
+
+		if (n & 0x2)
+			val = chebyshev_cos_polynomial_1 (t);
+		else
+			val = chebyshev_sin_polynomial_1 (t);
+
+		return sign * val;
+	}
+
+	/* Here if x is Inf or Nan.  */
+	if (ix == I_SPABS_INF)
+		__set_errno (EDOM);
+	return x - x;
+
+small_args:
+	if (ix >= I_SPABS_2POWN27)
+	{
+		/* Here if 2^-27<=|x|<2^-5.  */
+		return chebyshev_sin_polynomial_2 (x);
+	}
+	else if (ix > 0)
+	{
+		/* Here if 0<=|x|<2^-27.  */
+		double SMALL = 8.88178419700125232338905334472656e-16;
+
+		return x - x * SMALL;
+	}
+
+	/* Here if |x| == 0.  */
+	return x;
+}
+
+weak_alias (__sinf, sinf);
-- 
2.12.2
Follow-Ups:
- Re: [PATCH v2 1/1] aarch64: Add optimized version of sinf and cosf
  - From: Joseph Myers
- Re: [PATCH v2 1/1] aarch64: Add optimized version of sinf and cosf
  - From: Szabolcs Nagy
References:
- [PATCH v2 0/1] aarch64: Add optimized version of sinf and cosf
  - From: Ashwin Sekhar T K
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]