This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

RFC: Proposal for implementing libmvec on aarch64

From: Steve Ellcey <sellcey at cavium dot com>
To: libc-alpha <libc-alpha at sourceware dot org>
Date: Tue, 06 Mar 2018 13:33:08 -0800
Subject: RFC: Proposal for implementing libmvec on aarch64
Authentication-results: sourceware.org; auth=none
Authentication-results: spf=none (sender IP is ) smtp.mailfrom=Steve dot Ellcey at cavium dot com;
Reply-to: sellcey at cavium dot com
Spamdiagnosticmetadata: NSPM
Spamdiagnosticoutput: 1:99

This is an RFC for an approach to implementing libmvec vector math routines
for aarch64.  There are three patches attached to this email, one is a GCC
patch that allows aarch64 to generate calls to vector functions that are
marked with __attribute__ ((simd)), the second is the infrastructure part
of glibc changes that mark sin and cos with this attribute in the header files,
and the third is an implementation of vector sin and cos by adding ifdefs to
the existing scalar sin/cos functions in sysdeps/ieee754/dbl-64/s_sin.c.

I have several questions that I would like to get some feedback on:

Does the approach of modifying the existing scalar math routines to also
work on vector types, as opposed to having completely seperate vector
routines sound like a good approach?  I like it because it reduced the
amount of duplicated code between scalar and vector functions and should
keep the accuracy of scalar and vector routines in sync.  And since it is
written in C it can also be shared among multiple platforms.  The main
downside right now is the slow paths are still mostly handled serially
and so I am only seeing about a 5% improvement in a test that does sin
calls on an array of doubles.  I think more changes could be done to improve
the amount of parallelism in the vector versions but this is a start.
Perhaps we could remove some of the slow paths in the vector versions like
we do with the sincos function.

The current implementation does not build because when linking libmvec
it gets undefined externals for __branred, __docos, __dubsin, __mpcos,
__mpsin, and __sincostab.  How should this be handled?  Should they be
exported from libm so that libmvec can call them or should they be 
copied into libmvec so that they don't have to be externally visible?
Are there any issues with making libmvec depend on libm?

The vector ABI for aarch64 is not yet defined, I believe ARM is working
on that.  As you can see from the GCC patch I am just using the same
approach as x86 and just changed the vecsize_mangle variable to 'n' for
128 bit vectors on Aarch64.

Comments?

Steve Ellcey
sellcey@cavium.com

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 2f98a21..c4f4e11 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -40,6 +40,7 @@
 #include "regs.h"
 #include "emit-rtl.h"
 #include "recog.h"
+#include "cgraph.h"
 #include "diagnostic.h"
 #include "insn-attr.h"
 #include "alias.h"
@@ -17375,6 +17376,131 @@ aarch64_select_early_remat_modes (sbitmap modes)
     }
 }
 
+/* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
+   CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
+   CLONEI->simdlen.  Return 0 if SIMD clones shouldn't be emitted,
+   or number of vecsize_mangle variants that should be emitted.  */
+
+static int
+aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
+					struct cgraph_simd_clone *clonei,
+					tree base_type,
+					int num ATTRIBUTE_UNUSED)
+{
+  int ret = 0;
+
+  if (clonei->simdlen
+      && (clonei->simdlen < 2
+	  || clonei->simdlen > 1024
+	  || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
+    {
+      warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
+		  "unsupported simdlen %d", clonei->simdlen);
+      return 0;
+    }
+
+  tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
+  if (TREE_CODE (ret_type) != VOID_TYPE)
+    switch (TYPE_MODE (ret_type))
+      {
+      case E_QImode:
+      case E_HImode:
+      case E_SImode:
+      case E_DImode:
+      case E_SFmode:
+      case E_DFmode:
+      /* case E_SCmode: */
+      /* case E_DCmode: */
+	break;
+      default:
+	warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
+		    "unsupported return type %qT for simd\n", ret_type);
+	return 0;
+      }
+
+  tree t;
+  for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
+    /* FIXME: Shouldn't we allow such arguments if they are uniform?  */
+    switch (TYPE_MODE (TREE_TYPE (t)))
+      {
+      case E_QImode:
+      case E_HImode:
+      case E_SImode:
+      case E_DImode:
+      case E_SFmode:
+      case E_DFmode:
+      /* case E_SCmode: */
+      /* case E_DCmode: */
+	break;
+      default:
+	warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
+		    "unsupported argument type %qT for simd\n", TREE_TYPE (t));
+	return 0;
+      }
+
+  if (TARGET_SIMD)
+    {
+    clonei->vecsize_mangle = 'n';
+    clonei->mask_mode = VOIDmode;
+    clonei->vecsize_int = 128;
+    clonei->vecsize_float = 128;
+
+    if (clonei->simdlen == 0)
+      {
+      if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
+	clonei->simdlen = clonei->vecsize_int;
+      else
+	clonei->simdlen = clonei->vecsize_float;
+      clonei->simdlen /= GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
+      }
+    else if (clonei->simdlen > 16)
+      {
+      /* If it is possible for given SIMDLEN to pass CTYPE value in
+	 registers (v0-v7) accept that SIMDLEN, otherwise warn and don't
+	 emit corresponding clone.  */
+      int cnt = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type)) * clonei->simdlen;
+      if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
+	cnt /= clonei->vecsize_int;
+      else
+	cnt /= clonei->vecsize_float;
+      if (cnt > 8)
+	{
+	warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
+		    "unsupported simdlen %d", clonei->simdlen);
+	return 0;
+	}
+      }
+      ret = 1;
+    }
+  return ret;
+}
+
+/* Add target attribute to SIMD clone NODE if needed.  */
+
+static void
+aarch64_simd_clone_adjust (struct cgraph_node *node ATTRIBUTE_UNUSED)
+{
+}
+
+/* If SIMD clone NODE can't be used in a vectorized loop
+   in current function, return -1, otherwise return a badness of using it
+   (0 if it is most desirable from vecsize_mangle point of view, 1
+   slightly less desirable, etc.).  */
+
+static int
+aarch64_simd_clone_usable (struct cgraph_node *node)
+{
+  switch (node->simdclone->vecsize_mangle)
+    {
+    case 'n':
+      if (!TARGET_SIMD)
+	return -1;
+      return 0;
+    default:
+      gcc_unreachable ();
+    }
+}
+
 /* Target-specific selftests.  */
 
 #if CHECKING_P
@@ -17844,6 +17970,16 @@ aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_SELECT_EARLY_REMAT_MODES
 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
 
+#undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
+#define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
+  aarch64_simd_clone_compute_vecsize_and_simdlen
+
+#undef TARGET_SIMD_CLONE_ADJUST
+#define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
+
+#undef TARGET_SIMD_CLONE_USABLE
+#define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
+
 #if CHECKING_P
 #undef TARGET_RUN_TARGET_SELFTESTS
 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests

diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
index 4a182bd..e07eeb2 100644
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@@ -12,3 +12,14 @@ CFLAGS-s_fmaxf.c += -ffinite-math-only
 CFLAGS-s_fmin.c += -ffinite-math-only
 CFLAGS-s_fminf.c += -ffinite-math-only
 endif
+
+ifeq ($(subdir),mathvec)
+libmvec-support += libmvec_d_sincos2
+endif
+
+ifeq ($(subdir),math)
+ifeq ($(build-mathvec),yes)
+libmvec-tests += double-vlen2
+double-vlen2-funcs = sin cos
+endif
+endif
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
index e69de29..808fd35 100644
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
@@ -0,0 +1,43 @@
+/* Platform-specific SIMD declarations of math functions.
+   Copyright (C) 2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _MATH_H
+# error "Never include <bits/math-vector.h> directly;\
+ include <math.h> instead."
+#endif
+
+/* Get default empty definitions for simd declarations.  */
+#include <bits/libm-simd-decl-stubs.h>
+
+#if defined __FAST_MATH__
+# if defined _OPENMP && _OPENMP >= 201307
+/* OpenMP case.  */
+#  define __DECL_SIMD_AARCH64 _Pragma ("omp declare simd notinbranch")
+# elif __GNUC_PREREQ (6,0)
+/* W/o OpenMP use GCC 6.* __attribute__ ((__simd__)).  */
+#  define __DECL_SIMD_AARCH64 __attribute__ ((__simd__ ("notinbranch")))
+# endif
+
+# ifdef __DECL_SIMD_AARCH64
+#  undef __DECL_SIMD_sin
+#  define __DECL_SIMD_sin __DECL_SIMD_AARCH64
+#  undef __DECL_SIMD_cos
+#  define __DECL_SIMD_cos __DECL_SIMD_AARCH64
+# endif
+
+#endif
diff --git a/sysdeps/aarch64/fpu/test-double-vlen2-wrappers.c b/sysdeps/aarch64/fpu/test-double-vlen2-wrappers.c
index e69de29..ee07a47 100644
--- a/sysdeps/aarch64/fpu/test-double-vlen2-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-vlen2-wrappers.c
@@ -0,0 +1,24 @@
+/* Tests for aarch64 vector math routines.
+   Copyright (C) 2014-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "test-double-vlen2.h"
+
+#define VEC_TYPE __Float64x2_t
+
+VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVnN2v_sin)
+VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVnN2v_cos)

diff --git a/sysdeps/aarch64/fpu/libmvec_d_sincos2.c b/sysdeps/aarch64/fpu/libmvec_d_sincos2.c
index e69de29..f92291f 100644
--- a/sysdeps/aarch64/fpu/libmvec_d_sincos2.c
+++ b/sysdeps/aarch64/fpu/libmvec_d_sincos2.c
@@ -0,0 +1,21 @@
+
+/* We need the includes before the defines so that the define of __sin
+   and __cos do not mess up the bits/mathcalls.h declaration of __sin
+   and __cos.  */
+
+#include <float.h>
+#include "endian.h"
+#include "mydefs.h"
+#include "usncs.h"
+#include "MathLib.h"
+#include <math.h>
+#include <math_private.h>
+#include <libm-alias-double.h>
+#include <fenv.h>
+
+#define VEC_SIZE 2
+#define OP_TYPE __Float64x2_t
+#define __sin _ZGVnN2v_sin
+#define __cos _ZGVnN2v_cos
+
+#include "sysdeps/ieee754/dbl-64/s_sin.c"
diff --git a/sysdeps/ieee754/dbl-64/s_sin.c b/sysdeps/ieee754/dbl-64/s_sin.c
index 8c589cb..e1db599 100644
--- a/sysdeps/ieee754/dbl-64/s_sin.c
+++ b/sysdeps/ieee754/dbl-64/s_sin.c
@@ -55,6 +55,18 @@
 #include <libm-alias-double.h>
 #include <fenv.h>
 
+#ifndef VEC_SIZE
+# define VEC_SIZE 1
+#endif
+
+#if VEC_SIZE != 1 && VEC_SIZE != 2
+# error "This file must be updated to support other vector lengths."
+#endif
+
+#ifndef OP_TYPE
+# define OP_TYPE double
+#endif
+
 /* Helper macros to compute sin of the input values.  */
 #define POLYNOMIAL2(xx) ((((s5 * (xx) + s4) * (xx) + s3) * (xx) + s2) * (xx))
 
@@ -67,13 +79,25 @@
 
    The constants s1, s2, s3, etc. are pre-computed values of 1/3!, 1/5! and so
    on.  The result is returned to LHS and correction in COR.  */
-#define TAYLOR_SIN(xx, a, da, cor) \
-({									      \
-  double t = ((POLYNOMIAL (xx)  * (a) - 0.5 * (da))  * (xx) + (da));	      \
-  double res = (a) + t;							      \
-  (cor) = ((a) - res) + t;						      \
-  res;									      \
-})
+static inline double
+__always_inline
+taylor_sin_scalar (double xx, double a, double da, double *cor)
+{
+  double t = (POLYNOMIAL (xx) * a - 0.5 * da)  * xx + da;
+  double res = a + t;
+  *cor = (a - res) + t;
+  return res;
+}
+
+static inline OP_TYPE
+__always_inline
+taylor_sin_vector (OP_TYPE xx, OP_TYPE a, OP_TYPE da, OP_TYPE *cor)
+{
+  OP_TYPE t = (POLYNOMIAL (xx) * a - 0.5 * da)  * xx + da;
+  OP_TYPE res = a + t;
+  *cor = (a - res) + t;
+  return res;
+}
 
 /* This is again a variation of the Taylor series expansion with the term
    x^3/3! expanded into the following for better accuracy:
@@ -126,10 +150,11 @@ static const double
 
 static const double t22 = 0x1.8p22;
 
-void __dubsin (double x, double dx, double w[]);
-void __docos (double x, double dx, double w[]);
-double __mpsin (double x, double dx, bool reduce_range);
-double __mpcos (double x, double dx, bool reduce_range);
+extern void __dubsin (double x, double dx, double w[]);
+extern void __docos (double x, double dx, double w[]);
+extern double __mpsin (double x, double dx, bool reduce_range);
+extern double __mpcos (double x, double dx, bool reduce_range);
+
 static double slow (double x);
 static double slow1 (double x);
 static double slow2 (double x);
@@ -148,7 +173,7 @@ static double cslow2 (double x);
    get the result in RES and a correction value in COR.  */
 static inline double
 __always_inline
-do_cos (double x, double dx, double *corp)
+do_cos_scalar (double x, double dx, double *corp)
 {
   mynumber u;
 
@@ -170,6 +195,45 @@ do_cos (double x, double dx, double *corp)
   return res;
 }
 
+static inline OP_TYPE
+__always_inline
+do_cos_vector (OP_TYPE x, OP_TYPE dx, OP_TYPE *corp)
+{
+#if VEC_SIZE == 1
+  mynumber u;
+  if (x < 0)
+    dx = -dx;
+  u.x = big + fabs (x);
+  x = fabs (x) - (u.x - big) + dx;
+#else
+  mynumber u0, u1;
+  if (x[0] < 0)
+    dx[0] = -dx[0];
+  if (x[1] < 0)
+    dx[1] = -dx[1];
+  u0.x = big + fabs (x[0]);
+  u1.x = big + fabs (x[1]);
+  x[0] = fabs (x[0]) - (u0.x - big) + dx[0];
+  x[1] = fabs (x[1]) - (u1.x - big) + dx[1];
+#endif
+
+  OP_TYPE xx, s, sn, ssn, c, cs, ccs, res, cor;
+  xx = x * x;
+  s = x + x * xx * (sn3 + xx * sn5);
+  c = xx * (cs2 + xx * (cs4 + xx * cs6));
+#if VEC_SIZE == 1
+  SINCOS_TABLE_LOOKUP (u, sn, ssn, cs, ccs);
+#else
+  SINCOS_TABLE_LOOKUP (u0, sn[0], ssn[0], cs[0], ccs[0]);
+  SINCOS_TABLE_LOOKUP (u1, sn[1], ssn[1], cs[1], ccs[1]);
+#endif
+  cor = (ccs - s * ssn - cs * c) - sn * s;
+  res = cs + cor;
+  cor = (cs - res) + cor;
+  *corp = cor;
+  return res;
+}
+
 /* A more precise variant of DO_COS.  EPS is the adjustment to the correction
    COR.  */
 static inline double
@@ -210,7 +274,7 @@ do_cos_slow (double x, double dx, double eps, double *corp)
    the result in RES and a correction value in COR.  */
 static inline double
 __always_inline
-do_sin (double x, double dx, double *corp)
+do_sin_scalar (double x, double dx, double *corp)
 {
   mynumber u;
 
@@ -231,6 +295,45 @@ do_sin (double x, double dx, double *corp)
   return res;
 }
 
+static inline OP_TYPE
+__always_inline
+do_sin_vector (OP_TYPE x, OP_TYPE dx, OP_TYPE *corp)
+{
+#if VEC_SIZE == 1
+  mynumber u;
+  if (x <= 0)
+    dx = -dx;
+  u.x = big + fabs (x);
+  x = fabs (x) - (u.x - big);
+#else
+  mynumber u0, u1;
+  if (x[0] <= 0)
+    dx[0] = -dx[0];
+  if (x[1] <= 0)
+    dx[1] = -dx[1];
+  u0.x = big + fabs (x[0]);
+  u1.x = big + fabs (x[1]);
+  x[0] = fabs (x[0]) - (u0.x - big);
+  x[1] = fabs (x[1]) - (u1.x - big);
+#endif
+
+  OP_TYPE xx, s, sn, ssn, c, cs, ccs, cor, res;
+  xx = x * x;
+  s = x + (dx + x * xx * (sn3 + xx * sn5));
+  c = x * dx + xx * (cs2 + xx * (cs4 + xx * cs6));
+#if VEC_SIZE == 1
+  SINCOS_TABLE_LOOKUP (u, sn, ssn, cs, ccs);
+#else
+  SINCOS_TABLE_LOOKUP (u0, sn[0], ssn[0], cs[0], ccs[0]);
+  SINCOS_TABLE_LOOKUP (u1, sn[1], ssn[1], cs[1], ccs[1]);
+#endif
+  cor = (ssn + s * ccs - sn * c) + cs * s;
+  res = sn + cor;
+  cor = (sn - res) + cor;
+  *corp = cor;
+  return res;
+}
+
 /* A more precise variant of DO_SIN.  EPS is the adjustment to the correction
    COR.  */
 static inline double
@@ -337,13 +440,13 @@ do_sincos_1 (double a, double da, double x, int4 n, bool shift_quadrant)
       if (xx < 0.01588)
 	{
 	  /* Taylor series.  */
-	  res = TAYLOR_SIN (xx, a, da, cor);
+	  res = taylor_sin_scalar (xx, a, da, &cor);
 	  cor = 1.02 * cor + __copysign (eps, cor);
 	  retval = (res == res + cor) ? res : sloww (a, da, x, shift_quadrant);
 	}
       else
 	{
-	  res = do_sin (a, da, &cor);
+	  res = do_sin_scalar (a, da, &cor);
 	  cor = 1.035 * cor + __copysign (eps, cor);
 	  retval = ((res == res + cor) ? __copysign (res, a)
 		    : sloww1 (a, da, x, shift_quadrant));
@@ -352,7 +455,7 @@ do_sincos_1 (double a, double da, double x, int4 n, bool shift_quadrant)
 
     case 1:
     case 3:
-      res = do_cos (a, da, &cor);
+      res = do_cos_scalar (a, da, &cor);
       cor = 1.025 * cor + __copysign (eps, cor);
       retval = ((res == res + cor) ? ((n & 2) ? -res : res)
 		: sloww2 (a, da, x, n));
@@ -411,13 +514,13 @@ do_sincos_2 (double a, double da, double x, int4 n, bool shift_quadrant)
       if (xx < 0.01588)
 	{
 	  /* Taylor series.  */
-	  res = TAYLOR_SIN (xx, a, da, cor);
+	  res = taylor_sin_scalar (xx, a, da, &cor);
 	  cor = 1.02 * cor + __copysign (eps, cor);
 	  retval = (res == res + cor) ? res : bsloww (a, da, x, n);
 	}
       else
 	{
-	  res = do_sin (a, da, &cor);
+	  res = do_sin_scalar (a, da, &cor);
 	  cor = 1.035 * cor + __copysign (eps, cor);
 	  retval = ((res == res + cor) ? __copysign (res, a)
 		    : bsloww1 (a, da, x, n));
@@ -426,7 +529,7 @@ do_sincos_2 (double a, double da, double x, int4 n, bool shift_quadrant)
 
     case 1:
     case 3:
-      res = do_cos (a, da, &cor);
+      res = do_cos_scalar (a, da, &cor);
       cor = 1.025 * cor + __copysign (eps, cor);
       retval = ((res == res + cor) ? ((n & 2) ? -res : res)
 		: bsloww2 (a, da, x, n));
@@ -436,33 +539,84 @@ do_sincos_2 (double a, double da, double x, int4 n, bool shift_quadrant)
   return retval;
 }
 
+static double
+__always_inline
+do_sincos_tail (double x, int4 k, int4 kl, bool shift_quadrant)
+{
+  int4 n;
+  double retval;
+  if (k < 0x419921FB)
+    {
+      double a, da;
+      n = reduce_sincos_1 (x, &a, &da);
+      retval = do_sincos_1 (a, da, x, n, shift_quadrant);
+    }
+  else if (k < 0x42F00000)
+    {
+      double a, da;
+      n = reduce_sincos_2 (x, &a, &da);
+      retval = do_sincos_2 (a, da, x, n, shift_quadrant);
+    }
+  else if (k < 0x7ff00000)
+    {
+      retval = reduce_and_compute (x, shift_quadrant);
+    }
+  else
+    {
+      if (k == 0x7ff00000 && kl == 0)
+	__set_errno (EDOM);
+      retval = x / x;
+    }
+  return retval;
+}
+
 /*******************************************************************/
 /* An ultimate sin routine. Given an IEEE double machine number x   */
 /* it computes the correctly rounded (to nearest) value of sin(x)  */
 /*******************************************************************/
 #ifdef IN_SINCOS
-static double
+static OP_TYPE
 #else
-double
+OP_TYPE
 SECTION
 #endif
-__sin (double x)
+__sin (OP_TYPE x)
 {
-  double xx, res, t, cor;
-  mynumber u;
-  int4 k, m;
-  double retval = 0;
+  OP_TYPE xx, res, res2, t, cor;
+  OP_TYPE retval;
 
 #ifndef IN_SINCOS
   SET_RESTORE_ROUND_53BIT (FE_TONEAREST);
 #endif
 
+#if VEC_SIZE == 1
+  mynumber u;
+  int4 m, k;
+  OP_TYPE zero = 0.0;
   u.x = x;
   m = u.i[HIGH_HALF];
   k = 0x7fffffff & m;		/* no sign           */
+#else /* VEC_SIZE == 2 */
+  mynumber u0, u1;
+  int4 m0, m1, k0, k1, k;
+  OP_TYPE zero = {0.0, 0.0};
+  u0.x = x[0];
+  u1.x = x[1];
+  m0 = u0.i[HIGH_HALF];
+  m1 = u1.i[HIGH_HALF];
+  k0 = 0x7fffffff & m0;
+  k1 = 0x7fffffff & m1;
+  k = k0 > k1 ? k0 : k1;
+#endif
+
   if (k < 0x3e500000)		/* if x->0 =>sin(x)=x */
     {
+#if VEC_SIZE == 1
       math_check_force_underflow (x);
+#else
+      math_check_force_underflow (x[0]);
+      math_check_force_underflow (x[1]);
+#endif
       retval = x;
     }
  /*---------------------------- 2^-26 < |x|< 0.25 ----------------------*/
@@ -473,54 +627,64 @@ __sin (double x)
       t = POLYNOMIAL (xx) * (xx * x);
       res = x + t;
       cor = (x - res) + t;
-      retval = (res == res + 1.07 * cor) ? res : slow (x);
+      res2 = res + 1.07 * cor;
+#if VEC_SIZE == 1
+      retval = (res == res2) ? res : slow (x);
+#else
+      retval[0] = (res[0] == res2[0]) ? res[0] : slow (x[0]);
+      retval[1] = (res[1] == res2[1]) ? res[1] : slow (x[1]);
+#endif
     }				/*  else  if (k < 0x3fd00000)    */
+
 /*---------------------------- 0.25<|x|< 0.855469---------------------- */
   else if (k < 0x3feb6000)
     {
-      res = do_sin (x, 0, &cor);
-      retval = (res == res + 1.096 * cor) ? res : slow1 (x);
+      res = do_sin_vector (x, zero, &cor);
+      res2 = res + 1.096 * cor;
+#if VEC_SIZE == 1
+      retval = (res == res2) ? res : slow1 (x);
       retval = __copysign (retval, x);
+#else
+      retval[0] = (res[0] == res2[0]) ? res[0] : slow1 (x[0]);
+      retval[1] = (res[1] == res2[1]) ? res[1] : slow1 (x[1]);
+      retval[0] = __copysign (retval[0], x[0]);
+      retval[1] = __copysign (retval[1], x[1]);
+#endif
     }				/*   else  if (k < 0x3feb6000)    */
 
 /*----------------------- 0.855469  <|x|<2.426265  ----------------------*/
   else if (k < 0x400368fd)
     {
-
+#if VEC_SIZE == 1
       t = hp0 - fabs (x);
-      res = do_cos (t, hp1, &cor);
-      retval = (res == res + 1.020 * cor) ? res : slow2 (x);
+      res = do_cos_vector (t, hp1, &cor);
+      res2 = res + 1.020 * cor;
+      retval = (res == res2) ? res : slow2 (x);
       retval = __copysign (retval, x);
+#else
+      OP_TYPE hp1_vec;
+      t[0] = hp0 - fabs (x[0]);
+      t[1] = hp0 - fabs (x[1]);
+      hp1_vec[0] = hp1_vec[1] = hp1;
+      res = do_cos_vector (t, hp1_vec, &cor);
+      res2 = res + 1.020 * cor;
+      retval[0] = (res[0] == res2[0]) ? res[0] : slow2 (x[0]);
+      retval[1] = (res[1] == res2[1]) ? res[1] : slow2 (x[1]);
+      retval[0] = __copysign (retval[0], x[0]);
+      retval[1] = __copysign (retval[1], x[1]);
+#endif
     }				/*   else  if (k < 0x400368fd)    */
 
 #ifndef IN_SINCOS
 /*-------------------------- 2.426265<|x|< 105414350 ----------------------*/
-  else if (k < 0x419921FB)
-    {
-      double a, da;
-      int4 n = reduce_sincos_1 (x, &a, &da);
-      retval = do_sincos_1 (a, da, x, n, false);
-    }				/*   else  if (k <  0x419921FB )    */
-
-/*---------------------105414350 <|x|< 281474976710656 --------------------*/
-  else if (k < 0x42F00000)
-    {
-      double a, da;
-
-      int4 n = reduce_sincos_2 (x, &a, &da);
-      retval = do_sincos_2 (a, da, x, n, false);
-    }				/*   else  if (k <  0x42F00000 )   */
-
-/* -----------------281474976710656 <|x| <2^1024----------------------------*/
-  else if (k < 0x7ff00000)
-    retval = reduce_and_compute (x, false);
-
-/*--------------------- |x| > 2^1024 ----------------------------------*/
   else
     {
-      if (k == 0x7ff00000 && u.i[LOW_HALF] == 0)
-	__set_errno (EDOM);
-      retval = x / x;
+#if VEC_SIZE == 1
+      retval = do_sincos_tail (x, k, u.i[LOW_HALF], false);
+#else
+      retval[0] = do_sincos_tail (x[0], k0, u0.i[LOW_HALF], false);
+      retval[1] = do_sincos_tail (x[1], k1, u1.i[LOW_HALF], false);
+#endif
     }
 #endif
 
@@ -534,85 +698,122 @@ __sin (double x)
 /*******************************************************************/
 
 #ifdef IN_SINCOS
-static double
+static OP_TYPE
 #else
-double
+OP_TYPE
 SECTION
 #endif
-__cos (double x)
+__cos (OP_TYPE x)
 {
-  double y, xx, res, cor, a, da;
-  mynumber u;
-  int4 k, m;
-
-  double retval = 0;
+  OP_TYPE y, xx, res, res2, a, da, cor;
+  OP_TYPE retval;
 
 #ifndef IN_SINCOS
   SET_RESTORE_ROUND_53BIT (FE_TONEAREST);
 #endif
 
+#if VEC_SIZE == 1
+  mynumber u;
+  int4 m, k;
+  OP_TYPE zero = 0.0;
+  OP_TYPE one = 1.0;
   u.x = x;
   m = u.i[HIGH_HALF];
-  k = 0x7fffffff & m;
+  k = 0x7fffffff & m;           /* no sign           */
+#else /* VEC_SIZE == 2 */
+  mynumber u0, u1;
+  int4 m0, m1, k0, k1, k;
+  OP_TYPE zero = {0.0, 0.0};
+  OP_TYPE one = {1.0, 1.0};
+  u0.x = x[0];
+  u1.x = x[1];
+  m0 = u0.i[HIGH_HALF];
+  m1 = u1.i[HIGH_HALF];
+  k0 = 0x7fffffff & m0;
+  k1 = 0x7fffffff & m1;
+  k = k0 > k1 ? k0 : k1;
+#endif
 
   /* |x|<2^-27 => cos(x)=1 */
   if (k < 0x3e400000)
-    retval = 1.0;
+      retval = one;
 
   else if (k < 0x3feb6000)
     {				/* 2^-27 < |x| < 0.855469 */
-      res = do_cos (x, 0, &cor);
-      retval = (res == res + 1.020 * cor) ? res : cslow2 (x);
+      res = do_cos_vector(x, zero, &cor);
+      res2 = res + 1.020 * cor;
+#if VEC_SIZE == 1
+      retval = (res == res2) ? res : cslow2 (x);
+#else
+      retval[0] = (res[0] == res2[0]) ? res[0] : cslow2 (x[0]);
+      retval[1] = (res[1] == res2[1]) ? res[1] : cslow2 (x[1]);
+#endif
     }				/*   else  if (k < 0x3feb6000)    */
 
   else if (k < 0x400368fd)
     { /* 0.855469  <|x|<2.426265  */ ;
+#if VEC_SIZE == 1
       y = hp0 - fabs (x);
       a = y + hp1;
       da = (y - a) + hp1;
       xx = a * a;
       if (xx < 0.01588)
 	{
-	  res = TAYLOR_SIN (xx, a, da, cor);
+	  res = taylor_sin_scalar (xx, a, da, &cor);
 	  cor = 1.02 * cor + __copysign (1.0e-31, cor);
 	  retval = (res == res + cor) ? res : sloww (a, da, x, true);
 	}
       else
 	{
-	  res = do_sin (a, da, &cor);
+	  res = do_sin_scalar (a, da, &cor);
 	  cor = 1.035 * cor + __copysign (1.0e-31, cor);
 	  retval = ((res == res + cor) ? __copysign (res, a)
 		    : sloww1 (a, da, x, true));
 	}
-
+#else
+      OP_TYPE hp1_vec, cors;
+      hp1_vec[0] = hp1_vec[1] = hp1;
+      y[0] = hp0 - fabs (x[0]);
+      y[1] = hp0 - fabs (x[1]);
+      a = y + hp1_vec;
+      da = (y - a) + hp1_vec;
+      xx = a * a;
+      if (xx[0] < 0.01588 && x[1] < 0.01588)
+	{
+	  res = taylor_sin_vector (xx, a, da, &cor);
+          cors[0] = __copysign (1.0e-31, cor[0]);
+          cors[1] = __copysign (1.0e-31, cor[1]);
+	  cor = 1.02 * cor + cors;
+          res2 = res + cor;
+	  retval[0] = (res[0] == res2[0]) ? res[0]
+					  : sloww (a[0], da[0], x[0], true);
+	  retval[1] = (res[1] == res2[1]) ? res[1]
+					  : sloww (a[1], da[1], x[1], true);
+	}
+      else
+	{
+	  res = do_sin_vector (a, da, &cor);
+	  cors[0] = __copysign (1.0e-31, cor[0]);
+	  cors[1] = __copysign (1.0e-31, cor[1]);
+	  cor = 1.035 * cor + cors;
+	  res2 = res + cor;
+          retval[0] = __copysign (res[0], a[0]);
+          retval[1] = __copysign (res[1], a[1]);
+	  retval[0] = (res[0] == res2[0]) ? retval[0] : sloww1 (a[0], da[0], x[0], true);
+	  retval[1] = (res[1] == res2[1]) ? retval[1] : sloww1 (a[1], da[1], x[1], true);
+	}
+#endif
     }				/*   else  if (k < 0x400368fd)    */
 
-
 #ifndef IN_SINCOS
-  else if (k < 0x419921FB)
-    {				/* 2.426265<|x|< 105414350 */
-      double a, da;
-      int4 n = reduce_sincos_1 (x, &a, &da);
-      retval = do_sincos_1 (a, da, x, n, true);
-    }				/*   else  if (k <  0x419921FB )    */
-
-  else if (k < 0x42F00000)
-    {
-      double a, da;
-
-      int4 n = reduce_sincos_2 (x, &a, &da);
-      retval = do_sincos_2 (a, da, x, n, true);
-    }				/*   else  if (k <  0x42F00000 )    */
-
-  /* 281474976710656 <|x| <2^1024 */
-  else if (k < 0x7ff00000)
-    retval = reduce_and_compute (x, true);
-
   else
     {
-      if (k == 0x7ff00000 && u.i[LOW_HALF] == 0)
-	__set_errno (EDOM);
-      retval = x / x;		/* |x| > 2^1024 */
+#if VEC_SIZE == 1
+      retval = do_sincos_tail (x, k, u.i[LOW_HALF], true);
+#else
+      retval[0] = do_sincos_tail (x[0], k0, u0.i[LOW_HALF], true);
+      retval[1] = do_sincos_tail (x[1], k1, u1.i[LOW_HALF], true);
+#endif
     }
 #endif

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]