This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch siddhesh/libm-mpa updated. glibc-2.17-174-g21b0d2d
- From: siddhesh at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 12 Feb 2013 14:17:45 -0000
- Subject: GNU C Library master sources branch siddhesh/libm-mpa updated. glibc-2.17-174-g21b0d2d
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, siddhesh/libm-mpa has been updated
via 21b0d2d025b3949198a246ac183bfcc1b9fcb5bb (commit)
via 3bb4b7f80010b0d8237c887e681c1e7a3e6ebe35 (commit)
via f32ec4787e50436065f150b7aca181e576a2dcc5 (commit)
via ffde6a00cac4320960dd81afd5970403b890ab88 (commit)
via fffdf8b075f7e62f8d2dc999e9a290963838803f (commit)
via 26126c66e09412bef9b3782a0aab137905bd52dd (commit)
from 7974e4817608adf4e1e3e9132309a90335928557 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=21b0d2d025b3949198a246ac183bfcc1b9fcb5bb
commit 21b0d2d025b3949198a246ac183bfcc1b9fcb5bb
Author: Siddhesh Poyarekar <siddhesh@redhat.com>
Date: Tue Feb 12 19:37:41 2013 +0530
Clean up add_magnitudes and sub_magnitudes
diff --git a/sysdeps/ieee754/dbl-64/mpa.c b/sysdeps/ieee754/dbl-64/mpa.c
index d170248..f9e2488 100644
--- a/sysdeps/ieee754/dbl-64/mpa.c
+++ b/sysdeps/ieee754/dbl-64/mpa.c
@@ -335,7 +335,7 @@ static void
SECTION
add_magnitudes (const mp_no *x, const mp_no *y, mp_no *z, int p2)
{
- long i, j, k, p = p2;
+ long i, j, k, p = p2, zk;
EZ = EX;
@@ -343,35 +343,38 @@ add_magnitudes (const mp_no *x, const mp_no *y, mp_no *z, int p2)
j = p + EY - EX;
k = p + 1;
- if (j < 1)
+ if (__glibc_unlikely (j < 1))
{
__cpy (x, z, p);
return;
}
- else
- Z[k] = 0;
+
+ zk = 0;
for (; j > 0; i--, j--)
{
- long tmp = Z[k] + X[i] + Y[j];
- Z[k] = tmp % I_RADIX;
- Z[--k] = tmp / I_RADIX;
+ zk += X[i] + Y[j];
+ Z[k--] = zk % I_RADIX;
+ zk /= I_RADIX;
}
for (; i > 0; i--)
{
- long tmp = Z[k] + X[i];
- Z[k] = tmp % I_RADIX;
- Z[--k] = tmp / I_RADIX;
+ zk += X[i];
+ Z[k--] = zk % I_RADIX;
+ zk /= I_RADIX;
}
- if (Z[1] == 0)
+ if (zk == 0)
{
for (i = 1; i <= p; i++)
Z[i] = Z[i + 1];
}
else
- EZ++;
+ {
+ Z[1] = zk;
+ EZ++;
+ }
}
/* Subtract the magnitudes of *X and *Y assuming that abs (*x) > abs (*y) > 0.
@@ -382,52 +385,46 @@ static void
SECTION
sub_magnitudes (const mp_no *x, const mp_no *y, mp_no *z, int p2)
{
- long i, j, k, p = p2;
+ long i, j, k, p = p2, zk;
EZ = EX;
- if (EX == EY)
+ i = p;
+ j = p + EY - EX;
+ k = p;
+
+ /* Y is too small compared to X, copy X over to the result. */
+ if (__glibc_unlikely (j < 1))
{
- i = j = k = p;
- Z[k] = Z[k + 1] = 0;
+ __cpy (x, z, p);
+ return;
}
- else
- {
- j = EX - EY;
- if (j > p)
- {
- __cpy (x, z, p);
- return;
- }
- else
- {
- long tmp;
- i = p;
- j = p + 1 - j;
- k = p;
-
- tmp = I_RADIX - Y[j];
- Z[k + 1] = tmp % I_RADIX;
- Z[k] = tmp / I_RADIX - 1;
- j--;
- }
+ /* The relevant least significant digit in Y is non-zero, so we factor it in
+ to enhance accuracy. */
+ if (j < p && Y[j + 1] > 0)
+ {
+ Z[k + 1] = RADIX - Y[j + 1];
+ zk = -1;
}
+ else
+ zk = Z[k + 1] = ZERO;
+ /* Subtract and borrow. */
for (; j > 0; i--, j--)
{
- long tmp = I_RADIX + Z[k] + X[i] - Y[j];
+ zk += I_RADIX + X[i] - Y[j];
- Z[k] = tmp % I_RADIX;
- Z[--k] = tmp / I_RADIX - 1;
+ Z[k--] = zk % I_RADIX;
+ zk = zk / I_RADIX - 1;
}
for (; i > 0; i--)
{
- long tmp = I_RADIX + Z[k] + X[i];
+ zk += I_RADIX + X[i];
- Z[k] = tmp % I_RADIX;
- Z[--k] = tmp / I_RADIX - 1;
+ Z[k--] = zk % I_RADIX;
+ zk = zk / I_RADIX - 1;
}
for (i = 1; Z[i] == 0; i++);
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=3bb4b7f80010b0d8237c887e681c1e7a3e6ebe35
commit 3bb4b7f80010b0d8237c887e681c1e7a3e6ebe35
Author: Siddhesh Poyarekar <siddhesh@redhat.com>
Date: Tue Feb 12 19:21:53 2013 +0530
Sync up latest patches
Port __sqr to this branch and also fix up the broken parts of __mul.
diff --git a/sysdeps/ieee754/dbl-64/mpa.c b/sysdeps/ieee754/dbl-64/mpa.c
index 967921a..d170248 100644
--- a/sysdeps/ieee754/dbl-64/mpa.c
+++ b/sysdeps/ieee754/dbl-64/mpa.c
@@ -552,6 +552,7 @@ __mul (const mp_no *x, const mp_no *y, mp_no *z, int p2)
long i, j, k, ip, ip2, p = p2;
int64_t zk;
int64_t *diag;
+ const mp_no *a;
/* Is z=0? */
if (__glibc_unlikely (X[0] * Y[0] == 0))
@@ -566,9 +567,14 @@ __mul (const mp_no *x, const mp_no *y, mp_no *z, int p2)
if (X[ip2] | Y[ip2])
break;
+ if (X[ip2])
+ a = y;
+ else
+ a = x;
+
/* ... and here, at least one of them is still zero. */
for (ip = ip2; ip > 0; ip--)
- if (X[ip] * Y[ip])
+ if (a->d[ip])
break;
/* Multiply, add and carry. */
@@ -581,19 +587,19 @@ __mul (const mp_no *x, const mp_no *y, mp_no *z, int p2)
while (k > ip + ip2 + 1)
Z[k--] = 0;
- zk = Z[k] = 0;
+ zk = 0;
/* Precompute sums of diagonal elements so that we can directly use them
later. See the next comment to know we why need them. */
diag = alloca (k * sizeof (int64_t));
- diag[0] = 0;
+ int64_t d = ZERO;
for (i = 1; i <= ip; i++)
{
- diag[i] = X[i] * Y[i];
- diag[i] += diag[i - 1];
+ d += X[i] * Y[i];
+ diag[i] = d;
}
while (i < k)
- diag[i++] = diag[ip];
+ diag[i++] = d;
/* This gives us additional precision if required. This is only executed
when P < IP1 + IP2 + 1, i.e. at least one of the numbers have precision
@@ -677,31 +683,31 @@ SECTION
__sqr (const mp_no *x, mp_no *y, int p)
{
long i, j, k, ip;
- double u, yk;
+ int64_t yk;
/* Is z=0? */
- if (__glibc_unlikely (X[0] == ZERO))
+ if (__glibc_unlikely (X[0] == 0))
{
- Y[0] = ZERO;
+ Y[0] = 0;
return;
}
/* We need not iterate through all X's since it's pointless to
multiply zeroes. */
for (ip = p; ip > 0; ip--)
- if (X[ip] != ZERO)
+ if (X[ip])
break;
k = (__glibc_unlikely (p < 3)) ? p + p : p + 3;
while (k > 2 * ip + 1)
- Y[k--] = ZERO;
+ Y[k--] = 0;
- yk = ZERO;
+ yk = 0;
while (k > p)
{
- double yk2 = 0.0;
+ int64_t yk2 = 0;
long lim = k / 2;
if (k % 2 == 0)
@@ -713,21 +719,15 @@ __sqr (const mp_no *x, mp_no *y, int p)
for (i = k - p, j = p; i <= lim; i++, j--)
yk2 += X[i] * X[j];
- yk += 2.0 * yk2;
-
- if (i == j)
- yk += X[i] * X[i];
+ yk += yk2 * 2;
- u = (yk + CUTTER) - CUTTER;
- if (u > yk)
- u -= RADIX;
- Y[k--] = yk - u;
- yk = u * RADIXI;
+ Y[k--] = yk % I_RADIX;
+ yk /= I_RADIX;
}
while (k > 1)
{
- double yk2 = 0.0;
+ int64_t yk2 = 0;
long lim = k / 2;
if (k % 2 == 0)
@@ -739,22 +739,19 @@ __sqr (const mp_no *x, mp_no *y, int p)
for (i = 1, j = k - 1; i <= lim; i++, j--)
yk2 += X[i] * X[j];
- yk += 2.0 * yk2;
+ yk += yk2 * 2;
- u = (yk + CUTTER) - CUTTER;
- if (u > yk)
- u -= RADIX;
- Y[k--] = yk - u;
- yk = u * RADIXI;
+ Y[k--] = yk % I_RADIX;
+ yk /= I_RADIX;
}
Y[k] = yk;
/* Squares are always positive. */
- Y[0] = 1.0;
+ Y[0] = 1;
EY = 2 * EX;
/* Is there a carry beyond the most significant digit? */
- if (__glibc_unlikely (Y[1] == ZERO))
+ if (__glibc_unlikely (Y[1] == 0))
{
for (i = 1; i <= p; i++)
Y[i] = Y[i + 1];
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f32ec4787e50436065f150b7aca181e576a2dcc5
commit f32ec4787e50436065f150b7aca181e576a2dcc5
Author: Siddhesh Poyarekar <siddhesh@redhat.com>
Date: Tue Feb 12 18:54:23 2013 +0530
Use reduced mantissa calculation for the truncated mantissa as well
diff --git a/sysdeps/ieee754/dbl-64/mpa.c b/sysdeps/ieee754/dbl-64/mpa.c
index 064953b..967921a 100644
--- a/sysdeps/ieee754/dbl-64/mpa.c
+++ b/sysdeps/ieee754/dbl-64/mpa.c
@@ -583,20 +583,6 @@ __mul (const mp_no *x, const mp_no *y, mp_no *z, int p2)
zk = Z[k] = 0;
- /* This gives us additional precision if required. This is only executed
- when P < IP1 + IP2 + 1, i.e. at least one of the numbers have precision
- of greater than or equal to half of what's required (P). Anything less
- and we're just wasting our time since we'll be spinning around
- multiplying zeroes. */
- while (k > p)
- {
- for (i = k - p, j = p; i < p + 1; i++, j--)
- zk += (int64_t) X[i] * Y[j];
-
- Z[k--] = zk % I_RADIX;
- zk /= I_RADIX;
- }
-
/* Precompute sums of diagonal elements so that we can directly use them
later. See the next comment to know we why need them. */
diag = alloca (k * sizeof (int64_t));
@@ -609,6 +595,32 @@ __mul (const mp_no *x, const mp_no *y, mp_no *z, int p2)
while (i < k)
diag[i++] = diag[ip];
+ /* This gives us additional precision if required. This is only executed
+ when P < IP1 + IP2 + 1, i.e. at least one of the numbers have precision
+ of greater than or equal to half of what's required (P). Anything less
+ and we're just wasting our time since we'll be spinning around
+ multiplying zeroes. */
+ while (k > p)
+ {
+ long lim = k / 2;
+
+ if (k % 2 == 0)
+ {
+ /* We want to add this only once, but since we subtract it in the sum
+ of products above, we add twice. */
+ zk += 2 * X[lim] * Y[lim];
+ lim--;
+ }
+
+ for (i = k - p, j = p; i <= lim; i++, j--)
+ zk += (int64_t) (X[i] + X[j]) * (Y[i] + Y[j]);
+
+ zk -= diag[k - 1];
+
+ Z[k--] = zk % I_RADIX;
+ zk /= I_RADIX;
+ }
+
/* The real deal. Mantissa digit Z[k] is the sum of all X[i] * Y[j] where i
goes from 1 -> k - 1 and j goes the same range in reverse. To reduce the
number of multiplications, we halve the range and if k is an even number,
@@ -624,7 +636,7 @@ __mul (const mp_no *x, const mp_no *y, mp_no *z, int p2)
used in the loop below. */
while (k > 1)
{
- int lim = k / 2;
+ long lim = k / 2;
if (k % 2 == 0)
{
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=ffde6a00cac4320960dd81afd5970403b890ab88
commit ffde6a00cac4320960dd81afd5970403b890ab88
Author: Andreas Schwab <schwab@linux-m68k.org>
Date: Sun Jan 20 02:03:42 2013 +0100
Remove use of mpa2.h
diff --git a/ChangeLog b/ChangeLog
index 647ce1d..5e2ace9 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2013-01-20 Andreas Schwab <schwab@linux-m68k.org>
+
+ * sysdeps/powerpc/powerpc32/power4/fpu/mpa.c: Don't include
+ "mpa2.h".
+ * sysdeps/powerpc/powerpc64/power4/fpu/mpa.c: Likewise.
+
2013-01-18 Siddhesh Poyarekar <siddhesh@redhat.com>
[BZ #14496]
diff --git a/sysdeps/powerpc/powerpc32/power4/fpu/mpa.c b/sysdeps/powerpc/powerpc32/power4/fpu/mpa.c
index e83943c..ef7a57f 100644
--- a/sysdeps/powerpc/powerpc32/power4/fpu/mpa.c
+++ b/sysdeps/powerpc/powerpc32/power4/fpu/mpa.c
@@ -43,7 +43,6 @@
#include "endian.h"
#include "mpa.h"
-#include "mpa2.h"
#include <sys/param.h>
const mp_no mpone = {1, {1.0, 1.0}};
diff --git a/sysdeps/powerpc/powerpc64/power4/fpu/mpa.c b/sysdeps/powerpc/powerpc64/power4/fpu/mpa.c
index e83943c..ef7a57f 100644
--- a/sysdeps/powerpc/powerpc64/power4/fpu/mpa.c
+++ b/sysdeps/powerpc/powerpc64/power4/fpu/mpa.c
@@ -43,7 +43,6 @@
#include "endian.h"
#include "mpa.h"
-#include "mpa2.h"
#include <sys/param.h>
const mp_no mpone = {1, {1.0, 1.0}};
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=fffdf8b075f7e62f8d2dc999e9a290963838803f
commit fffdf8b075f7e62f8d2dc999e9a290963838803f
Author: Siddhesh Poyarekar <siddhesh@redhat.com>
Date: Tue Feb 12 12:49:39 2013 +0530
New __sqr function as a faster special case of __mul
diff --git a/sysdeps/ieee754/dbl-64/mpa.c b/sysdeps/ieee754/dbl-64/mpa.c
index 830e157..064953b 100644
--- a/sysdeps/ieee754/dbl-64/mpa.c
+++ b/sysdeps/ieee754/dbl-64/mpa.c
@@ -656,6 +656,100 @@ __mul (const mp_no *x, const mp_no *y, mp_no *z, int p2)
Z[0] = X[0] * Y[0];
}
+/* Square *X and store result in *Y. X and Y may not overlap. For P in
+ [1, 2, 3], the exact result is truncated to P digits. In case P > 3 the
+ error is bounded by 1.001 ULP. This is a faster special case of
+ multiplication. */
+void
+SECTION
+__sqr (const mp_no *x, mp_no *y, int p)
+{
+ long i, j, k, ip;
+ double u, yk;
+
+ /* Is z=0? */
+ if (__glibc_unlikely (X[0] == ZERO))
+ {
+ Y[0] = ZERO;
+ return;
+ }
+
+ /* We need not iterate through all X's since it's pointless to
+ multiply zeroes. */
+ for (ip = p; ip > 0; ip--)
+ if (X[ip] != ZERO)
+ break;
+
+ k = (__glibc_unlikely (p < 3)) ? p + p : p + 3;
+
+ while (k > 2 * ip + 1)
+ Y[k--] = ZERO;
+
+ yk = ZERO;
+
+ while (k > p)
+ {
+ double yk2 = 0.0;
+ long lim = k / 2;
+
+ if (k % 2 == 0)
+ {
+ yk += X[lim] * X[lim];
+ lim--;
+ }
+
+ for (i = k - p, j = p; i <= lim; i++, j--)
+ yk2 += X[i] * X[j];
+
+ yk += 2.0 * yk2;
+
+ if (i == j)
+ yk += X[i] * X[i];
+
+ u = (yk + CUTTER) - CUTTER;
+ if (u > yk)
+ u -= RADIX;
+ Y[k--] = yk - u;
+ yk = u * RADIXI;
+ }
+
+ while (k > 1)
+ {
+ double yk2 = 0.0;
+ long lim = k / 2;
+
+ if (k % 2 == 0)
+ {
+ yk += X[lim] * X[lim];
+ lim--;
+ }
+
+ for (i = 1, j = k - 1; i <= lim; i++, j--)
+ yk2 += X[i] * X[j];
+
+ yk += 2.0 * yk2;
+
+ u = (yk + CUTTER) - CUTTER;
+ if (u > yk)
+ u -= RADIX;
+ Y[k--] = yk - u;
+ yk = u * RADIXI;
+ }
+ Y[k] = yk;
+
+ /* Squares are always positive. */
+ Y[0] = 1.0;
+
+ EY = 2 * EX;
+ /* Is there a carry beyond the most significant digit? */
+ if (__glibc_unlikely (Y[1] == ZERO))
+ {
+ for (i = 1; i <= p; i++)
+ Y[i] = Y[i + 1];
+ EY--;
+ }
+}
+
/* Invert *X and store in *Y. Relative error bound:
- For P = 2: 1.001 * R ^ (1 - P)
- For P = 3: 1.063 * R ^ (1 - P)
diff --git a/sysdeps/ieee754/dbl-64/mpa.h b/sysdeps/ieee754/dbl-64/mpa.h
index 3185299..581c401 100644
--- a/sysdeps/ieee754/dbl-64/mpa.h
+++ b/sysdeps/ieee754/dbl-64/mpa.h
@@ -118,6 +118,7 @@ void __dbl_mp (double, mp_no *, int);
void __add (const mp_no *, const mp_no *, mp_no *, int);
void __sub (const mp_no *, const mp_no *, mp_no *, int);
void __mul (const mp_no *, const mp_no *, mp_no *, int);
+void __sqr (const mp_no *, mp_no *, int);
void __dvd (const mp_no *, const mp_no *, mp_no *, int);
extern void __mpatan (mp_no *, mp_no *, int);
diff --git a/sysdeps/ieee754/dbl-64/mpexp.c b/sysdeps/ieee754/dbl-64/mpexp.c
index 35c4e80..2db4536 100644
--- a/sysdeps/ieee754/dbl-64/mpexp.c
+++ b/sysdeps/ieee754/dbl-64/mpexp.c
@@ -152,14 +152,14 @@ __mpexp (mp_no *x, mp_no *y, int p)
/* Raise polynomial value to the power of 2**m. Put result in y. */
for (k = 0, j = 0; k < m;)
{
- __mul (&mpt2, &mpt2, &mpt1, p);
+ __sqr (&mpt2, &mpt1, p);
k++;
if (k == m)
{
j = 1;
break;
}
- __mul (&mpt1, &mpt1, &mpt2, p);
+ __sqr (&mpt1, &mpt2, p);
k++;
}
if (j)
diff --git a/sysdeps/powerpc/powerpc32/power4/fpu/mpa.c b/sysdeps/powerpc/powerpc32/power4/fpu/mpa.c
index 58c0c54..e83943c 100644
--- a/sysdeps/powerpc/powerpc32/power4/fpu/mpa.c
+++ b/sysdeps/powerpc/powerpc32/power4/fpu/mpa.c
@@ -688,6 +688,99 @@ __mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
return;
}
+/* Square *X and store result in *Y. X and Y may not overlap. For P in
+ [1, 2, 3], the exact result is truncated to P digits. In case P > 3 the
+ error is bounded by 1.001 ULP. This is a faster special case of
+ multiplication. */
+void
+__sqr (const mp_no *x, mp_no *y, int p)
+{
+ long i, j, k, ip;
+ double u, yk;
+
+ /* Is z=0? */
+ if (__glibc_unlikely (X[0] == ZERO))
+ {
+ Y[0] = ZERO;
+ return;
+ }
+
+ /* We need not iterate through all X's since it's pointless to
+ multiply zeroes. */
+ for (ip = p; ip > 0; ip--)
+ if (X[ip] != ZERO)
+ break;
+
+ k = (__glibc_unlikely (p < 3)) ? p + p : p + 3;
+
+ while (k > 2 * ip + 1)
+ Y[k--] = ZERO;
+
+ yk = ZERO;
+
+ while (k > p)
+ {
+ double yk2 = 0.0;
+ long lim = k / 2;
+
+ if (k % 2 == 0)
+ {
+ yk += X[lim] * X[lim];
+ lim--;
+ }
+
+ for (i = k - p, j = p; i <= lim; i++, j--)
+ yk2 += X[i] * X[j];
+
+ yk += 2.0 * yk2;
+
+ if (i == j)
+ yk += X[i] * X[i];
+
+ u = (yk + CUTTER) - CUTTER;
+ if (u > yk)
+ u -= RADIX;
+ Y[k--] = yk - u;
+ yk = u * RADIXI;
+ }
+
+ while (k > 1)
+ {
+ double yk2 = 0.0;
+ long lim = k / 2;
+
+ if (k % 2 == 0)
+ {
+ yk += X[lim] * X[lim];
+ lim--;
+ }
+
+ for (i = 1, j = k - 1; i <= lim; i++, j--)
+ yk2 += X[i] * X[j];
+
+ yk += 2.0 * yk2;
+
+ u = (yk + CUTTER) - CUTTER;
+ if (u > yk)
+ u -= RADIX;
+ Y[k--] = yk - u;
+ yk = u * RADIXI;
+ }
+ Y[k] = yk;
+
+ /* Squares are always positive. */
+ Y[0] = 1.0;
+
+ EY = 2 * EX;
+ /* Is there a carry beyond the most significant digit? */
+ if (__glibc_unlikely (Y[1] == ZERO))
+ {
+ for (i = 1; i <= p; i++)
+ Y[i] = Y[i + 1];
+ EY--;
+ }
+}
+
/* Invert *X and store in *Y. Relative error bound:
- For P = 2: 1.001 * R ^ (1 - P)
- For P = 3: 1.063 * R ^ (1 - P)
diff --git a/sysdeps/powerpc/powerpc64/power4/fpu/mpa.c b/sysdeps/powerpc/powerpc64/power4/fpu/mpa.c
index 58c0c54..e83943c 100644
--- a/sysdeps/powerpc/powerpc64/power4/fpu/mpa.c
+++ b/sysdeps/powerpc/powerpc64/power4/fpu/mpa.c
@@ -688,6 +688,99 @@ __mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
return;
}
+/* Square *X and store result in *Y. X and Y may not overlap. For P in
+ [1, 2, 3], the exact result is truncated to P digits. In case P > 3 the
+ error is bounded by 1.001 ULP. This is a faster special case of
+ multiplication. */
+void
+__sqr (const mp_no *x, mp_no *y, int p)
+{
+ long i, j, k, ip;
+ double u, yk;
+
+ /* Is z=0? */
+ if (__glibc_unlikely (X[0] == ZERO))
+ {
+ Y[0] = ZERO;
+ return;
+ }
+
+ /* We need not iterate through all X's since it's pointless to
+ multiply zeroes. */
+ for (ip = p; ip > 0; ip--)
+ if (X[ip] != ZERO)
+ break;
+
+ k = (__glibc_unlikely (p < 3)) ? p + p : p + 3;
+
+ while (k > 2 * ip + 1)
+ Y[k--] = ZERO;
+
+ yk = ZERO;
+
+ while (k > p)
+ {
+ double yk2 = 0.0;
+ long lim = k / 2;
+
+ if (k % 2 == 0)
+ {
+ yk += X[lim] * X[lim];
+ lim--;
+ }
+
+ for (i = k - p, j = p; i <= lim; i++, j--)
+ yk2 += X[i] * X[j];
+
+ yk += 2.0 * yk2;
+
+ if (i == j)
+ yk += X[i] * X[i];
+
+ u = (yk + CUTTER) - CUTTER;
+ if (u > yk)
+ u -= RADIX;
+ Y[k--] = yk - u;
+ yk = u * RADIXI;
+ }
+
+ while (k > 1)
+ {
+ double yk2 = 0.0;
+ long lim = k / 2;
+
+ if (k % 2 == 0)
+ {
+ yk += X[lim] * X[lim];
+ lim--;
+ }
+
+ for (i = 1, j = k - 1; i <= lim; i++, j--)
+ yk2 += X[i] * X[j];
+
+ yk += 2.0 * yk2;
+
+ u = (yk + CUTTER) - CUTTER;
+ if (u > yk)
+ u -= RADIX;
+ Y[k--] = yk - u;
+ yk = u * RADIXI;
+ }
+ Y[k] = yk;
+
+ /* Squares are always positive. */
+ Y[0] = 1.0;
+
+ EY = 2 * EX;
+ /* Is there a carry beyond the most significant digit? */
+ if (__glibc_unlikely (Y[1] == ZERO))
+ {
+ for (i = 1; i <= p; i++)
+ Y[i] = Y[i + 1];
+ EY--;
+ }
+}
+
/* Invert *X and store in *Y. Relative error bound:
- For P = 2: 1.001 * R ^ (1 - P)
- For P = 3: 1.063 * R ^ (1 - P)
diff --git a/sysdeps/x86_64/fpu/multiarch/mpa-avx.c b/sysdeps/x86_64/fpu/multiarch/mpa-avx.c
index d3f4d7a..366b0b7 100644
--- a/sysdeps/x86_64/fpu/multiarch/mpa-avx.c
+++ b/sysdeps/x86_64/fpu/multiarch/mpa-avx.c
@@ -1,5 +1,6 @@
#define __add __add_avx
#define __mul __mul_avx
+#define __sqr __sqr_avx
#define __sub __sub_avx
#define __dbl_mp __dbl_mp_avx
#define __dvd __dvd_avx
diff --git a/sysdeps/x86_64/fpu/multiarch/mpa-fma4.c b/sysdeps/x86_64/fpu/multiarch/mpa-fma4.c
index 6abb671..a4a7594 100644
--- a/sysdeps/x86_64/fpu/multiarch/mpa-fma4.c
+++ b/sysdeps/x86_64/fpu/multiarch/mpa-fma4.c
@@ -1,5 +1,6 @@
#define __add __add_fma4
#define __mul __mul_fma4
+#define __sqr __sqr_fma4
#define __sub __sub_fma4
#define __dbl_mp __dbl_mp_fma4
#define __dvd __dvd_fma4
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=26126c66e09412bef9b3782a0aab137905bd52dd
commit 26126c66e09412bef9b3782a0aab137905bd52dd
Author: Siddhesh Poyarekar <siddhesh@redhat.com>
Date: Mon Feb 11 19:45:03 2013 +0530
Better exp polynomial
diff --git a/sysdeps/ieee754/dbl-64/mpexp.c b/sysdeps/ieee754/dbl-64/mpexp.c
index 8d288ff..35c4e80 100644
--- a/sysdeps/ieee754/dbl-64/mpexp.c
+++ b/sysdeps/ieee754/dbl-64/mpexp.c
@@ -49,6 +49,15 @@ __mpexp (mp_no *x, mp_no *y, int p)
0, 0, 0, 0, 3, 3, 4, 4, 5, 4, 4, 5, 5, 5, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8
};
+
+ /* Factorials for the values of np above. */
+ static const double nfa[33] =
+ {
+ 1.0, 1.0, 1.0, 1.0, 6.0, 6.0, 24.0, 24.0, 120.0, 24.0, 24.0,
+ 120.0, 120.0, 120.0, 720.0, 720.0, 720.0, 720.0, 720.0, 720.0,
+ 720.0, 720.0, 720.0, 720.0, 5040.0, 5040.0, 5040.0, 5040.0,
+ 40320.0, 40320.0, 40320.0, 40320.0, 40320.0
+ };
static const int m1p[33] =
{
0, 0, 0, 0,
@@ -71,16 +80,7 @@ __mpexp (mp_no *x, mp_no *y, int p)
{0, 0, 0, 0, 0, 0, 0, 0, 27, 0, 0, 39, 43, 47, 51, 55, 59, 63},
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 43, 47, 50, 54}
};
- mp_no mpk =
- {
- 0,
- {
- 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
- 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
- 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
- }
- };
- mp_no mps, mpak, mpt1, mpt2;
+ mp_no mps, mpk, mpt1, mpt2;
/* Choose m,n and compute a=2**(-m). */
n = np[p];
@@ -115,24 +115,38 @@ __mpexp (mp_no *x, mp_no *y, int p)
break;
}
- /* Compute s=x*2**(-m). Put result in mps. */
+ /* Compute s=x*2**(-m). Put result in mps. This is the range-reduced input
+ that we will use to compute e^s. For the final result, simply raise it
+ to 2^m. */
__pow_mp (-m, &mpt1, p);
__mul (x, &mpt1, &mps, p);
- /* Evaluate the polynomial. Put result in mpt2. */
- mpk.e = 1;
- mpk.d[0] = ONE;
- mpk.d[1] = n;
- __dvd (&mps, &mpk, &mpt1, p);
- __add (&mpone, &mpt1, &mpak, p);
- for (k = n - 1; k > 1; k--)
+ /* Compute the Taylor series for e^s:
+
+ 1 + x/1! + x^2/2! + x^3/3! ...
+
+ for N iterations. We compute this as:
+
+ e^x = 1 + (x * n!/1! + x^2 * n!/2! + x^3 * n!/3!) / n!
+ = 1 + (x * (n!/1! + x * (n!/2! + x * (n!/3! + x ...)))) / n!
+
+ n! is pre-computed and saved while k! is computed on the fly. */
+ __cpy (&mps, &mpt2, p);
+
+ double kf = 1.0;
+
+ /* Evaluate the rest. The result will be in mpt2. */
+ for (k = n - 1; k > 0; k--)
{
- __mul (&mps, &mpak, &mpt1, p);
- mpk.d[1] = k;
- __dvd (&mpt1, &mpk, &mpt2, p);
- __add (&mpone, &mpt2, &mpak, p);
+ /* n! / k! = n * (n - 1) ... * (n - k + 1) */
+ kf *= k + 1;
+
+ __dbl_mp (kf, &mpk, p);
+ __add (&mpt2, &mpk, &mpt1, p);
+ __mul (&mps, &mpt1, &mpt2, p);
}
- __mul (&mps, &mpak, &mpt1, p);
+ __dbl_mp (nfa[p], &mpk, p);
+ __dvd (&mpt2, &mpk, &mpt1, p);
__add (&mpone, &mpt1, &mpt2, p);
/* Raise polynomial value to the power of 2**m. Put result in y. */
-----------------------------------------------------------------------
Summary of changes:
ChangeLog | 6 +
sysdeps/ieee754/dbl-64/mpa.c | 218 ++++++++++++++++++++--------
sysdeps/ieee754/dbl-64/mpa.h | 1 +
sysdeps/ieee754/dbl-64/mpexp.c | 64 +++++---
sysdeps/powerpc/powerpc32/power4/fpu/mpa.c | 94 ++++++++++++-
sysdeps/powerpc/powerpc64/power4/fpu/mpa.c | 94 ++++++++++++-
sysdeps/x86_64/fpu/multiarch/mpa-avx.c | 1 +
sysdeps/x86_64/fpu/multiarch/mpa-fma4.c | 1 +
8 files changed, 393 insertions(+), 86 deletions(-)
hooks/post-receive
--
GNU C Library master sources