This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH] PPC64 performance improvements for sqrt
- From: Steven Munroe <munroesj at us dot ibm dot com>
- To: libc-alpha at sources dot redhat dot com, Ryan Arnold <rsa at us dot ibm dot com>
- Date: Thu, 01 Nov 2007 17:11:03 -0500
- Subject: [PATCH] PPC64 performance improvements for sqrt
For POWER4 and later, the performance of sqrt()/sqrtf() is limited by
the overhead of stacking a frame, addressing the GOT, testing for NaN
etc, which is only need in case we need to report EDOM via
__kernel_standard(). For PowerPC the fsqrt[s] instruction does all the
work except for setting errno to EDOM for non-zero negative values.
So the attached patch simulates prologue shrink-wrapping such that the
fsqrt calculation and basic error checking occurs before the prologure.
If there are no errors the result is returned directly. Otherwise stack
stack a frame and reports errors if needed via __kernel_standard().
This patch only impacts builds using:
--with-cpu=[power4,970,power5,power5+,power6,power6x]
the code for power4/970 is slightly different from that for power5 and
later due to increased latency accessing the FPSCR. For micro benchmarks
I see the following improvement:
1.74 X on ppc970 2.0GHz
1.71 X on power4 1.0GHz
2.60 X on power5 1.9GHz
1.55 X on power6 4.2GHz
This patch will also be release with powerpc-cpu-V0.07.
2007-10-26 Steven Munroe <sjmunroe@us.ibm.com>
* sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.c (__sqrt): Make __sqrt
a leaf routine with tail call to __w_sqrt. Error path code moved to.
(__w_sqrt): Here
* sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.c (__sqrtf): Make
__sqrtf a leaf routine with tail call to __w_sqrtf. Error path code
moved to.
(__w_sqrtf): Here
* sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.c: New File.
* sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.c: New File.
* sysdeps/powerpc/powerpc64/power4/fpu/w_sqrt.c (__sqrt): Make __sqrt
a leaf routine with tail call to __w_sqrt. Error path code moved to.
(__w_sqrt): Here
* sysdeps/powerpc/powerpc64/power4/fpu/w_sqrtf.c (__sqrtf): Make
__sqrtf a leaf routine with tail call to __w_sqrtf. Error path code
moved to.
(__w_sqrtf): Here
* sysdeps/powerpc/powerpc64/power5/fpu/w_sqrt.c: New File.
* sysdeps/powerpc/powerpc64/power5/fpu/w_sqrtf.c: New File.
diff -urN libc25-cvstip-20070919/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.c libc25/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.c
--- libc25-cvstip-20070919/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.c 2007-06-03 15:51:32.000000000 -0500
+++ libc25/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.c 2007-10-26 11:14:06.881065568 -0500
@@ -32,11 +32,47 @@
#endif
{
double z;
-/* Power4 (ISA V2.0) and above implement sqrt in hardware. */
+/* Power4 (ISA V2.0) and above implement sqrt in hardware (not optional).
+ The fsqrt instruction generates the correct value for all inputs and
+ sets the appropriate floating point exceptions. Extented checking is
+ only needed to set errno (via __kernel_standard) if the input value
+ is negative.
+
+ The fsqrt will set FPCC and FU (Floating Point Unordered or NaN
+ to indicated that the input value was negative or NaN. Use Move to
+ Condition Register from FPSCR to copy the FPCC field to cr1. The
+ branch on summary overflow transfers control to __w_sqrt to process
+ any error conditions. Otherwise we can return the result directly.
+
+ This function looks like a leaf routine, so gcc will not stack a
+ frame or generate prologue/epilogue code. This means it is safe to
+ transfer directly to __w_sqrt as long as the input value (f1) is
+ preserved. Putting the the sqrt result into f2 (float parameter 2)
+ allows passing both the input value and sqrt result into the extended
+ wrapper so there is no need to recompute.
+
+ This tactic avoids the overhead of stacking a frame for the normal
+ (non-error) case. Until gcc supports prologue shrink-wrapping
+ this is the best we can do. */
__asm __volatile (
- " fsqrt %0,%1\n"
+ " fsqrt 2,%1\n"
+ " mcrfs cr1,4\n"
+ " bso- cr1,__w_sqrt\n"
+ " fmr %0,2\n"
: "=f" (z)
- : "f" (x));
+ : "f" (x)
+ : "cr1", "fr2");
+
+ return z;
+}
+
+
+/* This code gets control from the __sqrt wrapper only if there are
+ errors that need to be reported. For example nagative input values
+ or NANs. */
+double
+__w_sqrt (double x, double z) /* wrapper sqrt errors */
+{
#ifdef _IEEE_LIBM
return z;
#else
@@ -45,7 +81,7 @@
if (__builtin_expect (x != x, 0))
return z;
-
+
if (__builtin_expect (x < 0.0, 0))
return __kernel_standard (x, x, 26); /* sqrt(negative) */
else
diff -urN libc25-cvstip-20070919/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.c libc25/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.c
--- libc25-cvstip-20070919/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.c 2007-06-03 15:51:38.000000000 -0500
+++ libc25/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.c 2007-10-26 11:14:06.886064808 -0500
@@ -33,16 +33,51 @@
float x;
#endif
{
-#ifdef _IEEE_LIBM
- return __ieee754_sqrtf (x);
-#else
float z;
-/* Power4 (ISA V2.0) and above implement sqrtf in hardware. */
+/* Power4 (ISA V2.0) and above implement sqrt in hardware (not optional).
+ The fsqrts instruction generates the correct value for all inputs and
+ sets the appropriate floating point exceptions. Extented checking is
+ only needed to set errno (via __kernel_standard) if the input value
+ is negative.
+
+ The fsqrt will set FPCC and FU (Floating Point Unordered or NaN
+ to indicated that the input value was negative or NaN. Use Move to
+ Condition Register from FPSCR to copy the FPCC field to cr1. The
+ branch on summary overflow transfers control to __w_sqrtf to process
+ any error conditions. Otherwise we can return the result directly.
+
+ This function looks like a leaf routine, so gcc will not stack a
+ frame or generate prologue/epilogue code. This means it is safe to
+ transfer directly to __w_sqrtf as long as the input value (f1) is
+ preserved. Putting the the sqrt result into f2 (float parameter 2)
+ allows passing both the input value and sqrt result into the extended
+ wrapper so there is no need to recompute.
+
+ This tactic avoids the overhead of stacking a frame for the normal
+ (non-error) case. Until gcc supports prologue shrink-wrapping
+ this is the best we can do. */
__asm __volatile (
- " fsqrts %0,%1\n"
+ " fsqrts 2,%1\n"
+ " mcrfs cr1,4\n"
+ " bso- cr1,__w_sqrtf\n"
+ " fmr %0,2\n"
: "=f" (z)
- : "f" (x));
+ : "f" (x)
+ : "cr1", "fr2");
+
+ return z;
+}
+
+/* This code gets control from the __sqrtf wrapper only if there are
+ errors that need to be reported. For example nagative input values
+ or NANs. */
+float
+__w_sqrtf (float x, float z) /* wrapper sqrt errors */
+{
+#ifdef _IEEE_LIBM
+ return z;
+#else
if (__builtin_expect (_LIB_VERSION == _IEEE_, 0))
return z;
diff -urN libc25-cvstip-20070919/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.c libc25/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.c
--- libc25-cvstip-20070919/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.c Wed Dec 31 18:00:00 1969
+++ libc25/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.c Fri Oct 26 11:17:20 2007
@@ -0,0 +1,98 @@
+/* Double-precision floating point square root wrapper.
+ Copyright (C) 2004, 2007 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <math_ldbl_opt.h>
+#include "math.h"
+#include "math_private.h"
+#include <fenv_libc.h>
+
+#ifdef __STDC__
+double
+__sqrt (double x) /* wrapper sqrt */
+#else
+double
+__sqrt (x) /* wrapper sqrt */
+ double x;
+#endif
+{
+ double z;
+/* Power4 (ISA V2.0) and above implement sqrt in hardware (not optional).
+ The fsqrt instruction generates the correct value for all inputs and
+ sets the appropriate floating point exceptions.
+
+ Extented checking is only needed to set errno (via __kernel_standard)
+ if the input value is negative. So compare the input value against
+ the absolute value of itself. This will compare equal unless the
+ value is negative (EDOM) or a NAN, in which case we transfer (tail
+ call via branch) to the extend wrapper. If equal we can return the
+ result directly.
+
+ This function looks like a leaf routine, so gcc will not stack a
+ frame or generate prologue/epilogue code. This means it is safe to
+ transfer directly to __w_sqrt as long as the input value (f1) is
+ preserved. Putting the the sqrt result into f2 (float parameter 2)
+ allows passing both the input value and sqrt result into the extended
+ wrapper so there is no need to recompute.
+
+ This tactic avoids the overhead of stacking a frame for the normal
+ (non-error) case. Until gcc supports prologue shrink-wrapping
+ this is the best we can do. */
+ __asm __volatile (
+ " fabs 0,%1\n"
+ " fsqrt 2,%1\n"
+ " fcmpu cr1,0,%1\n"
+ " bne- cr1,__w_sqrt\n"
+ " fmr %0,2\n"
+ : "=f" (z)
+ : "f" (x)
+ : "cr1", "fr0", "fr2");
+
+ return z;
+}
+
+
+/* This code gets control from the __sqrt wrapper only if there are
+ errors that need to be reported. For example nagative input values
+ or NANs. */
+double
+__w_sqrt (double x, double z) /* wrapper sqrt errors */
+{
+#ifdef _IEEE_LIBM
+ return z;
+#else
+ if (__builtin_expect (_LIB_VERSION == _IEEE_, 0))
+ return z;
+
+ if (__builtin_expect (x != x, 0))
+ return z;
+
+ if (__builtin_expect (x < 0.0, 0))
+ return __kernel_standard (x, x, 26); /* sqrt(negative) */
+ else
+ return z;
+#endif
+}
+
+weak_alias (__sqrt, sqrt)
+#ifdef NO_LONG_DOUBLE
+ strong_alias (__sqrt, __sqrtl) weak_alias (__sqrt, sqrtl)
+#endif
+#if LONG_DOUBLE_COMPAT(libm, GLIBC_2_0)
+compat_symbol (libm, __sqrt, sqrtl, GLIBC_2_0);
+#endif
diff -urN libc25-cvstip-20070919/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.c libc25/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.c
--- libc25-cvstip-20070919/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.c Wed Dec 31 18:00:00 1969
+++ libc25/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.c Fri Oct 26 11:14:14 2007
@@ -0,0 +1,95 @@
+/* Single-precision floating point square root wrapper.
+ Copyright (C) 2004, 2007 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include "math.h"
+#include "math_private.h"
+#include <fenv_libc.h>
+
+#include <sysdep.h>
+#include <ldsodefs.h>
+
+#ifdef __STDC__
+float
+__sqrtf (float x) /* wrapper sqrtf */
+#else
+float
+__sqrtf (x) /* wrapper sqrtf */
+ float x;
+#endif
+{
+ float z;
+/* Power4 (ISA V2.0) and above implement sqrt in hardware (not optional).
+ The fsqrts instruction generates the correct value for all inputs and
+ sets the appropriate floating point exceptions.
+
+ Extented checking is only needed to set errno (via __kernel_standard)
+ if the input value is negative. So compare the input value against
+ the absolute value of itself. This will compare equal unless the
+ value is negative (EDOM) or a NAN, in which case we transfer (tail
+ call via branch) to the extended wrapper. If equal we can return the
+ result directly.
+
+ This function looks like a leaf routine, so gcc will not stack a
+ frame or generate prologue/epilogue code. This means it is safe to
+ transfer directly to __w_sqrtf as long as the input value (f1) is
+ preserved. Putting the the sqrt result into f2 (float parameter 2)
+ allows passing both the input value and sqrt result into the extended
+ wrapper so there is no need to recompute.
+
+ This tactic avoids the overhead of stacking a frame for the normal
+ (non-error) case. Until gcc supports prologue shrink-wrapping
+ this is the best we can do. */
+ __asm __volatile (
+ " fabs 0,%1\n"
+ " fsqrts 2,%1\n"
+ " fcmpu cr1,0,%1\n"
+ " bne- cr1,__w_sqrt\n"
+ " fmr %0,2\n"
+ : "=f" (z)
+ : "f" (x)
+ : "cr1", "fr0", "fr2");
+
+ return z;
+}
+
+
+/* This code gets control from the __sqrtf wrapper only if there are
+ errors that need to be reported. For example nagative input values
+ or NANs. */
+float
+__w_sqrtf (float x, float z) /* wrapper sqrt errors */
+{
+#ifdef _IEEE_LIBM
+ return z;
+#else
+ if (__builtin_expect (_LIB_VERSION == _IEEE_, 0))
+ return z;
+
+ if (__builtin_expect (x != x, 0))
+ return z;
+
+ if (__builtin_expect (x < 0.0, 0))
+ /* sqrtf(negative) */
+ return (float) __kernel_standard ((double) x, (double) x, 126);
+ else
+ return z;
+#endif
+}
+
+weak_alias (__sqrtf, sqrtf)
diff -urN libc25-cvstip-20070919/sysdeps/powerpc/powerpc64/power4/fpu/w_sqrt.c libc25/sysdeps/powerpc/powerpc64/power4/fpu/w_sqrt.c
--- libc25-cvstip-20070919/sysdeps/powerpc/powerpc64/power4/fpu/w_sqrt.c 2007-06-03 16:15:53.000000000 -0500
+++ libc25/sysdeps/powerpc/powerpc64/power4/fpu/w_sqrt.c 2007-10-24 17:13:16.336139032 -0500
@@ -32,11 +32,47 @@
#endif
{
double z;
-/* Power4 (ISA V2.0) and above implement sqrt in hardware. */
+/* Power4 (ISA V2.0) and above implement sqrt in hardware (not optional).
+ The fsqrt instruction generates the correct value for all inputs and
+ sets the appropriate floating point exceptions. Extented checking is
+ only needed to set errno (via __kernel_standard) if the input value
+ is negative.
+
+ The fsqrt will set FPCC and FU (Floating Point Unordered or NaN
+ to indicated that the input value was negative or NaN. Use Move to
+ Condition Register from FPSCR to copy the FPCC field to cr1. The
+ branch on summary overflow transfers control to __w_sqrt to process
+ any error conditions. Otherwise we can return the result directly.
+
+ This function looks like a leaf routine, so gcc will not stack a
+ frame or generate prologue/epilogue code. This means it is safe to
+ transfer directly to __w_sqrt as long as the input value (f1) is
+ preserved. Putting the the sqrt result into f2 (float parameter 2)
+ allows passing both the input value and sqrt result into the extended
+ wrapper so there is no need to recompute.
+
+ This tactic avoids the overhead of stacking a frame for the normal
+ (non-error) case. Until gcc supports prologue shrink-wrapping
+ this is the best we can do. */
__asm __volatile (
- " fsqrt %0,%1\n"
+ " fsqrt 2,%1\n"
+ " mcrfs cr1,4\n"
+ " bso- cr1,__w_sqrt\n"
+ " fmr %0,2\n"
: "=f" (z)
- : "f" (x));
+ : "f" (x)
+ : "cr1", "fr2");
+
+ return z;
+}
+
+
+/* This code gets control from the __sqrt wrapper only if there are
+ errors that need to be reported. For example nagative input values
+ or NANs. */
+double
+__w_sqrt (double x, double z) /* wrapper sqrt errors */
+{
#ifdef _IEEE_LIBM
return z;
#else
diff -urN libc25-cvstip-20070919/sysdeps/powerpc/powerpc64/power4/fpu/w_sqrtf.c libc25/sysdeps/powerpc/powerpc64/power4/fpu/w_sqrtf.c
--- libc25-cvstip-20070919/sysdeps/powerpc/powerpc64/power4/fpu/w_sqrtf.c 2007-06-03 16:15:58.000000000 -0500
+++ libc25/sysdeps/powerpc/powerpc64/power4/fpu/w_sqrtf.c 2007-10-24 16:07:55.808110280 -0500
@@ -33,16 +33,51 @@
float x;
#endif
{
-#ifdef _IEEE_LIBM
- return __ieee754_sqrtf (x);
-#else
float z;
-/* Power4 (ISA V2.0) and above implement sqrtf in hardware. */
+/* Power4 (ISA V2.0) and above implement sqrt in hardware (not optional).
+ The fsqrts instruction generates the correct value for all inputs and
+ sets the appropriate floating point exceptions. Extented checking is
+ only needed to set errno (via __kernel_standard) if the input value
+ is negative.
+
+ The fsqrt will set FPCC and FU (Floating Point Unordered or NaN
+ to indicated that the input value was negative or NaN. Use Move to
+ Condition Register from FPSCR to copy the FPCC field to cr1. The
+ branch on summary overflow transfers control to __w_sqrtf to process
+ any error conditions. Otherwise we can return the result directly.
+
+ This function looks like a leaf routine, so gcc will not stack a
+ frame or generate prologue/epilogue code. This means it is safe to
+ transfer directly to __w_sqrtf as long as the input value (f1) is
+ preserved. Putting the the sqrt result into f2 (float parameter 2)
+ allows passing both the input value and sqrt result into the extended
+ wrapper so there is no need to recompute.
+
+ This tactic avoids the overhead of stacking a frame for the normal
+ (non-error) case. Until gcc supports prologue shrink-wrapping
+ this is the best we can do. */
__asm __volatile (
- " fsqrts %0,%1\n"
+ " fsqrts 2,%1\n"
+ " mcrfs cr1,4\n"
+ " bso- cr1,__w_sqrtf\n"
+ " fmr %0,2\n"
: "=f" (z)
- : "f" (x));
+ : "f" (x)
+ : "cr1", "fr2");
+
+ return z;
+}
+
+/* This code gets control from the __sqrtf wrapper only if there are
+ errors that need to be reported. For example nagative input values
+ or NANs. */
+float
+__w_sqrtf (float x, float z) /* wrapper sqrt errors */
+{
+#ifdef _IEEE_LIBM
+ return z;
+#else
if (__builtin_expect (_LIB_VERSION == _IEEE_, 0))
return z;
diff -urN libc25-cvstip-20070919/sysdeps/powerpc/powerpc64/power5/fpu/w_sqrt.c libc25/sysdeps/powerpc/powerpc64/power5/fpu/w_sqrt.c
--- libc25-cvstip-20070919/sysdeps/powerpc/powerpc64/power5/fpu/w_sqrt.c Wed Dec 31 18:00:00 1969
+++ libc25/sysdeps/powerpc/powerpc64/power5/fpu/w_sqrt.c Fri Oct 26 11:12:12 2007
@@ -0,0 +1,98 @@
+/* Double-precision floating point square root wrapper.
+ Copyright (C) 2004, 2007 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <math_ldbl_opt.h>
+#include "math.h"
+#include "math_private.h"
+#include <fenv_libc.h>
+
+#ifdef __STDC__
+double
+__sqrt (double x) /* wrapper sqrt */
+#else
+double
+__sqrt (x) /* wrapper sqrt */
+ double x;
+#endif
+{
+ double z;
+/* Power4 (ISA V2.0) and above implement sqrt in hardware (not optional).
+ The fsqrt instruction generates the correct value for all inputs and
+ sets the appropriate floating point exceptions.
+
+ Extented checking is only needed to set errno (via __kernel_standard)
+ if the input value is negative. So compare the input value against
+ the absolute value of itself. This will compare equal unless the
+ value is negative (EDOM) or a NAN, in which case we transfer (tail
+ call via branch) to the extend wrapper. If equal we can return the
+ result directly.
+
+ This function looks like a leaf routine, so gcc will not stack a
+ frame or generate prologue/epilogue code. This means it is safe to
+ transfer directly to __w_sqrt as long as the input value (f1) is
+ preserved. Putting the the sqrt result into f2 (float parameter 2)
+ allows passing both the input value and sqrt result into the extended
+ wrapper so there is no need to recompute.
+
+ This tactic avoids the overhead of stacking a frame for the normal
+ (non-error) case. Until gcc supports prologue shrink-wrapping
+ this is the best we can do. */
+ __asm __volatile (
+ " fabs 0,%1\n"
+ " fsqrt 2,%1\n"
+ " fcmpu cr1,0,%1\n"
+ " bne- cr1,__w_sqrt\n"
+ " fmr %0,2\n"
+ : "=f" (z)
+ : "f" (x)
+ : "cr1", "fr0", "fr2");
+
+ return z;
+}
+
+
+/* This code gets control from the __sqrt wrapper only if there are
+ errors that need to be reported. For example nagative input values
+ or NANs. */
+double
+__w_sqrt (double x, double z) /* wrapper sqrt errors */
+{
+#ifdef _IEEE_LIBM
+ return z;
+#else
+ if (__builtin_expect (_LIB_VERSION == _IEEE_, 0))
+ return z;
+
+ if (__builtin_expect (x != x, 0))
+ return z;
+
+ if (__builtin_expect (x < 0.0, 0))
+ return __kernel_standard (x, x, 26); /* sqrt(negative) */
+ else
+ return z;
+#endif
+}
+
+weak_alias (__sqrt, sqrt)
+#ifdef NO_LONG_DOUBLE
+ strong_alias (__sqrt, __sqrtl) weak_alias (__sqrt, sqrtl)
+#endif
+#if LONG_DOUBLE_COMPAT(libm, GLIBC_2_3)
+compat_symbol (libm, __sqrt, sqrtl, GLIBC_2_3);
+#endif
diff -urN libc25-cvstip-20070919/sysdeps/powerpc/powerpc64/power5/fpu/w_sqrtf.c libc25/sysdeps/powerpc/powerpc64/power5/fpu/w_sqrtf.c
--- libc25-cvstip-20070919/sysdeps/powerpc/powerpc64/power5/fpu/w_sqrtf.c Wed Dec 31 18:00:00 1969
+++ libc25/sysdeps/powerpc/powerpc64/power5/fpu/w_sqrtf.c Wed Oct 24 10:07:59 2007
@@ -0,0 +1,95 @@
+/* Single-precision floating point square root wrapper.
+ Copyright (C) 2004, 2007 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include "math.h"
+#include "math_private.h"
+#include <fenv_libc.h>
+
+#include <sysdep.h>
+#include <ldsodefs.h>
+
+#ifdef __STDC__
+float
+__sqrtf (float x) /* wrapper sqrtf */
+#else
+float
+__sqrtf (x) /* wrapper sqrtf */
+ float x;
+#endif
+{
+ float z;
+/* Power4 (ISA V2.0) and above implement sqrt in hardware (not optional).
+ The fsqrts instruction generates the correct value for all inputs and
+ sets the appropriate floating point exceptions.
+
+ Extented checking is only needed to set errno (via __kernel_standard)
+ if the input value is negative. So compare the input value against
+ the absolute value of itself. This will compare equal unless the
+ value is negative (EDOM) or a NAN, in which case we transfer (tail
+ call via branch) to the extended wrapper. If equal we can return the
+ result directly.
+
+ This function looks like a leaf routine, so gcc will not stack a
+ frame or generate prologue/epilogue code. This means it is safe to
+ transfer directly to __w_sqrtf as long as the input value (f1) is
+ preserved. Putting the the sqrt result into f2 (float parameter 2)
+ allows passing both the input value and sqrt result into the extended
+ wrapper so there is no need to recompute.
+
+ This tactic avoids the overhead of stacking a frame for the normal
+ (non-error) case. Until gcc supports prologue shrink-wrapping
+ this is the best we can do. */
+ __asm __volatile (
+ " fabs 0,%1\n"
+ " fsqrts 2,%1\n"
+ " fcmpu cr1,0,%1\n"
+ " bne- cr1,__w_sqrt\n"
+ " fmr %0,2\n"
+ : "=f" (z)
+ : "f" (x)
+ : "cr1", "fr0", "fr2");
+
+ return z;
+}
+
+
+/* This code gets control from the __sqrtf wrapper only if there are
+ errors that need to be reported. For example nagative input values
+ or NANs. */
+float
+__w_sqrtf (float x, float z) /* wrapper sqrt errors */
+{
+#ifdef _IEEE_LIBM
+ return z;
+#else
+ if (__builtin_expect (_LIB_VERSION == _IEEE_, 0))
+ return z;
+
+ if (__builtin_expect (x != x, 0))
+ return z;
+
+ if (__builtin_expect (x < 0.0, 0))
+ /* sqrtf(negative) */
+ return (float) __kernel_standard ((double) x, (double) x, 126);
+ else
+ return z;
+#endif
+}
+
+weak_alias (__sqrtf, sqrtf)