This is the mail archive of the libc-hacker@sources.redhat.com mailing list for the glibc project.

Note that libc-hacker is a closed list. You may look at the archives of this list, but subscription and posting are not open.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

Re: [PATCH] Fix powl on i386/x86_64 [BZ #258] (take 2)

From: Jakub Jelinek <jakub at redhat dot com>
To: Ulrich Drepper <drepper at redhat dot com>
Cc: Glibc hackers <libc-hacker at sources dot redhat dot com>
Date: Mon, 19 Jul 2004 10:03:53 +0200
Subject: Re: [PATCH] Fix powl on i386/x86_64 [BZ #258] (take 2)
References: <20040714160203.GQ5191@sunsite.ms.mff.cuni.cz> <20040715093555.GA9816@sunsite.ms.mff.cuni.cz> <40F80B44.9020305@redhat.com>
Reply-to: Jakub Jelinek <jakub at redhat dot com>

On Fri, Jul 16, 2004 at 10:07:16AM -0700, Ulrich Drepper wrote:
> Jakub Jelinek wrote:
> 
> > -4:	fldl	MO(one)		// 1 : x
> > +4:	fld1			// 1 : x
> 
> I deliberately avoided the use of fld1 since all my measurements showed
> it is slower.  Please some some measurements of your own.  As far as I
> remember the only time when it paid off to use fld1 is when the fld was
> the only memory operation and therefore it could be avoided to load the
> PIC register.

If there are no other memory references, it is surely noticeably faster,
otherwise I see it the same speed (but the instruction is shorter).
With following proglet I get 10 cycles in both cases on Opteron,
42 on P3, 92 on P4.  AFAIK GCC uses fld1 and fldz in the code it generates
in all cases.

#include <stdio.h>
#include <math.h>

extern long double foo1 (void);
extern long double foo2 (void);
asm (".balign 16; foo1: .section .rodata; .balign 16; mzero: .byte 0, 0, 0, 0, 0, 0, 0, 0x80; .previous; fldz; faddl mzero; fxam; fnstsw; ret;");
asm (".balign 16; foo2: .section .rodata; .balign 16; zero: .double 0.0; .previous; fldl zero; faddl mzero; fxam; fnstsw; ret;");

int main (void)
{
  unsigned long long st, en, m;
  int i;
  m = -1ULL;
  for (i = 0; i < 1000; ++i)
    {
      asm volatile ("rdtsc" : "=A" (st));
      foo1 ();
      asm volatile ("rdtsc" : "=A" (en));
      en -= st;
      if (en < m) m = en;
    }
  printf ("%lld\n", m);
  m = -1ULL;
  for (i = 0; i < 1000; ++i)
    {
      asm volatile ("rdtsc" : "=A" (st));
      foo2 ();
      asm volatile ("rdtsc" : "=A" (en));
      en -= st;
      if (en < m) m = en;
    }
  printf ("%lld\n", m);
  return 0;
}

Anyway, below is a patch variant without fldz and fld1, those changes
weren't fixing anything and thus can be a separate change which can be
argued about independently.

2004-07-19  Jakub Jelinek  <jakub@redhat.com>

	[BZ #258]
	* math/libm-test.inc (max_value, min_value): New variables.
	(initialize): Initialize them.
	(pow_test): Add a couple of new tests.
	* sysdeps/i386/fpu/e_powf.S (__ieee754_powf): Don't generate invalid
	exception if |y| >= 1U<<31.
	* sysdeps/i386/fpu/e_pow.S (__ieee754_pow): Don't generate invalid
	exception if |y| >= 1L<<63.
	* sysdeps/i386/fpu/e_powl.S (__ieee754_powl): Likewise.
	If y*log2(x) overflows to +-inf, return still +inf/+0 instead of NaN.
	* sysdeps/x86_64/fpu/e_powl.S (__ieee754_powl): Likewise.

--- libc/math/libm-test.inc.jj	2004-02-13 12:28:07.000000000 +0100
+++ libc/math/libm-test.inc	2004-07-15 12:34:17.395663150 +0200
@@ -169,7 +169,7 @@ static int output_points;	/* Should the 
 static int ignore_max_ulp;	/* Should we ignore max_ulp?  */
 
 static FLOAT minus_zero, plus_zero;
-static FLOAT plus_infty, minus_infty, nan_value;
+static FLOAT plus_infty, minus_infty, nan_value, max_value, min_value;
 
 static FLOAT max_error, real_max_error, imag_max_error;
 
@@ -3593,6 +3593,28 @@ pow_test (void)
   TEST_ff_f (pow, -1, plus_infty, 1);
   TEST_ff_f (pow, 1, minus_infty, 1);
   TEST_ff_f (pow, -1, minus_infty, 1);
+  TEST_ff_f (pow, 1, 1, 1);
+  TEST_ff_f (pow, 1, -1, 1);
+  TEST_ff_f (pow, 1, 1.25, 1);
+  TEST_ff_f (pow, 1, -1.25, 1);
+  TEST_ff_f (pow, 1, 0x1p62L, 1);
+  TEST_ff_f (pow, 1, 0x1p63L, 1);
+  TEST_ff_f (pow, 1, 0x1p64L, 1);
+  TEST_ff_f (pow, 1, 0x1p72L, 1);
+
+  /* pow (x, +-0) == 1.  */
+  TEST_ff_f (pow, plus_infty, 0, 1);
+  TEST_ff_f (pow, plus_infty, minus_zero, 1);
+  TEST_ff_f (pow, minus_infty, 0, 1);
+  TEST_ff_f (pow, minus_infty, minus_zero, 1);
+  TEST_ff_f (pow, 32.75L, 0, 1);
+  TEST_ff_f (pow, 32.75L, minus_zero, 1);
+  TEST_ff_f (pow, -32.75L, 0, 1);
+  TEST_ff_f (pow, -32.75L, minus_zero, 1);
+  TEST_ff_f (pow, 0x1p72L, 0, 1);
+  TEST_ff_f (pow, 0x1p72L, minus_zero, 1);
+  TEST_ff_f (pow, 0x1p-72L, 0, 1);
+  TEST_ff_f (pow, 0x1p-72L, minus_zero, 1);
 
   TEST_ff_f (pow, -0.1L, 1.1L, nan_value, INVALID_EXCEPTION);
   TEST_ff_f (pow, -0.1L, -1.1L, nan_value, INVALID_EXCEPTION);
@@ -3609,6 +3631,10 @@ pow_test (void)
   TEST_ff_f (pow, minus_zero, -2, plus_infty, DIVIDE_BY_ZERO_EXCEPTION);
   TEST_ff_f (pow, minus_zero, -11.1L, plus_infty, DIVIDE_BY_ZERO_EXCEPTION);
 
+  TEST_ff_f (pow, 0x1p72L, 0x1p72L, plus_infty);
+  TEST_ff_f (pow, 10, -0x1p72L, 0);
+  TEST_ff_f (pow, max_value, max_value, plus_infty);
+  TEST_ff_f (pow, 10, -max_value, 0);
 
   TEST_ff_f (pow, 0, 1, 0);
   TEST_ff_f (pow, 0, 11, 0);
@@ -3623,6 +3649,8 @@ pow_test (void)
 
   TEST_ff_f (pow, minus_zero, 2, 0);
   TEST_ff_f (pow, minus_zero, 11.1L, 0);
+  TEST_ff_f (pow, 0, plus_infty, 0);
+  TEST_ff_f (pow, minus_zero, plus_infty, 0);
 
 #ifndef TEST_INLINE
   /* pow (x, +inf) == +inf for |x| > 1.  */
@@ -3667,6 +3695,11 @@ pow_test (void)
   /* pow (-0, y) == +0 for y > 0 and not an odd integer.  */
   TEST_ff_f (pow, minus_zero, 4, 0.0);
 
+  TEST_ff_f (pow, 16, 0.25L, 2);
+  TEST_ff_f (pow, 0x1p64L, 0.125L, 256);
+  TEST_ff_f (pow, 2, 4, 16);
+  TEST_ff_f (pow, 256, 8, 0x1p64L);
+
   TEST_ff_f (pow, 0.75L, 1.25L, 0.697953644326574699205914060237425566L);
 
 #if defined TEST_DOUBLE || defined TEST_LDOUBLE
@@ -4312,12 +4345,18 @@ initialize (void)
 		       HUGE_VALL, HUGE_VAL, HUGE_VALF);
   minus_infty = CHOOSE (-HUGE_VALL, -HUGE_VAL, -HUGE_VALF,
 			-HUGE_VALL, -HUGE_VAL, -HUGE_VALF);
+  max_value = CHOOSE (LDBL_MAX, DBL_MAX, FLT_MAX,
+		      LDBL_MAX, DBL_MAX, FLT_MAX);
+  min_value = CHOOSE (LDBL_MIN, DBL_MIN, FLT_MIN,
+		      LDBL_MIN, DBL_MIN, FLT_MIN);
 
   (void) &plus_zero;
   (void) &nan_value;
   (void) &minus_zero;
   (void) &plus_infty;
   (void) &minus_infty;
+  (void) &max_value;
+  (void) &min_value;
 
   /* Clear all exceptions.  From now on we must not get random exceptions.  */
   feclearexcept (FE_ALL_EXCEPT);
--- libc/sysdeps/i386/fpu/e_powf.S.jj	2001-07-06 06:55:53.000000000 +0200
+++ libc/sysdeps/i386/fpu/e_powf.S	2004-07-15 13:23:15.977438871 +0200
@@ -1,5 +1,5 @@
 /* ix87 specific implementation of pow function.
-   Copyright (C) 1996, 1997, 1999, 2001 Free Software Foundation, Inc.
+   Copyright (C) 1996, 1997, 1999, 2001, 2004 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
 
@@ -48,6 +48,9 @@ one:	.double 1.0
 	ASM_TYPE_DIRECTIVE(limit,@object)
 limit:	.double 0.29
 	ASM_SIZE_DIRECTIVE(limit)
+	ASM_TYPE_DIRECTIVE(p31,@object)
+p31:	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x41
+	ASM_SIZE_DIRECTIVE(p31)
 
 #ifdef PIC
 #define MO(op) op##@GOTOFF(%ecx)
@@ -96,6 +99,14 @@ ENTRY(__ieee754_powf)
 
 	fxch			// y : x
 
+	/* fistpl raises invalid exception for |y| >= 1L<<31.  */
+	fld	%st		// y : y : x
+	fabs			// |y| : y : x
+	fcompl	MO(p31)		// y : x
+	fnstsw
+	sahf
+	jnc	2f
+
 	/* First see whether `y' is a natural number.  In this case we
 	   can use a more precise algorithm.  */
 	fld	%st		// y : y : x
--- libc/sysdeps/i386/fpu/e_powl.S.jj	2001-07-06 06:55:53.000000000 +0200
+++ libc/sysdeps/i386/fpu/e_powl.S	2004-07-15 12:56:07.678735978 +0200
@@ -1,5 +1,6 @@
 /* ix87 specific implementation of pow function.
-   Copyright (C) 1996, 1997, 1998, 1999, 2001 Free Software Foundation, Inc.
+   Copyright (C) 1996, 1997, 1998, 1999, 2001, 2004
+   Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
 
@@ -48,6 +49,9 @@ one:	.double 1.0
 	ASM_TYPE_DIRECTIVE(limit,@object)
 limit:	.double 0.29
 	ASM_SIZE_DIRECTIVE(limit)
+	ASM_TYPE_DIRECTIVE(p63,@object)
+p63:	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43
+	ASM_SIZE_DIRECTIVE(p63)
 
 #ifdef PIC
 #define MO(op) op##@GOTOFF(%ecx)
@@ -96,6 +100,14 @@ ENTRY(__ieee754_powl)
 
 	fxch			// y : x
 
+	/* fistpll raises invalid exception for |y| >= 1L<<63.  */
+	fld	%st		// y : y : x
+	fabs			// |y| : y : x
+	fcompl	MO(p63)		// y : x
+	fnstsw
+	sahf
+	jnc	2f
+
 	/* First see whether `y' is a natural number.  In this case we
 	   can use a more precise algorithm.  */
 	fld	%st		// y : y : x
@@ -161,6 +173,11 @@ ENTRY(__ieee754_powl)
 
 7:	fyl2x			// log2(x) : y
 8:	fmul	%st(1)		// y*log2(x) : y
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x05, %ah	// is y*log2(x) == ±inf ?
+	je	28f
 	fst	%st(1)		// y*log2(x) : y*log2(x)
 	frndint			// int(y*log2(x)) : y*log2(x)
 	fsubr	%st, %st(1)	// int(y*log2(x)) : fract(y*log2(x))
@@ -172,6 +189,12 @@ ENTRY(__ieee754_powl)
 	fstp	%st(1)		// 2^fract(y*log2(x))*2^int(y*log2(x))
 	ret
 
+28:	fstp	%st(1)		// y*log2(x)
+	fldl	MO(one)		// 1 : y*log2(x)
+	fscale			// 2^(y*log2(x)) : y*log2(x)
+	addl	$8, %esp
+	fstp	%st(1)		// 2^(y*log2(x))
+	ret
 
 	// pow(x,±0) = 1
 	.align ALIGNARG(4)
--- libc/sysdeps/i386/fpu/e_pow.S.jj	2001-07-06 06:55:53.000000000 +0200
+++ libc/sysdeps/i386/fpu/e_pow.S	2004-07-15 13:05:41.601640612 +0200
@@ -1,5 +1,6 @@
 /* ix87 specific implementation of pow function.
-   Copyright (C) 1996, 1997, 1998, 1999, 2001 Free Software Foundation, Inc.
+   Copyright (C) 1996, 1997, 1998, 1999, 2001, 2004
+   Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
 
@@ -48,6 +49,9 @@ one:	.double 1.0
 	ASM_TYPE_DIRECTIVE(limit,@object)
 limit:	.double 0.29
 	ASM_SIZE_DIRECTIVE(limit)
+	ASM_TYPE_DIRECTIVE(p63,@object)
+p63:	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43
+	ASM_SIZE_DIRECTIVE(p63)
 
 #ifdef PIC
 #define MO(op) op##@GOTOFF(%ecx)
@@ -96,6 +100,14 @@ ENTRY(__ieee754_pow)
 
 	fxch			// y : x
 
+	/* fistpll raises invalid exception for |y| >= 1L<<63.  */
+	fld	%st		// y : y : x
+	fabs			// |y| : y : x
+	fcompl	MO(p63)		// y : x
+	fnstsw
+	sahf
+	jnc	2f
+
 	/* First see whether `y' is a natural number.  In this case we
 	   can use a more precise algorithm.  */
 	fld	%st		// y : y : x
--- libc/sysdeps/x86_64/fpu/e_powl.S.jj	2001-09-19 12:24:08.000000000 +0200
+++ libc/sysdeps/x86_64/fpu/e_powl.S	2004-07-14 20:02:06.000000000 +0200
@@ -1,5 +1,5 @@
 /* ix87 specific implementation of pow function.
-   Copyright (C) 1996, 1997, 1998, 1999, 2001 Free Software Foundation, Inc.
+   Copyright (C) 1996, 1997, 1998, 1999, 2001, 2004 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
 
@@ -48,6 +48,10 @@ one:	.double 1.0
 	ASM_TYPE_DIRECTIVE(limit,@object)
 limit:	.double 0.29
 	ASM_SIZE_DIRECTIVE(limit)
+	ASM_TYPE_DIRECTIVE(p63,@object)
+p63:
+	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43
+	ASM_SIZE_DIRECTIVE(p63)
 
 #ifdef PIC
 #define MO(op) op##(%rip)
@@ -87,6 +91,14 @@ ENTRY(__ieee754_powl)
 
 	fxch			// y : x
 
+	/* fistpll raises invalid exception for |y| >= 1L<<63.  */
+	fldl	MO(p63)		// 1L<<63 : y : x
+	fld	%st(1)		// y : 1L<<63 : y : x
+	fabs			// |y| : 1L<<63 : y : x
+	fcomip	%st(1), %st	// 1L<<63 : y : x
+	fstp	%st(0)		// y : x
+	jnc	2f
+
 	/* First see whether `y' is a natural number.  In this case we
 	   can use a more precise algorithm.  */
 	fld	%st		// y : y : x
@@ -148,6 +160,11 @@ ENTRY(__ieee754_powl)
 
 7:	fyl2x			// log2(x) : y
 8:	fmul	%st(1)		// y*log2(x) : y
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x05, %ah      // is y*log2(x) == ±inf ?
+	je	28f
 	fst	%st(1)		// y*log2(x) : y*log2(x)
 	frndint			// int(y*log2(x)) : y*log2(x)
 	fsubr	%st, %st(1)	// int(y*log2(x)) : fract(y*log2(x))
@@ -158,6 +175,11 @@ ENTRY(__ieee754_powl)
 	fstp	%st(1)		// 2^fract(y*log2(x))*2^int(y*log2(x))
 	ret
 
+28:	fstp	%st(1)		// y*log2(x)
+	fldl	MO(one)		// 1 : y*log2(x)
+	fscale			// 2^(y*log2(x)) : y*log2(x)
+	fstp	%st(1)		// 2^(y*log2(x))
+	ret
 
 	// pow(x,±0) = 1
 	.align ALIGNARG(4)


	Jakub

Follow-Ups:
- Re: [PATCH] Fix powl on i386/x86_64 [BZ #258] (take 2)
  - From: Ulrich Drepper

References:
- [PATCH] Fix powl on x86_64 [BZ #258]
  - From: Jakub Jelinek
- [PATCH] Fix powl on i386/x86_64 [BZ #258] (take 2)
  - From: Jakub Jelinek
- Re: [PATCH] Fix powl on i386/x86_64 [BZ #258] (take 2)
  - From: Ulrich Drepper

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]