This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH 1/N] x86_64 vectorization support: vectorized math functions addition to Glibc


On 10-09-2014 12:14, Andrew Senkevich wrote:
> Patch attached here

Before anything I would like to ask you to read the contributor checklist [1].
First, such change will require you to sign FSF copyright assignment before any
kind of review.

You also need to describe your intentions with your patch: it is an optimization
to current behavior? Just "Patch attached here" does not say anything about it.
Which is the performance evaluation? Which benchmarks did you use? Did you run
the testcase? Does the ULPs file need update?

For such changes the best way is to provide an internal symbol selected by IFUNC.
I will let x86 maintainers chime in, but I see adding a new symbol under GLIBC 2.2.5
 *and* and external PLT call is unacceptable in IMHO.

The patch also needs a proper ChangeLog. In a short: read the checklist first please.

[1] https://sourceware.org/glibc/wiki/Contribution%20checklist

>
>
> --
> WBR,
> Andrew
> diff --git a/math/bits/mathcalls.h b/math/bits/mathcalls.h
> index 8a94a7e..ebfa583 100644
> --- a/math/bits/mathcalls.h
> +++ b/math/bits/mathcalls.h
> @@ -46,6 +46,17 @@
>  # error "Never include <bits/mathcalls.h> directly; include <math.h> instead."
>  #endif
>  
> +#undef __DECL_SIMD
> +
> +#if defined _OPENMP && _OPENMP >= 201307
> +/* For now we have vectorized version only for _Mdouble_ case */
> +# ifdef _Mdouble_
> +#  define __DECL_SIMD _Pragma ("omp declare simd")
> +# endif
> +#else
> +# define __DECL_SIMD
> +#endif
> +
>  
>  /* Trigonometric functions.  */
>  
> @@ -60,6 +71,7 @@ __MATHCALL (atan,, (_Mdouble_ __x));
>  __MATHCALL (atan2,, (_Mdouble_ __y, _Mdouble_ __x));
>  
>  /* Cosine of X.  */
> +__DECL_SIMD
>  __MATHCALL (cos,, (_Mdouble_ __x));
>  /* Sine of X.  */
>  __MATHCALL (sin,, (_Mdouble_ __x));
> diff --git a/sysdeps/unix/sysv/linux/x86_64/64/libm.abilist b/sysdeps/unix/sysv/linux/x86_64/64/libm.abilist
> index 2390934..1aa3099 100644
> --- a/sysdeps/unix/sysv/linux/x86_64/64/libm.abilist
> +++ b/sysdeps/unix/sysv/linux/x86_64/64/libm.abilist
> @@ -89,6 +89,7 @@ GLIBC_2.18
>  GLIBC_2.2.5
>   GLIBC_2.2.5 A
>   _LIB_VERSION D 0x4
> + _ZGVdN4v_cos F
>   __clog10 F
>   __clog10f F
>   __clog10l F
> diff --git a/sysdeps/unix/sysv/linux/x86_64/64/localplt.data b/sysdeps/unix/sysv/linux/x86_64/64/localplt.data
> new file mode 100644
> index 0000000..1a683d9
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/x86_64/64/localplt.data
> @@ -0,0 +1,10 @@
> +# See scripts/check-localplt.awk for how this file is processed.
> +# PLT use is required for the malloc family and for matherr because
> +# users can define their own functions and have library internals call them.
> +libc.so: calloc
> +libc.so: free
> +libc.so: malloc
> +libc.so: memalign
> +libc.so: realloc
> +libm.so: matherr
> +libm.so: cos
> diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile
> new file mode 100644
> index 0000000..1cb3ec5
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/Makefile
> @@ -0,0 +1,3 @@
> +ifeq ($(subdir),math)
> +libm-support += svml_d_cos4_core svml_d_cos_data
> +endif
> diff --git a/sysdeps/x86_64/fpu/Versions b/sysdeps/x86_64/fpu/Versions
> new file mode 100644
> index 0000000..d30fbb3
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/Versions
> @@ -0,0 +1,7 @@
> +libm {
> +  GLIBC_2.2.5 {
> +    # A generic bug got this omitted from other configurations' version
> +    # sets, but we always had it.
> +    _ZGVdN4v_cos;
> +  }
> +}
> diff --git a/sysdeps/x86_64/fpu/svml_d_cos4_core.S b/sysdeps/x86_64/fpu/svml_d_cos4_core.S
> new file mode 100644
> index 0000000..7316d2b
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_d_cos4_core.S
> @@ -0,0 +1,185 @@
> +/* Function cos vectorized with AVX2.
> +   Copyright (C) 2014 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +	.text
> +ENTRY(_ZGVdN4v_cos)
> +
> +/* ALGORITHM DESCRIPTION:
> + *     
> + *    ( low accuracy ( < 4ulp ) or enhanced performance ( half of correct mantissa ) implementation )
> + *     
> + *    Argument representation:
> + *    arg + Pi/2 = (N*Pi + R)
> + *    
> + *    Result calculation:
> + *    cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R)
> + *    sin(R) is approximated by corresponding polynomial
> + */
> +        pushq     %rbp
> +        movq      %rsp, %rbp
> +        andq      $-64, %rsp
> +        subq      $448, %rsp
> +        movq      __gnu_svml_dcos_data@GOTPCREL(%rip), %rax
> +        vmovapd   %ymm0, %ymm1
> +        vmovupd   192(%rax), %ymm4
> +        vmovupd   256(%rax), %ymm5
> +
> +/* ARGUMENT RANGE REDUCTION:
> + * Add Pi/2 to argument: X' = X+Pi/2
> + */
> +        vaddpd    128(%rax), %ymm1, %ymm7
> +
> +/* Get absolute argument value: X' = |X'| */
> +        vandpd    (%rax), %ymm7, %ymm2
> +
> +/* Y = X'*InvPi + RS : right shifter add */
> +        vfmadd213pd %ymm5, %ymm4, %ymm7
> +        vmovupd   1216(%rax), %ymm4
> +
> +/* Check for large arguments path */
> +        vcmpnle_uqpd 64(%rax), %ymm2, %ymm3
> +
> +/* N = Y - RS : right shifter sub */
> +        vsubpd    %ymm5, %ymm7, %ymm6
> +        vmovupd   640(%rax), %ymm2
> +
> +/* SignRes = Y<<63 : shift LSB to MSB place for result sign */
> +        vpsllq    $63, %ymm7, %ymm7
> +
> +/* N = N - 0.5 */
> +        vsubpd    320(%rax), %ymm6, %ymm0
> +        vmovmskpd %ymm3, %ecx
> +
> +/* R = X - N*Pi1 */
> +        vmovapd   %ymm1, %ymm3
> +        vfnmadd231pd %ymm0, %ymm2, %ymm3
> +
> +/* R = R - N*Pi2 */
> +        vfnmadd231pd 704(%rax), %ymm0, %ymm3
> +
> +/* R = R - N*Pi3 */
> +        vfnmadd132pd 768(%rax), %ymm3, %ymm0
> +
> +/* POLYNOMIAL APPROXIMATION:
> + * R2 = R*R
> + */
> +        vmulpd    %ymm0, %ymm0, %ymm5
> +        vfmadd213pd 1152(%rax), %ymm5, %ymm4
> +        vfmadd213pd 1088(%rax), %ymm5, %ymm4
> +        vfmadd213pd 1024(%rax), %ymm5, %ymm4
> +
> +/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */
> +        vfmadd213pd 960(%rax), %ymm5, %ymm4
> +        vfmadd213pd 896(%rax), %ymm5, %ymm4
> +        vfmadd213pd 832(%rax), %ymm5, %ymm4
> +        vmulpd    %ymm5, %ymm4, %ymm6
> +        vfmadd213pd %ymm0, %ymm0, %ymm6
> +
> +/* RECONSTRUCTION:
> + * Final sign setting: Res = Poly^SignRes 
> + */
> +        vxorpd    %ymm7, %ymm6, %ymm0
> +        testl     %ecx, %ecx
> +        jne       _LBL_1_3
> +
> +_LBL_1_2:
> +        movq      %rbp, %rsp
> +        popq      %rbp
> +        ret
> +
> +_LBL_1_3:
> +        vmovupd   %ymm1, 320(%rsp)
> +        vmovupd   %ymm0, 384(%rsp)
> +        je        _LBL_1_2
> +
> +        xorb      %dl, %dl
> +        xorl      %eax, %eax
> +        vmovups   %ymm8, 224(%rsp)
> +        vmovups   %ymm9, 192(%rsp)
> +        vmovups   %ymm10, 160(%rsp)
> +        vmovups   %ymm11, 128(%rsp)
> +        vmovups   %ymm12, 96(%rsp)
> +        vmovups   %ymm13, 64(%rsp)
> +        vmovups   %ymm14, 32(%rsp)
> +        vmovups   %ymm15, (%rsp)
> +        movq      %rsi, 264(%rsp)
> +        movq      %rdi, 256(%rsp)
> +        movq      %r12, 296(%rsp)
> +        movb      %dl, %r12b
> +        movq      %r13, 288(%rsp)
> +        movl      %ecx, %r13d
> +        movq      %r14, 280(%rsp)
> +        movl      %eax, %r14d
> +        movq      %r15, 272(%rsp)
> +
> +_LBL_1_6:
> +        btl       %r14d, %r13d
> +        jc        _LBL_1_12
> +
> +_LBL_1_7:
> +        lea       1(%r14), %esi
> +        btl       %esi, %r13d
> +        jc        _LBL_1_10
> +
> +_LBL_1_8:
> +        incb      %r12b
> +        addl      $2, %r14d
> +        cmpb      $16, %r12b
> +        jb        _LBL_1_6
> +
> +        vmovups   224(%rsp), %ymm8
> +        vmovups   192(%rsp), %ymm9
> +        vmovups   160(%rsp), %ymm10
> +        vmovups   128(%rsp), %ymm11
> +        vmovups   96(%rsp), %ymm12
> +        vmovups   64(%rsp), %ymm13
> +        vmovups   32(%rsp), %ymm14
> +        vmovups   (%rsp), %ymm15
> +        vmovupd   384(%rsp), %ymm0
> +        movq      264(%rsp), %rsi
> +        movq      256(%rsp), %rdi
> +        movq      296(%rsp), %r12
> +        movq      288(%rsp), %r13
> +        movq      280(%rsp), %r14
> +        movq      272(%rsp), %r15
> +        jmp       _LBL_1_2
> +
> +_LBL_1_10:
> +        movzbl    %r12b, %r15d
> +        shlq      $4, %r15
> +        vmovsd    328(%rsp,%r15), %xmm0
> +        vzeroupper
> +
> +        call      cos@PLT
> +
> +        vmovsd    %xmm0, 392(%rsp,%r15)
> +        jmp       _LBL_1_8
> +
> +_LBL_1_12:
> +        movzbl    %r12b, %r15d
> +        shlq      $4, %r15
> +        vmovsd    320(%rsp,%r15), %xmm0
> +        vzeroupper
> +
> +        call      cos@PLT
> +
> +        vmovsd    %xmm0, 384(%rsp,%r15)
> +        jmp       _LBL_1_7
> +END(_ZGVdN4v_cos)
> diff --git a/sysdeps/x86_64/fpu/svml_d_cos_data.S b/sysdeps/x86_64/fpu/svml_d_cos_data.S
> new file mode 100644
> index 0000000..53f5244
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_d_cos_data.S
> @@ -0,0 +1,424 @@
> +/* Data for function cos vectorized with AVX2.
> +   Copyright (C) 2014 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +
> +	.align 64
> +	.globl __gnu_svml_dcos_data
> +__gnu_svml_dcos_data:
> +	.long	4294967295
> +	.long	2147483647
> +	.long	4294967295
> +	.long	2147483647
> +	.long	4294967295
> +	.long	2147483647
> +	.long	4294967295
> +	.long	2147483647
> +	.long	4294967295
> +	.long	2147483647
> +	.long	4294967295
> +	.long	2147483647
> +	.long	4294967295
> +	.long	2147483647
> +	.long	4294967295
> +	.long	2147483647
> +	.long	0
> +	.long	1096810496
> +	.long	0
> +	.long	1096810496
> +	.long	0
> +	.long	1096810496
> +	.long	0
> +	.long	1096810496
> +	.long	0
> +	.long	1096810496
> +	.long	0
> +	.long	1096810496
> +	.long	0
> +	.long	1096810496
> +	.long	0
> +	.long	1096810496
> +	.long	1413754136
> +	.long	1073291771
> +	.long	1413754136
> +	.long	1073291771
> +	.long	1413754136
> +	.long	1073291771
> +	.long	1413754136
> +	.long	1073291771
> +	.long	1413754136
> +	.long	1073291771
> +	.long	1413754136
> +	.long	1073291771
> +	.long	1413754136
> +	.long	1073291771
> +	.long	1413754136
> +	.long	1073291771
> +	.long	1841940611
> +	.long	1070882608
> +	.long	1841940611
> +	.long	1070882608
> +	.long	1841940611
> +	.long	1070882608
> +	.long	1841940611
> +	.long	1070882608
> +	.long	1841940611
> +	.long	1070882608
> +	.long	1841940611
> +	.long	1070882608
> +	.long	1841940611
> +	.long	1070882608
> +	.long	1841940611
> +	.long	1070882608
> +	.long	0
> +	.long	1127743488
> +	.long	0
> +	.long	1127743488
> +	.long	0
> +	.long	1127743488
> +	.long	0
> +	.long	1127743488
> +	.long	0
> +	.long	1127743488
> +	.long	0
> +	.long	1127743488
> +	.long	0
> +	.long	1127743488
> +	.long	0
> +	.long	1127743488
> +	.long	0
> +	.long	1071644672
> +	.long	0
> +	.long	1071644672
> +	.long	0
> +	.long	1071644672
> +	.long	0
> +	.long	1071644672
> +	.long	0
> +	.long	1071644672
> +	.long	0
> +	.long	1071644672
> +	.long	0
> +	.long	1071644672
> +	.long	0
> +	.long	1071644672
> +	.long	1073741824
> +	.long	1074340347
> +	.long	1073741824
> +	.long	1074340347
> +	.long	1073741824
> +	.long	1074340347
> +	.long	1073741824
> +	.long	1074340347
> +	.long	1073741824
> +	.long	1074340347
> +	.long	1073741824
> +	.long	1074340347
> +	.long	1073741824
> +	.long	1074340347
> +	.long	1073741824
> +	.long	1074340347
> +	.long	0
> +	.long	1048855597
> +	.long	0
> +	.long	1048855597
> +	.long	0
> +	.long	1048855597
> +	.long	0
> +	.long	1048855597
> +	.long	0
> +	.long	1048855597
> +	.long	0
> +	.long	1048855597
> +	.long	0
> +	.long	1048855597
> +	.long	0
> +	.long	1048855597
> +	.long	2147483648
> +	.long	1023952536
> +	.long	2147483648
> +	.long	1023952536
> +	.long	2147483648
> +	.long	1023952536
> +	.long	2147483648
> +	.long	1023952536
> +	.long	2147483648
> +	.long	1023952536
> +	.long	2147483648
> +	.long	1023952536
> +	.long	2147483648
> +	.long	1023952536
> +	.long	2147483648
> +	.long	1023952536
> +	.long	1880851354
> +	.long	998820945
> +	.long	1880851354
> +	.long	998820945
> +	.long	1880851354
> +	.long	998820945
> +	.long	1880851354
> +	.long	998820945
> +	.long	1880851354
> +	.long	998820945
> +	.long	1880851354
> +	.long	998820945
> +	.long	1880851354
> +	.long	998820945
> +	.long	1880851354
> +	.long	998820945
> +	.long	1413754136
> +	.long	1074340347
> +	.long	1413754136
> +	.long	1074340347
> +	.long	1413754136
> +	.long	1074340347
> +	.long	1413754136
> +	.long	1074340347
> +	.long	1413754136
> +	.long	1074340347
> +	.long	1413754136
> +	.long	1074340347
> +	.long	1413754136
> +	.long	1074340347
> +	.long	1413754136
> +	.long	1074340347
> +	.long	856972294
> +	.long	1017226790
> +	.long	856972294
> +	.long	1017226790
> +	.long	856972294
> +	.long	1017226790
> +	.long	856972294
> +	.long	1017226790
> +	.long	856972294
> +	.long	1017226790
> +	.long	856972294
> +	.long	1017226790
> +	.long	856972294
> +	.long	1017226790
> +	.long	856972294
> +	.long	1017226790
> +	.long	688016905
> +	.long	962338001
> +	.long	688016905
> +	.long	962338001
> +	.long	688016905
> +	.long	962338001
> +	.long	688016905
> +	.long	962338001
> +	.long	688016905
> +	.long	962338001
> +	.long	688016905
> +	.long	962338001
> +	.long	688016905
> +	.long	962338001
> +	.long	688016905
> +	.long	962338001
> +	.long	1431655591
> +	.long	3217380693
> +	.long	1431655591
> +	.long	3217380693
> +	.long	1431655591
> +	.long	3217380693
> +	.long	1431655591
> +	.long	3217380693
> +	.long	1431655591
> +	.long	3217380693
> +	.long	1431655591
> +	.long	3217380693
> +	.long	1431655591
> +	.long	3217380693
> +	.long	1431655591
> +	.long	3217380693
> +	.long	286303400
> +	.long	1065423121
> +	.long	286303400
> +	.long	1065423121
> +	.long	286303400
> +	.long	1065423121
> +	.long	286303400
> +	.long	1065423121
> +	.long	286303400
> +	.long	1065423121
> +	.long	286303400
> +	.long	1065423121
> +	.long	286303400
> +	.long	1065423121
> +	.long	286303400
> +	.long	1065423121
> +	.long	430291053
> +	.long	3207201184
> +	.long	430291053
> +	.long	3207201184
> +	.long	430291053
> +	.long	3207201184
> +	.long	430291053
> +	.long	3207201184
> +	.long	430291053
> +	.long	3207201184
> +	.long	430291053
> +	.long	3207201184
> +	.long	430291053
> +	.long	3207201184
> +	.long	430291053
> +	.long	3207201184
> +	.long	2150694560
> +	.long	1053236707
> +	.long	2150694560
> +	.long	1053236707
> +	.long	2150694560
> +	.long	1053236707
> +	.long	2150694560
> +	.long	1053236707
> +	.long	2150694560
> +	.long	1053236707
> +	.long	2150694560
> +	.long	1053236707
> +	.long	2150694560
> +	.long	1053236707
> +	.long	2150694560
> +	.long	1053236707
> +	.long	1174413873
> +	.long	3193628213
> +	.long	1174413873
> +	.long	3193628213
> +	.long	1174413873
> +	.long	3193628213
> +	.long	1174413873
> +	.long	3193628213
> +	.long	1174413873
> +	.long	3193628213
> +	.long	1174413873
> +	.long	3193628213
> +	.long	1174413873
> +	.long	3193628213
> +	.long	1174413873
> +	.long	3193628213
> +	.long	1470296608
> +	.long	1038487144
> +	.long	1470296608
> +	.long	1038487144
> +	.long	1470296608
> +	.long	1038487144
> +	.long	1470296608
> +	.long	1038487144
> +	.long	1470296608
> +	.long	1038487144
> +	.long	1470296608
> +	.long	1038487144
> +	.long	1470296608
> +	.long	1038487144
> +	.long	1470296608
> +	.long	1038487144
> +	.long	135375560
> +	.long	3177836758
> +	.long	135375560
> +	.long	3177836758
> +	.long	135375560
> +	.long	3177836758
> +	.long	135375560
> +	.long	3177836758
> +	.long	135375560
> +	.long	3177836758
> +	.long	135375560
> +	.long	3177836758
> +	.long	135375560
> +	.long	3177836758
> +	.long	135375560
> +	.long	3177836758
> +	.long	4294967295
> +	.long	2147483647
> +	.long	4294967295
> +	.long	2147483647
> +	.long	4294967295
> +	.long	2147483647
> +	.long	4294967295
> +	.long	2147483647
> +	.long	4294967295
> +	.long	2147483647
> +	.long	4294967295
> +	.long	2147483647
> +	.long	4294967295
> +	.long	2147483647
> +	.long	4294967295
> +	.long	2147483647
> +	.long	1841940611
> +	.long	1070882608
> +	.long	1841940611
> +	.long	1070882608
> +	.long	1841940611
> +	.long	1070882608
> +	.long	1841940611
> +	.long	1070882608
> +	.long	1841940611
> +	.long	1070882608
> +	.long	1841940611
> +	.long	1070882608
> +	.long	1841940611
> +	.long	1070882608
> +	.long	1841940611
> +	.long	1070882608
> +	.long	0
> +	.long	1127219200
> +	.long	0
> +	.long	1127219200
> +	.long	0
> +	.long	1127219200
> +	.long	0
> +	.long	1127219200
> +	.long	0
> +	.long	1127219200
> +	.long	0
> +	.long	1127219200
> +	.long	0
> +	.long	1127219200
> +	.long	0
> +	.long	1127219200
> +	.long	4294967295
> +	.long	1127219199
> +	.long	4294967295
> +	.long	1127219199
> +	.long	4294967295
> +	.long	1127219199
> +	.long	4294967295
> +	.long	1127219199
> +	.long	4294967295
> +	.long	1127219199
> +	.long	4294967295
> +	.long	1127219199
> +	.long	4294967295
> +	.long	1127219199
> +	.long	4294967295
> +	.long	1127219199
> +	.long	8388606
> +	.long	1127219200
> +	.long	8388606
> +	.long	1127219200
> +	.long	8388606
> +	.long	1127219200
> +	.long	8388606
> +	.long	1127219200
> +	.long	8388606
> +	.long	1127219200
> +	.long	8388606
> +	.long	1127219200
> +	.long	8388606
> +	.long	1127219200
> +	.long	8388606
> +	.long	1127219200
> +	.type	__gnu_svml_dcos_data,@object
> +	.size	__gnu_svml_dcos_data,1600


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]