This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH 1/N] x86_64 vectorization support: vectorized math functions addition to Glibc
- From: Adhemerval Zanella <azanella at linux dot vnet dot ibm dot com>
- To: libc-alpha at sourceware dot org
- Date: Wed, 10 Sep 2014 13:55:16 -0300
- Subject: Re: [PATCH 1/N] x86_64 vectorization support: vectorized math functions addition to Glibc
- Authentication-results: sourceware.org; auth=none
- References: <CAMXFM3u01fV=jtOo5MXvLkVAzB6Qaik+Jevk-ydmqJ7xakAV-w at mail dot gmail dot com>
On 10-09-2014 12:14, Andrew Senkevich wrote:
> Patch attached here
Before anything I would like to ask you to read the contributor checklist [1].
First, such change will require you to sign FSF copyright assignment before any
kind of review.
You also need to describe your intentions with your patch: it is an optimization
to current behavior? Just "Patch attached here" does not say anything about it.
Which is the performance evaluation? Which benchmarks did you use? Did you run
the testcase? Does the ULPs file need update?
For such changes the best way is to provide an internal symbol selected by IFUNC.
I will let x86 maintainers chime in, but I see adding a new symbol under GLIBC 2.2.5
*and* and external PLT call is unacceptable in IMHO.
The patch also needs a proper ChangeLog. In a short: read the checklist first please.
[1] https://sourceware.org/glibc/wiki/Contribution%20checklist
>
>
> --
> WBR,
> Andrew
> diff --git a/math/bits/mathcalls.h b/math/bits/mathcalls.h
> index 8a94a7e..ebfa583 100644
> --- a/math/bits/mathcalls.h
> +++ b/math/bits/mathcalls.h
> @@ -46,6 +46,17 @@
> # error "Never include <bits/mathcalls.h> directly; include <math.h> instead."
> #endif
>
> +#undef __DECL_SIMD
> +
> +#if defined _OPENMP && _OPENMP >= 201307
> +/* For now we have vectorized version only for _Mdouble_ case */
> +# ifdef _Mdouble_
> +# define __DECL_SIMD _Pragma ("omp declare simd")
> +# endif
> +#else
> +# define __DECL_SIMD
> +#endif
> +
>
> /* Trigonometric functions. */
>
> @@ -60,6 +71,7 @@ __MATHCALL (atan,, (_Mdouble_ __x));
> __MATHCALL (atan2,, (_Mdouble_ __y, _Mdouble_ __x));
>
> /* Cosine of X. */
> +__DECL_SIMD
> __MATHCALL (cos,, (_Mdouble_ __x));
> /* Sine of X. */
> __MATHCALL (sin,, (_Mdouble_ __x));
> diff --git a/sysdeps/unix/sysv/linux/x86_64/64/libm.abilist b/sysdeps/unix/sysv/linux/x86_64/64/libm.abilist
> index 2390934..1aa3099 100644
> --- a/sysdeps/unix/sysv/linux/x86_64/64/libm.abilist
> +++ b/sysdeps/unix/sysv/linux/x86_64/64/libm.abilist
> @@ -89,6 +89,7 @@ GLIBC_2.18
> GLIBC_2.2.5
> GLIBC_2.2.5 A
> _LIB_VERSION D 0x4
> + _ZGVdN4v_cos F
> __clog10 F
> __clog10f F
> __clog10l F
> diff --git a/sysdeps/unix/sysv/linux/x86_64/64/localplt.data b/sysdeps/unix/sysv/linux/x86_64/64/localplt.data
> new file mode 100644
> index 0000000..1a683d9
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/x86_64/64/localplt.data
> @@ -0,0 +1,10 @@
> +# See scripts/check-localplt.awk for how this file is processed.
> +# PLT use is required for the malloc family and for matherr because
> +# users can define their own functions and have library internals call them.
> +libc.so: calloc
> +libc.so: free
> +libc.so: malloc
> +libc.so: memalign
> +libc.so: realloc
> +libm.so: matherr
> +libm.so: cos
> diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile
> new file mode 100644
> index 0000000..1cb3ec5
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/Makefile
> @@ -0,0 +1,3 @@
> +ifeq ($(subdir),math)
> +libm-support += svml_d_cos4_core svml_d_cos_data
> +endif
> diff --git a/sysdeps/x86_64/fpu/Versions b/sysdeps/x86_64/fpu/Versions
> new file mode 100644
> index 0000000..d30fbb3
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/Versions
> @@ -0,0 +1,7 @@
> +libm {
> + GLIBC_2.2.5 {
> + # A generic bug got this omitted from other configurations' version
> + # sets, but we always had it.
> + _ZGVdN4v_cos;
> + }
> +}
> diff --git a/sysdeps/x86_64/fpu/svml_d_cos4_core.S b/sysdeps/x86_64/fpu/svml_d_cos4_core.S
> new file mode 100644
> index 0000000..7316d2b
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_d_cos4_core.S
> @@ -0,0 +1,185 @@
> +/* Function cos vectorized with AVX2.
> + Copyright (C) 2014 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +
> + .text
> +ENTRY(_ZGVdN4v_cos)
> +
> +/* ALGORITHM DESCRIPTION:
> + *
> + * ( low accuracy ( < 4ulp ) or enhanced performance ( half of correct mantissa ) implementation )
> + *
> + * Argument representation:
> + * arg + Pi/2 = (N*Pi + R)
> + *
> + * Result calculation:
> + * cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R)
> + * sin(R) is approximated by corresponding polynomial
> + */
> + pushq %rbp
> + movq %rsp, %rbp
> + andq $-64, %rsp
> + subq $448, %rsp
> + movq __gnu_svml_dcos_data@GOTPCREL(%rip), %rax
> + vmovapd %ymm0, %ymm1
> + vmovupd 192(%rax), %ymm4
> + vmovupd 256(%rax), %ymm5
> +
> +/* ARGUMENT RANGE REDUCTION:
> + * Add Pi/2 to argument: X' = X+Pi/2
> + */
> + vaddpd 128(%rax), %ymm1, %ymm7
> +
> +/* Get absolute argument value: X' = |X'| */
> + vandpd (%rax), %ymm7, %ymm2
> +
> +/* Y = X'*InvPi + RS : right shifter add */
> + vfmadd213pd %ymm5, %ymm4, %ymm7
> + vmovupd 1216(%rax), %ymm4
> +
> +/* Check for large arguments path */
> + vcmpnle_uqpd 64(%rax), %ymm2, %ymm3
> +
> +/* N = Y - RS : right shifter sub */
> + vsubpd %ymm5, %ymm7, %ymm6
> + vmovupd 640(%rax), %ymm2
> +
> +/* SignRes = Y<<63 : shift LSB to MSB place for result sign */
> + vpsllq $63, %ymm7, %ymm7
> +
> +/* N = N - 0.5 */
> + vsubpd 320(%rax), %ymm6, %ymm0
> + vmovmskpd %ymm3, %ecx
> +
> +/* R = X - N*Pi1 */
> + vmovapd %ymm1, %ymm3
> + vfnmadd231pd %ymm0, %ymm2, %ymm3
> +
> +/* R = R - N*Pi2 */
> + vfnmadd231pd 704(%rax), %ymm0, %ymm3
> +
> +/* R = R - N*Pi3 */
> + vfnmadd132pd 768(%rax), %ymm3, %ymm0
> +
> +/* POLYNOMIAL APPROXIMATION:
> + * R2 = R*R
> + */
> + vmulpd %ymm0, %ymm0, %ymm5
> + vfmadd213pd 1152(%rax), %ymm5, %ymm4
> + vfmadd213pd 1088(%rax), %ymm5, %ymm4
> + vfmadd213pd 1024(%rax), %ymm5, %ymm4
> +
> +/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */
> + vfmadd213pd 960(%rax), %ymm5, %ymm4
> + vfmadd213pd 896(%rax), %ymm5, %ymm4
> + vfmadd213pd 832(%rax), %ymm5, %ymm4
> + vmulpd %ymm5, %ymm4, %ymm6
> + vfmadd213pd %ymm0, %ymm0, %ymm6
> +
> +/* RECONSTRUCTION:
> + * Final sign setting: Res = Poly^SignRes
> + */
> + vxorpd %ymm7, %ymm6, %ymm0
> + testl %ecx, %ecx
> + jne _LBL_1_3
> +
> +_LBL_1_2:
> + movq %rbp, %rsp
> + popq %rbp
> + ret
> +
> +_LBL_1_3:
> + vmovupd %ymm1, 320(%rsp)
> + vmovupd %ymm0, 384(%rsp)
> + je _LBL_1_2
> +
> + xorb %dl, %dl
> + xorl %eax, %eax
> + vmovups %ymm8, 224(%rsp)
> + vmovups %ymm9, 192(%rsp)
> + vmovups %ymm10, 160(%rsp)
> + vmovups %ymm11, 128(%rsp)
> + vmovups %ymm12, 96(%rsp)
> + vmovups %ymm13, 64(%rsp)
> + vmovups %ymm14, 32(%rsp)
> + vmovups %ymm15, (%rsp)
> + movq %rsi, 264(%rsp)
> + movq %rdi, 256(%rsp)
> + movq %r12, 296(%rsp)
> + movb %dl, %r12b
> + movq %r13, 288(%rsp)
> + movl %ecx, %r13d
> + movq %r14, 280(%rsp)
> + movl %eax, %r14d
> + movq %r15, 272(%rsp)
> +
> +_LBL_1_6:
> + btl %r14d, %r13d
> + jc _LBL_1_12
> +
> +_LBL_1_7:
> + lea 1(%r14), %esi
> + btl %esi, %r13d
> + jc _LBL_1_10
> +
> +_LBL_1_8:
> + incb %r12b
> + addl $2, %r14d
> + cmpb $16, %r12b
> + jb _LBL_1_6
> +
> + vmovups 224(%rsp), %ymm8
> + vmovups 192(%rsp), %ymm9
> + vmovups 160(%rsp), %ymm10
> + vmovups 128(%rsp), %ymm11
> + vmovups 96(%rsp), %ymm12
> + vmovups 64(%rsp), %ymm13
> + vmovups 32(%rsp), %ymm14
> + vmovups (%rsp), %ymm15
> + vmovupd 384(%rsp), %ymm0
> + movq 264(%rsp), %rsi
> + movq 256(%rsp), %rdi
> + movq 296(%rsp), %r12
> + movq 288(%rsp), %r13
> + movq 280(%rsp), %r14
> + movq 272(%rsp), %r15
> + jmp _LBL_1_2
> +
> +_LBL_1_10:
> + movzbl %r12b, %r15d
> + shlq $4, %r15
> + vmovsd 328(%rsp,%r15), %xmm0
> + vzeroupper
> +
> + call cos@PLT
> +
> + vmovsd %xmm0, 392(%rsp,%r15)
> + jmp _LBL_1_8
> +
> +_LBL_1_12:
> + movzbl %r12b, %r15d
> + shlq $4, %r15
> + vmovsd 320(%rsp,%r15), %xmm0
> + vzeroupper
> +
> + call cos@PLT
> +
> + vmovsd %xmm0, 384(%rsp,%r15)
> + jmp _LBL_1_7
> +END(_ZGVdN4v_cos)
> diff --git a/sysdeps/x86_64/fpu/svml_d_cos_data.S b/sysdeps/x86_64/fpu/svml_d_cos_data.S
> new file mode 100644
> index 0000000..53f5244
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_d_cos_data.S
> @@ -0,0 +1,424 @@
> +/* Data for function cos vectorized with AVX2.
> + Copyright (C) 2014 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +
> + .align 64
> + .globl __gnu_svml_dcos_data
> +__gnu_svml_dcos_data:
> + .long 4294967295
> + .long 2147483647
> + .long 4294967295
> + .long 2147483647
> + .long 4294967295
> + .long 2147483647
> + .long 4294967295
> + .long 2147483647
> + .long 4294967295
> + .long 2147483647
> + .long 4294967295
> + .long 2147483647
> + .long 4294967295
> + .long 2147483647
> + .long 4294967295
> + .long 2147483647
> + .long 0
> + .long 1096810496
> + .long 0
> + .long 1096810496
> + .long 0
> + .long 1096810496
> + .long 0
> + .long 1096810496
> + .long 0
> + .long 1096810496
> + .long 0
> + .long 1096810496
> + .long 0
> + .long 1096810496
> + .long 0
> + .long 1096810496
> + .long 1413754136
> + .long 1073291771
> + .long 1413754136
> + .long 1073291771
> + .long 1413754136
> + .long 1073291771
> + .long 1413754136
> + .long 1073291771
> + .long 1413754136
> + .long 1073291771
> + .long 1413754136
> + .long 1073291771
> + .long 1413754136
> + .long 1073291771
> + .long 1413754136
> + .long 1073291771
> + .long 1841940611
> + .long 1070882608
> + .long 1841940611
> + .long 1070882608
> + .long 1841940611
> + .long 1070882608
> + .long 1841940611
> + .long 1070882608
> + .long 1841940611
> + .long 1070882608
> + .long 1841940611
> + .long 1070882608
> + .long 1841940611
> + .long 1070882608
> + .long 1841940611
> + .long 1070882608
> + .long 0
> + .long 1127743488
> + .long 0
> + .long 1127743488
> + .long 0
> + .long 1127743488
> + .long 0
> + .long 1127743488
> + .long 0
> + .long 1127743488
> + .long 0
> + .long 1127743488
> + .long 0
> + .long 1127743488
> + .long 0
> + .long 1127743488
> + .long 0
> + .long 1071644672
> + .long 0
> + .long 1071644672
> + .long 0
> + .long 1071644672
> + .long 0
> + .long 1071644672
> + .long 0
> + .long 1071644672
> + .long 0
> + .long 1071644672
> + .long 0
> + .long 1071644672
> + .long 0
> + .long 1071644672
> + .long 1073741824
> + .long 1074340347
> + .long 1073741824
> + .long 1074340347
> + .long 1073741824
> + .long 1074340347
> + .long 1073741824
> + .long 1074340347
> + .long 1073741824
> + .long 1074340347
> + .long 1073741824
> + .long 1074340347
> + .long 1073741824
> + .long 1074340347
> + .long 1073741824
> + .long 1074340347
> + .long 0
> + .long 1048855597
> + .long 0
> + .long 1048855597
> + .long 0
> + .long 1048855597
> + .long 0
> + .long 1048855597
> + .long 0
> + .long 1048855597
> + .long 0
> + .long 1048855597
> + .long 0
> + .long 1048855597
> + .long 0
> + .long 1048855597
> + .long 2147483648
> + .long 1023952536
> + .long 2147483648
> + .long 1023952536
> + .long 2147483648
> + .long 1023952536
> + .long 2147483648
> + .long 1023952536
> + .long 2147483648
> + .long 1023952536
> + .long 2147483648
> + .long 1023952536
> + .long 2147483648
> + .long 1023952536
> + .long 2147483648
> + .long 1023952536
> + .long 1880851354
> + .long 998820945
> + .long 1880851354
> + .long 998820945
> + .long 1880851354
> + .long 998820945
> + .long 1880851354
> + .long 998820945
> + .long 1880851354
> + .long 998820945
> + .long 1880851354
> + .long 998820945
> + .long 1880851354
> + .long 998820945
> + .long 1880851354
> + .long 998820945
> + .long 1413754136
> + .long 1074340347
> + .long 1413754136
> + .long 1074340347
> + .long 1413754136
> + .long 1074340347
> + .long 1413754136
> + .long 1074340347
> + .long 1413754136
> + .long 1074340347
> + .long 1413754136
> + .long 1074340347
> + .long 1413754136
> + .long 1074340347
> + .long 1413754136
> + .long 1074340347
> + .long 856972294
> + .long 1017226790
> + .long 856972294
> + .long 1017226790
> + .long 856972294
> + .long 1017226790
> + .long 856972294
> + .long 1017226790
> + .long 856972294
> + .long 1017226790
> + .long 856972294
> + .long 1017226790
> + .long 856972294
> + .long 1017226790
> + .long 856972294
> + .long 1017226790
> + .long 688016905
> + .long 962338001
> + .long 688016905
> + .long 962338001
> + .long 688016905
> + .long 962338001
> + .long 688016905
> + .long 962338001
> + .long 688016905
> + .long 962338001
> + .long 688016905
> + .long 962338001
> + .long 688016905
> + .long 962338001
> + .long 688016905
> + .long 962338001
> + .long 1431655591
> + .long 3217380693
> + .long 1431655591
> + .long 3217380693
> + .long 1431655591
> + .long 3217380693
> + .long 1431655591
> + .long 3217380693
> + .long 1431655591
> + .long 3217380693
> + .long 1431655591
> + .long 3217380693
> + .long 1431655591
> + .long 3217380693
> + .long 1431655591
> + .long 3217380693
> + .long 286303400
> + .long 1065423121
> + .long 286303400
> + .long 1065423121
> + .long 286303400
> + .long 1065423121
> + .long 286303400
> + .long 1065423121
> + .long 286303400
> + .long 1065423121
> + .long 286303400
> + .long 1065423121
> + .long 286303400
> + .long 1065423121
> + .long 286303400
> + .long 1065423121
> + .long 430291053
> + .long 3207201184
> + .long 430291053
> + .long 3207201184
> + .long 430291053
> + .long 3207201184
> + .long 430291053
> + .long 3207201184
> + .long 430291053
> + .long 3207201184
> + .long 430291053
> + .long 3207201184
> + .long 430291053
> + .long 3207201184
> + .long 430291053
> + .long 3207201184
> + .long 2150694560
> + .long 1053236707
> + .long 2150694560
> + .long 1053236707
> + .long 2150694560
> + .long 1053236707
> + .long 2150694560
> + .long 1053236707
> + .long 2150694560
> + .long 1053236707
> + .long 2150694560
> + .long 1053236707
> + .long 2150694560
> + .long 1053236707
> + .long 2150694560
> + .long 1053236707
> + .long 1174413873
> + .long 3193628213
> + .long 1174413873
> + .long 3193628213
> + .long 1174413873
> + .long 3193628213
> + .long 1174413873
> + .long 3193628213
> + .long 1174413873
> + .long 3193628213
> + .long 1174413873
> + .long 3193628213
> + .long 1174413873
> + .long 3193628213
> + .long 1174413873
> + .long 3193628213
> + .long 1470296608
> + .long 1038487144
> + .long 1470296608
> + .long 1038487144
> + .long 1470296608
> + .long 1038487144
> + .long 1470296608
> + .long 1038487144
> + .long 1470296608
> + .long 1038487144
> + .long 1470296608
> + .long 1038487144
> + .long 1470296608
> + .long 1038487144
> + .long 1470296608
> + .long 1038487144
> + .long 135375560
> + .long 3177836758
> + .long 135375560
> + .long 3177836758
> + .long 135375560
> + .long 3177836758
> + .long 135375560
> + .long 3177836758
> + .long 135375560
> + .long 3177836758
> + .long 135375560
> + .long 3177836758
> + .long 135375560
> + .long 3177836758
> + .long 135375560
> + .long 3177836758
> + .long 4294967295
> + .long 2147483647
> + .long 4294967295
> + .long 2147483647
> + .long 4294967295
> + .long 2147483647
> + .long 4294967295
> + .long 2147483647
> + .long 4294967295
> + .long 2147483647
> + .long 4294967295
> + .long 2147483647
> + .long 4294967295
> + .long 2147483647
> + .long 4294967295
> + .long 2147483647
> + .long 1841940611
> + .long 1070882608
> + .long 1841940611
> + .long 1070882608
> + .long 1841940611
> + .long 1070882608
> + .long 1841940611
> + .long 1070882608
> + .long 1841940611
> + .long 1070882608
> + .long 1841940611
> + .long 1070882608
> + .long 1841940611
> + .long 1070882608
> + .long 1841940611
> + .long 1070882608
> + .long 0
> + .long 1127219200
> + .long 0
> + .long 1127219200
> + .long 0
> + .long 1127219200
> + .long 0
> + .long 1127219200
> + .long 0
> + .long 1127219200
> + .long 0
> + .long 1127219200
> + .long 0
> + .long 1127219200
> + .long 0
> + .long 1127219200
> + .long 4294967295
> + .long 1127219199
> + .long 4294967295
> + .long 1127219199
> + .long 4294967295
> + .long 1127219199
> + .long 4294967295
> + .long 1127219199
> + .long 4294967295
> + .long 1127219199
> + .long 4294967295
> + .long 1127219199
> + .long 4294967295
> + .long 1127219199
> + .long 4294967295
> + .long 1127219199
> + .long 8388606
> + .long 1127219200
> + .long 8388606
> + .long 1127219200
> + .long 8388606
> + .long 1127219200
> + .long 8388606
> + .long 1127219200
> + .long 8388606
> + .long 1127219200
> + .long 8388606
> + .long 1127219200
> + .long 8388606
> + .long 1127219200
> + .long 8388606
> + .long 1127219200
> + .type __gnu_svml_dcos_data,@object
> + .size __gnu_svml_dcos_data,1600