This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] x86_64: Replace AVX512F .byte sequences with instructions


Since binutils 2.25 or later is required to build glibc, we can replace
AVX512F .byte sequences with AVX512F instructions.

Tested on x86-64 and x32.  There are no code differences in libmvec.so
and libmvec.a.

I am checking it in now.

H.J.
----
	* sysdeps/x86_64/fpu/svml_d_sincos8_core.S: Replace AVX512F
	.byte sequences with AVX512F instructions.
	* sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Likewise.
	* sysdeps/x86_64/fpu/svml_s_sincosf16_core.S: Likewise.
	* sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: Likewise.
	* sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S:
	Likewise.
	* sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S:
	Likewise.
---
 ChangeLog                                          | 12 +++
 .../fpu/multiarch/svml_d_sincos8_core_avx512.S     | 19 +----
 .../fpu/multiarch/svml_s_sincosf16_core_avx512.S   | 63 ++--------------
 sysdeps/x86_64/fpu/svml_d_sincos8_core.S           | 41 +----------
 sysdeps/x86_64/fpu/svml_d_wrapper_impl.h           | 57 ++-------------
 sysdeps/x86_64/fpu/svml_s_sincosf16_core.S         | 85 ++--------------------
 sysdeps/x86_64/fpu/svml_s_wrapper_impl.h           | 57 ++-------------
 7 files changed, 44 insertions(+), 290 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 43c8880a76..a58de05af6 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,15 @@
+2017-08-23  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* sysdeps/x86_64/fpu/svml_d_sincos8_core.S: Replace AVX512F
+	.byte sequences with AVX512F instructions.
+	* sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Likewise.
+	* sysdeps/x86_64/fpu/svml_s_sincosf16_core.S: Likewise.
+	* sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: Likewise.
+	* sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S:
+	Likewise.
+	* sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S:
+	Likewise.
+
 2017-08-22  Szabolcs Nagy  <szabolcs.nagy@arm.com>
 	    Steve Ellcey  <sellcey@cavium.com>
 
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
index c9207558c5..3667faa0fa 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
@@ -599,24 +599,9 @@ libmvec_hidden_def(_ZGVeN8vl8l8_sincos_skx)
         cfi_def_cfa_register (%rbp)
         andq      $-64, %rsp
         subq      $256, %rsp
-        /* Encoding for vmovups %zmm1, 128(%rsp).  */
-        .byte 0x62
-        .byte 0xf1
-        .byte 0x7c
-        .byte 0x48
-        .byte 0x11
-        .byte 0x4c
-        .byte 0x24
-        .byte 0x02
+        vmovups   %zmm1, 128(%rsp)
         lea       (%rsp), %rdi
-        /* Encoding for vmovups %zmm2, 192(%rdi).  */
-        .byte 0x62
-        .byte 0xf1
-        .byte 0x7c
-        .byte 0x48
-        .byte 0x11
-        .byte 0x57
-        .byte 0x03
+        vmovups   %zmm2, 192(%rdi)
         lea       64(%rsp), %rsi
         call      HIDDEN_JUMPTARGET(\callee)
         movq      128(%rsp), %rdx
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
index f73ab7de7c..8fa4255d6d 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
@@ -510,40 +510,11 @@ libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_skx)
         cfi_def_cfa_register (%rbp)
         andq      $-64, %rsp
         subq      $384, %rsp
-        /* Encoding for vmovups %zmm1, 128(%rsp).  */
-        .byte 0x62
-        .byte 0xf1
-        .byte 0x7c
-        .byte 0x48
-        .byte 0x11
-        .byte 0x4c
-        .byte 0x24
-        .byte 0x02
+        vmovups   %zmm1, 128(%rsp)
         lea       (%rsp), %rdi
-        /* Encoding for vmovups %zmm2, 192(%rdi).  */
-        .byte 0x62
-        .byte 0xf1
-        .byte 0x7c
-        .byte 0x48
-        .byte 0x11
-        .byte 0x57
-        .byte 0x03
-        /* Encoding for vmovups %zmm3, 256(%rdi).  */
-        .byte 0x62
-        .byte 0xf1
-        .byte 0x7c
-        .byte 0x48
-        .byte 0x11
-        .byte 0x5f
-        .byte 0x04
-        /* Encoding for vmovups %zmm4, 320(%rdi).  */
-        .byte 0x62
-        .byte 0xf1
-        .byte 0x7c
-        .byte 0x48
-        .byte 0x11
-        .byte 0x67
-        .byte 0x05
+        vmovups   %zmm2, 192(%rdi)
+        vmovups   %zmm3, 256(%rdi)
+        vmovups   %zmm4, 320(%rdi)
         lea       64(%rsp), %rsi
         call      HIDDEN_JUMPTARGET(\callee)
         movq      128(%rsp), %rdx
@@ -661,30 +632,8 @@ libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_skx)
         leal    -112(%rbp), %esi
         leal    -176(%rbp), %edi
         subl    $296, %esp
-        /* Encoding for vmovdqa64 %zmm1, -240(%ebp).  */
-        .byte 0x67
-        .byte 0x62
-        .byte 0xf1
-        .byte 0xfd
-        .byte 0x48
-        .byte 0x7f
-        .byte 0x8d
-        .byte 0x10
-        .byte 0xff
-        .byte 0xff
-        .byte 0xff
-        /* Encoding for vmovdqa64 %zmm2, -304(%ebp).  */
-        .byte 0x67
-        .byte 0x62
-        .byte 0xf1
-        .byte 0xfd
-        .byte 0x48
-        .byte 0x7f
-        .byte 0x95
-        .byte 0xd0
-        .byte 0xfe
-        .byte 0xff
-        .byte 0xff
+        vmovdqa64 %zmm1, -240(%ebp)
+        vmovdqa64 %zmm2, -304(%ebp)
         call    HIDDEN_JUMPTARGET(\callee)
         movl    -240(%ebp), %eax
         vmovss  -176(%ebp), %xmm0
diff --git a/sysdeps/x86_64/fpu/svml_d_sincos8_core.S b/sysdeps/x86_64/fpu/svml_d_sincos8_core.S
index c104539821..cdea30409a 100644
--- a/sysdeps/x86_64/fpu/svml_d_sincos8_core.S
+++ b/sysdeps/x86_64/fpu/svml_d_sincos8_core.S
@@ -35,32 +35,10 @@ END (_ZGVeN8vl8l8_sincos)
         cfi_def_cfa_register (%rbp)
         andq      $-64, %rsp
         subq      $320, %rsp
-        /* Encoding for vmovups %zmm0, 256(%rsp).  */
-        .byte 0x62
-        .byte 0xf1
-        .byte 0x7c
-        .byte 0x48
-        .byte 0x11
-        .byte 0x44
-        .byte 0x24
-        .byte 0x04
+        vmovups    %zmm0, 256(%rsp)
         lea       (%rsp), %rdi
-        /* Encoding for vmovups %zmm1, 128(%rdi).  */
-        .byte 0x62
-        .byte 0xf1
-        .byte 0x7c
-        .byte 0x48
-        .byte 0x11
-        .byte 0x4f
-        .byte 0x02
-        /* Encoding for vmovups %zmm2, 192(%rdi).  */
-        .byte 0x62
-        .byte 0xf1
-        .byte 0x7c
-        .byte 0x48
-        .byte 0x11
-        .byte 0x57
-        .byte 0x03
+        vmovups   %zmm1, 128(%rdi)
+        vmovups   %zmm2, 192(%rdi)
         lea       64(%rsp), %rsi
         call      HIDDEN_JUMPTARGET(\callee)
         vmovdqu   288(%rsp), %ymm0
@@ -142,18 +120,7 @@ END (_ZGVeN8vl8l8_sincos)
         subl    $280, %esp
         vmovdqa %ymm1, -208(%ebp)
         vmovdqa %ymm2, -240(%ebp)
-        /* Encoding for vmovapd %zmm0, -304(%ebp).  */
-        .byte 0x67
-        .byte 0x62
-        .byte 0xf1
-        .byte 0xfd
-        .byte 0x48
-        .byte 0x29
-        .byte 0x85
-        .byte 0xd0
-        .byte 0xfe
-        .byte 0xff
-        .byte 0xff
+        vmovapd %zmm0, -304(%ebp)
         call    HIDDEN_JUMPTARGET(\callee)
         leal    32(%r12), %esi
         vmovupd -272(%ebp), %ymm0
diff --git a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
index 625eb6642b..39336447ab 100644
--- a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
+++ b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
@@ -201,29 +201,14 @@
         cfi_def_cfa_register (%rbp)
         andq      $-64, %rsp
         subq      $128, %rsp
-/* Below is encoding for vmovups %zmm0, (%rsp).  */
-        .byte   0x62
-        .byte   0xf1
-        .byte   0x7c
-        .byte   0x48
-        .byte   0x11
-        .byte   0x04
-        .byte   0x24
+        vmovups   %zmm0, (%rsp)
         vmovupd   (%rsp), %ymm0
         call      HIDDEN_JUMPTARGET(\callee)
         vmovupd   %ymm0, 64(%rsp)
         vmovupd   32(%rsp), %ymm0
         call      HIDDEN_JUMPTARGET(\callee)
         vmovupd   %ymm0, 96(%rsp)
-/* Below is encoding for vmovups 64(%rsp), %zmm0.  */
-        .byte   0x62
-        .byte   0xf1
-        .byte   0x7c
-        .byte   0x48
-        .byte   0x10
-        .byte   0x44
-        .byte   0x24
-        .byte   0x01
+        vmovups   64(%rsp), %zmm0
         movq      %rbp, %rsp
         cfi_def_cfa_register (%rsp)
         popq      %rbp
@@ -241,23 +226,8 @@
         cfi_def_cfa_register (%rbp)
         andq      $-64, %rsp
         subq      $192, %rsp
-/* Below is encoding for vmovups %zmm0, (%rsp).  */
-        .byte   0x62
-        .byte   0xf1
-        .byte   0x7c
-        .byte   0x48
-        .byte   0x11
-        .byte   0x04
-        .byte   0x24
-/* Below is encoding for vmovups %zmm1, 64(%rsp).  */
-        .byte   0x62
-        .byte   0xf1
-        .byte   0x7c
-        .byte   0x48
-        .byte   0x11
-        .byte   0x4c
-        .byte   0x24
-        .byte   0x01
+        vmovups   %zmm0, (%rsp)
+        vmovups   %zmm1, 64(%rsp)
         vmovupd   (%rsp), %ymm0
         vmovupd   64(%rsp), %ymm1
         call      HIDDEN_JUMPTARGET(\callee)
@@ -266,15 +236,7 @@
         vmovupd   96(%rsp), %ymm1
         call      HIDDEN_JUMPTARGET(\callee)
         vmovupd   %ymm0, 160(%rsp)
-/* Below is encoding for vmovups 128(%rsp), %zmm0.  */
-        .byte   0x62
-        .byte   0xf1
-        .byte   0x7c
-        .byte   0x48
-        .byte   0x10
-        .byte   0x44
-        .byte   0x24
-        .byte   0x02
+        vmovups   128(%rsp), %zmm0
         movq      %rbp, %rsp
         cfi_def_cfa_register (%rsp)
         popq      %rbp
@@ -299,14 +261,7 @@
         cfi_rel_offset (%r13, 0)
         subq      $176, %rsp
         movq      %rsi, %r13
-/* Below is encoding for vmovups %zmm0, (%rsp).  */
-        .byte	0x62
-        .byte	0xf1
-        .byte	0x7c
-        .byte	0x48
-        .byte	0x11
-        .byte	0x04
-        .byte	0x24
+        vmovups   %zmm0, (%rsp)
         movq    %rdi, %r12
         vmovupd (%rsp), %ymm0
         call      HIDDEN_JUMPTARGET(\callee)
diff --git a/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S b/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S
index d86c91380e..8ebcebb296 100644
--- a/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S
+++ b/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S
@@ -35,48 +35,12 @@ END (_ZGVeN16vl4l4_sincosf)
         cfi_def_cfa_register (%rbp)
         andq      $-64, %rsp
         subq      $448, %rsp
-        /* Encoding for vmovups %zmm0, 384(%rsp).  */
-        .byte 0x62
-        .byte 0xf1
-        .byte 0x7c
-        .byte 0x48
-        .byte 0x11
-        .byte 0x44
-        .byte 0x24
-        .byte 0x06
+        vmovups   %zmm0, 384(%rsp)
         lea       (%rsp), %rdi
-        /* Encoding for vmovups %zmm1, 128(%rdi).  */
-        .byte 0x62
-        .byte 0xf1
-        .byte 0x7c
-        .byte 0x48
-        .byte 0x11
-        .byte 0x4f
-        .byte 0x02
-        /* Encoding for vmovups %zmm2, 192(%rdi).  */
-        .byte 0x62
-        .byte 0xf1
-        .byte 0x7c
-        .byte 0x48
-        .byte 0x11
-        .byte 0x57
-        .byte 0x03
-        /* Encoding for vmovups %zmm3, 256(%rdi).  */
-        .byte 0x62
-        .byte 0xf1
-        .byte 0x7c
-        .byte 0x48
-        .byte 0x11
-        .byte 0x5f
-        .byte 0x04
-        /* Encoding for vmovups %zmm4, 320(%rdi).  */
-        .byte 0x62
-        .byte 0xf1
-        .byte 0x7c
-        .byte 0x48
-        .byte 0x11
-        .byte 0x67
-        .byte 0x05
+        vmovups   %zmm1, 128(%rdi)
+        vmovups   %zmm2, 192(%rdi)
+        vmovups   %zmm3, 256(%rdi)
+        vmovups   %zmm4, 320(%rdi)
         lea       64(%rsp), %rsi
         call      HIDDEN_JUMPTARGET(\callee)
         vmovdqu   416(%rsp), %ymm0
@@ -204,42 +168,9 @@ END (_ZGVeN16vl4l4_sincosf)
         .cfi_escape 0x10,0x3,0x2,0x76,0x68
         movq    %rdi, %rbx
         subl    $344, %esp
-        /* Encoding for vmovdqa64 %zmm1, -240(%ebp).  */
-        .byte 0x67
-        .byte 0x62
-        .byte 0xf1
-        .byte 0xfd
-        .byte 0x48
-        .byte 0x7f
-        .byte 0x8d
-        .byte 0x10
-        .byte 0xff
-        .byte 0xff
-        .byte 0xff
-        /* Encoding for vmovdqa64 %zmm2, -304(%ebp).  */
-        .byte 0x67
-        .byte 0x62
-        .byte 0xf1
-        .byte 0xfd
-        .byte 0x48
-        .byte 0x7f
-        .byte 0x95
-        .byte 0xd0
-        .byte 0xfe
-        .byte 0xff
-        .byte 0xff
-        /* Encoding for vmovaps %zmm0, -368(%ebp).  */
-        .byte 0x67
-        .byte 0x62
-        .byte 0xf1
-        .byte 0x7c
-        .byte 0x48
-        .byte 0x29
-        .byte 0x85
-        .byte 0x90
-        .byte 0xfe
-        .byte 0xff
-        .byte 0xff
+        vmovdqa64 %zmm1, -240(%ebp)
+        vmovdqa64 %zmm2, -304(%ebp)
+        vmovaps   %zmm0, -368(%ebp)
         call    HIDDEN_JUMPTARGET(\callee)
         leal    32(%r12), %esi
         vmovups -336(%ebp), %ymm0
diff --git a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
index cd6d58361c..00b86cd377 100644
--- a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
+++ b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
@@ -246,29 +246,14 @@
         cfi_def_cfa_register (%rbp)
         andq      $-64, %rsp
         subq      $128, %rsp
-/* Below is encoding for vmovups %zmm0, (%rsp).  */
-        .byte   0x62
-        .byte   0xf1
-        .byte   0x7c
-        .byte   0x48
-        .byte   0x11
-        .byte   0x04
-        .byte   0x24
+        vmovups   %zmm0, (%rsp)
         vmovupd   (%rsp), %ymm0
         call      HIDDEN_JUMPTARGET(\callee)
         vmovupd   %ymm0, 64(%rsp)
         vmovupd   32(%rsp), %ymm0
         call      HIDDEN_JUMPTARGET(\callee)
         vmovupd   %ymm0, 96(%rsp)
-/* Below is encoding for vmovups 64(%rsp), %zmm0.  */
-        .byte   0x62
-        .byte   0xf1
-        .byte   0x7c
-        .byte   0x48
-        .byte   0x10
-        .byte   0x44
-        .byte   0x24
-        .byte   0x01
+        vmovups   64(%rsp), %zmm0
         movq      %rbp, %rsp
         cfi_def_cfa_register (%rsp)
         popq      %rbp
@@ -286,23 +271,8 @@
         cfi_def_cfa_register (%rbp)
         andq      $-64, %rsp
         subq      $192, %rsp
-/* Below is encoding for vmovups %zmm0, (%rsp).  */
-        .byte   0x62
-        .byte   0xf1
-        .byte   0x7c
-        .byte   0x48
-        .byte   0x11
-        .byte   0x04
-        .byte   0x24
-/* Below is encoding for vmovups %zmm1, 64(%rsp).  */
-        .byte   0x62
-        .byte   0xf1
-        .byte   0x7c
-        .byte   0x48
-        .byte   0x11
-        .byte   0x4c
-        .byte   0x24
-        .byte   0x01
+        vmovups   %zmm0, (%rsp)
+        vmovups   %zmm1, 64(%rsp)
         vmovups   (%rsp), %ymm0
         vmovups   64(%rsp), %ymm1
         call      HIDDEN_JUMPTARGET(\callee)
@@ -311,15 +281,7 @@
         vmovups   96(%rsp), %ymm1
         call      HIDDEN_JUMPTARGET(\callee)
         vmovups   %ymm0, 160(%rsp)
-/* Below is encoding for vmovups 128(%rsp), %zmm0.  */
-        .byte   0x62
-        .byte   0xf1
-        .byte   0x7c
-        .byte   0x48
-        .byte   0x10
-        .byte   0x44
-        .byte   0x24
-        .byte   0x02
+        vmovups   128(%rsp), %zmm0
         movq      %rbp, %rsp
         cfi_def_cfa_register (%rsp)
         popq      %rbp
@@ -340,14 +302,7 @@
         pushq     %r13
         subq      $176, %rsp
         movq      %rsi, %r13
-/* Below is encoding for vmovaps %zmm0, (%rsp).  */
-        .byte	0x62
-        .byte	0xf1
-        .byte	0x7c
-        .byte	0x48
-        .byte	0x29
-        .byte	0x04
-        .byte	0x24
+        vmovaps   %zmm0, (%rsp)
         movq      %rdi, %r12
         vmovaps   (%rsp), %ymm0
         call      HIDDEN_JUMPTARGET(\callee)
-- 
2.13.5


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]