This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

Re: [PATCH] x86_64: Remove redundant REX bytes from memrchr.S

From: "H.J. Lu" <hjl dot tools at gmail dot com>
To: GNU C Library <libc-alpha at sourceware dot org>
Date: Sun, 4 Jun 2017 08:58:34 -0700
Subject: Re: [PATCH] x86_64: Remove redundant REX bytes from memrchr.S
Authentication-results: sourceware.org; auth=none
References: <20170530200441.GA8999@lucon.org>

On Tue, May 30, 2017 at 1:04 PM, H.J. Lu <hongjiu.lu@intel.com> wrote:
> By x86-64 specification, 32-bit destination registers are zero-extended
> to 64 bits.  There is no need to use 64-bit registers when only the lower
> 32 bits are non-zero.  Also 2 instructions in:
>
>         mov     %rdi, %rcx
>         and     $15, %rcx
>         jz      L(length_less16_offset0)
>
>         mov     %rdi, %rcx              <<< redundant
>         and     $15, %rcx               <<< redundant
>
> are redundant.
>
> Any comments?

I will check it next week.

> H.J.
> --
>         * sysdeps/x86_64/memrchr.S (__memrchr): Use 32-bit registers for
>         he lower 32 bits.  Remove redundant instructions.
> ---
>  sysdeps/x86_64/memrchr.S | 36 +++++++++++++++++-------------------
>  1 file changed, 17 insertions(+), 19 deletions(-)
>
> diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
> index aab1a4a..5fa0fe9 100644
> --- a/sysdeps/x86_64/memrchr.S
> +++ b/sysdeps/x86_64/memrchr.S
> @@ -22,7 +22,7 @@
>
>         .text
>  ENTRY (__memrchr)
> -       movd    %rsi, %xmm1
> +       movd    %esi, %xmm1
>
>         sub     $16, %rdx
>         jbe     L(length_less16)
> @@ -42,8 +42,8 @@ ENTRY (__memrchr)
>         jnz     L(matches0)
>
>         sub     $64, %rdi
> -       mov     %rdi, %rcx
> -       and     $15, %rcx
> +       mov     %edi, %ecx
> +       and     $15, %ecx
>         jz      L(loop_prolog)
>
>         add     $16, %rdi
> @@ -108,8 +108,8 @@ L(loop_prolog):
>         test    %eax, %eax
>         jnz     L(matches0)
>
> -       mov     %rdi, %rcx
> -       and     $63, %rcx
> +       mov     %edi, %ecx
> +       and     $63, %ecx
>         jz      L(align64_loop)
>
>         add     $64, %rdi
> @@ -166,8 +166,8 @@ L(align64_loop):
>
>         .p2align 4
>  L(exit_loop):
> -       add     $64, %rdx
> -       cmp     $32, %rdx
> +       add     $64, %edx
> +       cmp     $32, %edx
>         jbe     L(exit_loop_32)
>
>         movdqa  48(%rdi), %xmm0
> @@ -187,7 +187,7 @@ L(exit_loop):
>         pmovmskb        %xmm3, %eax
>         test    %eax, %eax
>         jnz     L(matches16_1)
> -       cmp     $48, %rdx
> +       cmp     $48, %edx
>         jbe     L(return_null)
>
>         pcmpeqb (%rdi), %xmm1
> @@ -204,7 +204,7 @@ L(exit_loop_32):
>         pmovmskb        %xmm0, %eax
>         test    %eax, %eax
>         jnz     L(matches48_1)
> -       cmp     $16, %rdx
> +       cmp     $16, %edx
>         jbe     L(return_null)
>
>         pcmpeqb 32(%rdi), %xmm1
> @@ -276,7 +276,7 @@ L(matches48_1):
>
>         .p2align 4
>  L(return_null):
> -       xor     %rax, %rax
> +       xor     %eax, %eax
>         ret
>
>         .p2align 4
> @@ -306,18 +306,16 @@ L(length_less16):
>         punpcklbw       %xmm1, %xmm1
>         punpcklbw       %xmm1, %xmm1
>
> -       add     $16, %rdx
> +       add     $16, %edx
>
>         pshufd  $0, %xmm1, %xmm1
>
> -       mov     %rdi, %rcx
> -       and     $15, %rcx
> +       mov     %edi, %ecx
> +       and     $15, %ecx
>         jz      L(length_less16_offset0)
>
> -       mov     %rdi, %rcx
> -       and     $15, %rcx
>         mov     %cl, %dh
> -       mov     %rcx, %r8
> +       mov     %ecx, %esi
>         add     %dl, %dh
>         and     $-16, %rdi
>
> @@ -340,7 +338,7 @@ L(length_less16):
>
>         bsr     %eax, %eax
>         add     %rdi, %rax
> -       add     %r8, %rax
> +       add     %rsi, %rax
>         ret
>
>         .p2align 4
> @@ -362,14 +360,14 @@ L(length_less16_part2):
>         pcmpeqb (%rdi), %xmm1
>         pmovmskb        %xmm1, %eax
>
> -       mov     %r8, %rcx
> +       mov     %esi, %ecx
>         sar     %cl, %eax
>         test    %eax, %eax
>         jz      L(return_null)
>
>         bsr     %eax, %eax
>         add     %rdi, %rax
> -       add     %r8, %rax
> +       add     %rsi, %rax
>         ret
>
>         .p2align 4
> --
> 2.9.4
>



-- 
H.J.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]