This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH 4/4] S390: Implement mempcpy with help of memcpy. [BZ #19765]



On 26/04/2016 09:07, Stefan Liebler wrote:
> There exist optimized memcpy functions on s390, but no optimized mempcpy.
> This patch adds mempcpy entry points in memcpy.S files, which
> use the memcpy implementation. Now mempcpy itself is also an IFUNC function
> as memcpy is and the variants are listed in ifunc-impl-list.c.
> 
> The s390 string.h defines _HAVE_STRING_ARCH_mempcpy and __mempcpy/mempcpy
> definitions. If n is constant and small enough, __mempcpy/mempcpy is an inlined
> function, which uses memcpy and returns dest + n. In this constant case,
> GCC emits instructions like mvi or mvc and avoids the function call to memcpy.
> If n is not constant, then __mempcpy is called.
> 

AFAIk GLIBC does not have any ports that implements mempcpy and not
memcpy and IMHO the inline version to convert mempcpy to memcpy is 
always a gain.

So I would suggest to just avoid adding arch-specific bits and move the
mempcpy inline optimization at string/string.h to string/string2.h as

--

#define mempcpy(dest, src, n) __mempcpy_inline (dest, src, n)
#define __mempcpy(dest, src, n) __mempcpy_inline (dest, src, n)

__extern_always_inline void *
__mempcpy_inline (void *__restrict __dest,
                  const void *__restrict __src, size_t __n)
{
  return (char *) memcpy (__dest, __src, __n) + __n;
}

--

This is convert any mempcpy to mempcy plus length and provide
the symbol only for older binaries.

> ChangeLog:
> 
> 	[BZ #19765]
> 	* sysdeps/s390/bits/string.h (mempcpy) Redirect to
> 	__mempcpy_inline_memcpy or __mempcpy_inline_mempcpy.
> 	(__mempcpy) Likewise.
> 	(__mempcpy_inline_mempcpy): New inline function.
> 	(__mempcpy_inline_memcpy): Likewise.
> 	(_HAVE_STRING_ARCH_mempcpy): New define.
> 	* sysdeps/s390/mempcpy.S: New File.
> 	* sysdeps/s390/multiarch/mempcpy.c: Likewise.
> 	* sysdeps/s390/multiarch/Makefile (sysdep_routines): Add mempcpy.
> 	* sysdeps/s390/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list):
> 	Add mempcpy variants.
> 	* sysdeps/s390/s390-32/memcpy.S: Add mempcpy entry point.
> 	(memcpy): Adjust to be usable from mempcpy entry point.
> 	(__memcpy_mvcle): Likewise.
> 	* sysdeps/s390/s390-64/memcpy.S: Likewise.
> 	* sysdeps/s390/s390-32/multiarch/memcpy-s390.S: Add entry points
> 	____mempcpy_z196, ____mempcpy_z10 and add __GI_ symbols for mempcpy.
> 	(__memcpy_z196): Adjust to be usable from mempcpy entry point.
> 	(__memcpy_z10): Likewise.
> 	* sysdeps/s390/s390-64/multiarch/memcpy-s390x.S: Likewise.
> ---
>  sysdeps/s390/bits/string.h                    | 36 +++++++++++++++++++
>  sysdeps/s390/mempcpy.S                        | 19 ++++++++++
>  sysdeps/s390/multiarch/Makefile               |  3 +-
>  sysdeps/s390/multiarch/ifunc-impl-list.c      |  7 ++++
>  sysdeps/s390/multiarch/mempcpy.c              | 26 ++++++++++++++
>  sysdeps/s390/s390-32/memcpy.S                 | 50 ++++++++++++++++-----------
>  sysdeps/s390/s390-32/multiarch/memcpy-s390.S  | 31 +++++++++++++++--
>  sysdeps/s390/s390-64/memcpy.S                 | 47 ++++++++++++++-----------
>  sysdeps/s390/s390-64/multiarch/memcpy-s390x.S | 29 ++++++++++++++--
>  9 files changed, 203 insertions(+), 45 deletions(-)
>  create mode 100644 sysdeps/s390/mempcpy.S
>  create mode 100644 sysdeps/s390/multiarch/mempcpy.c
> 
> diff --git a/sysdeps/s390/bits/string.h b/sysdeps/s390/bits/string.h
> index 39e0b7f..eae3c8e 100644
> --- a/sysdeps/s390/bits/string.h
> +++ b/sysdeps/s390/bits/string.h
> @@ -24,6 +24,42 @@
>  /* Use the unaligned string inline ABI.  */
>  #define _STRING_INLINE_unaligned 1
>  
> +#if defined __USE_GNU && defined __OPTIMIZE__ \
> +    && defined __extern_always_inline && __GNUC_PREREQ (3,2)
> +# if !defined _FORCE_INLINES && !defined _HAVE_STRING_ARCH_mempcpy
> +
> +/* Don't inline mempcpy into memcpy as s390 has an optimized mempcpy.  */
> +#  define _HAVE_STRING_ARCH_mempcpy 1
> +
> +__extern_always_inline void *
> +__mempcpy_inline_memcpy (void *__restrict __dest,
> +			 const void *__restrict __src, size_t __n)
> +{
> +  return (char *) memcpy (__dest, __src, __n) + __n;
> +}
> +
> +__extern_always_inline void *
> +__mempcpy_inline_mempcpy (void *__restrict __dest,
> +			 const void *__restrict __src, size_t __n)
> +{
> +  return __mempcpy (__dest, __src, __n);
> +}
> +
> +/* If n is constant and small enough, __mempcpy/mempcpy is an inlined function,
> +   which uses memcpy and returns dest + n. In this constant case, GCC emits
> +   instructions like mvi or mvc and avoids the function call to memcpy.
> +   If n is not constant, then __mempcpy is called.  */
> +#  undef mempcpy
> +#  undef __mempcpy
> +#  define __mempcpy(dest, src, n)					\
> +  (__extension__ (__builtin_constant_p (n) && n <= 65536		\
> +		  ? __mempcpy_inline_memcpy (dest, src, n)		\
> +		  : __mempcpy_inline_mempcpy (dest, src, n)))
> +#  define mempcpy(dest, src, n) __mempcpy (dest, src, n)
> +
> +# endif
> +#endif
> +
>  /* We only provide optimizations if the user selects them and if
>     GNU CC is used.  */
>  #if !defined __NO_STRING_INLINES && defined __USE_STRING_INLINES \
> diff --git a/sysdeps/s390/mempcpy.S b/sysdeps/s390/mempcpy.S
> new file mode 100644
> index 0000000..c62074b
> --- /dev/null
> +++ b/sysdeps/s390/mempcpy.S
> @@ -0,0 +1,19 @@
> +/* CPU specific mempcpy without multiarch - 32/64 bit S/390 version.
> +   Copyright (C) 2016 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +/* mempcpy is implemented in memcpy.S.  */
> diff --git a/sysdeps/s390/multiarch/Makefile b/sysdeps/s390/multiarch/Makefile
> index 0805b07..89324ca 100644
> --- a/sysdeps/s390/multiarch/Makefile
> +++ b/sysdeps/s390/multiarch/Makefile
> @@ -18,7 +18,8 @@ sysdep_routines += strlen strlen-vx strlen-c \
>  		   memchr memchr-vx \
>  		   rawmemchr rawmemchr-vx rawmemchr-c \
>  		   memccpy memccpy-vx memccpy-c \
> -		   memrchr memrchr-vx memrchr-c
> +		   memrchr memrchr-vx memrchr-c \
> +		   mempcpy
>  endif
>  
>  ifeq ($(subdir),wcsmbs)
> diff --git a/sysdeps/s390/multiarch/ifunc-impl-list.c b/sysdeps/s390/multiarch/ifunc-impl-list.c
> index 62a4359..89898ec 100644
> --- a/sysdeps/s390/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/s390/multiarch/ifunc-impl-list.c
> @@ -69,6 +69,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  			      S390_IS_Z10 (stfle_bits), __memcpy_z10)
>  	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_default))
>  
> +  IFUNC_IMPL (i, name, mempcpy,
> +	      IFUNC_IMPL_ADD (array, i, mempcpy,
> +			      S390_IS_Z196 (stfle_bits), ____mempcpy_z196)
> +	      IFUNC_IMPL_ADD (array, i, mempcpy,
> +			      S390_IS_Z10 (stfle_bits), ____mempcpy_z10)
> +	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, ____mempcpy_default))
> +
>  #endif /* SHARED */
>  
>  #ifdef HAVE_S390_VX_ASM_SUPPORT
> diff --git a/sysdeps/s390/multiarch/mempcpy.c b/sysdeps/s390/multiarch/mempcpy.c
> new file mode 100644
> index 0000000..34d8329
> --- /dev/null
> +++ b/sysdeps/s390/multiarch/mempcpy.c
> @@ -0,0 +1,26 @@
> +/* Multiple versions of mempcpy.
> +   Copyright (C) 2016 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +
> +#if defined SHARED && IS_IN (libc)
> +# include <ifunc-resolve.h>
> +s390_libc_ifunc (__mempcpy)
> +
> +__asm__ (".weak mempcpy\n\t"
> +	 ".set mempcpy,__mempcpy\n\t");
> +#endif
> diff --git a/sysdeps/s390/s390-32/memcpy.S b/sysdeps/s390/s390-32/memcpy.S
> index 2ac51ab..6be5104 100644
> --- a/sysdeps/s390/s390-32/memcpy.S
> +++ b/sysdeps/s390/s390-32/memcpy.S
> @@ -25,12 +25,23 @@
>       %r3 = address of source memory area
>       %r4 = number of bytes to copy.  */
>  
> -#ifdef USE_MULTIARCH
> -ENTRY(__memcpy_default)
> -#else
> -ENTRY(memcpy)
> +       .text
> +ENTRY(__mempcpy)
> +	.machine "g5"
> +	lr      %r1,%r2             # Use as dest
> +	la      %r2,0(%r4,%r2)      # Return dest + n
> +	j	.L_G5_start
> +END(__mempcpy)
> +#ifndef USE_MULTIARCH
> +libc_hidden_def (__mempcpy)
> +weak_alias (__mempcpy, mempcpy)
> +libc_hidden_builtin_def (mempcpy)
>  #endif
> +
> +ENTRY(memcpy)
>  	.machine "g5"
> +	lr      %r1,%r2             # r1: Use as dest ; r2: Return dest
> +.L_G5_start:
>  	st      %r13,52(%r15)
>  	.cfi_offset 13, -44
>  	basr    %r13,0
> @@ -41,14 +52,13 @@ ENTRY(memcpy)
>  	lr      %r5,%r4
>  	srl     %r5,8
>  	ltr     %r5,%r5
> -	lr      %r1,%r2
>  	jne     .L_G5_13
>  	ex      %r4,.L_G5_17-.L_G5_16(%r13)
>  .L_G5_4:
>  	l       %r13,52(%r15)
>  	br      %r14
>  .L_G5_13:
> -	chi	%r5,4096             # Switch to mvcle for copies >1MB
> +	chi	%r5,4096            # Switch to mvcle for copies >1MB
>  	jh	__memcpy_mvcle
>  .L_G5_12:
>  	mvc     0(256,%r1),0(%r3)
> @@ -59,24 +69,24 @@ ENTRY(memcpy)
>  	j       .L_G5_4
>  .L_G5_17:
>  	mvc     0(1,%r1),0(%r3)
> -#ifdef USE_MULTIARCH
> -END(__memcpy_default)
> -#else
>  END(memcpy)
> +#ifndef USE_MULTIARCH
>  libc_hidden_builtin_def (memcpy)
>  #endif
>  
>  ENTRY(__memcpy_mvcle)
> -       # Using as standalone function will result in unexpected
> -       # results since the length field is incremented by 1 in order to
> -       # compensate the changes already done in the functions above.
> -       ahi     %r4,1               # length + 1
> -       lr      %r5,%r4             # source length
> -       lr      %r4,%r3             # source address
> -       lr      %r3,%r5             # destination length = source length
> +	# Using as standalone function will result in unexpected
> +	# results since the length field is incremented by 1 in order to
> +	# compensate the changes already done in the functions above.
> +	lr      %r0,%r2             # backup return dest [ + n ]
> +	ahi     %r4,1               # length + 1
> +	lr      %r5,%r4             # source length
> +	lr      %r4,%r3             # source address
> +	lr      %r2,%r1             # destination address
> +	lr      %r3,%r5             # destination length = source length
>  .L_MVCLE_1:
> -       mvcle   %r2,%r4,0           # thats it, MVCLE is your friend
> -       jo      .L_MVCLE_1
> -       lr      %r2,%r1             # return destination address
> -       br      %r14
> +	mvcle   %r2,%r4,0           # thats it, MVCLE is your friend
> +	jo      .L_MVCLE_1
> +	lr      %r2,%r0             # return destination address
> +	br      %r14
>  END(__memcpy_mvcle)
> diff --git a/sysdeps/s390/s390-32/multiarch/memcpy-s390.S b/sysdeps/s390/s390-32/multiarch/memcpy-s390.S
> index 92ffaea..297a894 100644
> --- a/sysdeps/s390/s390-32/multiarch/memcpy-s390.S
> +++ b/sysdeps/s390/s390-32/multiarch/memcpy-s390.S
> @@ -29,14 +29,23 @@
>  
>  #if defined SHARED && IS_IN (libc)
>  
> +ENTRY(____mempcpy_z196)
> +	.machine "z196"
> +	.machinemode "zarch_nohighgprs"
> +	lr      %r1,%r2         # Use as dest
> +	la      %r2,0(%r4,%r2)  # Return dest + n
> +	j	.L_Z196_start
> +END(____mempcpy_z196)
> +
>  ENTRY(__memcpy_z196)
>  	.machine "z196"
>  	.machinemode "zarch_nohighgprs"
> +	lr      %r1,%r2         # r1: Use as dest ; r2: Return dest
> +.L_Z196_start:
>  	llgfr   %r4,%r4
>  	ltgr    %r4,%r4
>  	je      .L_Z196_4
>  	aghi    %r4,-1
> -	lr      %r1,%r2
>  	srlg    %r5,%r4,8
>  	ltgr    %r5,%r5
>  	jne     .L_Z196_5
> @@ -60,13 +69,22 @@ ENTRY(__memcpy_z196)
>  	mvc     0(1,%r1),0(%r3)
>  END(__memcpy_z196)
>  
> +ENTRY(____mempcpy_z10)
> +	.machine "z10"
> +	.machinemode "zarch_nohighgprs"
> +	lr      %r1,%r2         # Use as dest
> +	la      %r2,0(%r4,%r2)  # Return dest + n
> +	j	.L_Z10_start
> +END(____mempcpy_z10)
> +
>  ENTRY(__memcpy_z10)
>  	.machine "z10"
>  	.machinemode "zarch_nohighgprs"
> +	lr      %r1,%r2         # r1: Use as dest ; r2: Return dest
> +.L_Z10_start:
>  	llgfr   %r4,%r4
>  	cgije   %r4,0,.L_Z10_4
>  	aghi    %r4,-1
> -	lr      %r1,%r2
>  	srlg    %r5,%r4,8
>  	cgijlh  %r5,0,.L_Z10_13
>  .L_Z10_3:
> @@ -88,14 +106,23 @@ ENTRY(__memcpy_z10)
>  	mvc     0(1,%r1),0(%r3)
>  END(__memcpy_z10)
>  
> +# define __mempcpy ____mempcpy_default
>  #endif /* SHARED && IS_IN (libc) */
>  
> +#define memcpy __memcpy_default
>  #include "../memcpy.S"
> +#undef memcpy
>  
>  #if defined SHARED && IS_IN (libc)
>  .globl   __GI_memcpy
>  .set     __GI_memcpy,__memcpy_default
> +.globl   __GI_mempcpy
> +.set     __GI_mempcpy,____mempcpy_default
> +.globl   __GI___mempcpy
> +.set     __GI___mempcpy,____mempcpy_default
>  #else
>  .globl   memcpy
>  .set     memcpy,__memcpy_default
> +.weak    mempcpy
> +.set     mempcpy,__mempcpy
>  #endif
> diff --git a/sysdeps/s390/s390-64/memcpy.S b/sysdeps/s390/s390-64/memcpy.S
> index 9d60a14..e3ace0c 100644
> --- a/sysdeps/s390/s390-64/memcpy.S
> +++ b/sysdeps/s390/s390-64/memcpy.S
> @@ -27,19 +27,27 @@
>  
>  
>         .text
> +ENTRY(__mempcpy)
> +	.machine "z900"
> +	lgr     %r1,%r2             # Use as dest
> +	la      %r2,0(%r4,%r2)      # Return dest + n
> +	j	.L_Z900_start
> +END(__mempcpy)
> +#ifndef USE_MULTIARCH
> +libc_hidden_def (__mempcpy)
> +weak_alias (__mempcpy, mempcpy)
> +libc_hidden_builtin_def (mempcpy)
> +#endif
>  
> -#ifdef USE_MULTIARCH
> -ENTRY(__memcpy_default)
> -#else
>  ENTRY(memcpy)
> -#endif
>  	.machine "z900"
> +	lgr     %r1,%r2             # r1: Use as dest ; r2: Return dest
> +.L_Z900_start:
>  	ltgr    %r4,%r4
>  	je      .L_Z900_4
>  	aghi    %r4,-1
>  	srlg    %r5,%r4,8
>  	ltgr    %r5,%r5
> -	lgr     %r1,%r2
>  	jne     .L_Z900_13
>  .L_Z900_3:
>  	larl    %r5,.L_Z900_15
> @@ -57,25 +65,24 @@ ENTRY(memcpy)
>  	j       .L_Z900_3
>  .L_Z900_15:
>  	mvc     0(1,%r1),0(%r3)
> -
> -#ifdef USE_MULTIARCH
> -END(__memcpy_default)
> -#else
>  END(memcpy)
> +#ifndef USE_MULTIARCH
>  libc_hidden_builtin_def (memcpy)
>  #endif
>  
>  ENTRY(__memcpy_mvcle)
> -       # Using as standalone function will result in unexpected
> -       # results since the length field is incremented by 1 in order to
> -       # compensate the changes already done in the functions above.
> -       aghi    %r4,1               # length + 1
> -       lgr     %r5,%r4             # source length
> -       lgr     %r4,%r3             # source address
> -       lgr     %r3,%r5             # destination length = source length
> +	# Using as standalone function will result in unexpected
> +	# results since the length field is incremented by 1 in order to
> +	# compensate the changes already done in the functions above.
> +	lgr     %r0,%r2             # backup return dest [ + n ]
> +	aghi    %r4,1               # length + 1
> +	lgr     %r5,%r4             # source length
> +	lgr     %r4,%r3             # source address
> +	lgr     %r2,%r1             # destination address
> +	lgr     %r3,%r5             # destination length = source length
>  .L_MVCLE_1:
> -       mvcle   %r2,%r4,0           # thats it, MVCLE is your friend
> -       jo      .L_MVCLE_1
> -       lgr     %r2,%r1             # return destination address
> -       br      %r14
> +	mvcle   %r2,%r4,0           # thats it, MVCLE is your friend
> +	jo      .L_MVCLE_1
> +	lgr     %r2,%r0             # return destination address
> +	br      %r14
>  END(__memcpy_mvcle)
> diff --git a/sysdeps/s390/s390-64/multiarch/memcpy-s390x.S b/sysdeps/s390/s390-64/multiarch/memcpy-s390x.S
> index 8f54526..0f5a36e 100644
> --- a/sysdeps/s390/s390-64/multiarch/memcpy-s390x.S
> +++ b/sysdeps/s390/s390-64/multiarch/memcpy-s390x.S
> @@ -29,12 +29,20 @@
>  
>  #if defined SHARED && IS_IN (libc)
>  
> +ENTRY(____mempcpy_z196)
> +	.machine "z196"
> +	lgr     %r1,%r2         # Use as dest
> +	la      %r2,0(%r4,%r2)  # Return dest + n
> +	j	.L_Z196_start
> +END(____mempcpy_z196)
> +
>  ENTRY(__memcpy_z196)
>  	.machine "z196"
> +	lgr     %r1,%r2         # r1: Use as dest ; r2: Return dest
> +.L_Z196_start:
>  	ltgr    %r4,%r4
>  	je      .L_Z196_4
>  	aghi    %r4,-1
> -	lgr     %r1,%r2
>  	srlg    %r5,%r4,8
>  	ltgr    %r5,%r5
>  	jne     .L_Z196_5
> @@ -58,11 +66,19 @@ ENTRY(__memcpy_z196)
>  	mvc     0(1,%r1),0(%r3)
>  END(__memcpy_z196)
>  
> +ENTRY(____mempcpy_z10)
> +	.machine "z10"
> +	lgr     %r1,%r2         # Use as dest
> +	la      %r2,0(%r4,%r2)  # Return dest + n
> +	j	.L_Z10_start
> +END(____mempcpy_z10)
> +
>  ENTRY(__memcpy_z10)
>  	.machine "z10"
> +	lgr     %r1,%r2         # r1: Use as dest ; r2: Return dest
> +.L_Z10_start:
>  	cgije   %r4,0,.L_Z10_4
>  	aghi    %r4,-1
> -	lgr     %r1,%r2
>  	srlg    %r5,%r4,8
>  	cgijlh  %r5,0,.L_Z10_13
>  .L_Z10_3:
> @@ -84,14 +100,23 @@ ENTRY(__memcpy_z10)
>  	mvc     0(1,%r1),0(%r3)
>  END(__memcpy_z10)
>  
> +# define __mempcpy ____mempcpy_default
>  #endif /* SHARED && IS_IN (libc) */
>  
> +#define memcpy __memcpy_default
>  #include "../memcpy.S"
> +#undef memcpy
>  
>  #if defined SHARED && IS_IN (libc)
>  .globl   __GI_memcpy
>  .set     __GI_memcpy,__memcpy_default
> +.globl   __GI_mempcpy
> +.set     __GI_mempcpy,____mempcpy_default
> +.globl   __GI___mempcpy
> +.set     __GI___mempcpy,____mempcpy_default
>  #else
>  .globl   memcpy
>  .set     memcpy,__memcpy_default
> +.weak    mempcpy
> +.set     mempcpy,__mempcpy
>  #endif
> 


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]