This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH 1/2] powerpc: Zero pad using memset in strncpy/stpncpy


I am ok with this patch, the only nit is it now calls memset for all
padding size, where for small padding (less than 16 or maybe 32) I think it would
be still faster than calling memset (since dcbtst will be used only for
large inputs). However I still not sure if embedding the memset write_LT_32
would yield much gain as well.

On 20-04-2016 17:55, Gabriel F. T. Gomes wrote:
> Call __memset_power8 to pad, with zeros, the remaining bytes in the
> dest string on __strncpy_power8 and __stpncpy_power8.  This improves
> performance when n is larger than the input string, giving ~30% gain for
> larger strings without impacting much shorter strings.
> 
> 2016-04-18  Gabriel F. T. Gomes  <gftg@linux.vnet.ibm.com>
> 
> 	* sysdeps/powerpc/powerpc64/power8/strncpy.S: Call memset to pad the
> 	remaining bytes in the dest string, with zeros.
> ---
>  sysdeps/powerpc/powerpc64/power8/strncpy.S | 123 +++++++++++++----------------
>  1 file changed, 56 insertions(+), 67 deletions(-)
> 
> diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S
> index 17c3afb..0bb3bd4 100644
> --- a/sysdeps/powerpc/powerpc64/power8/strncpy.S
> +++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S
> @@ -24,6 +24,8 @@
>  # define FUNC_NAME strncpy
>  #endif
>  
> +#define FRAMESIZE (FRAME_MIN_SIZE+48)
> +
>  /* Implements the function
>  
>     char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> @@ -54,8 +56,7 @@ EALIGN (FUNC_NAME, 4, 0)
>  	addi	r10,r4,16
>  	rlwinm	r9,r4,0,19,19
>  
> -	/* Since it is a leaf function, save some non-volatile registers on the
> -	   protected/red zone.  */
> +	/* Save some non-volatile registers on the stack.  */
>  	std	r26,-48(r1)
>  	std	r27,-40(r1)
>  
> @@ -69,6 +70,14 @@ EALIGN (FUNC_NAME, 4, 0)
>  	std	r30,-16(r1)
>  	std	r31,-8(r1)
>  
> +	/* Update CFI.  */
> +	cfi_offset(r26, -48)
> +	cfi_offset(r27, -40)
> +	cfi_offset(r28, -32)
> +	cfi_offset(r29, -24)
> +	cfi_offset(r30, -16)
> +	cfi_offset(r31, -8)
> +
>  	beq	cr7,L(unaligned_lt_16)
>  	rldicl	r9,r4,0,61
>  	subfic	r8,r9,8
> @@ -180,74 +189,58 @@ L(short_path_loop_end):
>  	ld	r31,-8(r1)
>  	blr
>  
> -	/* This code pads the remainder dest with NULL bytes.  The algorithm
> -	   calculate the remanining size and issues a doubleword unrolled
> -	   loops followed by a byte a byte set.  */
> +	/* This code pads the remainder of dest with NULL bytes.  The algorithm
> +	   calculates the remaining size and calls memset.  */
>  	.align	4
>  L(zero_pad_start):
>  	mr	r5,r10
>  	mr	r9,r6
>  L(zero_pad_start_1):
> -	srdi.	r8,r5,r3
> -	mr	r10,r9
> -#ifdef USE_AS_STPNCPY
> -	mr	r3,r9
> +	/* At this point:
> +	     - r5 holds the number of bytes that still have to be written to
> +	       dest.
> +	     - r9 points to the position, in dest, where the first null byte
> +	       will be written.
> +	   The above statements are true both when control reaches this label
> +	   from a branch or when falling through the previous lines.  */
> +#ifndef USE_AS_STPNCPY
> +	mr	r30,r3       /* Save the return value of strncpy.  */
> +#endif
> +	/* Prepare the call to memset.  */
> +	mr	r3,r9        /* Pointer to the area to be zero-filled.  */
> +	li	r4,0         /* Byte to be written (zero).  */
> +
> +	/* We delayed the creation of the stack frame, as well as the saving of
> +	   the link register, because only at this point, we are sure that
> +	   doing so is actually needed.  */
> +
> +	/* Save the link register.  */
> +	mflr	r0
> +	std	r0,16(r1)
> +	cfi_offset(lr, 16)
> +
> +	/* Create the stack frame.  */
> +	stdu	r1,-FRAMESIZE(r1)
> +	cfi_adjust_cfa_offset(FRAMESIZE)
> +
> +	bl	__memset_power8
> +	nop
> +
> +	/* Restore the stack frame.  */
> +	addi	r1,r1,FRAMESIZE
> +	cfi_adjust_cfa_offset(-FRAMESIZE)
> +	/* Restore the link register.  */
> +	ld	r0,16(r1)
> +	mtlr	r0
> +
> +#ifndef USE_AS_STPNCPY
> +	mr	r3,r30       /* Restore the return value of strncpy, i.e.:
> +				dest.  For stpncpy, the return value is the
> +				same as return value of memset.  */
>  #endif
> -	beq-	cr0,L(zero_pad_loop_b_start)
> -	cmpldi	cr7,r8,1
> -	li	cr7,0
> -	std	r7,0(r9)
> -	beq	cr7,L(zero_pad_loop_b_prepare)
> -	addic.	r8,r8,-2
> -	addi	r10,r9,r16
> -	std	r7,8(r9)
> -	beq	cr0,L(zero_pad_loop_dw_2)
> -	std	r7,16(r9)
> -	li	r9,0
> -	b	L(zero_pad_loop_dw_1)
> -
> -	.align	4
> -L(zero_pad_loop_dw):
> -	addi	r10,r10,16
> -	std	r9,-8(r10)
> -	beq	cr0,L(zero_pad_loop_dw_2)
> -	std	r9,0(r10)
> -L(zero_pad_loop_dw_1):
> -	cmpldi	cr7,r8,1
> -	std	r9,0(r10)
> -	addic.	r8,r8,-2
> -	bne	cr7,L(zero_pad_loop_dw)
> -	addi	r10,r10,8
> -L(zero_pad_loop_dw_2):
> -	rldicl	r5,r5,0,61
> -L(zero_pad_loop_b_start):
> -	cmpdi	cr7,r5,0
> -	addi	r5,r5,-1
> -	addi	r9,r10,-1
> -	add	r10,r10,5
> -	subf	r10,r9,r10
> -	li	r8,0
> -	beq-	cr7,L(short_path_loop_end)
> -
> -	/* Write remaining 1-8 bytes.  */
> -        .align  4
> -	addi	r9,r9,1
> -	mtocrf	0x1,r10
> -	bf	29,4f
> -        stw     r8,0(r9)
> -        addi	r9,r9,4
> -
> -        .align  4
> -4:      bf      30,2f
> -        sth     r8,0(r9)
> -        addi	r9,r9,2
> -
> -        .align  4
> -2:      bf	31,1f
> -        stb	r8,0(r9)
>  
> -	/* Restore non-volatile registers.  */
> -1:	ld	r26,-48(r1)
> +	/* Restore non-volatile registers and return.  */
> +	ld	r26,-48(r1)
>  	ld	r27,-40(r1)
>  	ld	r28,-32(r1)
>  	ld	r29,-24(r1)
> @@ -443,10 +436,6 @@ L(short_path_prepare_2_3):
>  	mr	r4,r28
>  	mr	r9,r29
>  	b	L(short_path_2)
> -L(zero_pad_loop_b_prepare):
> -	addi	r10,r9,8
> -	rldicl	r5,r5,0,61
> -	b	L(zero_pad_loop_b_start)
>  L(zero_pad_start_prepare_1):
>  	mr	r5,r6
>  	mr	r9,r8
> 


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]