This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PING][PATCHv3 2/2] aarch64: Reduce zva prologue to 64 bytes to reduce one instruction


Ping!

On Thursday 05 October 2017 10:46 PM, Siddhesh Poyarekar wrote:
> The current zva copy of 64 bytes has a prologue and epilogue of 128
> bytes, which is quite suboptimal for falkor as well as mustang (the
> two arm machines I have access to).  Dropping it to 64 bytes, which is
> the mininmum alignment required for 64 byte zva to work correctly
> results in a decent gain in performance for falkor as well as mustang.
> For falkor the timings reduction against generic memset goes up to
> about 54% in the best case, i.e. over twice as fast compared to 40% in
> the previous case.
> 
> Function: memset
> Variant: walk
>                                     simple_memset	__memset_nozva	__memset_zva_64	__memset_generic
> ========================================================================================================================
>                   length=256, char=0:     35933.10 (-705.25%)	     2458.49 ( 44.91%)	     2052.48 ( 54.00%)	     4462.35
>                   length=257, char=0:     36241.40 (-709.07%)	     2449.31 ( 45.32%)	     2088.25 ( 53.38%)	     4479.40
>                   length=258, char=0:     36506.50 (-707.37%)	     2533.37 ( 43.97%)	     2073.68 ( 54.14%)	     4521.66
>                   length=259, char=0:     36766.80 (-710.04%)	     2573.23 ( 43.31%)	     2088.39 ( 53.99%)	     4538.88
>                   length=260, char=0:     36958.30 (-711.39%)	     2638.15 ( 42.08%)	     2098.50 ( 53.93%)	     4554.95
>                   length=261, char=0:     37384.90 (-717.06%)	     2635.56 ( 42.40%)	     2112.04 ( 53.84%)	     4575.56
>                   length=262, char=0:     37582.20 (-720.77%)	     2661.22 ( 41.88%)	     2126.52 ( 53.56%)	     4578.89
>                   length=263, char=0:     37893.80 (-724.00%)	     2705.17 ( 41.18%)	     2155.46 ( 53.13%)	     4598.76
>                   length=264, char=0:     38126.30 (-720.58%)	     2748.52 ( 40.84%)	     2148.82 ( 53.75%)	     4646.25
>                   length=265, char=0:     38430.50 (-725.42%)	     2789.58 ( 40.08%)	     2159.59 ( 53.62%)	     4655.89
>                   length=266, char=0:     38699.40 (-728.77%)	     2850.63 ( 38.95%)	     2170.97 ( 53.51%)	     4669.48
>                   length=267, char=0:     38956.10 (-733.21%)	     2896.94 ( 38.04%)	     2182.96 ( 53.31%)	     4675.43
>                   length=268, char=0:     39198.60 (-734.03%)	     2965.66 ( 36.90%)	     2226.63 ( 52.62%)	     4699.92
>                   length=269, char=0:     39542.90 (-738.41%)	     3069.46 ( 34.92%)	     2229.59 ( 52.73%)	     4716.43
>                   length=270, char=0:     39822.00 (-737.24%)	     3071.14 ( 35.43%)	     2221.98 ( 53.28%)	     4756.34
>                   length=271, char=0:     40095.60 (-739.78%)	     3116.24 ( 34.73%)	     2234.95 ( 53.19%)	     4774.54
>                   length=512, char=0:    137512.00 (-1279.92%)	     8993.82 (  9.75%)	     6227.36 ( 37.51%)	     9965.19
>                   length=513, char=0:    138047.00 (-1282.33%)	     9076.86 (  9.11%)	     6187.04 ( 38.05%)	     9986.55
>                   length=514, char=0:    138744.00 (-1275.79%)	     9174.97 (  9.02%)	     6207.71 ( 38.44%)	    10084.70
>                   length=515, char=0:    139094.00 (-1276.45%)	     9249.32 (  8.47%)	     6225.89 ( 38.39%)	    10105.30
>                   length=516, char=0:    139488.00 (-1286.59%)	     9387.34 (  6.68%)	     6248.57 ( 37.89%)	    10059.80
>                   length=517, char=0:    140146.00 (-1292.00%)	     9406.52 (  6.61%)	     6253.62 ( 37.91%)	    10072.00
>                   length=518, char=0:    140728.00 (-1284.75%)	     9495.81 (  6.56%)	     6274.71 ( 38.26%)	    10162.70
>                   length=519, char=0:    141207.00 (-1294.93%)	     9570.67 (  5.46%)	     6358.77 ( 37.18%)	    10122.90
>                   length=520, char=0:    141825.00 (-1298.09%)	     9800.91 (  3.38%)	     6316.39 ( 37.73%)	    10144.20
>                   length=521, char=0:    142328.00 (-1299.56%)	     9870.21 (  2.94%)	     6336.67 ( 37.69%)	    10169.50
>                   length=522, char=0:    142876.00 (-1291.83%)	     9936.09 (  3.21%)	     6357.73 ( 38.07%)	    10265.30
>                   length=523, char=0:    143333.00 (-1302.71%)	     9946.90 (  2.66%)	     6447.33 ( 36.90%)	    10218.30
>                   length=524, char=0:    143793.00 (-1304.24%)	    10093.50 (  1.43%)	     6403.34 ( 37.47%)	    10239.90
>                   length=525, char=0:    144453.00 (-1298.91%)	    10122.80 (  1.97%)	     6418.25 ( 37.84%)	    10326.10
>                   length=526, char=0:    145077.00 (-1299.05%)	    10208.90 (  1.55%)	     6440.72 ( 37.89%)	    10369.70
>                   length=527, char=0:    145490.00 (-1310.72%)	    10346.70 ( -0.32%)	     6461.50 ( 37.35%)	    10313.20
>                  length=1024, char=0:    537455.00 (-2112.45%)	    34630.10 (-42.56%)	    20549.10 ( 15.41%)	    24292.30
>                  length=1025, char=0:    538527.00 (-2114.96%)	    34632.10 (-42.44%)	    20584.50 ( 15.34%)	    24313.20
>                  length=1026, char=0:    539582.00 (-2115.53%)	    34993.10 (-43.68%)	    20620.10 ( 15.33%)	    24354.50
>                  length=1027, char=0:    540599.00 (-2115.10%)	    35129.10 (-43.94%)	    20644.70 ( 15.41%)	    24405.20
>                  length=1028, char=0:    541682.00 (-2117.35%)	    35523.70 (-45.41%)	    20685.00 ( 15.33%)	    24429.30
>                  length=1029, char=0:    542909.00 (-2119.46%)	    35735.10 (-46.09%)	    20741.20 ( 15.21%)	    24461.30
>                  length=1030, char=0:    543762.00 (-2119.68%)	    35845.00 (-46.32%)	    20760.60 ( 15.25%)	    24497.30
>                  length=1031, char=0:    544860.00 (-2119.82%)	    36079.80 (-46.99%)	    20798.00 ( 15.27%)	    24545.20
>                  length=1032, char=0:    545893.00 (-2120.69%)	    36166.20 (-47.12%)	    20829.20 ( 15.27%)	    24582.10
>                  length=1033, char=0:    546847.00 (-2121.45%)	    36366.10 (-47.73%)	    20877.20 ( 15.19%)	    24616.70
>                  length=1034, char=0:    548095.00 (-2124.11%)	    36520.50 (-48.20%)	    20911.00 ( 15.15%)	    24643.30
>                  length=1035, char=0:    549395.00 (-2125.90%)	    36769.10 (-48.97%)	    20940.70 ( 15.16%)	    24681.90
>                  length=1036, char=0:    549914.00 (-2124.01%)	    36847.90 (-49.02%)	    20979.40 ( 15.15%)	    24726.20
>                  length=1037, char=0:    551102.00 (-2126.26%)	    37040.50 (-49.63%)	    21018.10 ( 15.09%)	    24754.60
>                  length=1038, char=0:    552181.00 (-2127.43%)	    37019.80 (-49.33%)	    21052.80 ( 15.08%)	    24790.10
>                  length=1039, char=0:    553138.00 (-2127.14%)	    37184.30 (-49.72%)	    21086.90 ( 15.10%)	    24836.20
> 
> 	* sysdeps/aarch64/memset.S (do_zva_64): Set 64 bytes in
> 	prologue and epilogue instead of 128 bytes.
> ---
>  sysdeps/aarch64/memset.S | 16 +++++-----------
>  1 file changed, 5 insertions(+), 11 deletions(-)
> 
> diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
> index 9fea4c2..78ead60 100644
> --- a/sysdeps/aarch64/memset.S
> +++ b/sysdeps/aarch64/memset.S
> @@ -60,20 +60,14 @@
>  	str	q0, [dst, 16]
>  	stp	q0, q0, [dst, 32]
>  	bic	dst, dst, 63
> -	stp	q0, q0, [dst, 64]
> -	stp	q0, q0, [dst, 96]
> -	sub	count, dstend, dst	/* Count is now 128 too large.	*/
> -	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
> -	add	dst, dst, 128
> -	nop
> +	add	dst, dst, 64
> +	sub	dstend, dstend, 64
>  1:	dc	zva, dst
>  	add	dst, dst, 64
> -	subs	count, count, 64
> +	cmp	dstend, dst
>  	b.hi	1b
> -	stp	q0, q0, [dst, 0]
> -	stp	q0, q0, [dst, 32]
> -	stp	q0, q0, [dstend, -64]
> -	stp	q0, q0, [dstend, -32]
> +	stp	q0, q0, [dstend]
> +	stp	q0, q0, [dstend, 32]
>  	ret
>  .endm
>  
> 


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]