This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH 07/27] S390: Optimize strlen and wcslen.


On 06/26/2015 03:00 PM, OndÅej BÃlka wrote:
On Fri, Jun 26, 2015 at 01:51:32PM +0200, Stefan Liebler wrote:
This patch provides optimized versions of strlen and wcslen with the z13 vector

Didn't read details about z13 so I will ask. These questions apply for
all functions.

+	lghi	%r5,0		/* current_len = 0.  */
+
+	/* Align s to 16 byte.  */

This way of masking tends to be slow due inputs where you read only few
bytes on first check due alignment.

In my experience a fastest is first check for page cross and if not then
do unaligned load of 16 bytes. It looks possible with vll unless its
limited to cache line.

+	risbg	%r3,%r2,60,128+63,0 /* Test if s is aligned and
+				       %r3 = bits 60-63 'and' 15.  */
+	je	.Lloop1		/* If s is aligned, loop aligned.  */

This is performance problem as its relatively unpredictable branch
(29.8% calls are aligned to 16 bytes), and you save few cycles but lose
more.

There is the vlbb - vector load to block boundary (e.g. 4k) - and lcbb - load count to block boundary. With this pair of instructions i can load the vector early, because it does not cause a page fault. lcbb returns 16 or the number of bytes to block boundary. I've changed the alignment code, so that the first 16 bytes are processed - if we cross the 4k boundary, then <16 bytes - and then the alignment to 16 bytes is done before the loop without the branch. In the worst case, 15 bytes are checked twice by the first vl in loop.

ENTRY(__strlen_vx)
	.machine "z13"
	.machinemode "zarch_nohighgprs"

	vlbb	%v16,0(%r2),6	/* Load s until next 4k-byte boundary.  */
	lcbb	%r1,0(%r2),6	/* Get bytes to 4k-byte boundary or 16.  */
	
	vfenezb	%v16,%v16,%v16	/* Find element not equal with zero search.  */
	vlgvb	%r4,%v16,7	/* Load zero index or 16 if not found.  */
	clr	%r4,%r1		/* If found zero within loaded bytes?  */
	locgrl	%r2,%r4		/* Then copy return value.  */
	blr	%r14		/* And return.  */

	/* Align s to 16 byte.  */
	risbgn	%r3,%r2,60,128+63,0 /* %r3 = bits 60-63 of %r2 'and' 15.  */
	lghi	%r5,16		/* current_len = 16.  */
	slr	%r5,%r3		/* Compute bytes to 16bytes boundary.  */

	/* Find zero in 16 byte aligned loop.  */
.Lloop:
	vl	%v16,0(%r5,%r2)	/* Load s.  */
...

I'll prepare a patch for other string/wcslen functions.

+	lghi	%r4,15
+	slr	%r4,%r3		/* Compute highest index to load (15-x).  */
+	vll	%v16,%r4,0(%r2) /* Load up to 16 byte boundary. (vll needs
+				   highest index, remaining bytes are 0.)  */
+	ahi	%r4,1		/* Work with loaded byte count.  */
+	vfenezb	%v16,%v16,%v16	/* Find element not equal with zero search.  */
+	vlgvb	%r5,%v16,7	/* Load zero index or 16 if not found.  */
+	clr	%r5,%r4		/* If found zero within loaded bytes?  */
+	locgrl	%r2,%r5		/* Then copy return value.  */
+	blr	%r14		/* And return.  */
+	lgr	%r5,%r4		/* No zero within loaded bytes,
+				   process further bytes aligned.  */
+	/* Find zero in 16 byte aligned loop.  */
+.Lloop1:
+	vl	%v16,0(%r5,%r2)	/* Load s.  */
+	aghi	%r5,16
+	vfenezbs %v16,%v16,%v16 /* Find element not equal with zero search.  */
+	je	.Lfound		/* Jump away if zero was found.  */
+	vl	%v16,0(%r5,%r2)
+	aghi	%r5,16
+	vfenezbs %v16,%v16,%v16
+	je	.Lfound
+	vl	%v16,0(%r5,%r2)
What addressing is allowed? If you could add offsets then following
looks faster

      vl       %v16,0(%r2)
      vfenezbs %v16,%v16,%v16
      je      .Lfound0
      vl       %v16,16(%r2)
      vfenezbs %v16,%v16,%v16
      je      .Lfound1



Yes, it is possible. A short test turned out, that in case of strlen it is a little bit faster, but slower for wcslen. I don't know why. I have to investigate.
+	aghi	%r5,16
+	vfenezbs %v16,%v16,%v16
+	je	.Lfound
+	vl	%v16,0(%r5,%r2)
+	aghi	%r5,16
+	vfenezbs %v16,%v16,%v16
+	jne	.Lloop1		/* No zero found -> loop.  */
+
+.Lfound:
+	vlgvb	%r2,%v16,7	/* Load byte index of zero.  */
+	slgfi	%r5,16		/* current_len -=16 */
+	algr	%r2,%r5
+	br	%r14
+END(__strlen_vx)



Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]