This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [RFC] pthread_once: Use unified variant instead of custom x86_64/i386


On Mon, 2014-04-07 at 15:47 +0200, Torvald Riegel wrote:
> On Fri, 2013-10-11 at 23:28 +0300, Torvald Riegel wrote:
> > Assuming the pthread_once unification I sent recently is applied, we
> > still have custom x86_64 and i386 variants of pthread_once.  The
> > algorithm they use is the same as the unified variant, so we would be
> > able to remove the custom variants if this doesn't affect performance.
> > 
> > The common case when pthread_once is executed is that the initialization
> > has already been performed; thus, this is the fast path that we can
> > focus on.  (I haven't looked specifically at the generated code for the
> > slow path, but the algorithm is the same and I assume that the overhead
> > of the synchronizing instructions and futex syscalls determines the
> > performance of it, not any differences between compiler-generated code
> > and the custom code.)
> > 
> > The fast path of the custom assembler version:
> > 	testl	$2, (%rdi)
> > 	jz	1f
> > 	xorl	%eax, %eax
> > 	retq
> > 
> > The fast path of the generic pthread_once C code, as it is after the
> > pthread_once unification patch:
> >   20:   48 89 5c 24 e8          mov    %rbx,-0x18(%rsp)
> >   25:   48 89 6c 24 f0          mov    %rbp,-0x10(%rsp)
> >   2a:   48 89 fb                mov    %rdi,%rbx
> >   2d:   4c 89 64 24 f8          mov    %r12,-0x8(%rsp)
> >   32:   48 89 f5                mov    %rsi,%rbp
> >   35:   48 83 ec 38             sub    $0x38,%rsp
> >   39:   41 b8 ca 00 00 00       mov    $0xca,%r8d
> >   3f:   8b 13                   mov    (%rbx),%edx
> >   41:   f6 c2 02                test   $0x2,%dl
> >   44:   74 16                   je     5c <__pthread_once+0x3c>
> >   46:   31 c0                   xor    %eax,%eax
> >   48:   48 8b 5c 24 20          mov    0x20(%rsp),%rbx
> >   4d:   48 8b 6c 24 28          mov    0x28(%rsp),%rbp
> >   52:   4c 8b 64 24 30          mov    0x30(%rsp),%r12
> >   57:   48 83 c4 38             add    $0x38,%rsp
> >   5b:   c3                      retq   
> > 
> > The only difference is more stack save/restore.  However, a quick run of
> > benchtests/pthread_once (see the patch I sent for review) on my laptop
> > doesn't show any noticeable differences between both (averages of 8 runs
> > of the microbenchmark differ by 0.2%).
> > 
> > When splitting out the slow path like this:
> > 
> > static int
> > __attribute__((noinline))
> > __pthread_once_slow (once_control, init_routine)
> > /* ... */
> > 
> > int
> > __pthread_once (once_control, init_routine)
> >      pthread_once_t *once_control;
> >      void (*init_routine) (void);
> > {
> >   int val;
> >   val = *once_control;
> >   atomic_read_barrier();
> >   if (__builtin_expect ((val & __PTHREAD_ONCE_DONE) != 0, 1))
> >     return 0;
> >   else
> >     return __pthread_once_slow(once_control, init_routine);
> > }
> > 
> > we get this for the C variants fast path:
> > 
> > 00000000000000e0 <__pthread_once>:
> >   e0:   8b 07                   mov    (%rdi),%eax
> >   e2:   a8 02                   test   $0x2,%al
> >   e4:   74 03                   je     e9 <__pthread_once+0x9>
> >   e6:   31 c0                   xor    %eax,%eax
> >   e8:   c3                      retq   
> >   e9:   31 c0                   xor    %eax,%eax
> >   eb:   e9 30 ff ff ff          jmpq   20 <__pthread_once_slow>
> > 
> > This is very close to the fast path of the custom assembler code.
> > 
> > I haven't looked further at i386, but the custom code is pretty similar
> > to the x86_64 variant.
> > 
> > 
> > What do you all prefer?:
> > 1) Keep the x86-specific assembler versions?
> > 2) Remove the x86-specific assembler versions and split out the slow
> > path?
> > 2) Just remove the x86-specific assembler versions?
> > 
> 
> Here is an updated patch for 2).
> 
> Without the fast path, I get the following without and with assembly (on
> my x86_64 laptop):
> 
> "pthread_once": {
> "": {
> "duration": 2.42853e+10, "iterations": 2.16569e+09, "max": 2217.22,
> "min": 11.024, "mean": 11.2137
> }
> }
> "pthread_once": {
> "": {
> "duration": 2.40695e+10, "iterations": 2.65473e+09, "max": 2185.57,
> "min": 9.016, "mean": 9.06665
> }
> }
> 
> With the fast path as split out, I get:
> "pthread_once": {
> "": {
> "duration": 2.40632e+10, "iterations": 2.6526e+09, "max": 2336.96,
> "min": 9.016, "mean": 9.07154
> }
> }
> 
> Okay to commit the fast path (after the pthread_once unification has
> been committed) and remove the x86 assembler variants?

This slipped through and this was never committed.  I took Richard's
advice into account, and the x86_64 code generated for the generic
version is now:
00000000000000d0 <__pthread_once>:
  d0:   8b 07                   mov    (%rdi),%eax
  d2:   a8 02                   test   $0x2,%al
  d4:   74 03                   je     d9 <__pthread_once+0x9>
  d6:   31 c0                   xor    %eax,%eax
  d8:   c3                      retq   
  d9:   e9 42 ff ff ff          jmpq   20 <__pthread_once_slow>


Attached is an updated patch, which I'll commit after two days if nobody
objects.

2014-10-20  Torvald Riegel  <triegel@redhat.com>

	[BZ #15215]
	* nptl/pthread_once.c (__pthread_once): Split out fast path to ...
	(__pthread_once_slow): ... here.
	* sysdeps/unix/sysv/linux/i386/pthread_once.S: Remove file.
	* sysdeps/unix/sysv/linux/x86_64/pthread_once.S: Remove file.

commit bc0ee21b1af4d8def1f3b6860816bf51f981c28e
Author: Torvald Riegel <triegel@redhat.com>
Date:   Sun Oct 19 21:59:26 2014 +0200

    pthread_once: Add fast path and remove x86 variants

diff --git a/nptl/pthread_once.c b/nptl/pthread_once.c
index 595bd7e..27cf06f 100644
--- a/nptl/pthread_once.c
+++ b/nptl/pthread_once.c
@@ -59,10 +59,9 @@ clear_once_control (void *arg)
    once_control are used for the fork generation), and try to initialize
    again, we can deadlock because we can't distinguish the in-progress and
    interrupted cases anymore.  */
-int
-__pthread_once (once_control, init_routine)
-     pthread_once_t *once_control;
-     void (*init_routine) (void);
+static int
+__attribute__((noinline))
+__pthread_once_slow (pthread_once_t *once_control, void (*init_routine) (void))
 {
   while (1)
     {
@@ -130,5 +129,18 @@ __pthread_once (once_control, init_routine)
 
   return 0;
 }
+
+int
+__pthread_once (pthread_once_t *once_control, void (*init_routine) (void))
+{
+  /* Fast path.  See __pthread_once_slow.  */
+  int val;
+  val = *once_control;
+  atomic_read_barrier();
+  if (__glibc_likely ((val & __PTHREAD_ONCE_DONE) != 0))
+    return 0;
+  else
+    return __pthread_once_slow(once_control, init_routine);
+}
 weak_alias (__pthread_once, pthread_once)
 hidden_def (__pthread_once)
diff --git a/sysdeps/unix/sysv/linux/i386/pthread_once.S b/sysdeps/unix/sysv/linux/i386/pthread_once.S
deleted file mode 100644
index dacd724..0000000
--- a/sysdeps/unix/sysv/linux/i386/pthread_once.S
+++ /dev/null
@@ -1,178 +0,0 @@
-/* Copyright (C) 2002-2014 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <unwindbuf.h>
-#include <sysdep.h>
-#include <kernel-features.h>
-#include <lowlevellock.h>
-
-
-	.comm	__fork_generation, 4, 4
-
-	.text
-
-
-	.globl	__pthread_once
-	.type	__pthread_once,@function
-	.align	16
-	cfi_startproc
-__pthread_once:
-	movl	4(%esp), %ecx
-	testl	$2, (%ecx)
-	jz	1f
-	xorl	%eax, %eax
-	ret
-
-1:	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (3, 0)
-	pushl	%esi
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (6, 0)
-	movl	%ecx, %ebx
-	xorl	%esi, %esi
-
-	/* Not yet initialized or initialization in progress.
-	   Get the fork generation counter now.  */
-6:	movl	(%ebx), %eax
-#ifdef PIC
-	LOAD_PIC_REG(cx)
-#endif
-
-5:	movl	%eax, %edx
-
-	testl	$2, %eax
-	jnz	4f
-
-	andl	$3, %edx
-#ifdef PIC
-	orl	__fork_generation@GOTOFF(%ecx), %edx
-#else
-	orl	__fork_generation, %edx
-#endif
-	orl	$1, %edx
-
-	LOCK
-	cmpxchgl %edx, (%ebx)
-	jnz	5b
-
-	/* Check whether another thread already runs the initializer.  */
-	testl	$1, %eax
-	jz	3f	/* No -> do it.  */
-
-	/* Check whether the initializer execution was interrupted
-	   by a fork.  */
-	xorl	%edx, %eax
-	testl	$0xfffffffc, %eax
-	jnz	3f	/* Different for generation -> run initializer.  */
-
-	/* Somebody else got here first.  Wait.  */
-#ifdef __ASSUME_PRIVATE_FUTEX
-	movl	$FUTEX_WAIT|FUTEX_PRIVATE_FLAG, %ecx
-#else
-# if FUTEX_WAIT == 0
-	movl	%gs:PRIVATE_FUTEX, %ecx
-# else
-	movl	$FUTEX_WAIT, %ecx
-	orl	%gs:PRIVATE_FUTEX, %ecx
-# endif
-#endif
-	movl	$SYS_futex, %eax
-	ENTER_KERNEL
-	jmp	6b
-
-3:	/* Call the initializer function after setting up the
-	   cancellation handler.  Note that it is not possible here
-	   to use the unwind-based cleanup handling.  This would require
-	   that the user-provided function and all the code it calls
-	   is compiled with exceptions.  Unfortunately this cannot be
-	   guaranteed.  */
-	subl	$UNWINDBUFSIZE+8, %esp
-	cfi_adjust_cfa_offset (UNWINDBUFSIZE+8)
-	movl	%ecx, %ebx		/* PIC register value.  */
-
-	leal	8+UWJMPBUF(%esp), %eax
-	movl	$0, 4(%esp)
-	movl	%eax, (%esp)
-	call	__sigsetjmp@PLT
-	testl	%eax, %eax
-	jne	7f
-
-	leal	8(%esp), %eax
-	call	HIDDEN_JUMPTARGET(__pthread_register_cancel)
-
-	/* Call the user-provided initialization function.  */
-	call	*24+UNWINDBUFSIZE(%esp)
-
-	/* Pop the cleanup handler.  */
-	leal	8(%esp), %eax
-	call	HIDDEN_JUMPTARGET(__pthread_unregister_cancel)
-	addl	$UNWINDBUFSIZE+8, %esp
-	cfi_adjust_cfa_offset (-UNWINDBUFSIZE-8)
-
-	/* Sucessful run of the initializer.  Signal that we are done.  */
-	movl	12(%esp), %ebx
-	LOCK
-	addl	$1, (%ebx)
-
-	/* Wake up all other threads.  */
-	movl	$0x7fffffff, %edx
-#ifdef __ASSUME_PRIVATE_FUTEX
-	movl	$FUTEX_WAKE|FUTEX_PRIVATE_FLAG, %ecx
-#else
-	movl	$FUTEX_WAKE, %ecx
-	orl	%gs:PRIVATE_FUTEX, %ecx
-#endif
-	movl	$SYS_futex, %eax
-	ENTER_KERNEL
-
-4:	popl	%esi
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (6)
-	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (3)
-	xorl	%eax, %eax
-	ret
-
-7:	/* __sigsetjmp returned for the second time.  */
-	movl	20+UNWINDBUFSIZE(%esp), %ebx
-	cfi_adjust_cfa_offset (UNWINDBUFSIZE+16)
-	cfi_offset (3, -8)
-	cfi_offset (6, -12)
-	movl	$0, (%ebx)
-
-	movl	$0x7fffffff, %edx
-#ifdef __ASSUME_PRIVATE_FUTEX
-	movl	$FUTEX_WAKE|FUTEX_PRIVATE_FLAG, %ecx
-#else
-	movl	$FUTEX_WAKE, %ecx
-	orl	%gs:PRIVATE_FUTEX, %ecx
-#endif
-	movl	$SYS_futex, %eax
-	ENTER_KERNEL
-
-	leal	8(%esp), %eax
-	call	HIDDEN_JUMPTARGET (__pthread_unwind_next)
-	/* NOTREACHED */
-	hlt
-	cfi_endproc
-	.size	__pthread_once,.-__pthread_once
-
-hidden_def (__pthread_once)
-strong_alias (__pthread_once, pthread_once)
diff --git a/sysdeps/unix/sysv/linux/x86_64/pthread_once.S b/sysdeps/unix/sysv/linux/x86_64/pthread_once.S
deleted file mode 100644
index 2cbe2fa..0000000
--- a/sysdeps/unix/sysv/linux/x86_64/pthread_once.S
+++ /dev/null
@@ -1,193 +0,0 @@
-/* Copyright (C) 2002-2014 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include <kernel-features.h>
-#include <tcb-offsets.h>
-#include <lowlevellock.h>
-
-
-	.comm	__fork_generation, 4, 4
-
-	.text
-
-
-	.globl	__pthread_once
-	.type	__pthread_once,@function
-	.align	16
-__pthread_once:
-.LSTARTCODE:
-	cfi_startproc
-#ifdef SHARED
-	cfi_personality(DW_EH_PE_pcrel | DW_EH_PE_sdata4 | DW_EH_PE_indirect,
-			DW.ref.__gcc_personality_v0)
-	cfi_lsda(DW_EH_PE_pcrel | DW_EH_PE_sdata4, .LexceptSTART)
-#else
-	cfi_personality(DW_EH_PE_udata4, __gcc_personality_v0)
-	cfi_lsda(DW_EH_PE_udata4, .LexceptSTART)
-#endif
-	testl	$2, (%rdi)
-	jz	1f
-	xorl	%eax, %eax
-	retq
-
-	/* Preserve the function pointer.  */
-1:	pushq	%rsi
-	cfi_adjust_cfa_offset(8)
-	xorq	%r10, %r10
-
-	/* Not yet initialized or initialization in progress.
-	   Get the fork generation counter now.  */
-6:	movl	(%rdi), %eax
-
-5:	movl	%eax, %edx
-
-	testl	$2, %eax
-	jnz	4f
-
-	andl	$3, %edx
-	orl	__fork_generation(%rip), %edx
-	orl	$1, %edx
-
-	LOCK
-	cmpxchgl %edx, (%rdi)
-	jnz	5b
-
-	/* Check whether another thread already runs the initializer.  */
-	testl	$1, %eax
-	jz	3f	/* No -> do it.  */
-
-	/* Check whether the initializer execution was interrupted
-	   by a fork.  */
-	xorl	%edx, %eax
-	testl	$0xfffffffc, %eax
-	jnz	3f	/* Different for generation -> run initializer.  */
-
-	/* Somebody else got here first.  Wait.  */
-#ifdef __ASSUME_PRIVATE_FUTEX
-	movl	$FUTEX_WAIT|FUTEX_PRIVATE_FLAG, %esi
-#else
-# if FUTEX_WAIT == 0
-	movl	%fs:PRIVATE_FUTEX, %esi
-# else
-	movl	$FUTEX_WAIT, %esi
-	orl	%fs:PRIVATE_FUTEX, %esi
-# endif
-#endif
-	movl	$SYS_futex, %eax
-	syscall
-	jmp	6b
-
-	/* Preserve the pointer to the control variable.  */
-3:	pushq	%rdi
-	cfi_adjust_cfa_offset(8)
-	pushq	%rdi
-	cfi_adjust_cfa_offset(8)
-
-.LcleanupSTART:
-	callq	*16(%rsp)
-.LcleanupEND:
-
-	/* Get the control variable address back.  */
-	popq	%rdi
-	cfi_adjust_cfa_offset(-8)
-
-	/* Sucessful run of the initializer.  Signal that we are done.  */
-	LOCK
-	incl	(%rdi)
-
-	addq	$8, %rsp
-	cfi_adjust_cfa_offset(-8)
-
-	/* Wake up all other threads.  */
-	movl	$0x7fffffff, %edx
-#ifdef __ASSUME_PRIVATE_FUTEX
-	movl	$FUTEX_WAKE|FUTEX_PRIVATE_FLAG, %esi
-#else
-	movl	$FUTEX_WAKE, %esi
-	orl	%fs:PRIVATE_FUTEX, %esi
-#endif
-	movl	$SYS_futex, %eax
-	syscall
-
-4:	addq	$8, %rsp
-	cfi_adjust_cfa_offset(-8)
-	xorl	%eax, %eax
-	retq
-	.size	__pthread_once,.-__pthread_once
-
-
-hidden_def (__pthread_once)
-strong_alias (__pthread_once, pthread_once)
-
-
-	.type	clear_once_control,@function
-	.align	16
-clear_once_control:
-	cfi_adjust_cfa_offset(3 * 8)
-	movq	(%rsp), %rdi
-	movq	%rax, %r8
-	movl	$0, (%rdi)
-
-	movl	$0x7fffffff, %edx
-#ifdef __ASSUME_PRIVATE_FUTEX
-	movl	$FUTEX_WAKE|FUTEX_PRIVATE_FLAG, %esi
-#else
-	movl	$FUTEX_WAKE, %esi
-	orl	%fs:PRIVATE_FUTEX, %esi
-#endif
-	movl	$SYS_futex, %eax
-	syscall
-
-	movq	%r8, %rdi
-.LcallUR:
-	call	_Unwind_Resume@PLT
-	hlt
-.LENDCODE:
-	cfi_endproc
-	.size	clear_once_control,.-clear_once_control
-
-
-	.section .gcc_except_table,"a",@progbits
-.LexceptSTART:
-	.byte	DW_EH_PE_omit			# @LPStart format
-	.byte	DW_EH_PE_omit			# @TType format
-	.byte	DW_EH_PE_uleb128		# call-site format
-	.uleb128 .Lcstend-.Lcstbegin
-.Lcstbegin:
-	.uleb128 .LcleanupSTART-.LSTARTCODE
-	.uleb128 .LcleanupEND-.LcleanupSTART
-	.uleb128 clear_once_control-.LSTARTCODE
-	.uleb128  0
-	.uleb128 .LcallUR-.LSTARTCODE
-	.uleb128 .LENDCODE-.LcallUR
-	.uleb128 0
-	.uleb128  0
-.Lcstend:
-
-
-#ifdef SHARED
-	.hidden	DW.ref.__gcc_personality_v0
-	.weak	DW.ref.__gcc_personality_v0
-	.section .gnu.linkonce.d.DW.ref.__gcc_personality_v0,"aw",@progbits
-	.align	LP_SIZE
-	.type	DW.ref.__gcc_personality_v0, @object
-	.size	DW.ref.__gcc_personality_v0, LP_SIZE
-DW.ref.__gcc_personality_v0:
-	ASM_ADDR __gcc_personality_v0
-#endif

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]