This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [RFC] Enable RTM lock elision in current rwlock
- From: Andrew Senkevich <andrew dot n dot senkevich at gmail dot com>
- To: libc-alpha <libc-alpha at sourceware dot org>, Torvald Riegel <triegel at redhat dot com>
- Date: Tue, 13 Jun 2017 12:50:11 +0200
- Subject: Re: [RFC] Enable RTM lock elision in current rwlock
- Authentication-results: sourceware.org; auth=none
- References: <CAMXFM3tui-xJ1pGWuK_YJS3XOR+KV4PL7NeKpgPJ_KC9pRARxw@mail.gmail.com>
2017-04-05 11:23 GMT+02:00 Andrew Senkevich <andrew.n.senkevich@gmail.com>:
> Hi,
>
> here is the patch to enable RTM lock elision in current rwlock implementation.
>
> Patch adds enabling itself using existing wrappers macro with new
> conditions and fixes typo in elision tuning.
>
> ChangeLog:
>
> * nptl/pthread_rwlock_rdlock.c: Add RTM lock elision.
> * nptl/pthread_rwlock_tryrdlock.c: Likewise.
> * nptl/pthread_rwlock_trywrlock.c: Likewise.
> * nptl/pthread_rwlock_wrlock.c: Likewise.
> * nptl/pthread_rwlock_unlock.c: Add RTM unlock elision.
> * sysdeps/x86/elide.h (ELIDE_LOCK): Fixed typo.
>
> diff --git a/nptl/pthread_rwlock_rdlock.c b/nptl/pthread_rwlock_rdlock.c
> index e07581b..3073dfc 100644
> --- a/nptl/pthread_rwlock_rdlock.c
> +++ b/nptl/pthread_rwlock_rdlock.c
> @@ -16,6 +16,8 @@
> License along with the GNU C Library; if not, see
> <http://www.gnu.org/licenses/>. */
>
> +#include <elide.h>
> +
> #include "pthread_rwlock_common.c"
>
> /* See pthread_rwlock_common.c. */
> @@ -24,6 +26,12 @@ __pthread_rwlock_rdlock (pthread_rwlock_t *rwlock)
> {
> LIBC_PROBE (rdlock_entry, 1, rwlock);
>
> + if (ELIDE_LOCK (rwlock->__data.__rwelision,
> + ((rwlock->__data.__readers & PTHREAD_RWLOCK_WRPHASE) &&
> (rwlock->__data.__readers & PTHREAD_RWLOCK_WRLOCKED)) == 0
> + && (rwlock->__data.__readers >> PTHREAD_RWLOCK_READER_SHIFT) == 0
> + && rwlock->__data.__writers_futex == 0))
> + return 0;
> +
> int result = __pthread_rwlock_rdlock_full (rwlock, NULL);
> LIBC_PROBE (rdlock_acquire_read, 1, rwlock);
> return result;
> diff --git a/nptl/pthread_rwlock_tryrdlock.c b/nptl/pthread_rwlock_tryrdlock.c
> index 6c3014c..7cff149 100644
> --- a/nptl/pthread_rwlock_tryrdlock.c
> +++ b/nptl/pthread_rwlock_tryrdlock.c
> @@ -20,6 +20,8 @@
> #include "pthreadP.h"
> #include <atomic.h>
> #include <stdbool.h>
> +#include <elide.h>
> +
> #include "pthread_rwlock_common.c"
>
>
> @@ -27,6 +29,12 @@
> int
> __pthread_rwlock_tryrdlock (pthread_rwlock_t *rwlock)
> {
> + if (ELIDE_TRYLOCK (rwlock->__data.__rwelision,
> + ((rwlock->__data.__readers & PTHREAD_RWLOCK_WRPHASE) &&
> (rwlock->__data.__readers & PTHREAD_RWLOCK_WRLOCKED)) == 0
> + && (rwlock->__data.__readers >> PTHREAD_RWLOCK_READER_SHIFT) == 0
> + && rwlock->__data.__writers_futex, 0))
> + return 0;
> +
> /* For tryrdlock, we could speculate that we will succeed and go ahead and
> register as a reader. However, if we misspeculate, we have to do the
> same steps as a timed-out rdlock, which will increase contention.
> diff --git a/nptl/pthread_rwlock_trywrlock.c b/nptl/pthread_rwlock_trywrlock.c
> index 0d9ccaf..dce32da 100644
> --- a/nptl/pthread_rwlock_trywrlock.c
> +++ b/nptl/pthread_rwlock_trywrlock.c
> @@ -19,11 +19,18 @@
> #include <errno.h>
> #include "pthreadP.h"
> #include <atomic.h>
> +#include <elide.h>
>
> /* See pthread_rwlock_common.c for an overview. */
> int
> __pthread_rwlock_trywrlock (pthread_rwlock_t *rwlock)
> {
> + if (ELIDE_TRYLOCK (rwlock->__data.__rwelision,
> + ((rwlock->__data.__readers & PTHREAD_RWLOCK_WRPHASE) &&
> (rwlock->__data.__readers & PTHREAD_RWLOCK_WRLOCKED)) == 0
> + && (rwlock->__data.__readers >> PTHREAD_RWLOCK_READER_SHIFT) == 0
> + && rwlock->__data.__writers_futex, 1))
> + return 0;
> +
> /* When in a trywrlock, we can acquire the write lock if it is in states
> #1 (idle and read phase) and #5 (idle and write phase), and also in #6
> (readers waiting, write phase) if we prefer writers.
> diff --git a/nptl/pthread_rwlock_unlock.c b/nptl/pthread_rwlock_unlock.c
> index ef46e88..fa4555d 100644
> --- a/nptl/pthread_rwlock_unlock.c
> +++ b/nptl/pthread_rwlock_unlock.c
> @@ -22,6 +22,7 @@
> #include <pthread.h>
> #include <pthreadP.h>
> #include <stap-probe.h>
> +#include <elide.h>
>
> #include "pthread_rwlock_common.c"
>
> @@ -31,6 +32,10 @@ __pthread_rwlock_unlock (pthread_rwlock_t *rwlock)
> {
> LIBC_PROBE (rwlock_unlock, 1, rwlock);
>
> + if (ELIDE_UNLOCK ((rwlock->__data.__readers >>
> PTHREAD_RWLOCK_READER_SHIFT) == 0
> + && rwlock->__data.__writers_futex == 0))
> + return 0;
> +
> /* We distinguish between having acquired a read vs. a write lock by looking
> at the writer TID. If it's equal to our TID, we must be the writer
> because nobody else can have stored this value. Also, if we are a
> diff --git a/nptl/pthread_rwlock_wrlock.c b/nptl/pthread_rwlock_wrlock.c
> index 335fcd1..d041aad 100644
> --- a/nptl/pthread_rwlock_wrlock.c
> +++ b/nptl/pthread_rwlock_wrlock.c
> @@ -16,6 +16,8 @@
> License along with the GNU C Library; if not, see
> <http://www.gnu.org/licenses/>. */
>
> +#include <elide.h>
> +
> #include "pthread_rwlock_common.c"
>
> /* See pthread_rwlock_common.c. */
> @@ -24,6 +26,12 @@ __pthread_rwlock_wrlock (pthread_rwlock_t *rwlock)
> {
> LIBC_PROBE (wrlock_entry, 1, rwlock);
>
> + if (ELIDE_LOCK (rwlock->__data.__rwelision,
> + ((rwlock->__data.__readers & PTHREAD_RWLOCK_WRPHASE) &&
> (rwlock->__data.__readers & PTHREAD_RWLOCK_WRLOCKED)) == 0
> + && (rwlock->__data.__readers >> PTHREAD_RWLOCK_READER_SHIFT) == 0
> + && rwlock->__data.__writers_futex == 0))
> + return 0;
> +
> int result = __pthread_rwlock_wrlock_full (rwlock, NULL);
> LIBC_PROBE (wrlock_acquire_write, 1, rwlock);
> return result;
> diff --git a/sysdeps/x86/elide.h b/sysdeps/x86/elide.h
> index 53de418..3eaf15c 100644
> --- a/sysdeps/x86/elide.h
> +++ b/sysdeps/x86/elide.h
> @@ -77,7 +77,7 @@ elision_adapt(signed char *adapt_count, unsigned int status)
> } \
> _xabort (_ABORT_LOCK_BUSY); \
> } \
> - if (!elision_adapt (&(adapt_count), status)) \
> + if (elision_adapt (&(adapt_count), status)) \
> break; \
> } \
> } \
>
> Performance was measured with bench-multithread.c posted at
> https://sourceware.org/ml/libc-alpha/2017-01/msg00168.html on i7-6700
> (Skylake).
>
> With R=60 and shared W=0 lock elision helps to increase scalability
> with some slowdown in single-thread case:
> w/o patch:
> threads=1 iter/s= 4060317 timing_t=17040157458
> threads=2 iter/s= 6752786 timing_t=17040102928
> threads=3 iter/s= 9392445 timing_t=17040102070
> threads=4 iter/s= 10463599 timing_t=17040103938
>
> with patch:
> threads=1 iter/s= 3402230 timing_t=17040139858
> threads=2 iter/s= 6784424 timing_t=17040102620
> threads=3 iter/s= 10157296 timing_t=17040103664
> threads=4 iter/s= 13509321 timing_t=17040108804
>
> But with R=60 and shared W=1 we have not so good picture:
> w/o patch:
> threads=1 iter/s= 3939640 timing_t=17040142322
> threads=2 iter/s= 6018330 timing_t=17040101032
> threads=3 iter/s= 7598935 timing_t=17040101858
> threads=4 iter/s= 7873063 timing_t=17040107686
>
> with patch:
> threads=1 iter/s= 3317374 timing_t=17040140564
> threads=2 iter/s= 4555148 timing_t=17040106910
> threads=3 iter/s= 6583628 timing_t=17040105330
> threads=4 iter/s= 7421028 timing_t=17040101884
> (perf stat -T tells we have 9.44% transactional cycles and 3.84% aborted cycles)
>
> I have tried to tune lock elision and it change performance but
> haven't achieve results better or equal than w/o patch:
>
> with patch and tuning:
> threads=1 iter/s= 3324805 timing_t=17040143918
> threads=2 iter/s= 5360456 timing_t=17040100294
> threads=3 iter/s= 6997841 timing_t=17040107640
> threads=4 iter/s= 7406136 timing_t=17040104018
> (perf stat -T tells we have 5.84% transactional cycles and 0.46% aborted cycles)
>
> Is condition for wrapper macro enough correct?
> Maybe it have sence to make tuning configurable by tunables and have
> additional switch for turn it on/off?
Ping.
Just a note that here are two parts, one part is only typo fix:
> * sysdeps/x86/elide.h (ELIDE_LOCK): Fix typo.
>
> diff --git a/sysdeps/x86/elide.h b/sysdeps/x86/elide.h
> index 53de418..3eaf15c 100644
> --- a/sysdeps/x86/elide.h
> +++ b/sysdeps/x86/elide.h
> @@ -77,7 +77,7 @@ elision_adapt(signed char *adapt_count, unsigned int status)
> } \
> _xabort (_ABORT_LOCK_BUSY); \
> } \
> - if (!elision_adapt (&(adapt_count), status)) \
> + if (elision_adapt (&(adapt_count), status)) \
> break; \
> } \
> } \
--
WBR,
Andrew