This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [RFC] Enable RTM lock elision in current rwlock


2017-04-05 11:23 GMT+02:00 Andrew Senkevich <andrew.n.senkevich@gmail.com>:
> Hi,
>
> here is the patch to enable RTM lock elision in current rwlock implementation.
>
> Patch adds enabling itself using existing wrappers macro with new
> conditions and fixes typo in elision tuning.
>
> ChangeLog:
>
>         * nptl/pthread_rwlock_rdlock.c: Add RTM lock elision.
>         * nptl/pthread_rwlock_tryrdlock.c: Likewise.
>         * nptl/pthread_rwlock_trywrlock.c: Likewise.
>         * nptl/pthread_rwlock_wrlock.c: Likewise.
>         * nptl/pthread_rwlock_unlock.c: Add RTM unlock elision.
>         * sysdeps/x86/elide.h (ELIDE_LOCK): Fixed typo.
>
> diff --git a/nptl/pthread_rwlock_rdlock.c b/nptl/pthread_rwlock_rdlock.c
> index e07581b..3073dfc 100644
> --- a/nptl/pthread_rwlock_rdlock.c
> +++ b/nptl/pthread_rwlock_rdlock.c
> @@ -16,6 +16,8 @@
>     License along with the GNU C Library; if not, see
>     <http://www.gnu.org/licenses/>.  */
>
> +#include <elide.h>
> +
>  #include "pthread_rwlock_common.c"
>
>  /* See pthread_rwlock_common.c.  */
> @@ -24,6 +26,12 @@ __pthread_rwlock_rdlock (pthread_rwlock_t *rwlock)
>  {
>    LIBC_PROBE (rdlock_entry, 1, rwlock);
>
> +  if (ELIDE_LOCK (rwlock->__data.__rwelision,
> +  ((rwlock->__data.__readers & PTHREAD_RWLOCK_WRPHASE) &&
> (rwlock->__data.__readers & PTHREAD_RWLOCK_WRLOCKED)) == 0
> +  && (rwlock->__data.__readers >> PTHREAD_RWLOCK_READER_SHIFT) == 0
> +  && rwlock->__data.__writers_futex == 0))
> +    return 0;
> +
>    int result = __pthread_rwlock_rdlock_full (rwlock, NULL);
>    LIBC_PROBE (rdlock_acquire_read, 1, rwlock);
>    return result;
> diff --git a/nptl/pthread_rwlock_tryrdlock.c b/nptl/pthread_rwlock_tryrdlock.c
> index 6c3014c..7cff149 100644
> --- a/nptl/pthread_rwlock_tryrdlock.c
> +++ b/nptl/pthread_rwlock_tryrdlock.c
> @@ -20,6 +20,8 @@
>  #include "pthreadP.h"
>  #include <atomic.h>
>  #include <stdbool.h>
> +#include <elide.h>
> +
>  #include "pthread_rwlock_common.c"
>
>
> @@ -27,6 +29,12 @@
>  int
>  __pthread_rwlock_tryrdlock (pthread_rwlock_t *rwlock)
>  {
> +  if (ELIDE_TRYLOCK (rwlock->__data.__rwelision,
> +     ((rwlock->__data.__readers & PTHREAD_RWLOCK_WRPHASE) &&
> (rwlock->__data.__readers & PTHREAD_RWLOCK_WRLOCKED)) == 0
> +     && (rwlock->__data.__readers >> PTHREAD_RWLOCK_READER_SHIFT) == 0
> +     && rwlock->__data.__writers_futex, 0))
> +    return 0;
> +
>    /* For tryrdlock, we could speculate that we will succeed and go ahead and
>       register as a reader.  However, if we misspeculate, we have to do the
>       same steps as a timed-out rdlock, which will increase contention.
> diff --git a/nptl/pthread_rwlock_trywrlock.c b/nptl/pthread_rwlock_trywrlock.c
> index 0d9ccaf..dce32da 100644
> --- a/nptl/pthread_rwlock_trywrlock.c
> +++ b/nptl/pthread_rwlock_trywrlock.c
> @@ -19,11 +19,18 @@
>  #include <errno.h>
>  #include "pthreadP.h"
>  #include <atomic.h>
> +#include <elide.h>
>
>  /* See pthread_rwlock_common.c for an overview.  */
>  int
>  __pthread_rwlock_trywrlock (pthread_rwlock_t *rwlock)
>  {
> +  if (ELIDE_TRYLOCK (rwlock->__data.__rwelision,
> +     ((rwlock->__data.__readers & PTHREAD_RWLOCK_WRPHASE) &&
> (rwlock->__data.__readers & PTHREAD_RWLOCK_WRLOCKED)) == 0
> +     && (rwlock->__data.__readers >> PTHREAD_RWLOCK_READER_SHIFT) == 0
> +     && rwlock->__data.__writers_futex, 1))
> +    return 0;
> +
>    /* When in a trywrlock, we can acquire the write lock if it is in states
>       #1 (idle and read phase) and #5 (idle and write phase), and also in #6
>       (readers waiting, write phase) if we prefer writers.
> diff --git a/nptl/pthread_rwlock_unlock.c b/nptl/pthread_rwlock_unlock.c
> index ef46e88..fa4555d 100644
> --- a/nptl/pthread_rwlock_unlock.c
> +++ b/nptl/pthread_rwlock_unlock.c
> @@ -22,6 +22,7 @@
>  #include <pthread.h>
>  #include <pthreadP.h>
>  #include <stap-probe.h>
> +#include <elide.h>
>
>  #include "pthread_rwlock_common.c"
>
> @@ -31,6 +32,10 @@ __pthread_rwlock_unlock (pthread_rwlock_t *rwlock)
>  {
>    LIBC_PROBE (rwlock_unlock, 1, rwlock);
>
> +  if (ELIDE_UNLOCK ((rwlock->__data.__readers >>
> PTHREAD_RWLOCK_READER_SHIFT) == 0
> +    && rwlock->__data.__writers_futex == 0))
> +    return 0;
> +
>    /* We distinguish between having acquired a read vs. a write lock by looking
>       at the writer TID.  If it's equal to our TID, we must be the writer
>       because nobody else can have stored this value.  Also, if we are a
> diff --git a/nptl/pthread_rwlock_wrlock.c b/nptl/pthread_rwlock_wrlock.c
> index 335fcd1..d041aad 100644
> --- a/nptl/pthread_rwlock_wrlock.c
> +++ b/nptl/pthread_rwlock_wrlock.c
> @@ -16,6 +16,8 @@
>     License along with the GNU C Library; if not, see
>     <http://www.gnu.org/licenses/>.  */
>
> +#include <elide.h>
> +
>  #include "pthread_rwlock_common.c"
>
>  /* See pthread_rwlock_common.c.  */
> @@ -24,6 +26,12 @@ __pthread_rwlock_wrlock (pthread_rwlock_t *rwlock)
>  {
>    LIBC_PROBE (wrlock_entry, 1, rwlock);
>
> +  if (ELIDE_LOCK (rwlock->__data.__rwelision,
> +  ((rwlock->__data.__readers & PTHREAD_RWLOCK_WRPHASE) &&
> (rwlock->__data.__readers & PTHREAD_RWLOCK_WRLOCKED)) == 0
> +  && (rwlock->__data.__readers >> PTHREAD_RWLOCK_READER_SHIFT) == 0
> +  && rwlock->__data.__writers_futex == 0))
> +    return 0;
> +
>    int result = __pthread_rwlock_wrlock_full (rwlock, NULL);
>    LIBC_PROBE (wrlock_acquire_write, 1, rwlock);
>    return result;
> diff --git a/sysdeps/x86/elide.h b/sysdeps/x86/elide.h
> index 53de418..3eaf15c 100644
> --- a/sysdeps/x86/elide.h
> +++ b/sysdeps/x86/elide.h
> @@ -77,7 +77,7 @@ elision_adapt(signed char *adapt_count, unsigned int status)
>            } \
>          _xabort (_ABORT_LOCK_BUSY); \
>        } \
> -    if (!elision_adapt (&(adapt_count), status)) \
> +    if (elision_adapt (&(adapt_count), status)) \
>        break; \
>            } \
>        } \
>
> Performance was measured with bench-multithread.c posted at
> https://sourceware.org/ml/libc-alpha/2017-01/msg00168.html on i7-6700
> (Skylake).
>
> With R=60 and shared W=0 lock elision helps to increase scalability
> with some slowdown in single-thread case:
> w/o patch:
> threads=1  iter/s=        4060317  timing_t=17040157458
> threads=2  iter/s=        6752786  timing_t=17040102928
> threads=3  iter/s=        9392445  timing_t=17040102070
> threads=4  iter/s=       10463599  timing_t=17040103938
>
> with patch:
> threads=1  iter/s=        3402230  timing_t=17040139858
> threads=2  iter/s=        6784424  timing_t=17040102620
> threads=3  iter/s=       10157296  timing_t=17040103664
> threads=4  iter/s=       13509321  timing_t=17040108804
>
> But with R=60 and shared W=1 we have not so good picture:
> w/o patch:
> threads=1  iter/s=        3939640  timing_t=17040142322
> threads=2  iter/s=        6018330  timing_t=17040101032
> threads=3  iter/s=        7598935  timing_t=17040101858
> threads=4  iter/s=        7873063  timing_t=17040107686
>
> with patch:
> threads=1  iter/s=        3317374  timing_t=17040140564
> threads=2  iter/s=        4555148  timing_t=17040106910
> threads=3  iter/s=        6583628  timing_t=17040105330
> threads=4  iter/s=        7421028  timing_t=17040101884
> (perf stat -T tells we have 9.44% transactional cycles and 3.84% aborted cycles)
>
> I have tried to tune lock elision and it change performance but
> haven't achieve results better or equal than w/o patch:
>
> with patch and tuning:
> threads=1  iter/s=        3324805  timing_t=17040143918
> threads=2  iter/s=        5360456  timing_t=17040100294
> threads=3  iter/s=        6997841  timing_t=17040107640
> threads=4  iter/s=        7406136  timing_t=17040104018
> (perf stat -T tells we have 5.84% transactional cycles and 0.46% aborted cycles)
>
> Is condition for wrapper macro enough correct?
> Maybe it have sence to make tuning configurable by tunables and have
> additional switch for turn it on/off?

Ping.

Just a note that here are two parts, one part is only typo fix:

>         * sysdeps/x86/elide.h (ELIDE_LOCK): Fix typo.
>
> diff --git a/sysdeps/x86/elide.h b/sysdeps/x86/elide.h
> index 53de418..3eaf15c 100644
> --- a/sysdeps/x86/elide.h
> +++ b/sysdeps/x86/elide.h
> @@ -77,7 +77,7 @@ elision_adapt(signed char *adapt_count, unsigned int status)
>            } \
>          _xabort (_ABORT_LOCK_BUSY); \
>        } \
> -    if (!elision_adapt (&(adapt_count), status)) \
> +    if (elision_adapt (&(adapt_count), status)) \
>        break; \
>            } \
>        } \


--
WBR,
Andrew


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]