This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[RFC] Enable RTM lock elision in current rwlock


Hi,

here is the patch to enable RTM lock elision in current rwlock implementation.

Patch adds enabling itself using existing wrappers macro with new
conditions and fixes typo in elision tuning.

ChangeLog:

        * nptl/pthread_rwlock_rdlock.c: Add RTM lock elision.
        * nptl/pthread_rwlock_tryrdlock.c: Likewise.
        * nptl/pthread_rwlock_trywrlock.c: Likewise.
        * nptl/pthread_rwlock_wrlock.c: Likewise.
        * nptl/pthread_rwlock_unlock.c: Add RTM unlock elision.
        * sysdeps/x86/elide.h (ELIDE_LOCK): Fixed typo.

diff --git a/nptl/pthread_rwlock_rdlock.c b/nptl/pthread_rwlock_rdlock.c
index e07581b..3073dfc 100644
--- a/nptl/pthread_rwlock_rdlock.c
+++ b/nptl/pthread_rwlock_rdlock.c
@@ -16,6 +16,8 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */

+#include <elide.h>
+
 #include "pthread_rwlock_common.c"

 /* See pthread_rwlock_common.c.  */
@@ -24,6 +26,12 @@ __pthread_rwlock_rdlock (pthread_rwlock_t *rwlock)
 {
   LIBC_PROBE (rdlock_entry, 1, rwlock);

+  if (ELIDE_LOCK (rwlock->__data.__rwelision,
+  ((rwlock->__data.__readers & PTHREAD_RWLOCK_WRPHASE) &&
(rwlock->__data.__readers & PTHREAD_RWLOCK_WRLOCKED)) == 0
+  && (rwlock->__data.__readers >> PTHREAD_RWLOCK_READER_SHIFT) == 0
+  && rwlock->__data.__writers_futex == 0))
+    return 0;
+
   int result = __pthread_rwlock_rdlock_full (rwlock, NULL);
   LIBC_PROBE (rdlock_acquire_read, 1, rwlock);
   return result;
diff --git a/nptl/pthread_rwlock_tryrdlock.c b/nptl/pthread_rwlock_tryrdlock.c
index 6c3014c..7cff149 100644
--- a/nptl/pthread_rwlock_tryrdlock.c
+++ b/nptl/pthread_rwlock_tryrdlock.c
@@ -20,6 +20,8 @@
 #include "pthreadP.h"
 #include <atomic.h>
 #include <stdbool.h>
+#include <elide.h>
+
 #include "pthread_rwlock_common.c"


@@ -27,6 +29,12 @@
 int
 __pthread_rwlock_tryrdlock (pthread_rwlock_t *rwlock)
 {
+  if (ELIDE_TRYLOCK (rwlock->__data.__rwelision,
+     ((rwlock->__data.__readers & PTHREAD_RWLOCK_WRPHASE) &&
(rwlock->__data.__readers & PTHREAD_RWLOCK_WRLOCKED)) == 0
+     && (rwlock->__data.__readers >> PTHREAD_RWLOCK_READER_SHIFT) == 0
+     && rwlock->__data.__writers_futex, 0))
+    return 0;
+
   /* For tryrdlock, we could speculate that we will succeed and go ahead and
      register as a reader.  However, if we misspeculate, we have to do the
      same steps as a timed-out rdlock, which will increase contention.
diff --git a/nptl/pthread_rwlock_trywrlock.c b/nptl/pthread_rwlock_trywrlock.c
index 0d9ccaf..dce32da 100644
--- a/nptl/pthread_rwlock_trywrlock.c
+++ b/nptl/pthread_rwlock_trywrlock.c
@@ -19,11 +19,18 @@
 #include <errno.h>
 #include "pthreadP.h"
 #include <atomic.h>
+#include <elide.h>

 /* See pthread_rwlock_common.c for an overview.  */
 int
 __pthread_rwlock_trywrlock (pthread_rwlock_t *rwlock)
 {
+  if (ELIDE_TRYLOCK (rwlock->__data.__rwelision,
+     ((rwlock->__data.__readers & PTHREAD_RWLOCK_WRPHASE) &&
(rwlock->__data.__readers & PTHREAD_RWLOCK_WRLOCKED)) == 0
+     && (rwlock->__data.__readers >> PTHREAD_RWLOCK_READER_SHIFT) == 0
+     && rwlock->__data.__writers_futex, 1))
+    return 0;
+
   /* When in a trywrlock, we can acquire the write lock if it is in states
      #1 (idle and read phase) and #5 (idle and write phase), and also in #6
      (readers waiting, write phase) if we prefer writers.
diff --git a/nptl/pthread_rwlock_unlock.c b/nptl/pthread_rwlock_unlock.c
index ef46e88..fa4555d 100644
--- a/nptl/pthread_rwlock_unlock.c
+++ b/nptl/pthread_rwlock_unlock.c
@@ -22,6 +22,7 @@
 #include <pthread.h>
 #include <pthreadP.h>
 #include <stap-probe.h>
+#include <elide.h>

 #include "pthread_rwlock_common.c"

@@ -31,6 +32,10 @@ __pthread_rwlock_unlock (pthread_rwlock_t *rwlock)
 {
   LIBC_PROBE (rwlock_unlock, 1, rwlock);

+  if (ELIDE_UNLOCK ((rwlock->__data.__readers >>
PTHREAD_RWLOCK_READER_SHIFT) == 0
+    && rwlock->__data.__writers_futex == 0))
+    return 0;
+
   /* We distinguish between having acquired a read vs. a write lock by looking
      at the writer TID.  If it's equal to our TID, we must be the writer
      because nobody else can have stored this value.  Also, if we are a
diff --git a/nptl/pthread_rwlock_wrlock.c b/nptl/pthread_rwlock_wrlock.c
index 335fcd1..d041aad 100644
--- a/nptl/pthread_rwlock_wrlock.c
+++ b/nptl/pthread_rwlock_wrlock.c
@@ -16,6 +16,8 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */

+#include <elide.h>
+
 #include "pthread_rwlock_common.c"

 /* See pthread_rwlock_common.c.  */
@@ -24,6 +26,12 @@ __pthread_rwlock_wrlock (pthread_rwlock_t *rwlock)
 {
   LIBC_PROBE (wrlock_entry, 1, rwlock);

+  if (ELIDE_LOCK (rwlock->__data.__rwelision,
+  ((rwlock->__data.__readers & PTHREAD_RWLOCK_WRPHASE) &&
(rwlock->__data.__readers & PTHREAD_RWLOCK_WRLOCKED)) == 0
+  && (rwlock->__data.__readers >> PTHREAD_RWLOCK_READER_SHIFT) == 0
+  && rwlock->__data.__writers_futex == 0))
+    return 0;
+
   int result = __pthread_rwlock_wrlock_full (rwlock, NULL);
   LIBC_PROBE (wrlock_acquire_write, 1, rwlock);
   return result;
diff --git a/sysdeps/x86/elide.h b/sysdeps/x86/elide.h
index 53de418..3eaf15c 100644
--- a/sysdeps/x86/elide.h
+++ b/sysdeps/x86/elide.h
@@ -77,7 +77,7 @@ elision_adapt(signed char *adapt_count, unsigned int status)
           } \
         _xabort (_ABORT_LOCK_BUSY); \
       } \
-    if (!elision_adapt (&(adapt_count), status)) \
+    if (elision_adapt (&(adapt_count), status)) \
       break; \
           } \
       } \

Performance was measured with bench-multithread.c posted at
https://sourceware.org/ml/libc-alpha/2017-01/msg00168.html on i7-6700
(Skylake).

With R=60 and shared W=0 lock elision helps to increase scalability
with some slowdown in single-thread case:
w/o patch:
threads=1  iter/s=        4060317  timing_t=17040157458
threads=2  iter/s=        6752786  timing_t=17040102928
threads=3  iter/s=        9392445  timing_t=17040102070
threads=4  iter/s=       10463599  timing_t=17040103938

with patch:
threads=1  iter/s=        3402230  timing_t=17040139858
threads=2  iter/s=        6784424  timing_t=17040102620
threads=3  iter/s=       10157296  timing_t=17040103664
threads=4  iter/s=       13509321  timing_t=17040108804

But with R=60 and shared W=1 we have not so good picture:
w/o patch:
threads=1  iter/s=        3939640  timing_t=17040142322
threads=2  iter/s=        6018330  timing_t=17040101032
threads=3  iter/s=        7598935  timing_t=17040101858
threads=4  iter/s=        7873063  timing_t=17040107686

with patch:
threads=1  iter/s=        3317374  timing_t=17040140564
threads=2  iter/s=        4555148  timing_t=17040106910
threads=3  iter/s=        6583628  timing_t=17040105330
threads=4  iter/s=        7421028  timing_t=17040101884
(perf stat -T tells we have 9.44% transactional cycles and 3.84% aborted cycles)

I have tried to tune lock elision and it change performance but
haven't achieve results better or equal than w/o patch:

with patch and tuning:
threads=1  iter/s=        3324805  timing_t=17040143918
threads=2  iter/s=        5360456  timing_t=17040100294
threads=3  iter/s=        6997841  timing_t=17040107640
threads=4  iter/s=        7406136  timing_t=17040104018
(perf stat -T tells we have 5.84% transactional cycles and 0.46% aborted cycles)

Is condition for wrapper macro enough correct?
Maybe it have sence to make tuning configurable by tunables and have
additional switch for turn it on/off?


--
WBR,
Andrew


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]