This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH v3] PowerPC: libc single-thread lock optimization
- From: "Tulio Magno Quites Machado Filho" <tuliom at linux dot vnet dot ibm dot com>
- To: libc-alpha at sourceware dot org
- Cc: adhemerval dot zanella at linaro dot org, munroesj at linux dot vnet dot ibm dot com
- Date: Fri, 11 Mar 2016 15:35:37 -0300
- Subject: [PATCH v3] PowerPC: libc single-thread lock optimization
- Authentication-results: sourceware.org; auth=none
- References: <540080DF dot 6030205 at linux dot vnet dot ibm dot com>
I continued the work started by Adhemerval. The discussion around version 2
of this patch is available at http://patchwork.sourceware.org/patch/2516/
Nowadays, we already require GCC 4.7, so we can safely rely on compiler
built-ins for most of our atomic primitives.
Changes since v2:
- Updated ChangeLog and commit message.
- Replaced the following atomic primitives by compiler built-ins:
exchange*, and* and or*.
---8<---
Add relaxed atomics as a lock optimization. Addressing the concerns
raised in previous discussions, the primitives are still signal-safe
(although not thread-safe), so if future implementations relying on
this code (e.g. malloc) is changed to be async-safe, it won't require to
adjust powerpc atomics.
For catomic_and and catomic_or I follow the definition at 'include/atomic.h'
(which powerpc is currently using) and implemented the atomics with acquire
semantics. The new implementation is based on compiler built-ins.
On synthetic benchmarks it shows an improvement of 5-10% for malloc
calls and a performance increase of 7-8% in 483.xalancbmk from
speccpu2006 (number from a POWER8 machine).
Checked on powerpc32, powerpc64 and powerpc64le.
2016-03-11 Adhemerval Zanella Netto <azanella@linux.vnet.ibm.com>
Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>
* malloc/malloc.c (malloc_consolidate): Replace 0 by NULL in
order to match the type of p when calling atomic_exchange_acq().
* sysdeps/powerpc/atomic-machine.h
(__arch_atomic_exchange_32_acq): Removed.
(__arch_atomic_exchange_32_rel): Likewise
(__arch_compare_and_exchange_val_32_relaxed): New macro: atomic compare
and exchange with relaxed semantic.
(atomic_compare_and_exchange_val_relaxed): Likewise.
(__atomic_is_single_thread): New macro: check if program is
single-thread.
(atomic_compare_and_exchange_val_acq): Add relaxed operation for
single-thread.
(atomic_compare_and_exchange_val_rel): Likewise.
(atomic_exchange_acq): Likewise.
(atomic_exchange_rel): Likewise.
(catomic_and): Add relaxed operation and use compiler built-ins.
(catomic_or): Likewise.
(atomic_exchange_acq): Modify to use compiler built-ins.
(atomic_exchange_rel): Likewise.
* sysdeps/powerpc/powerpc32/atomic-machine.h
(__arch_compare_and_exchange_val_64_relaxed): New macro: add empty
implementation.
(__arch_atomic_exchange_64_relaxed): Likewise.
* sysdeps/powerpc/powerpc64/atomic-machine.h
(__arch_compare_and_exchange_val_64_relaxed): New macro: atomic compare
and exchange with relaxed semantics.
(__arch_atomic_exchange_64_acq): Removed.
(__arch_atomic_exchange_64_rel): Removed.
---
malloc/malloc.c | 2 +-
sysdeps/powerpc/atomic-machine.h | 128 ++++++++++++++++++-----------
sysdeps/powerpc/powerpc32/atomic-machine.h | 6 ++
sysdeps/powerpc/powerpc64/atomic-machine.h | 38 ++++-----
4 files changed, 103 insertions(+), 71 deletions(-)
diff --git a/malloc/malloc.c b/malloc/malloc.c
index b8a43bf..1eed794 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -4150,7 +4150,7 @@ static void malloc_consolidate(mstate av)
maxfb = &fastbin (av, NFASTBINS - 1);
fb = &fastbin (av, 0);
do {
- p = atomic_exchange_acq (fb, 0);
+ p = atomic_exchange_acq (fb, NULL);
if (p != 0) {
do {
check_inuse_chunk(av, p);
diff --git a/sysdeps/powerpc/atomic-machine.h b/sysdeps/powerpc/atomic-machine.h
index 8b0e1e7..7e6c699 100644
--- a/sysdeps/powerpc/atomic-machine.h
+++ b/sysdeps/powerpc/atomic-machine.h
@@ -27,6 +27,7 @@
*/
#include <stdint.h>
+#include <tls.h>
typedef int32_t atomic32_t;
typedef uint32_t uatomic32_t;
@@ -78,6 +79,9 @@ typedef uintmax_t uatomic_max_t;
#define atomic_full_barrier() __asm ("sync" ::: "memory")
+/* We can't convert __arch_compare_and_exchange_val_* to compiler built-ins
+ yet because the built-ins expect a pointer to the expected value while
+ our current implementation pass the value directly. */
#define __arch_compare_and_exchange_val_32_acq(mem, newval, oldval) \
({ \
__typeof (*(mem)) __tmp; \
@@ -112,33 +116,24 @@ typedef uintmax_t uatomic_max_t;
__tmp; \
})
-#define __arch_atomic_exchange_32_acq(mem, value) \
+#define __arch_compare_and_exchange_val_32_relaxed(mem, newval, oldval) \
({ \
- __typeof (*mem) __val; \
- __asm __volatile ( \
- "1: lwarx %0,0,%2" MUTEX_HINT_ACQ "\n" \
- " stwcx. %3,0,%2\n" \
- " bne- 1b\n" \
- " " __ARCH_ACQ_INSTR \
- : "=&r" (__val), "=m" (*mem) \
- : "b" (mem), "r" (value), "m" (*mem) \
- : "cr0", "memory"); \
- __val; \
- })
-
-#define __arch_atomic_exchange_32_rel(mem, value) \
- ({ \
- __typeof (*mem) __val; \
- __asm __volatile (__ARCH_REL_INSTR "\n" \
- "1: lwarx %0,0,%2" MUTEX_HINT_REL "\n" \
- " stwcx. %3,0,%2\n" \
- " bne- 1b" \
- : "=&r" (__val), "=m" (*mem) \
- : "b" (mem), "r" (value), "m" (*mem) \
- : "cr0", "memory"); \
- __val; \
+ __typeof (*(mem)) __tmp; \
+ __typeof (mem) __memp = (mem); \
+ __asm __volatile ( \
+ "1: lwarx %0,0,%1\n" \
+ " cmpw %0,%2\n" \
+ " bne 2f\n" \
+ " stwcx. %3,0,%1\n" \
+ " bne- 1b\n" \
+ "2: " \
+ : "=&r" (__tmp) \
+ : "b" (__memp), "r" (oldval), "r" (newval) \
+ : "cr0", "memory"); \
+ __tmp; \
})
+/* The following atomic primitives aren't available as compiler built-ins. */
#define __arch_atomic_exchange_and_add_32(mem, value) \
({ \
__typeof (*mem) __val, __tmp; \
@@ -221,10 +216,30 @@ typedef uintmax_t uatomic_max_t;
__val; \
})
+#define __atomic_is_single_thread \
+ (THREAD_GETMEM (THREAD_SELF, header.multiple_threads) == 0)
+
+#define atomic_compare_and_exchange_val_relaxed(mem, newval, oldval) \
+ ({ \
+ __typeof (*(mem)) __result; \
+ if (sizeof (*mem) == 4) \
+ __result = __arch_compare_and_exchange_val_32_relaxed(mem, newval, \
+ oldval); \
+ else if (sizeof (*mem) == 8) \
+ __result = __arch_compare_and_exchange_val_64_relaxed(mem, newval, \
+ oldval); \
+ else \
+ abort (); \
+ __result; \
+ })
+
#define atomic_compare_and_exchange_val_acq(mem, newval, oldval) \
({ \
__typeof (*(mem)) __result; \
- if (sizeof (*mem) == 4) \
+ if (__atomic_is_single_thread) \
+ __result = atomic_compare_and_exchange_val_relaxed (mem, newval, \
+ oldval); \
+ else if (sizeof (*mem) == 4) \
__result = __arch_compare_and_exchange_val_32_acq(mem, newval, oldval); \
else if (sizeof (*mem) == 8) \
__result = __arch_compare_and_exchange_val_64_acq(mem, newval, oldval); \
@@ -236,7 +251,10 @@ typedef uintmax_t uatomic_max_t;
#define atomic_compare_and_exchange_val_rel(mem, newval, oldval) \
({ \
__typeof (*(mem)) __result; \
- if (sizeof (*mem) == 4) \
+ if (__atomic_is_single_thread) \
+ __result = atomic_compare_and_exchange_val_relaxed (mem, newval, \
+ oldval); \
+ else if (sizeof (*mem) == 4) \
__result = __arch_compare_and_exchange_val_32_rel(mem, newval, oldval); \
else if (sizeof (*mem) == 8) \
__result = __arch_compare_and_exchange_val_64_rel(mem, newval, oldval); \
@@ -245,28 +263,24 @@ typedef uintmax_t uatomic_max_t;
__result; \
})
-#define atomic_exchange_acq(mem, value) \
- ({ \
- __typeof (*(mem)) __result; \
- if (sizeof (*mem) == 4) \
- __result = __arch_atomic_exchange_32_acq (mem, value); \
- else if (sizeof (*mem) == 8) \
- __result = __arch_atomic_exchange_64_acq (mem, value); \
- else \
- abort (); \
- __result; \
+#define atomic_exchange_acq(mem, value) \
+ ({ \
+ __typeof (value) __ret; \
+ if (__atomic_is_single_thread) \
+ __ret = __atomic_exchange_n ((mem), (value), __ATOMIC_RELAXED); \
+ else \
+ __ret = __atomic_exchange_n ((mem), (value), __ATOMIC_ACQUIRE); \
+ __ret; \
})
-#define atomic_exchange_rel(mem, value) \
- ({ \
- __typeof (*(mem)) __result; \
- if (sizeof (*mem) == 4) \
- __result = __arch_atomic_exchange_32_rel (mem, value); \
- else if (sizeof (*mem) == 8) \
- __result = __arch_atomic_exchange_64_rel (mem, value); \
- else \
- abort (); \
- __result; \
+#define atomic_exchange_rel(mem, value) \
+ ({ \
+ __typeof (value) __ret; \
+ if (__atomic_is_single_thread) \
+ __ret = __atomic_exchange_n ((mem), (value), __ATOMIC_RELAXED); \
+ else \
+ __ret = __atomic_exchange_n ((mem), (value), __ATOMIC_RELEASE); \
+ __ret; \
})
#define atomic_exchange_and_add(mem, value) \
@@ -280,6 +294,7 @@ typedef uintmax_t uatomic_max_t;
abort (); \
__result; \
})
+
#define atomic_exchange_and_add_acq(mem, value) \
({ \
__typeof (*(mem)) __result; \
@@ -291,6 +306,7 @@ typedef uintmax_t uatomic_max_t;
abort (); \
__result; \
})
+
#define atomic_exchange_and_add_rel(mem, value) \
({ \
__typeof (*(mem)) __result; \
@@ -343,3 +359,23 @@ typedef uintmax_t uatomic_max_t;
abort (); \
__result; \
})
+
+#define catomic_and(mem, arg) \
+ ({ \
+ __typeof (arg) __ret; \
+ if (__atomic_is_single_thread) \
+ __ret = __atomic_fetch_and((mem), arg, __ATOMIC_RELAXED); \
+ else \
+ __ret = __atomic_fetch_and((mem), arg, __ATOMIC_ACQUIRE); \
+ __ret; \
+ })
+
+#define catomic_or(mem, arg) \
+ ({ \
+ __typeof (arg) __ret; \
+ if (__atomic_is_single_thread) \
+ __ret = __atomic_fetch_or((mem), arg, __ATOMIC_RELAXED); \
+ else \
+ __ret = __atomic_fetch_or((mem), arg, __ATOMIC_ACQUIRE); \
+ __ret; \
+ })
diff --git a/sysdeps/powerpc/powerpc32/atomic-machine.h b/sysdeps/powerpc/powerpc32/atomic-machine.h
index 1d407b3..c733d43 100644
--- a/sysdeps/powerpc/powerpc32/atomic-machine.h
+++ b/sysdeps/powerpc/powerpc32/atomic-machine.h
@@ -86,6 +86,9 @@
#define __arch_compare_and_exchange_bool_64_rel(mem, newval, oldval) \
(abort (), 0)
+#define __arch_compare_and_exchange_val_64_relaxed(mem, newval, oldval) \
+ (abort (), (__typeof (*mem)) 0)
+
#define __arch_compare_and_exchange_val_64_rel(mem, newval, oldval) \
(abort (), (__typeof (*mem)) 0)
@@ -95,6 +98,9 @@
#define __arch_atomic_exchange_64_rel(mem, value) \
({ abort (); (*mem) = (value); })
+#define __arch_atomic_exchange_64_relaxed(mem, value) \
+ ({ abort (); (*mem) = (value); })
+
#define __arch_atomic_exchange_and_add_64(mem, value) \
({ abort (); (*mem) = (value); })
diff --git a/sysdeps/powerpc/powerpc64/atomic-machine.h b/sysdeps/powerpc/powerpc64/atomic-machine.h
index 751487a..515572e 100644
--- a/sysdeps/powerpc/powerpc64/atomic-machine.h
+++ b/sysdeps/powerpc/powerpc64/atomic-machine.h
@@ -146,32 +146,22 @@
__tmp; \
})
-#define __arch_atomic_exchange_64_acq(mem, value) \
- ({ \
- __typeof (*mem) __val; \
- __asm __volatile (__ARCH_REL_INSTR "\n" \
- "1: ldarx %0,0,%2" MUTEX_HINT_ACQ "\n" \
- " stdcx. %3,0,%2\n" \
+#define __arch_compare_and_exchange_val_64_relaxed(mem, newval, oldval) \
+ ({ \
+ __typeof (*(mem)) __tmp; \
+ __typeof (mem) __memp = (mem); \
+ __asm __volatile ("\n" \
+ "1: ldarx %0,0,%1\n" \
+ " cmpd %0,%2\n" \
+ " bne 2f\n" \
+ " stdcx. %3,0,%1\n" \
" bne- 1b\n" \
- " " __ARCH_ACQ_INSTR \
- : "=&r" (__val), "=m" (*mem) \
- : "b" (mem), "r" (value), "m" (*mem) \
+ "2: " \
+ : "=&r" (__tmp) \
+ : "b" (__memp), "r" (oldval), "r" (newval) \
: "cr0", "memory"); \
- __val; \
- })
-
-#define __arch_atomic_exchange_64_rel(mem, value) \
- ({ \
- __typeof (*mem) __val; \
- __asm __volatile (__ARCH_REL_INSTR "\n" \
- "1: ldarx %0,0,%2" MUTEX_HINT_REL "\n" \
- " stdcx. %3,0,%2\n" \
- " bne- 1b" \
- : "=&r" (__val), "=m" (*mem) \
- : "b" (mem), "r" (value), "m" (*mem) \
- : "cr0", "memory"); \
- __val; \
- })
+ __tmp; \
+ })
#define __arch_atomic_exchange_and_add_64(mem, value) \
({ \
--
2.1.0