This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH v2] PowerPC: libc single-thread lock optimization
- From: Adhemerval Zanella <azanella at linux dot vnet dot ibm dot com>
- To: "GNU C. Library" <libc-alpha at sourceware dot org>
- Date: Fri, 22 Aug 2014 10:50:11 -0300
- Subject: [PATCH v2] PowerPC: libc single-thread lock optimization
- Authentication-results: sourceware.org; auth=none
Hi,
Following comments from my first patch to optimize single-thread internal
glibc locking/atomics [1], I have changed the implementation to use now
relaxed atomics instead. Addresing the concerns raised in last discussion,
the primitives are still signal-safe (although not thread-safe), so if future
malloc implementation is changed to be async-safe, it won't require to a
adjust powerpc atomics.
For catomic_and and catomic_or I follow the definition at 'include/atomic.h'
(which powerpc is currently using) and implemented the atomics with acquire
semantics. The new implementation also is simpler.
On synthetic benchmarks it shows an improvement of 5-10% for malloc
calls and an performance increase of 7-8% in 483.xalancbmk from
speccpu2006 (number from an POWER8 machine).
Checked on powerpc64, powerpc32 and powerpc64le.
[1] https://sourceware.org/ml/libc-alpha/2014-05/msg00118.html
--
* sysdeps/powerpc/bits/atomic.h
(__arch_compare_and_exchange_val_32_relaxed): New macro: atomic compare
and exchange with relaxed semantic.
(atomic_compare_and_exchange_val_relaxed): Likewise.
(__arch_atomic_exchange_32_relaxed): New macro: atomic exchange with
relaxed semantic.
(atomic_exchange_relaxed): Likewise.
(__arch_atomic_and_32): New macro: atomic bitwise and with acquire
semantic.
(__arch_atomic_and_32_relaxed): New macro: atomic bitwise and with
relaxed semantic.
(atomic_and_relaxed): Likewise.
(__arch_atomic_or_32): New macro: atomic bitwise and with acquire
semantic.
(__arch_atomic_or_32_relaxed): New macro: atomic bitwise and with
relaxed semantic.
(atomic_or_relaxed): Likewise.
(__atomic_is_single_thread): New macro: check if program is
single-thread.
(atomic_compare_and_exchange_val_acq): Add relaxed operation for
single-thread.
(atomic_compare_and_exchange_val_rel): Likewise.
(atomic_exchange_rel): Likewise.
(catomic_and): Likewise.
(catomic_or): Likewise.
* sysdeps/powerpc/powerpc32/bits/atomic.h
(__arch_compare_and_exchange_val_64_relaxed): New macro: add empty
implementation.
(__arch_atomic_exchange_64_relaxed): Likewise.
* sysdeps/powerpc/powerpc32/bits/atomic.h
(__arch_compare_and_exchange_val_64_relaxed): New macro: atomic compare
and exchange with relaxed semantics.
(__arch_atomic_exchange_64_relaxed): New macro: atomic exchange with
relaxed semantic.
(__arch_atomic_and_64_relaxed): New macro: atomic exchange with
relaxed semantic.
(__arch_atomic_and_64): New macro: atomic bitwise and with acquire
semantic.
(__arch_atomic_or_64_relaxed): New macro: atomic bitwise or with
relaxed semantic.
(__arch_atomic_or_64): New macro: atomic bitwise or with acquire
semantic.
---
diff --git a/sysdeps/powerpc/bits/atomic.h b/sysdeps/powerpc/bits/atomic.h
index 2ffba48..be590c7 100644
--- a/sysdeps/powerpc/bits/atomic.h
+++ b/sysdeps/powerpc/bits/atomic.h
@@ -27,6 +27,7 @@
*/
#include <stdint.h>
+#include <tls.h>
typedef int32_t atomic32_t;
typedef uint32_t uatomic32_t;
@@ -113,6 +114,23 @@ typedef uintmax_t uatomic_max_t;
__tmp; \
})
+#define __arch_compare_and_exchange_val_32_relaxed(mem, newval, oldval) \
+ ({ \
+ __typeof (*(mem)) __tmp; \
+ __typeof (mem) __memp = (mem); \
+ __asm __volatile ( \
+ "1: lwarx %0,0,%1\n" \
+ " cmpw %0,%2\n" \
+ " bne 2f\n" \
+ " stwcx. %3,0,%1\n" \
+ " bne- 1b\n" \
+ "2: " \
+ : "=&r" (__tmp) \
+ : "b" (__memp), "r" (oldval), "r" (newval) \
+ : "cr0", "memory"); \
+ __tmp; \
+ })
+
#define __arch_atomic_exchange_32_acq(mem, value) \
({ \
__typeof (*mem) __val; \
@@ -127,6 +145,18 @@ typedef uintmax_t uatomic_max_t;
__val; \
})
+#define __arch_atomic_exchange_32_relaxed(mem, value) \
+ ({ \
+ __typeof (*mem) __val; \
+ __asm __volatile ("1: lwarx %0,0,%2\n" \
+ " stwcx. %3,0,%2\n" \
+ " bne- 1b\n" \
+ : "=&r" (__val), "=m" (*mem) \
+ : "b" (mem), "r" (value), "m" (*mem) \
+ : "cr0", "memory"); \
+ __val; \
+ })
+
#define __arch_atomic_exchange_32_rel(mem, value) \
({ \
__typeof (*mem) __val; \
@@ -140,6 +170,7 @@ typedef uintmax_t uatomic_max_t;
__val; \
})
+
#define __arch_atomic_exchange_and_add_32(mem, value) \
({ \
__typeof (*mem) __val, __tmp; \
@@ -153,6 +184,62 @@ typedef uintmax_t uatomic_max_t;
__val; \
})
+#define __arch_atomic_and_32(mem, value) \
+ ({ \
+ __typeof (*mem) __val, __tmp; \
+ __asm __volatile ("1: lwarx %0,0,%3\n" \
+ " add %1,%0,%4\n" \
+ " stwcx. %1,0,%3\n" \
+ " bne- 1b\n" \
+ " " __ARCH_ACQ_INSTR \
+ : "=&b" (__val), "=&r" (__tmp), "=m" (*mem) \
+ : "b" (mem), "r" (value), "m" (*mem) \
+ : "cr0", "memory"); \
+ __val; \
+ })
+
+#define __arch_atomic_and_32_relaxed(mem, value) \
+ ({ \
+ __typeof (*mem) __val, __tmp; \
+ __asm __volatile (" \n" \
+ "1: lwarx %0,0,%3\n" \
+ " and %1,%0,%4\n" \
+ " stwcx. %1,0,%3\n" \
+ " bne- 1b\n" \
+ : "=&b" (__val), "=&r" (__tmp), "=m" (*mem) \
+ : "b" (mem), "r" (value), "m" (*mem) \
+ : "cr0", "memory"); \
+ __val; \
+ })
+
+#define __arch_atomic_or_32(mem, value) \
+ ({ \
+ __typeof (*mem) __val, __tmp; \
+ __asm __volatile ("1: lwarx %0,0,%3\n" \
+ " or %1,%0,%4\n" \
+ " stwcx. %1,0,%3\n" \
+ " bne- 1b\n" \
+ " " __ARCH_ACQ_INSTR \
+ : "=&b" (__val), "=&r" (__tmp), "=m" (*mem) \
+ : "b" (mem), "r" (value), "m" (*mem) \
+ : "cr0", "memory"); \
+ __val; \
+ })
+
+#define __arch_atomic_or_32_relaxed(mem, value) \
+ ({ \
+ __typeof (*mem) __val, __tmp; \
+ __asm __volatile (" \n" \
+ "1: lwarx %0,0,%3\n" \
+ " or %1,%0,%4\n" \
+ " stwcx. %1,0,%3\n" \
+ " bne- 1b\n" \
+ : "=&b" (__val), "=&r" (__tmp), "=m" (*mem) \
+ : "b" (mem), "r" (value), "m" (*mem) \
+ : "cr0", "memory"); \
+ __val; \
+ })
+
#define __arch_atomic_increment_val_32(mem) \
({ \
__typeof (*(mem)) __val; \
@@ -194,10 +281,27 @@ typedef uintmax_t uatomic_max_t;
__val; \
})
-#define atomic_compare_and_exchange_val_acq(mem, newval, oldval) \
+#define __atomic_is_single_thread \
+ (THREAD_GETMEM (THREAD_SELF, header.multiple_threads) == 0)
+
+#define atomic_compare_and_exchange_val_relaxed(mem, newval, oldval) \
({ \
__typeof (*(mem)) __result; \
if (sizeof (*mem) == 4) \
+ __result = __arch_compare_and_exchange_val_32_relaxed(mem, newval, oldval); \
+ else if (sizeof (*mem) == 8) \
+ __result = __arch_compare_and_exchange_val_64_relaxed(mem, newval, oldval); \
+ else \
+ abort (); \
+ __result; \
+ })
+
+#define atomic_compare_and_exchange_val_acq(mem, newval, oldval) \
+ ({ \
+ __typeof (*(mem)) __result; \
+ if (__atomic_is_single_thread) \
+ __result = atomic_compare_and_exchange_val_relaxed (mem, newval, oldval); \
+ else if (sizeof (*mem) == 4) \
__result = __arch_compare_and_exchange_val_32_acq(mem, newval, oldval); \
else if (sizeof (*mem) == 8) \
__result = __arch_compare_and_exchange_val_64_acq(mem, newval, oldval); \
@@ -209,7 +313,9 @@ typedef uintmax_t uatomic_max_t;
#define atomic_compare_and_exchange_val_rel(mem, newval, oldval) \
({ \
__typeof (*(mem)) __result; \
- if (sizeof (*mem) == 4) \
+ if (__atomic_is_single_thread) \
+ __result = atomic_compare_and_exchange_val_relaxed (mem, newval, oldval); \
+ else if (sizeof (*mem) == 4) \
__result = __arch_compare_and_exchange_val_32_rel(mem, newval, oldval); \
else if (sizeof (*mem) == 8) \
__result = __arch_compare_and_exchange_val_64_rel(mem, newval, oldval); \
@@ -218,10 +324,24 @@ typedef uintmax_t uatomic_max_t;
__result; \
})
-#define atomic_exchange_acq(mem, value) \
+#define atomic_exchange_relaxed(mem, value) \
({ \
__typeof (*(mem)) __result; \
if (sizeof (*mem) == 4) \
+ __result = __arch_atomic_exchange_32_relaxed (mem, value); \
+ else if (sizeof (*mem) == 8) \
+ __result = __arch_atomic_exchange_64_relaxed (mem, value); \
+ else \
+ abort (); \
+ __result; \
+ })
+
+#define atomic_exchange_acq(mem, value) \
+ ({ \
+ __typeof (*(mem)) __result; \
+ if (__atomic_is_single_thread) \
+ __result = atomic_exchange_relaxed (mem, value); \
+ else if (sizeof (*mem) == 4) \
__result = __arch_atomic_exchange_32_acq (mem, value); \
else if (sizeof (*mem) == 8) \
__result = __arch_atomic_exchange_64_acq (mem, value); \
@@ -233,7 +353,9 @@ typedef uintmax_t uatomic_max_t;
#define atomic_exchange_rel(mem, value) \
({ \
__typeof (*(mem)) __result; \
- if (sizeof (*mem) == 4) \
+ if (__atomic_is_single_thread) \
+ __result = atomic_exchange_relaxed (mem, value); \
+ else if (sizeof (*mem) == 4) \
__result = __arch_atomic_exchange_32_rel (mem, value); \
else if (sizeof (*mem) == 8) \
__result = __arch_atomic_exchange_64_rel (mem, value); \
@@ -294,3 +416,55 @@ typedef uintmax_t uatomic_max_t;
abort (); \
__result; \
})
+
+#define atomic_and_relaxed(mem, arg) \
+ ({ \
+ __typeof (*(mem)) __result; \
+ if (sizeof (*mem) == 4) \
+ __result = __arch_atomic_and_32_relaxed(mem, arg); \
+ else if (sizeof (*mem) == 8) \
+ __result = __arch_atomic_and_64_relaxed(mem, arg); \
+ else \
+ abort (); \
+ __result; \
+ })
+
+#define catomic_and(mem, arg) \
+ ({ \
+ __typeof (*(mem)) __result; \
+ if (__atomic_is_single_thread) \
+ __result = atomic_and_relaxed (mem, arg); \
+ else if (sizeof (*mem) == 4) \
+ __result = __arch_atomic_and_32(mem, arg); \
+ else if (sizeof (*mem) == 8) \
+ __result = __arch_atomic_and_64(mem, arg); \
+ else \
+ abort (); \
+ __result; \
+ })
+
+#define atomic_or_relaxed(mem, arg) \
+ ({ \
+ __typeof (*(mem)) __result; \
+ if (sizeof (*mem) == 4) \
+ __result = __arch_atomic_or_32_relaxed(mem, arg); \
+ else if (sizeof (*mem) == 8) \
+ __result = __arch_atomic_or_64_relaxed(mem, arg); \
+ else \
+ abort (); \
+ __result; \
+ })
+
+#define catomic_or(mem, arg) \
+ ({ \
+ __typeof (*(mem)) __result; \
+ if (__atomic_is_single_thread) \
+ __result = atomic_or_relaxed (mem, arg); \
+ else if (sizeof (*mem) == 4) \
+ __result = __arch_atomic_or_32(mem, arg); \
+ else if (sizeof (*mem) == 8) \
+ __result = __arch_atomic_or_64(mem, arg); \
+ else \
+ abort (); \
+ __result; \
+ })
diff --git a/sysdeps/powerpc/powerpc32/bits/atomic.h b/sysdeps/powerpc/powerpc32/bits/atomic.h
index 7613bdc..1b9f82a 100644
--- a/sysdeps/powerpc/powerpc32/bits/atomic.h
+++ b/sysdeps/powerpc/powerpc32/bits/atomic.h
@@ -83,6 +83,9 @@
#define __arch_compare_and_exchange_bool_64_rel(mem, newval, oldval) \
(abort (), 0)
+#define __arch_compare_and_exchange_val_64_relaxed(mem, newval, oldval) \
+ (abort (), (__typeof (*mem)) 0)
+
#define __arch_compare_and_exchange_val_64_rel(mem, newval, oldval) \
(abort (), (__typeof (*mem)) 0)
@@ -92,6 +95,9 @@
#define __arch_atomic_exchange_64_rel(mem, value) \
({ abort (); (*mem) = (value); })
+#define __arch_atomic_exchange_64_relaxed(mem, value) \
+ ({ abort (); (*mem) = (value); })
+
#define __arch_atomic_exchange_and_add_64(mem, value) \
({ abort (); (*mem) = (value); })
diff --git a/sysdeps/powerpc/powerpc64/bits/atomic.h b/sysdeps/powerpc/powerpc64/bits/atomic.h
index 527fe7c..b8a1035 100644
--- a/sysdeps/powerpc/powerpc64/bits/atomic.h
+++ b/sysdeps/powerpc/powerpc64/bits/atomic.h
@@ -143,6 +143,23 @@
__tmp; \
})
+#define __arch_compare_and_exchange_val_64_relaxed(mem, newval, oldval) \
+ ({ \
+ __typeof (*(mem)) __tmp; \
+ __typeof (mem) __memp = (mem); \
+ __asm __volatile ("\n" \
+ "1: ldarx %0,0,%1\n" \
+ " cmpd %0,%2\n" \
+ " bne 2f\n" \
+ " stdcx. %3,0,%1\n" \
+ " bne- 1b\n" \
+ "2: " \
+ : "=&r" (__tmp) \
+ : "b" (__memp), "r" (oldval), "r" (newval) \
+ : "cr0", "memory"); \
+ __tmp; \
+ })
+
#define __arch_atomic_exchange_64_acq(mem, value) \
({ \
__typeof (*mem) __val; \
@@ -170,6 +187,18 @@
__val; \
})
+#define __arch_atomic_exchange_64_relaxed(mem, value) \
+ ({ \
+ __typeof (*mem) __val; \
+ __asm __volatile ("1: ldarx %0,0,%2\n" \
+ " stdcx. %3,0,%2\n" \
+ " bne- 1b" \
+ : "=&r" (__val), "=m" (*mem) \
+ : "b" (mem), "r" (value), "m" (*mem) \
+ : "cr0", "memory"); \
+ __val; \
+ })
+
#define __arch_atomic_exchange_and_add_64(mem, value) \
({ \
__typeof (*mem) __val, __tmp; \
@@ -224,6 +253,60 @@
__val; \
})
+#define __arch_atomic_and_64_relaxed(mem, value) \
+ ({ \
+ __typeof (*mem) __val, __tmp; \
+ __asm __volatile ("1: ldarx %0,0,%3\n" \
+ " and %1,%0,%4\n" \
+ " stdcx. %1,0,%3\n" \
+ " bne- 1b\n" \
+ : "=&b" (__val), "=&r" (__tmp), "=m" (*mem) \
+ : "b" (mem), "r" (value), "m" (*mem) \
+ : "cr0", "memory"); \
+ __val; \
+ })
+
+#define __arch_atomic_and_64(mem, value) \
+ ({ \
+ __typeof (*mem) __val, __tmp; \
+ __asm __volatile ("1: ldarx %0,0,%3\n" \
+ " and %1,%0,%4\n" \
+ " stdcx. %1,0,%3\n" \
+ " bne- 1b\n" \
+ " " __ARCH_ACQ_INSTR \
+ : "=&b" (__val), "=&r" (__tmp), "=m" (*mem) \
+ : "b" (mem), "r" (value), "m" (*mem) \
+ : "cr0", "memory"); \
+ __val; \
+ })
+
+#define __arch_atomic_or_64_relaxed(mem, value) \
+ ({ \
+ __typeof (*mem) __val, __tmp; \
+ __asm __volatile ("1: ldarx %0,0,%3\n" \
+ " or %1,%0,%4\n" \
+ " stdcx. %1,0,%3\n" \
+ " bne- 1b\n" \
+ : "=&b" (__val), "=&r" (__tmp), "=m" (*mem) \
+ : "b" (mem), "r" (value), "m" (*mem) \
+ : "cr0", "memory"); \
+ __val; \
+ })
+
+#define __arch_atomic_or_64(mem, value) \
+ ({ \
+ __typeof (*mem) __val, __tmp; \
+ __asm __volatile ("1: ldarx %0,0,%3\n" \
+ " or %1,%0,%4\n" \
+ " stdcx. %1,0,%3\n" \
+ " bne- 1b\n" \
+ " " __ARCH_ACQ_INSTR \
+ : "=&b" (__val), "=&r" (__tmp), "=m" (*mem) \
+ : "b" (mem), "r" (value), "m" (*mem) \
+ : "cr0", "memory"); \
+ __val; \
+ })
+
/*
* All powerpc64 processors support the new "light weight" sync (lwsync).
*/