This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH v3] PowerPC: libc single-thread lock optimization


I continued the work started by Adhemerval.  The discussion around version 2
of this patch is available at http://patchwork.sourceware.org/patch/2516/

Nowadays, we already require GCC 4.7, so we can safely rely on compiler
built-ins for most of our atomic primitives.

Changes since v2:
 - Updated ChangeLog and commit message.
 - Replaced the following atomic primitives by compiler built-ins:
   exchange*, and* and or*.

---8<---

Add relaxed atomics as a lock optimization.  Addressing the concerns
raised in previous discussions, the primitives are still signal-safe
(although not thread-safe), so if future implementations relying on
this code (e.g. malloc) is changed to be async-safe, it won't require to
adjust powerpc atomics.

For catomic_and and catomic_or I follow the definition at 'include/atomic.h'
(which powerpc is currently using) and implemented the atomics with acquire
semantics.  The new implementation is based on compiler built-ins.

On synthetic benchmarks it shows an improvement of 5-10% for malloc
calls and a performance increase of 7-8% in 483.xalancbmk from
speccpu2006 (number from a POWER8 machine).

Checked on powerpc32, powerpc64 and powerpc64le.

2016-03-11  Adhemerval Zanella Netto  <azanella@linux.vnet.ibm.com>
            Tulio Magno Quites Machado Filho  <tuliom@linux.vnet.ibm.com>

	* malloc/malloc.c (malloc_consolidate): Replace 0 by NULL in
	order to match the type of p when calling atomic_exchange_acq().
	* sysdeps/powerpc/atomic-machine.h
	(__arch_atomic_exchange_32_acq): Removed.
	(__arch_atomic_exchange_32_rel): Likewise
	(__arch_compare_and_exchange_val_32_relaxed): New macro: atomic compare
	and exchange with relaxed semantic.
	(atomic_compare_and_exchange_val_relaxed): Likewise.
	(__atomic_is_single_thread): New macro: check if program is
	single-thread.
	(atomic_compare_and_exchange_val_acq): Add relaxed operation for
	single-thread.
	(atomic_compare_and_exchange_val_rel): Likewise.
	(atomic_exchange_acq): Likewise.
	(atomic_exchange_rel): Likewise.
	(catomic_and): Add relaxed operation and use compiler built-ins.
	(catomic_or): Likewise.
	(atomic_exchange_acq): Modify to use compiler built-ins.
	(atomic_exchange_rel): Likewise.
	* sysdeps/powerpc/powerpc32/atomic-machine.h
	(__arch_compare_and_exchange_val_64_relaxed): New macro: add empty
	implementation.
	(__arch_atomic_exchange_64_relaxed): Likewise.
	* sysdeps/powerpc/powerpc64/atomic-machine.h
	(__arch_compare_and_exchange_val_64_relaxed): New macro: atomic compare
	and exchange with relaxed semantics.
	(__arch_atomic_exchange_64_acq): Removed.
	(__arch_atomic_exchange_64_rel): Removed.
---
 malloc/malloc.c                            |   2 +-
 sysdeps/powerpc/atomic-machine.h           | 128 ++++++++++++++++++-----------
 sysdeps/powerpc/powerpc32/atomic-machine.h |   6 ++
 sysdeps/powerpc/powerpc64/atomic-machine.h |  38 ++++-----
 4 files changed, 103 insertions(+), 71 deletions(-)

diff --git a/malloc/malloc.c b/malloc/malloc.c
index b8a43bf..1eed794 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -4150,7 +4150,7 @@ static void malloc_consolidate(mstate av)
     maxfb = &fastbin (av, NFASTBINS - 1);
     fb = &fastbin (av, 0);
     do {
-      p = atomic_exchange_acq (fb, 0);
+      p = atomic_exchange_acq (fb, NULL);
       if (p != 0) {
 	do {
 	  check_inuse_chunk(av, p);
diff --git a/sysdeps/powerpc/atomic-machine.h b/sysdeps/powerpc/atomic-machine.h
index 8b0e1e7..7e6c699 100644
--- a/sysdeps/powerpc/atomic-machine.h
+++ b/sysdeps/powerpc/atomic-machine.h
@@ -27,6 +27,7 @@
  */
 
 #include <stdint.h>
+#include <tls.h>
 
 typedef int32_t atomic32_t;
 typedef uint32_t uatomic32_t;
@@ -78,6 +79,9 @@ typedef uintmax_t uatomic_max_t;
 
 #define atomic_full_barrier()	__asm ("sync" ::: "memory")
 
+/* We can't convert __arch_compare_and_exchange_val_* to compiler built-ins
+   yet because the built-ins expect a pointer to the expected value while
+   our current implementation pass the value directly.  */
 #define __arch_compare_and_exchange_val_32_acq(mem, newval, oldval)	      \
   ({									      \
       __typeof (*(mem)) __tmp;						      \
@@ -112,33 +116,24 @@ typedef uintmax_t uatomic_max_t;
       __tmp;								      \
   })
 
-#define __arch_atomic_exchange_32_acq(mem, value)			      \
+#define __arch_compare_and_exchange_val_32_relaxed(mem, newval, oldval)	      \
   ({									      \
-    __typeof (*mem) __val;						      \
-    __asm __volatile (							      \
-		      "1:	lwarx	%0,0,%2" MUTEX_HINT_ACQ "\n"	      \
-		      "		stwcx.	%3,0,%2\n"			      \
-		      "		bne-	1b\n"				      \
-		      "   " __ARCH_ACQ_INSTR				      \
-		      : "=&r" (__val), "=m" (*mem)			      \
-		      : "b" (mem), "r" (value), "m" (*mem)		      \
-		      : "cr0", "memory");				      \
-    __val;								      \
-  })
-
-#define __arch_atomic_exchange_32_rel(mem, value) \
-  ({									      \
-    __typeof (*mem) __val;						      \
-    __asm __volatile (__ARCH_REL_INSTR "\n"				      \
-		      "1:	lwarx	%0,0,%2" MUTEX_HINT_REL "\n"	      \
-		      "		stwcx.	%3,0,%2\n"			      \
-		      "		bne-	1b"				      \
-		      : "=&r" (__val), "=m" (*mem)			      \
-		      : "b" (mem), "r" (value), "m" (*mem)		      \
-		      : "cr0", "memory");				      \
-    __val;								      \
+      __typeof (*(mem)) __tmp;						      \
+      __typeof (mem)  __memp = (mem);					      \
+      __asm __volatile (						      \
+		        "1:	lwarx	%0,0,%1\n"			      \
+		        "	cmpw	%0,%2\n"			      \
+		        "	bne	2f\n"				      \
+		        "	stwcx.	%3,0,%1\n"			      \
+		        "	bne-	1b\n"				      \
+		        "2:	"					      \
+		        : "=&r" (__tmp)					      \
+		        : "b" (__memp), "r" (oldval), "r" (newval)	      \
+		        : "cr0", "memory");				      \
+      __tmp;								      \
   })
 
+/* The following atomic primitives aren't available as compiler built-ins.  */
 #define __arch_atomic_exchange_and_add_32(mem, value) \
   ({									      \
     __typeof (*mem) __val, __tmp;					      \
@@ -221,10 +216,30 @@ typedef uintmax_t uatomic_max_t;
      __val;								      \
   })
 
+#define __atomic_is_single_thread				\
+  (THREAD_GETMEM (THREAD_SELF, header.multiple_threads) == 0)
+
+#define atomic_compare_and_exchange_val_relaxed(mem, newval, oldval)	      \
+  ({									      \
+    __typeof (*(mem)) __result;						      \
+    if (sizeof (*mem) == 4)						      \
+      __result = __arch_compare_and_exchange_val_32_relaxed(mem, newval,      \
+							    oldval);	      \
+    else if (sizeof (*mem) == 8)					      \
+      __result = __arch_compare_and_exchange_val_64_relaxed(mem, newval,      \
+							    oldval);	      \
+    else								      \
+       abort ();							      \
+    __result;								      \
+  })
+
 #define atomic_compare_and_exchange_val_acq(mem, newval, oldval) \
   ({									      \
     __typeof (*(mem)) __result;						      \
-    if (sizeof (*mem) == 4)						      \
+    if (__atomic_is_single_thread)					      \
+      __result = atomic_compare_and_exchange_val_relaxed (mem, newval,	      \
+							  oldval);	      \
+    else if (sizeof (*mem) == 4)					      \
       __result = __arch_compare_and_exchange_val_32_acq(mem, newval, oldval); \
     else if (sizeof (*mem) == 8)					      \
       __result = __arch_compare_and_exchange_val_64_acq(mem, newval, oldval); \
@@ -236,7 +251,10 @@ typedef uintmax_t uatomic_max_t;
 #define atomic_compare_and_exchange_val_rel(mem, newval, oldval) \
   ({									      \
     __typeof (*(mem)) __result;						      \
-    if (sizeof (*mem) == 4)						      \
+    if (__atomic_is_single_thread)					      \
+      __result = atomic_compare_and_exchange_val_relaxed (mem, newval,	      \
+							  oldval);	      \
+    else if (sizeof (*mem) == 4)					      \
       __result = __arch_compare_and_exchange_val_32_rel(mem, newval, oldval); \
     else if (sizeof (*mem) == 8)					      \
       __result = __arch_compare_and_exchange_val_64_rel(mem, newval, oldval); \
@@ -245,28 +263,24 @@ typedef uintmax_t uatomic_max_t;
     __result;								      \
   })
 
-#define atomic_exchange_acq(mem, value) \
-  ({									      \
-    __typeof (*(mem)) __result;						      \
-    if (sizeof (*mem) == 4)						      \
-      __result = __arch_atomic_exchange_32_acq (mem, value);		      \
-    else if (sizeof (*mem) == 8)					      \
-      __result = __arch_atomic_exchange_64_acq (mem, value);		      \
-    else 								      \
-       abort ();							      \
-    __result;								      \
+#define atomic_exchange_acq(mem, value)					\
+  ({									\
+    __typeof (value) __ret;						\
+    if (__atomic_is_single_thread)					\
+      __ret = __atomic_exchange_n ((mem), (value), __ATOMIC_RELAXED);	\
+    else								\
+      __ret = __atomic_exchange_n ((mem), (value), __ATOMIC_ACQUIRE);	\
+    __ret;								\
   })
 
-#define atomic_exchange_rel(mem, value) \
-  ({									      \
-    __typeof (*(mem)) __result;						      \
-    if (sizeof (*mem) == 4)						      \
-      __result = __arch_atomic_exchange_32_rel (mem, value);		      \
-    else if (sizeof (*mem) == 8)					      \
-      __result = __arch_atomic_exchange_64_rel (mem, value);		      \
-    else 								      \
-       abort ();							      \
-    __result;								      \
+#define atomic_exchange_rel(mem, value)					\
+  ({									\
+    __typeof (value) __ret;						\
+    if (__atomic_is_single_thread)					\
+      __ret = __atomic_exchange_n ((mem), (value), __ATOMIC_RELAXED);	\
+    else								\
+      __ret = __atomic_exchange_n ((mem), (value), __ATOMIC_RELEASE);	\
+    __ret;								\
   })
 
 #define atomic_exchange_and_add(mem, value) \
@@ -280,6 +294,7 @@ typedef uintmax_t uatomic_max_t;
        abort ();							      \
     __result;								      \
   })
+
 #define atomic_exchange_and_add_acq(mem, value) \
   ({									      \
     __typeof (*(mem)) __result;						      \
@@ -291,6 +306,7 @@ typedef uintmax_t uatomic_max_t;
        abort ();							      \
     __result;								      \
   })
+
 #define atomic_exchange_and_add_rel(mem, value) \
   ({									      \
     __typeof (*(mem)) __result;						      \
@@ -343,3 +359,23 @@ typedef uintmax_t uatomic_max_t;
        abort ();							      \
     __result;								      \
   })
+
+#define catomic_and(mem, arg)						\
+  ({									\
+    __typeof (arg) __ret;						\
+    if (__atomic_is_single_thread)					\
+      __ret = __atomic_fetch_and((mem), arg, __ATOMIC_RELAXED);		\
+    else								\
+      __ret = __atomic_fetch_and((mem), arg, __ATOMIC_ACQUIRE);		\
+    __ret;								\
+  })
+
+#define catomic_or(mem, arg)						\
+  ({									\
+    __typeof (arg) __ret;						\
+    if (__atomic_is_single_thread)					\
+      __ret = __atomic_fetch_or((mem), arg, __ATOMIC_RELAXED);		\
+    else								\
+      __ret = __atomic_fetch_or((mem), arg, __ATOMIC_ACQUIRE);		\
+    __ret;								\
+  })
diff --git a/sysdeps/powerpc/powerpc32/atomic-machine.h b/sysdeps/powerpc/powerpc32/atomic-machine.h
index 1d407b3..c733d43 100644
--- a/sysdeps/powerpc/powerpc32/atomic-machine.h
+++ b/sysdeps/powerpc/powerpc32/atomic-machine.h
@@ -86,6 +86,9 @@
 #define __arch_compare_and_exchange_bool_64_rel(mem, newval, oldval) \
   (abort (), 0)
 
+#define __arch_compare_and_exchange_val_64_relaxed(mem, newval, oldval) \
+  (abort (), (__typeof (*mem)) 0)
+
 #define __arch_compare_and_exchange_val_64_rel(mem, newval, oldval) \
   (abort (), (__typeof (*mem)) 0)
 
@@ -95,6 +98,9 @@
 #define __arch_atomic_exchange_64_rel(mem, value) \
     ({ abort (); (*mem) = (value); })
 
+#define __arch_atomic_exchange_64_relaxed(mem, value) \
+    ({ abort (); (*mem) = (value); })
+
 #define __arch_atomic_exchange_and_add_64(mem, value) \
     ({ abort (); (*mem) = (value); })
 
diff --git a/sysdeps/powerpc/powerpc64/atomic-machine.h b/sysdeps/powerpc/powerpc64/atomic-machine.h
index 751487a..515572e 100644
--- a/sysdeps/powerpc/powerpc64/atomic-machine.h
+++ b/sysdeps/powerpc/powerpc64/atomic-machine.h
@@ -146,32 +146,22 @@
       __tmp;								      \
   })
 
-#define __arch_atomic_exchange_64_acq(mem, value) \
-    ({									      \
-      __typeof (*mem) __val;						      \
-      __asm __volatile (__ARCH_REL_INSTR "\n"				      \
-			"1:	ldarx	%0,0,%2" MUTEX_HINT_ACQ "\n"	      \
-			"	stdcx.	%3,0,%2\n"			      \
+#define __arch_compare_and_exchange_val_64_relaxed(mem, newval, oldval)	      \
+  ({									      \
+      __typeof (*(mem)) __tmp;						      \
+      __typeof (mem)  __memp = (mem);					      \
+      __asm __volatile ("\n"						      \
+			"1:	ldarx	%0,0,%1\n"			      \
+			"	cmpd	%0,%2\n"			      \
+			"	bne	2f\n"				      \
+			"	stdcx.	%3,0,%1\n"			      \
 			"	bne-	1b\n"				      \
-		  " " __ARCH_ACQ_INSTR					      \
-			: "=&r" (__val), "=m" (*mem)			      \
-			: "b" (mem), "r" (value), "m" (*mem)		      \
+			"2:	"					      \
+			: "=&r" (__tmp)					      \
+			: "b" (__memp), "r" (oldval), "r" (newval)	      \
 			: "cr0", "memory");				      \
-      __val;								      \
-    })
-
-#define __arch_atomic_exchange_64_rel(mem, value) \
-    ({									      \
-      __typeof (*mem) __val;						      \
-      __asm __volatile (__ARCH_REL_INSTR "\n"				      \
-			"1:	ldarx	%0,0,%2" MUTEX_HINT_REL "\n"	      \
-			"	stdcx.	%3,0,%2\n"			      \
-			"	bne-	1b"				      \
-			: "=&r" (__val), "=m" (*mem)			      \
-			: "b" (mem), "r" (value), "m" (*mem)		      \
-			: "cr0", "memory");				      \
-      __val;								      \
-    })
+      __tmp;								      \
+  })
 
 #define __arch_atomic_exchange_and_add_64(mem, value) \
     ({									      \
-- 
2.1.0


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]