This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH v2] PowerPC: libc single-thread lock optimization


Hi,

Following comments from my first patch to optimize single-thread internal
glibc locking/atomics [1], I have changed the implementation to use now
relaxed atomics instead.  Addresing the concerns raised in last discussion, 
the primitives are still signal-safe (although not thread-safe), so if future
malloc implementation is changed to be async-safe, it won't require to a
adjust powerpc atomics.

For catomic_and and catomic_or I follow the definition at 'include/atomic.h'
(which powerpc is currently using) and implemented the atomics with acquire
semantics.  The new implementation also is simpler.

On synthetic benchmarks it shows an improvement of 5-10% for malloc
calls and an performance increase of 7-8% in 483.xalancbmk from
speccpu2006 (number from an POWER8 machine).

Checked on powerpc64, powerpc32 and powerpc64le.

[1] https://sourceware.org/ml/libc-alpha/2014-05/msg00118.html

--

	* sysdeps/powerpc/bits/atomic.h
	(__arch_compare_and_exchange_val_32_relaxed): New macro: atomic compare
	and exchange with relaxed semantic.
	(atomic_compare_and_exchange_val_relaxed): Likewise.
	(__arch_atomic_exchange_32_relaxed): New macro: atomic exchange with
	relaxed semantic.
	(atomic_exchange_relaxed): Likewise.
	(__arch_atomic_and_32): New macro: atomic bitwise and with acquire
	semantic.
	(__arch_atomic_and_32_relaxed): New macro: atomic bitwise and with
	relaxed semantic.
	(atomic_and_relaxed): Likewise.
	(__arch_atomic_or_32): New macro: atomic bitwise and with acquire
	semantic.
	(__arch_atomic_or_32_relaxed): New macro: atomic bitwise and with
	relaxed semantic.
	(atomic_or_relaxed): Likewise.
	(__atomic_is_single_thread): New macro: check if program is
	single-thread.
	(atomic_compare_and_exchange_val_acq): Add relaxed operation for
	single-thread.
	(atomic_compare_and_exchange_val_rel): Likewise.
	(atomic_exchange_rel): Likewise.
	(catomic_and): Likewise.
	(catomic_or): Likewise.
	* sysdeps/powerpc/powerpc32/bits/atomic.h
	(__arch_compare_and_exchange_val_64_relaxed): New macro: add empty
	implementation.
	(__arch_atomic_exchange_64_relaxed): Likewise.
	* sysdeps/powerpc/powerpc32/bits/atomic.h
	(__arch_compare_and_exchange_val_64_relaxed): New macro: atomic compare
	and exchange with relaxed semantics.
	(__arch_atomic_exchange_64_relaxed): New macro: atomic exchange with
	relaxed semantic.
	(__arch_atomic_and_64_relaxed): New macro: atomic exchange with
	relaxed semantic.
	(__arch_atomic_and_64): New macro: atomic bitwise and with acquire
	semantic.
	(__arch_atomic_or_64_relaxed): New macro: atomic bitwise or with
	relaxed semantic.
	(__arch_atomic_or_64): New macro: atomic bitwise or with acquire
	semantic.

---

diff --git a/sysdeps/powerpc/bits/atomic.h b/sysdeps/powerpc/bits/atomic.h
index 2ffba48..be590c7 100644
--- a/sysdeps/powerpc/bits/atomic.h
+++ b/sysdeps/powerpc/bits/atomic.h
@@ -27,6 +27,7 @@
  */
 
 #include <stdint.h>
+#include <tls.h>
 
 typedef int32_t atomic32_t;
 typedef uint32_t uatomic32_t;
@@ -113,6 +114,23 @@ typedef uintmax_t uatomic_max_t;
       __tmp;								      \
   })
 
+#define __arch_compare_and_exchange_val_32_relaxed(mem, newval, oldval)	      \
+  ({									      \
+      __typeof (*(mem)) __tmp;						      \
+      __typeof (mem)  __memp = (mem);					      \
+      __asm __volatile (						      \
+		        "1:	lwarx	%0,0,%1\n"			      \
+		        "	cmpw	%0,%2\n"			      \
+		        "	bne	2f\n"				      \
+		        "	stwcx.	%3,0,%1\n"			      \
+		        "	bne-	1b\n"				      \
+		        "2:	"					      \
+		        : "=&r" (__tmp)					      \
+		        : "b" (__memp), "r" (oldval), "r" (newval)	      \
+		        : "cr0", "memory");				      \
+      __tmp;								      \
+  })
+
 #define __arch_atomic_exchange_32_acq(mem, value)			      \
   ({									      \
     __typeof (*mem) __val;						      \
@@ -127,6 +145,18 @@ typedef uintmax_t uatomic_max_t;
     __val;								      \
   })
 
+#define __arch_atomic_exchange_32_relaxed(mem, value)			      \
+  ({									      \
+    __typeof (*mem) __val;						      \
+    __asm __volatile ("1:	lwarx	%0,0,%2\n"			      \
+		      "		stwcx.	%3,0,%2\n"			      \
+		      "		bne-	1b\n"				      \
+		      : "=&r" (__val), "=m" (*mem)			      \
+		      : "b" (mem), "r" (value), "m" (*mem)		      \
+		      : "cr0", "memory");				      \
+    __val;								      \
+  })
+
 #define __arch_atomic_exchange_32_rel(mem, value) \
   ({									      \
     __typeof (*mem) __val;						      \
@@ -140,6 +170,7 @@ typedef uintmax_t uatomic_max_t;
     __val;								      \
   })
 
+
 #define __arch_atomic_exchange_and_add_32(mem, value) \
   ({									      \
     __typeof (*mem) __val, __tmp;					      \
@@ -153,6 +184,62 @@ typedef uintmax_t uatomic_max_t;
     __val;								      \
   })
 
+#define __arch_atomic_and_32(mem, value) \
+  ({									      \
+    __typeof (*mem) __val, __tmp;					      \
+    __asm __volatile ("1:	lwarx	%0,0,%3\n"			      \
+		      "		add	%1,%0,%4\n"			      \
+		      "		stwcx.	%1,0,%3\n"			      \
+		      "		bne-	1b\n"				      \
+		      "         " __ARCH_ACQ_INSTR			      \
+		      : "=&b" (__val), "=&r" (__tmp), "=m" (*mem)	      \
+		      : "b" (mem), "r" (value), "m" (*mem)		      \
+		      : "cr0", "memory");				      \
+    __val;								      \
+  })
+
+#define __arch_atomic_and_32_relaxed(mem, value) \
+  ({									      \
+    __typeof (*mem) __val, __tmp;					      \
+    __asm __volatile ("		\n"					      \
+		      "1:	lwarx	%0,0,%3\n"			      \
+		      "		and	%1,%0,%4\n"			      \
+		      "		stwcx.	%1,0,%3\n"			      \
+		      "		bne-	1b\n"				      \
+		      : "=&b" (__val), "=&r" (__tmp), "=m" (*mem)	      \
+		      : "b" (mem), "r" (value), "m" (*mem)		      \
+		      : "cr0", "memory");				      \
+    __val;								      \
+  })
+
+#define __arch_atomic_or_32(mem, value) \
+  ({									      \
+    __typeof (*mem) __val, __tmp;					      \
+    __asm __volatile ("1:	lwarx	%0,0,%3\n"			      \
+		      "		or	%1,%0,%4\n"			      \
+		      "		stwcx.	%1,0,%3\n"			      \
+		      "		bne-	1b\n"				      \
+		      "         " __ARCH_ACQ_INSTR	 		      \
+		      : "=&b" (__val), "=&r" (__tmp), "=m" (*mem)	      \
+		      : "b" (mem), "r" (value), "m" (*mem)		      \
+		      : "cr0", "memory");				      \
+    __val;								      \
+  })
+
+#define __arch_atomic_or_32_relaxed(mem, value) \
+  ({									      \
+    __typeof (*mem) __val, __tmp;					      \
+    __asm __volatile ("		\n"					      \
+		      "1:	lwarx	%0,0,%3\n"			      \
+		      "		or	%1,%0,%4\n"			      \
+		      "		stwcx.	%1,0,%3\n"			      \
+		      "		bne-	1b\n"				      \
+		      : "=&b" (__val), "=&r" (__tmp), "=m" (*mem)	      \
+		      : "b" (mem), "r" (value), "m" (*mem)		      \
+		      : "cr0", "memory");				      \
+    __val;								      \
+  })
+
 #define __arch_atomic_increment_val_32(mem) \
   ({									      \
     __typeof (*(mem)) __val;						      \
@@ -194,10 +281,27 @@ typedef uintmax_t uatomic_max_t;
      __val;								      \
   })
 
-#define atomic_compare_and_exchange_val_acq(mem, newval, oldval) \
+#define __atomic_is_single_thread                                             \
+  (THREAD_GETMEM (THREAD_SELF, header.multiple_threads) == 0)
+
+#define atomic_compare_and_exchange_val_relaxed(mem, newval, oldval) \
   ({									      \
     __typeof (*(mem)) __result;						      \
     if (sizeof (*mem) == 4)						      \
+      __result = __arch_compare_and_exchange_val_32_relaxed(mem, newval, oldval); \
+    else if (sizeof (*mem) == 8)					      \
+      __result = __arch_compare_and_exchange_val_64_relaxed(mem, newval, oldval); \
+    else 								      \
+       abort ();							      \
+    __result;								      \
+  })
+
+#define atomic_compare_and_exchange_val_acq(mem, newval, oldval) \
+  ({									      \
+    __typeof (*(mem)) __result;						      \
+    if (__atomic_is_single_thread)					      \
+      __result = atomic_compare_and_exchange_val_relaxed (mem, newval, oldval); \
+    else if (sizeof (*mem) == 4)					      \
       __result = __arch_compare_and_exchange_val_32_acq(mem, newval, oldval); \
     else if (sizeof (*mem) == 8)					      \
       __result = __arch_compare_and_exchange_val_64_acq(mem, newval, oldval); \
@@ -209,7 +313,9 @@ typedef uintmax_t uatomic_max_t;
 #define atomic_compare_and_exchange_val_rel(mem, newval, oldval) \
   ({									      \
     __typeof (*(mem)) __result;						      \
-    if (sizeof (*mem) == 4)						      \
+    if (__atomic_is_single_thread)					      \
+      __result = atomic_compare_and_exchange_val_relaxed (mem, newval, oldval); \
+    else if (sizeof (*mem) == 4)					      \
       __result = __arch_compare_and_exchange_val_32_rel(mem, newval, oldval); \
     else if (sizeof (*mem) == 8)					      \
       __result = __arch_compare_and_exchange_val_64_rel(mem, newval, oldval); \
@@ -218,10 +324,24 @@ typedef uintmax_t uatomic_max_t;
     __result;								      \
   })
 
-#define atomic_exchange_acq(mem, value) \
+#define atomic_exchange_relaxed(mem, value) \
   ({									      \
     __typeof (*(mem)) __result;						      \
     if (sizeof (*mem) == 4)						      \
+      __result = __arch_atomic_exchange_32_relaxed (mem, value);	      \
+    else if (sizeof (*mem) == 8)					      \
+      __result = __arch_atomic_exchange_64_relaxed (mem, value);	      \
+    else 								      \
+       abort ();							      \
+    __result;								      \
+  })
+
+#define atomic_exchange_acq(mem, value)					      \
+  ({									      \
+    __typeof (*(mem)) __result;						      \
+    if (__atomic_is_single_thread)					      \
+      __result = atomic_exchange_relaxed (mem, value);			      \
+    else if (sizeof (*mem) == 4)					      \
       __result = __arch_atomic_exchange_32_acq (mem, value);		      \
     else if (sizeof (*mem) == 8)					      \
       __result = __arch_atomic_exchange_64_acq (mem, value);		      \
@@ -233,7 +353,9 @@ typedef uintmax_t uatomic_max_t;
 #define atomic_exchange_rel(mem, value) \
   ({									      \
     __typeof (*(mem)) __result;						      \
-    if (sizeof (*mem) == 4)						      \
+    if (__atomic_is_single_thread)					      \
+      __result = atomic_exchange_relaxed (mem, value);			      \
+    else if (sizeof (*mem) == 4)					      \
       __result = __arch_atomic_exchange_32_rel (mem, value);		      \
     else if (sizeof (*mem) == 8)					      \
       __result = __arch_atomic_exchange_64_rel (mem, value);		      \
@@ -294,3 +416,55 @@ typedef uintmax_t uatomic_max_t;
        abort ();							      \
     __result;								      \
   })
+
+#define atomic_and_relaxed(mem, arg) \
+  ({									      \
+    __typeof (*(mem)) __result;						      \
+    if (sizeof (*mem) == 4)						      \
+      __result = __arch_atomic_and_32_relaxed(mem, arg);		      \
+    else if (sizeof (*mem) == 8)					      \
+      __result = __arch_atomic_and_64_relaxed(mem, arg);		      \
+    else 								      \
+       abort ();							      \
+    __result;								      \
+  })
+
+#define catomic_and(mem, arg) \
+  ({									      \
+    __typeof (*(mem)) __result;						      \
+    if (__atomic_is_single_thread)					      \
+      __result = atomic_and_relaxed (mem, arg); \
+    else if (sizeof (*mem) == 4)					      \
+      __result = __arch_atomic_and_32(mem, arg); \
+    else if (sizeof (*mem) == 8)					      \
+      __result = __arch_atomic_and_64(mem, arg); \
+    else 								      \
+       abort ();							      \
+    __result;								      \
+  })
+
+#define atomic_or_relaxed(mem, arg) \
+  ({									      \
+    __typeof (*(mem)) __result;						      \
+    if (sizeof (*mem) == 4)						      \
+      __result = __arch_atomic_or_32_relaxed(mem, arg);			      \
+    else if (sizeof (*mem) == 8)					      \
+      __result = __arch_atomic_or_64_relaxed(mem, arg);			      \
+    else 								      \
+       abort ();							      \
+    __result;								      \
+  })
+
+#define catomic_or(mem, arg) \
+  ({									      \
+    __typeof (*(mem)) __result;						      \
+    if (__atomic_is_single_thread)					      \
+      __result = atomic_or_relaxed (mem, arg); \
+    else if (sizeof (*mem) == 4)					      \
+      __result = __arch_atomic_or_32(mem, arg); \
+    else if (sizeof (*mem) == 8)					      \
+      __result = __arch_atomic_or_64(mem, arg); \
+    else 								      \
+       abort ();							      \
+    __result;								      \
+  })
diff --git a/sysdeps/powerpc/powerpc32/bits/atomic.h b/sysdeps/powerpc/powerpc32/bits/atomic.h
index 7613bdc..1b9f82a 100644
--- a/sysdeps/powerpc/powerpc32/bits/atomic.h
+++ b/sysdeps/powerpc/powerpc32/bits/atomic.h
@@ -83,6 +83,9 @@
 #define __arch_compare_and_exchange_bool_64_rel(mem, newval, oldval) \
   (abort (), 0)
 
+#define __arch_compare_and_exchange_val_64_relaxed(mem, newval, oldval) \
+  (abort (), (__typeof (*mem)) 0)
+
 #define __arch_compare_and_exchange_val_64_rel(mem, newval, oldval) \
   (abort (), (__typeof (*mem)) 0)
 
@@ -92,6 +95,9 @@
 #define __arch_atomic_exchange_64_rel(mem, value) \
     ({ abort (); (*mem) = (value); })
 
+#define __arch_atomic_exchange_64_relaxed(mem, value) \
+    ({ abort (); (*mem) = (value); })
+
 #define __arch_atomic_exchange_and_add_64(mem, value) \
     ({ abort (); (*mem) = (value); })
 
diff --git a/sysdeps/powerpc/powerpc64/bits/atomic.h b/sysdeps/powerpc/powerpc64/bits/atomic.h
index 527fe7c..b8a1035 100644
--- a/sysdeps/powerpc/powerpc64/bits/atomic.h
+++ b/sysdeps/powerpc/powerpc64/bits/atomic.h
@@ -143,6 +143,23 @@
       __tmp;								      \
   })
 
+#define __arch_compare_and_exchange_val_64_relaxed(mem, newval, oldval) \
+  ({									      \
+      __typeof (*(mem)) __tmp;						      \
+      __typeof (mem)  __memp = (mem);					      \
+      __asm __volatile ("\n"				      \
+		        "1:	ldarx	%0,0,%1\n"	      \
+		        "	cmpd	%0,%2\n"			      \
+		        "	bne	2f\n"				      \
+		        "	stdcx.	%3,0,%1\n"			      \
+		        "	bne-	1b\n"				      \
+		        "2:	"					      \
+		        : "=&r" (__tmp)					      \
+		        : "b" (__memp), "r" (oldval), "r" (newval)	      \
+		        : "cr0", "memory");				      \
+      __tmp;								      \
+  })
+
 #define __arch_atomic_exchange_64_acq(mem, value) \
     ({									      \
       __typeof (*mem) __val;						      \
@@ -170,6 +187,18 @@
       __val;								      \
     })
 
+#define __arch_atomic_exchange_64_relaxed(mem, value) \
+    ({									      \
+      __typeof (*mem) __val;						      \
+      __asm __volatile ("1:	ldarx	%0,0,%2\n"			      \
+			"	stdcx.	%3,0,%2\n"			      \
+			"	bne-	1b"				      \
+			: "=&r" (__val), "=m" (*mem)			      \
+			: "b" (mem), "r" (value), "m" (*mem)		      \
+			: "cr0", "memory");				      \
+      __val;								      \
+    })
+
 #define __arch_atomic_exchange_and_add_64(mem, value) \
     ({									      \
       __typeof (*mem) __val, __tmp;					      \
@@ -224,6 +253,60 @@
      __val;								      \
   })
 
+#define __arch_atomic_and_64_relaxed(mem, value) \
+  ({									      \
+    __typeof (*mem) __val, __tmp;					      \
+    __asm __volatile ("1:	ldarx	%0,0,%3\n"			      \
+		      "		and	%1,%0,%4\n"			      \
+		      "		stdcx.	%1,0,%3\n"			      \
+		      "		bne-	1b\n"				      \
+		      : "=&b" (__val), "=&r" (__tmp), "=m" (*mem)	      \
+		      : "b" (mem), "r" (value), "m" (*mem)		      \
+		      : "cr0", "memory");				      \
+    __val;								      \
+  })
+
+#define __arch_atomic_and_64(mem, value) \
+  ({									      \
+    __typeof (*mem) __val, __tmp;					      \
+    __asm __volatile ("1:	ldarx	%0,0,%3\n"			      \
+		      "		and	%1,%0,%4\n"			      \
+		      "		stdcx.	%1,0,%3\n"			      \
+		      "		bne-	1b\n"				      \
+		      "       " __ARCH_ACQ_INSTR			      \
+		      : "=&b" (__val), "=&r" (__tmp), "=m" (*mem)	      \
+		      : "b" (mem), "r" (value), "m" (*mem)		      \
+		      : "cr0", "memory");				      \
+    __val;								      \
+  })
+
+#define __arch_atomic_or_64_relaxed(mem, value)				      \
+  ({									      \
+    __typeof (*mem) __val, __tmp;					      \
+    __asm __volatile ("1:	ldarx	%0,0,%3\n"			      \
+		      "		or	%1,%0,%4\n"			      \
+		      "		stdcx.	%1,0,%3\n"			      \
+		      "		bne-	1b\n"				      \
+		      : "=&b" (__val), "=&r" (__tmp), "=m" (*mem)	      \
+		      : "b" (mem), "r" (value), "m" (*mem)		      \
+		      : "cr0", "memory");				      \
+    __val;								      \
+  })
+
+#define __arch_atomic_or_64(mem, value)					      \
+  ({									      \
+    __typeof (*mem) __val, __tmp;					      \
+    __asm __volatile ("1:	ldarx	%0,0,%3\n"			      \
+		      "		or	%1,%0,%4\n"			      \
+		      "		stdcx.	%1,0,%3\n"			      \
+		      "		bne-	1b\n"				      \
+		      "       " __ARCH_ACQ_INSTR			      \
+		      : "=&b" (__val), "=&r" (__tmp), "=m" (*mem)	      \
+		      : "b" (mem), "r" (value), "m" (*mem)		      \
+		      : "cr0", "memory");				      \
+    __val;								      \
+  })
+
 /*
  * All powerpc64 processors support the new "light weight"  sync (lwsync).
  */


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]