This is the mail archive of the
systemtap@sourceware.org
mailing list for the systemtap project.
[SYSTEMTAP/PATCH 2/4] rt : replace read write lock with rcu
- From: Santosh Shukla <sshukla at mvista dot com>
- To: systemtap at sourceware dot org
- Cc: sshukla at mvista dot com
- Date: Tue, 9 Sep 2014 12:38:17 +0530
- Subject: [SYSTEMTAP/PATCH 2/4] rt : replace read write lock with rcu
- Authentication-results: sourceware.org; auth=none
- References: <1410246499-6938-1-git-send-email-sshukla at mvista dot com>
noticed that in rt mode - read and write_lock lead to several bug_on :
[ 1033.544821] ffff880035463d38 ffffffff815fcaed ffff880035463d90 ffffffff816057f7
[ 1033.544826] ffff880035463fd8 0000000000017e80 0000000000017e80 ffffffff810be5fb
[ 1033.544826] Call Trace:
[ 1033.544830] [<ffffffff81602329>] dump_stack+0x4e/0x7a
[ 1033.544834] [<ffffffff815fcaed>] __schedule_bug+0x9f/0xad
[ 1033.544838] [<ffffffff816057f7>] __schedule+0x627/0x6a0
[ 1033.544842] [<ffffffff810be5fb>] ? task_blocks_on_rt_mutex+0x19b/0x220
[ 1033.544846] [<ffffffff816058a0>] schedule+0x30/0xa0
[ 1033.544850] [<ffffffff8160727d>] rt_spin_lock_slowlock+0xbd/0x1f0
[ 1033.544856] [<ffffffff81607df5>] __rt_spin_lock+0x25/0x30
[ 1033.544858] [<ffffffff816080b0>] rt_read_lock+0x30/0x40
[ 1033.544861] [<ffffffff816080ce>] rt_read_lock_irqsave+0xe/0x20
[ 1033.544867] [<ffffffffa08a9b89>] __stp_tf_get_map_entry+0x19/0xc0 [stap_e40dcb2c46d7c0a2fb8d70ba343e393a_15235]
[ 1033.544873] [<ffffffffa08afedd>] __stp_utrace_task_finder_target_syscall_exit+0x5d/0x350 [stap_e40dcb2c46d7c0a2fb8d70ba343e393a_15235]
[ 1033.544889] [<ffffffffa08a91c5>] utrace_report_syscall_exit+0xc5/0x110 [stap_e40dcb2c46d7c0a2fb8d70ba343e393a_15235]
[ 1033.544893] [<ffffffff81023ce0>] syscall_trace_leave+0x100/0x130
[ 1033.544896] [<ffffffff8161090b>] int_check_syscall_exit_work+0x34/0x3d
[ 1033.544902] CPU: 5 PID: 20766 Comm: make Tainted: GF W O 3.14.12-rt-rt9+ #2
[ 1033.544904] Hardware name: Dell Inc. PowerEdge T620/0658N7, BIOS 1.4.6 10/26/2012
[ 1033.544912] ffff880236f57e80 ffff8801cab21d28 ffffffff81602329 ffff8802243064a0
[ 1033.544919] ffff8801cab21d38 ffffffff815fcaed ffff8801cab21d90 ffffffff816057f7
[ 1033.544924] ffff8801cab21fd8 0000000000017e80 0000000000017e80 ffffffff810be5fb
[ 1033.544925] Call Trace:
[ 1033.544929] [<ffffffff81602329>] dump_stack+0x4e/0x7a
[ 1033.544934] [<ffffffff815fcaed>] __schedule_bug+0x9f/0xad
[ 1033.544938] [<ffffffff816057f7>] __schedule+0x627/0x6a0
[ 1033.544942] [<ffffffff810be5fb>] ? task_blocks_on_rt_mutex+0x19b/0x220
[ 1033.544946] [<ffffffff816058a0>] schedule+0x30/0xa0
[ 1033.544951] [<ffffffff8160727d>] rt_spin_lock_slowlock+0xbd/0x1f0
[ 1033.544956] [<ffffffff81607df5>] __rt_spin_lock+0x25/0x30
[ 1033.544959] [<ffffffff816080b0>] rt_read_lock+0x30/0x40
[ 1033.544962] [<ffffffff816080ce>] rt_read_lock_irqsave+0xe/0x20
[ 1033.544968] [<ffffffffa08a9b89>] __stp_tf_get_map_entry+0x19/0xc0 [stap_e40dcb2c46d7c0a2fb8d70ba343e393a_15235]
[ 1033.544975] [<ffffffffa08afedd>] __stp_utrace_task_finder_target_syscall_exit+0x5d/0x350 [stap_e40dcb2c46d7c0a2fb8d70ba343e393a_15235]
[ 1033.544981] [<ffffffffa08a91c5>] utrace_report_syscall_exit+0xc5/0x110 [stap_e40dcb2c46d7c0a2fb8d70ba343e393a_15235]
[ 1033.544984] [<ffffffff81023ce0>] syscall_trace_leave+0x100/0x130
[ 1033.544988] [<ffffffff8161090b>] int_check_syscall_exit_work+0x34/0x3d
By replacing read_lock_irqsave to rcu supress these bug_on for -rt mode. As
well writer lock replaced to raw_lock for -rt mode and in non-rt mode they
would switch to normal spin lock. Also changed hlist api to _rcu type api.
Signed-off-by: Santosh Shukla <sshukla@mvista.com>
---
runtime/linux/addr-map.c | 16 ++++++++-------
runtime/linux/runtime.h | 2 +-
runtime/linux/task_finder_map.c | 38 ++++++++++++++++++----------------
runtime/task_finder_vma.c | 43 ++++++++++++++++++++++-----------------
4 files changed, 55 insertions(+), 44 deletions(-)
diff --git a/runtime/linux/addr-map.c b/runtime/linux/addr-map.c
index 3f5aca7..679ea7d 100644
--- a/runtime/linux/addr-map.c
+++ b/runtime/linux/addr-map.c
@@ -28,7 +28,7 @@ struct addr_map
struct addr_map_entry entries[0];
};
-static DEFINE_RWLOCK(addr_map_lock);
+static DEFINE_RAW_SPINLOCK(addr_map_lock);
static struct addr_map* blackmap;
/* Find address of entry where we can insert a new one. */
@@ -127,9 +127,11 @@ lookup_bad_addr(unsigned long addr, size_t size)
#endif
/* Search for the given range in the black-listed map. */
- read_lock_irqsave(&addr_map_lock, flags);
+ rcu_read_lock();
+ local_irq_save(flags);
result = lookup_addr_aux(addr, size, blackmap);
- read_unlock_irqrestore(&addr_map_lock, flags);
+ local_irq_restore(flags);
+ rcu_read_unlock();
if (result)
return 1;
else
@@ -154,7 +156,7 @@ add_bad_addr_entry(unsigned long min_addr, unsigned long max_addr,
while (1)
{
size_t old_size = 0;
- write_lock_irqsave(&addr_map_lock, flags);
+ raw_spin_lock_irqsave(&addr_map_lock, flags);
old_map = blackmap;
if (old_map)
old_size = old_map->size;
@@ -163,7 +165,7 @@ add_bad_addr_entry(unsigned long min_addr, unsigned long max_addr,
added an entry while we were sleeping. */
if (!new_map || (new_map && new_map->size < old_size + 1))
{
- write_unlock_irqrestore(&addr_map_lock, flags);
+ raw_spin_unlock_irqrestore(&addr_map_lock, flags);
if (new_map)
{
_stp_kfree(new_map);
@@ -192,7 +194,7 @@ add_bad_addr_entry(unsigned long min_addr, unsigned long max_addr,
*existing_min = min_entry;
if (existing_max)
*existing_max = max_entry;
- write_unlock_irqrestore(&addr_map_lock, flags);
+ raw_spin_unlock_irqrestore(&addr_map_lock, flags);
_stp_kfree(new_map);
return 1;
}
@@ -210,7 +212,7 @@ add_bad_addr_entry(unsigned long min_addr, unsigned long max_addr,
(old_map->size - existing) * sizeof(*new_entry));
}
blackmap = new_map;
- write_unlock_irqrestore(&addr_map_lock, flags);
+ raw_spin_unlock_irqrestore(&addr_map_lock, flags);
if (old_map)
_stp_kfree(old_map);
return 0;
diff --git a/runtime/linux/runtime.h b/runtime/linux/runtime.h
index 76dbea4..0267808 100644
--- a/runtime/linux/runtime.h
+++ b/runtime/linux/runtime.h
@@ -74,7 +74,7 @@ static void _stp_exit(void);
#ifdef STAPCONF_HLIST_4ARGS
-#define stap_hlist_for_each_entry(a,b,c,d) hlist_for_each_entry(a,b,c,d)
+#define stap_hlist_for_each_entry(a,b,c,d) hlist_for_each_entry_rcu(a,b,c,d)
#define stap_hlist_for_each_entry_safe(a,b,c,d,e) hlist_for_each_entry_safe(a,b,c,d,e)
#else
#define stap_hlist_for_each_entry(a,b,c,d) (void) b; hlist_for_each_entry(a,c,d)
diff --git a/runtime/linux/task_finder_map.c b/runtime/linux/task_finder_map.c
index d515e9f..0256393 100644
--- a/runtime/linux/task_finder_map.c
+++ b/runtime/linux/task_finder_map.c
@@ -13,7 +13,7 @@
// contents in interrupt context (which should only ever call
// stap_find_map_map_info for getting stored info). So we might
// want to look into that if this seems a bottleneck.
-static DEFINE_RWLOCK(__stp_tf_map_lock);
+static DEFINE_RAW_SPINLOCK(__stp_tf_map_lock);
#define __STP_TF_HASH_BITS 4
#define __STP_TF_TABLE_SIZE (1 << __STP_TF_HASH_BITS)
@@ -51,11 +51,11 @@ __stp_tf_map_initialize(void)
struct hlist_head *head = &__stp_tf_map_free_list[0];
unsigned long flags;
- write_lock_irqsave(&__stp_tf_map_lock, flags);
+ raw_spin_lock_irqsave(&__stp_tf_map_lock, flags);
for (i = 0; i < TASK_FINDER_MAP_ENTRY_ITEMS; i++) {
- hlist_add_head(&__stp_tf_map_free_list_items[i].hlist, head);
+ hlist_add_head_rcu(&__stp_tf_map_free_list_items[i].hlist, head);
}
- write_unlock_irqrestore(&__stp_tf_map_lock, flags);
+ raw_spin_unlock_irqrestore(&__stp_tf_map_lock, flags);
}
@@ -75,7 +75,7 @@ __stp_tf_map_get_free_entry(void)
break;
}
if (entry != NULL)
- hlist_del(&entry->hlist);
+ hlist_del_rcu(&entry->hlist);
return entry;
}
@@ -87,7 +87,7 @@ static void
__stp_tf_map_put_free_entry(struct __stp_tf_map_entry *entry)
{
struct hlist_head *head = &__stp_tf_map_free_list[0];
- hlist_add_head(&entry->hlist, head);
+ hlist_add_head_rcu(&entry->hlist, head);
}
@@ -109,15 +109,19 @@ __stp_tf_get_map_entry(struct task_struct *tsk)
struct __stp_tf_map_entry *entry;
unsigned long flags;
- read_lock_irqsave(&__stp_tf_map_lock, flags);
+ rcu_read_lock();
+ local_irq_save(flags);
head = &__stp_tf_map_table[__stp_tf_map_hash(tsk)];
stap_hlist_for_each_entry(entry, node, head, hlist) {
if (tsk->pid == entry->pid) {
- read_unlock_irqrestore(&__stp_tf_map_lock, flags);
+ local_irq_restore(flags);
+ rcu_read_unlock();
return entry;
}
}
- read_unlock_irqrestore(&__stp_tf_map_lock, flags);
+ local_irq_restore(flags);
+ rcu_read_unlock();
+
return NULL;
}
@@ -133,14 +137,14 @@ __stp_tf_add_map(struct task_struct *tsk, long syscall_no, unsigned long arg0,
struct __stp_tf_map_entry *entry;
unsigned long flags;
- write_lock_irqsave(&__stp_tf_map_lock, flags);
+ raw_spin_lock_irqsave(&__stp_tf_map_lock, flags);
head = &__stp_tf_map_table[__stp_tf_map_hash(tsk)];
stap_hlist_for_each_entry(entry, node, head, hlist) {
// If we find an existing entry, just increment the
// usage count.
if (tsk->pid == entry->pid) {
entry->usage++;
- write_unlock_irqrestore(&__stp_tf_map_lock, flags);
+ raw_spin_unlock_irqrestore(&__stp_tf_map_lock, flags);
return 0;
}
}
@@ -148,7 +152,7 @@ __stp_tf_add_map(struct task_struct *tsk, long syscall_no, unsigned long arg0,
// Get an element from the free list.
entry = __stp_tf_map_get_free_entry();
if (!entry) {
- write_unlock_irqrestore(&__stp_tf_map_lock, flags);
+ raw_spin_unlock_irqrestore(&__stp_tf_map_lock, flags);
return -ENOMEM;
}
entry->usage = 1;
@@ -157,8 +161,8 @@ __stp_tf_add_map(struct task_struct *tsk, long syscall_no, unsigned long arg0,
entry->arg0 = arg0;
entry->arg1 = arg1;
entry->arg2 = arg2;
- hlist_add_head(&entry->hlist, head);
- write_unlock_irqrestore(&__stp_tf_map_lock, flags);
+ hlist_add_head_rcu(&entry->hlist, head);
+ raw_spin_unlock_irqrestore(&__stp_tf_map_lock, flags);
return 0;
}
@@ -174,7 +178,7 @@ __stp_tf_remove_map_entry(struct __stp_tf_map_entry *entry)
if (entry != NULL) {
unsigned long flags;
- write_lock_irqsave(&__stp_tf_map_lock, flags);
+ raw_spin_lock_irqsave(&__stp_tf_map_lock, flags);
// Decrement the usage count.
entry->usage--;
@@ -182,10 +186,10 @@ __stp_tf_remove_map_entry(struct __stp_tf_map_entry *entry)
// If the entry is unused, put it back on the free
// list.
if (entry->usage == 0) {
- hlist_del(&entry->hlist);
+ hlist_del_rcu(&entry->hlist);
__stp_tf_map_put_free_entry(entry);
}
- write_unlock_irqrestore(&__stp_tf_map_lock, flags);
+ raw_spin_unlock_irqrestore(&__stp_tf_map_lock, flags);
}
return 0;
}
diff --git a/runtime/task_finder_vma.c b/runtime/task_finder_vma.c
index f826982..ff7f2f3 100644
--- a/runtime/task_finder_vma.c
+++ b/runtime/task_finder_vma.c
@@ -15,7 +15,7 @@
// contents in interrupt context (which should only ever call
// stap_find_vma_map_info for getting stored vma info). So we might
// want to look into that if this seems a bottleneck.
-static DEFINE_RWLOCK(__stp_tf_vma_lock);
+static DEFINE_RAW_SPINLOCK(__stp_tf_vma_lock);
#define __STP_TF_HASH_BITS 4
#define __STP_TF_TABLE_SIZE (1 << __STP_TF_HASH_BITS)
@@ -103,7 +103,7 @@ stap_destroy_vma_map(void)
continue;
stap_hlist_for_each_entry_safe(entry, node, n, head, hlist) {
- hlist_del(&entry->hlist);
+ hlist_del_rcu(&entry->hlist);
__stp_tf_vma_release_entry(entry);
}
}
@@ -180,17 +180,17 @@ stap_add_vma_map_info(struct task_struct *tsk,
// Take a write lock, since we are most likely going to write
// after reading. But reserve a new entry first outside the lock.
new_entry = __stp_tf_vma_new_entry();
- write_lock_irqsave(&__stp_tf_vma_lock, flags);
+ raw_spin_lock_irqsave(&__stp_tf_vma_lock, flags);
entry = __stp_tf_get_vma_map_entry_internal(tsk, vm_start);
if (entry != NULL) {
- write_unlock_irqrestore(&__stp_tf_vma_lock, flags);
+ raw_spin_unlock_irqrestore(&__stp_tf_vma_lock, flags);
if (new_entry)
__stp_tf_vma_release_entry(new_entry);
return -EBUSY; /* Already there */
}
if (!new_entry) {
- write_unlock_irqrestore(&__stp_tf_vma_lock, flags);
+ raw_spin_unlock_irqrestore(&__stp_tf_vma_lock, flags);
return -ENOMEM;
}
@@ -212,8 +212,8 @@ stap_add_vma_map_info(struct task_struct *tsk,
entry->user = user;
head = &__stp_tf_vma_map[__stp_tf_vma_map_hash(tsk)];
- hlist_add_head(&entry->hlist, head);
- write_unlock_irqrestore(&__stp_tf_vma_lock, flags);
+ hlist_add_head_rcu(&entry->hlist, head);
+ raw_spin_unlock_irqrestore(&__stp_tf_vma_lock, flags);
return 0;
}
@@ -234,13 +234,13 @@ stap_extend_vma_map_info(struct task_struct *tsk,
// Take a write lock, since we are most likely going to write
// to the entry after reading, if its vm_end matches our vm_start.
- write_lock_irqsave(&__stp_tf_vma_lock, flags);
+ raw_spin_lock_irqsave(&__stp_tf_vma_lock, flags);
entry = __stp_tf_get_vma_map_entry_end_internal(tsk, vm_start);
if (entry != NULL) {
entry->vm_end = vm_end;
res = 0;
}
- write_unlock_irqrestore(&__stp_tf_vma_lock, flags);
+ raw_spin_unlock_irqrestore(&__stp_tf_vma_lock, flags);
return res;
}
@@ -258,14 +258,14 @@ stap_remove_vma_map_info(struct task_struct *tsk, unsigned long vm_start)
// Take a write lock since we are most likely going to delete
// after reading.
unsigned long flags;
- write_lock_irqsave(&__stp_tf_vma_lock, flags);
+ raw_spin_lock_irqsave(&__stp_tf_vma_lock, flags);
entry = __stp_tf_get_vma_map_entry_internal(tsk, vm_start);
if (entry != NULL) {
- hlist_del(&entry->hlist);
+ hlist_del_rcu(&entry->hlist);
__stp_tf_vma_release_entry(entry);
rc = 0;
}
- write_unlock_irqrestore(&__stp_tf_vma_lock, flags);
+ raw_spin_unlock_irqrestore(&__stp_tf_vma_lock, flags);
return rc;
}
@@ -288,7 +288,8 @@ stap_find_vma_map_info(struct task_struct *tsk, unsigned long addr,
if (__stp_tf_vma_map == NULL)
return rc;
- read_lock_irqsave(&__stp_tf_vma_lock, flags);
+ rcu_read_lock();
+ local_irq_save(flags);
head = &__stp_tf_vma_map[__stp_tf_vma_map_hash(tsk)];
stap_hlist_for_each_entry(entry, node, head, hlist) {
if (tsk->pid == entry->pid
@@ -309,7 +310,9 @@ stap_find_vma_map_info(struct task_struct *tsk, unsigned long addr,
*user = found_entry->user;
rc = 0;
}
- read_unlock_irqrestore(&__stp_tf_vma_lock, flags);
+ local_irq_restore(flags);
+ rcu_read_unlock();
+
return rc;
}
@@ -332,7 +335,8 @@ stap_find_vma_map_info_user(struct task_struct *tsk, void *user,
if (__stp_tf_vma_map == NULL)
return rc;
- read_lock_irqsave(&__stp_tf_vma_lock, flags);
+ rcu_read_lock();
+ local_irq_save(flags);
head = &__stp_tf_vma_map[__stp_tf_vma_map_hash(tsk)];
stap_hlist_for_each_entry(entry, node, head, hlist) {
if (tsk->pid == entry->pid
@@ -350,7 +354,8 @@ stap_find_vma_map_info_user(struct task_struct *tsk, void *user,
*path = found_entry->path;
rc = 0;
}
- read_unlock_irqrestore(&__stp_tf_vma_lock, flags);
+ local_irq_restore(flags);
+ rcu_read_unlock();
return rc;
}
@@ -363,15 +368,15 @@ stap_drop_vma_maps(struct task_struct *tsk)
struct __stp_tf_vma_entry *entry;
unsigned long flags;
- write_lock_irqsave(&__stp_tf_vma_lock, flags);
+ raw_spin_lock_irqsave(&__stp_tf_vma_lock, flags);
head = &__stp_tf_vma_map[__stp_tf_vma_map_hash(tsk)];
stap_hlist_for_each_entry_safe(entry, node, n, head, hlist) {
if (tsk->pid == entry->pid) {
- hlist_del(&entry->hlist);
+ hlist_del_rcu(&entry->hlist);
__stp_tf_vma_release_entry(entry);
}
}
- write_unlock_irqrestore(&__stp_tf_vma_lock, flags);
+ raw_spin_unlock_irqrestore(&__stp_tf_vma_lock, flags);
return 0;
}
--
1.7.9.5