This is the mail archive of the
systemtap@sourceware.org
mailing list for the systemtap project.
[PATCH] Scheduler Tapset based on kernel tracepoints
- From: Kiran <kiran at linux dot vnet dot ibm dot com>
- To: systemtap at sources dot redhat dot com
- Date: Thu, 17 Sep 2009 14:40:28 +0530
- Subject: [PATCH] Scheduler Tapset based on kernel tracepoints
Hi,
This patch adds kernel tracepoints based probes to the scheduler tapset
along with the testcase, scheduler-test-tracepoints.stp and an example
script, sched_switch.stp.
Signed-off-by: Kiran Prakash <kiran@linux.vnet.ibm.com>
diff -Naur systemtap-0.9.9-orig/tapset/scheduler.stp systemtap-0.9.9/tapset/scheduler.stp
--- systemtap-0.9.9-orig/tapset/scheduler.stp 2009-09-17 02:35:18.000000000 -0400
+++ systemtap-0.9.9/tapset/scheduler.stp 2009-09-17 02:32:49.000000000 -0400
@@ -33,7 +33,7 @@
* idle - boolean indicating whether current is the idle process
*/
probe scheduler.cpu_off
- = kernel.function("context_switch")
+ = kernel.trace("sched_switch")
{
task_prev = $prev
task_next = $next
@@ -124,6 +124,7 @@
%( arch != "x86_64" && arch != "ia64" %?
kernel.function("__switch_to")
%:
+ kernel.trace("sched_switch") ?,
kernel.function("context_switch")
%)
{
@@ -147,3 +148,167 @@
prevtsk_state = $prev_p->state
%) %)
}
+
+/**
+ * probe scheduler.kthread_stop - Fires when a thread created by kthread_create is stopped.
+ * @thread_pid: pid of the thread being stopped.
+ * @thread_priority: priority of the thread.
+ */
+probe scheduler.kthread_stop
+ = kernel.trace("sched_kthread_stop")
+{
+ thread_pid = $t->pid
+ thread_priority = $t->prio
+}
+
+
+/**
+ * probe scheduler.kthread_stop.return - Fires once the kthread is stopped and gets the return value
+ * @return_value: return value after stopping the thread.
+ */
+probe scheduler.kthread_stop.return
+ = kernel.trace("sched_kthread_stop_ret")
+{
+ return_value = $ret
+}
+
+/**
+ * probe scheduler.wait_task - Fires when waiting on a task to unschedule.
+ * It waits till the task becomes inactive.
+ * @task_pid: pid of the task the scheduler is waiting on.
+ * @task_priority: priority of the task
+ */
+probe scheduler.wait_task
+ = kernel.trace("sched_wait_task") ?,
+ kernel.function("wait_task_inactive")
+{
+ task_pid = $p->pid
+ task_priority = $p->prio
+}
+
+/**
+ * probe scheduler.wakeup - Fires when a task is woken up
+ * @task_pid: pid of the task being woken up
+ * @task_priority: priority of the task being woken up
+ * @success: returns 1 if the wakeup is successful
+ */
+probe scheduler.wakeup
+ = kernel.trace("sched_wakeup")
+{
+ task_pid = $p->pid
+ task_priority = $p->prio
+ success = $success
+
+}
+
+/**
+ * probe scheduler.wakeup_new - Fires when a newly created task is woken up for the first time
+ * @task_pid: pid of the new task woken up
+ * @task_priority: priority of the new task
+ * @success: returns 1 if the wake-up is successful
+ */
+probe scheduler.wakeup_new
+ = kernel.trace("sched_wakeup_new")
+{
+ task_pid = $p->pid
+ task_priority = $p->prio
+ success = $success
+}
+
+/**
+ * probe scheduler.sched_switch - Traces the context switches performed by the scheduler
+ * @prev_pid: pid of the process currently running on scheduler.
+ * @prev_priority: priority of the process currently running on scheduler
+ * @next_pid: pid of the process running next on the scheduler.
+ * @next_priority: priority of the process running next on the scheduler.
+ */
+probe scheduler.sched_switch
+ = kernel.trace("sched_switch")
+{
+ prev_pid = $prev->pid
+ prev_priority = $prev->prio
+ prev_state = $prev->state
+ prev_tid = task_tid($prev)
+ prev_task_name = task_execname($prev)
+ next_pid = $next->pid
+ next_priority = $next->prio
+ next_state = $next->state
+ next_tid = task_tid($next)
+ next_task_name = task_execname($next)
+}
+
+/**
+ * probe scheduler.migrate_task - Traces the migration of the tasks across cpus by the scheduler.
+ * @pid: pid of the task being migrated.
+ * @priority: priority of the task being migrated.
+ * @original_cpu: the original cpu
+ * @destination_cpu: the destination cpu
+ */
+probe scheduler.migrate_task
+ = kernel.trace("sched_migrate_task")
+{
+ pid = $p->pid
+ priority = $p->prio
+ original_cpu = task_cpu($p)
+ destination_cpu = $dest_cpu
+}
+
+/**
+ * probe scheduler.process_free - Traces the process of freeing up of a process
+ * @pid: PID of the process getting freed
+ * @priority: priority of the process getting freed
+ */
+probe scheduler.process_free
+ = kernel.trace("sched_process_free")
+{
+ pid = $p->pid
+ priority = $p->prio
+}
+
+/**
+ * probe scheduler.process_exit - Fires when a process exits
+ * @pid: pid of the process exiting
+ * @priority: priority of the process exiting
+ */
+probe scheduler.process_exit
+ = kernel.trace("sched_process_exit")
+{
+ pid = $p->pid
+ priority = $p->prio
+}
+
+/**
+ * probe scheduler.process_wait - Fires when scheduler waits on a process
+ * @pid: PID of the process scheduler is waiting on
+ */
+probe scheduler.process_wait
+ = kernel.trace("sched_process_wait")
+{
+ pid = $pid
+}
+
+
+/**
+ * probe scheduler.process_fork - Probes the tracepoint for forking a process
+ * @parent_pid: PID of the parent process
+ * @child_pid: PID of the child process
+ */
+probe scheduler.process_fork
+ = kernel.trace("sched_process_fork")
+{
+ parent_pid = $parent->pid
+ child_pid = $child->pid
+}
+
+/**
+ * probe scheduler.signal_send - Probes the tracepoint for sending a signal
+ * @pid: pid of the process sending signal
+ * @signal_number: signal number
+ */
+probe scheduler.signal_send
+ = kernel.trace("sched_signal_send")
+{
+ pid = $p->pid
+ signal_number = $sig
+}
+
diff -Naur systemtap-0.9.9-orig/testsuite/buildok/scheduler-test-tracepoints.stp systemtap-0.9.9/testsuite/buildok/scheduler-test-tracepoints.stp
--- systemtap-0.9.9-orig/testsuite/buildok/scheduler-test-tracepoints.stp 1969-12-31 19:00:00.000000000 -0500
+++ systemtap-0.9.9/testsuite/buildok/scheduler-test-tracepoints.stp 2009-09-16 03:21:34.000000000 -0400
@@ -0,0 +1,53 @@
+#! stap -up4
+
+//Tests if all probes in the scheduler tapset are resolvable.
+
+probe scheduler.kthread_stop {
+ printf("pid = %d, priority = %d\n", thread_pid, thread_priority);
+}
+
+probe scheduler.kthread_stop.return {
+ printf("return value = %d\n", return_value);
+}
+
+probe scheduler.wait_task {
+ printf("pid = %d, priority = %d\n", task_pid, task_priority);
+}
+
+probe scheduler.wakeup {
+ printf("pid = %d, priority = %d\n, state = %d, cpu = %d, success = %d",task_pid, task_priority, task_state, task_cpu, success);
+}
+
+probe scheduler.wakeup_new {
+ printf("pid = %d, priority = %d, success = %d\n", task_pid, task_priority, success);
+}
+
+probe scheduler.sched_switch {
+ printf("prev_pid = %d, prev_priority = %d, prev_state = %d, prev_task_name = %s, prev_tid = %d, next_pid = %d, next_priority = %d, next_state = %d, next_task_name = %s, next_tid = %d\n", prev_pid, prev_priority, prev_state, prev_task_name, prev_tid, next_pid, next_priority, next_state, next_task_name, next_tid);
+}
+
+probe scheduler.migrate_task {
+ printf("pid = %d, priority = %d, original cpu = %d destination cpu = %d\n", pid, priority, original_cpu, destination_cpu);
+}
+
+probe scheduler.process_free {
+ printf("pid = %d, priority = %d\n", pid, priority);
+}
+
+probe scheduler.process_exit {
+ printf("pid = %d, priority = %d\n", pid, priority);
+}
+
+probe scheduler.process_wait {
+ printf("pid = %d, priority = %d\n", pid, priority);
+}
+
+probe scheduler.process_fork {
+ printf("parent pid = %d, child pid = %d\n", parent_pid, child_pid);
+}
+
+probe scheduler.signal_send {
+ printf("pid = %d, signal = %d\n", pid, signal);
+}
+
+
diff -Naur systemtap-0.9.9-orig/testsuite/systemtap.examples/profiling/sched_switch.meta systemtap-0.9.9/testsuite/systemtap.examples/profiling/sched_switch.meta
--- systemtap-0.9.9-orig/testsuite/systemtap.examples/profiling/sched_switch.meta 1969-12-31 19:00:00.000000000 -0500
+++ systemtap-0.9.9/testsuite/systemtap.examples/profiling/sched_switch.meta 2009-09-16 03:21:51.000000000 -0400
@@ -0,0 +1,14 @@
+title: Display the task switches happeningt the scheduler
+name: sched_switch.stp
+version: 1.0
+author: kiran
+keywords: profiling functions
+subsystem: kernel
+status: production
+exit: user-controlled
+output: sorted-list on-exit
+scope: system-wide
+description: The sched_switch.stp script takes two arguments, first argument can be "pid" or "name" to indicate what is being passed as second argument. The script will trace the process based on pid/name and print the scheduler switches happening with the process. If no arguments are passed, it displays all the scheduler switches.
+test_check: stap -p4 sched_switch.stp
+test_installcheck: stap sched_switch.stp -c "sleep 1"
+
diff -Naur systemtap-0.9.9-orig/testsuite/systemtap.examples/profiling/sched_switch.stp systemtap-0.9.9/testsuite/systemtap.examples/profiling/sched_switch.stp
--- systemtap-0.9.9-orig/testsuite/systemtap.examples/profiling/sched_switch.stp 1969-12-31 19:00:00.000000000 -0500
+++ systemtap-0.9.9/testsuite/systemtap.examples/profiling/sched_switch.stp 2009-09-16 03:21:53.000000000 -0400
@@ -0,0 +1,71 @@
+/* This script works similar to ftrace's sched_switch. It displays a list of
+ * processes which get switched in and out of the scheduler. The format of display
+ * is PROCESS_NAME PROCESS_PID CPU TIMESTAMP PID: PRIORITY: PROCESS STATE ->/+
+ * NEXT_PID : NEXT_PRIORITY: NEXT_STATE NEXT_PROCESS_NAME
+ * -> indicates that previous process is scheduled out and the next process is
+ * scheduled in.
+ * + indicates that previous process has woken up the next process.
+ * The usage is sched_switch.stp <"pid"/"name"> pid/name
+ */
+
+global task_cpu_old[9999]
+global pids[999]
+global processes
+global prev
+
+function state_calc(state) {
+ if(state == 0)
+ status = "R"
+ if(state == 1)
+ status = "S"
+ if(state == 2)
+ status = "D"
+ if(state == 4)
+ status = "T"
+ if(state == 8)
+ status = "T"
+ if(state == 16)
+ status = "Z"
+ if(state == 32)
+ status = "EXIT_DEAD"
+ return status
+}
+probe scheduler.wakeup
+{
+ pids[task_pid]++
+ processes[task_pid] = $p;
+ prev[task_pid] = task_current()
+
+}
+probe scheduler.sched_switch
+{
+ tid = next_tid
+ tid1 = prev_tid
+ state = prev_state
+ state1 = next_state
+
+ %( $# == 2 %?
+
+ if(@1 == "pid")
+ if (tid != $2 && tid1 != $2)
+ next
+ if(@1 == "name")
+ if (task_execname(task_current()) != @2 && task_execname($next) != @2)
+ next
+
+ foreach (name in pids-) {
+ if ((@1 == "pid" && (name == $2 || task_pid(prev[name]) == $2)) ||
+ (@1 == "name" && (task_execname(prev[name]) == @2 || task_execname(processes[name]) == @2)))
+ printf("%s\t\t%d\t%d\t%d\t%d:%d:%s + %d:%d:%s %s\n",
+ task_execname(prev[name]), task_pid(prev[name]), task_cpu(processes[name]), gettimeofday_ns(),
+ task_pid(prev[name]), task_prio(prev[name]), state_calc(task_state(prev[name])),
+ task_pid(processes[name]), task_prio(processes[name]), state_calc(task_state(processes[name])),
+ task_execname(processes[name]))
+ } %)
+
+ old_cpu = task_cpu_old[tid]
+ printf("%s\t\t%d\t%d\t%d\t%d:%d:%s ==> %d:%d:%s %s\n",task_execname(task_current()),tid1,
+ old_cpu,gettimeofday_ns(),tid1,task_prio(task_current()),state_calc(state),next_pid,
+ next_prio,state_calc(next_state),next_task_name )
+ task_cpu_old[next_tid] = cpu()
+}
Thanks,
Kiran