This is the mail archive of the
systemtap@sourceware.org
mailing list for the systemtap project.
Re: Pass 4 (non-)optimization speedup
On 07/13/2009 03:35 PM, Mark Wielaard wrote:
> This is really a shame, gcc is really a lot slower compiling with -O2.
> It would be good to find some other way to make pass 4 quicker by
> default.
Another approach mentioned on IRC is trying to simplify the code that we
present to GCC.
I took a stab at consolidating the probe locking code, patch attached.
It makes each probe body contain just a static array of locks needed,
and then call to a single function that actually loops over the locks.
This will only be a win for scripts with many distinct probe bodies,
like topsys.stp. For that script, I see pass-4 time go from 5 seconds
to 4.4 seconds. In runtime, it's hard to get consistent measurement,
but the average probe time is about 2-3% slower.
So, is that overhead worth 1/2 second compile time? Can anyone see ways
to improve this patch, or suggest other areas that could benefit from focus?
Josh
commit d8a8b06d763d006be5efe0ec53c7b5a41df30223
Author: Josh Stone <jistone@redhat.com>
Date: Mon Jul 13 13:22:11 2009 -0700
make probe locking data-driven
diff --git a/translate.cxx b/translate.cxx
index 9c90106..a35be58 100644
--- a/translate.cxx
+++ b/translate.cxx
@@ -1590,14 +1590,7 @@ c_unparser::emit_probe (derived_probe* v)
// initialize frame pointer
o->newline() << "struct " << v->name << "_locals * __restrict__ l =";
o->newline(1) << "& c->locals[0]." << v->name << ";";
- o->newline(-1) << "(void) l;"; // make sure "l" is marked used
-
- o->newline() << "#ifdef STP_TIMING";
- o->newline() << "c->statp = & time_" << v->basest()->name << ";";
- o->newline() << "#endif";
-
- // emit probe local initialization block
- v->emit_probe_local_init(o);
+ o->indent (-1);
// emit all read/write locks for global variables
varuse_collecting_visitor vut;
@@ -1607,6 +1600,15 @@ c_unparser::emit_probe (derived_probe* v)
emit_locks (vut);
}
+ o->newline() << "(void) l;"; // make sure "l" is marked used
+
+ o->newline() << "#ifdef STP_TIMING";
+ o->newline() << "c->statp = & time_" << v->basest()->name << ";";
+ o->newline() << "#endif";
+
+ // emit probe local initialization block
+ v->emit_probe_local_init(o);
+
// initialize locals
for (unsigned j=0; j<v->locals.size(); j++)
{
@@ -1658,11 +1660,8 @@ c_unparser::emit_probe (derived_probe* v)
void
c_unparser::emit_locks(const varuse_collecting_visitor& vut)
{
- o->newline() << "{";
- o->newline(1) << "unsigned numtrylock = 0;";
- o->newline() << "(void) numtrylock;";
+ bool has_locks = false;
- string last_locked_var;
for (unsigned i = 0; i < session->globals.size(); i++)
{
vardecl* v = session->globals[i];
@@ -1694,37 +1693,36 @@ c_unparser::emit_locks(const varuse_collecting_visitor& vut)
continue;
}
- string lockcall =
- string (write_p ? "write" : "read") +
- "_trylock (& global.s_" + v->name + "_lock)";
+ if (!has_locks)
+ {
+ o->newline() << "static const probe_lock_t locks[] = {";
+ o->indent(1);
+ has_locks = true;
+ }
- o->newline() << "while (! " << lockcall
- << "&& (++numtrylock < MAXTRYLOCK))";
- o->newline(1) << "ndelay (TRYLOCKDELAY);";
- o->newline(-1) << "if (unlikely (numtrylock >= MAXTRYLOCK)) {";
- o->newline(1) << "atomic_inc (& skipped_count);";
+ o->newline() << "{";
+ o->newline(1) << ".lock = & global.s_" << c_varname (v->name) << "_lock,";
o->newline() << "#ifdef STP_TIMING";
- o->newline() << "atomic_inc (& global.s_" << c_varname (v->name) << "_lock_skip_count);";
+ o->newline() << ".skip = & global.s_" << c_varname (v->name) << "_lock_skip_count,";
o->newline() << "#endif";
- // The following works even if i==0. Note that using
- // globals[i-1]->name is wrong since that global may not have
- // been lockworthy by this probe.
- o->newline() << "goto unlock_" << last_locked_var << ";";
- o->newline(-1) << "}";
-
- last_locked_var = v->name;
+ o->newline() << ".write_p = " << (write_p ? 1 : 0) << ",";
+ o->newline(-1) << "},";
}
- o->newline() << "if (0) goto unlock_;";
-
- o->newline(-1) << "}";
+ if (has_locks)
+ {
+ o->newline(-1) << "};";
+ o->newline() << "if (!probe_trylock(locks, ARRAY_SIZE(locks)))";
+ o->newline(1) << "goto unlocked;";
+ o->indent(-1);
+ }
}
void
c_unparser::emit_unlocks(const varuse_collecting_visitor& vut)
{
- unsigned numvars = 0;
+ bool has_locks = false;
if (session->verbose>1)
clog << current_probe->name << " locks ";
@@ -1751,32 +1749,24 @@ c_unparser::emit_unlocks(const varuse_collecting_visitor& vut)
continue;
}
- numvars ++;
- o->newline(-1) << "unlock_" << v->name << ":";
- o->indent(1);
-
+ has_locks = true;
if (session->verbose>1)
clog << v->name << "[" << (read_p ? "r" : "")
<< (write_p ? "w" : "") << "] ";
-
- if (write_p) // emit write lock
- o->newline() << "write_unlock (& global.s_" << v->name << "_lock);";
- else // (read_p && !write_p) : emit read lock
- o->newline() << "read_unlock (& global.s_" << v->name << "_lock);";
-
- // fall through to next variable; thus the reverse ordering
}
- // emit plain "unlock" label, used if the very first lock failed.
- o->newline(-1) << "unlock_: ;";
- o->indent(1);
-
- if (numvars) // is there a chance that any lock attempt failed?
+ if (has_locks) // is there a chance that any lock attempt failed?
{
// Formerly, we checked skipped_count > MAXSKIPPED here, and set
// SYSTEMTAP_SESSION_ERROR if so. But now, this check is shared
// via common_probe_entryfn_epilogue().
+ o->newline() << "probe_unlock(locks, ARRAY_SIZE(locks));";
+
+ // emit plain "unlocked" label, used if any locks failed
+ o->newline(-1) << "unlocked: ;";
+ o->indent(1);
+
if (session->verbose>1)
clog << endl;
}
@@ -5223,6 +5213,46 @@ translate_pass (systemtap_session& s)
}
s.op->assert_0_indent();
+ s.op->newline() << "typedef struct {";
+ s.op->newline(1) << "rwlock_t *lock;";
+ s.op->newline() << "#ifdef STP_TIMING";
+ s.op->newline() << "atomic_t *skip;";
+ s.op->newline() << "#endif";
+ s.op->newline() << "int write_p;";
+ s.op->newline(-1) << "} probe_lock_t;";
+ s.op->newline() << "static void";
+ s.op->newline() << "probe_unlock(const probe_lock_t *locks, unsigned n)";
+ s.op->newline() << "{";
+ s.op->newline(1) << "while (n--)";
+ s.op->newline(1) << "if (locks[n].write_p)";
+ s.op->newline(1) << "write_unlock(locks[n].lock);";
+ s.op->newline(-1) << "else";
+ s.op->newline(1) << "read_unlock(locks[n].lock);";
+ s.op->newline(-3) << "}";
+ s.op->newline() << "static int";
+ s.op->newline() << "probe_trylock(const probe_lock_t *locks, unsigned n)";
+ s.op->newline() << "{";
+ s.op->newline(1) << "unsigned i, numtrylock = 0;";
+ s.op->newline(0) << "for (i = 0; i < n; ++i) {";
+ s.op->newline(1) << "if (locks[i].write_p)";
+ s.op->newline(1) << "while (!write_trylock(locks[i].lock) && (++numtrylock < MAXTRYLOCK))";
+ s.op->newline(1) << "ndelay (TRYLOCKDELAY);";
+ s.op->newline(-2) << "else";
+ s.op->newline(1) << "while (!read_trylock(locks[i].lock) && (++numtrylock < MAXTRYLOCK))";
+ s.op->newline(1) << "ndelay (TRYLOCKDELAY);";
+ s.op->newline(-2) << "if (unlikely (numtrylock >= MAXTRYLOCK)) {";
+ s.op->newline(1) << "atomic_inc (& skipped_count);";
+ s.op->newline() << "#ifdef STP_TIMING";
+ s.op->newline() << "atomic_inc (locks[i].skip);";
+ s.op->newline() << "#endif";
+ s.op->newline() << "probe_unlock(locks, i);";
+ s.op->newline() << "return 0;";
+ s.op->newline(-1) << "}";
+ s.op->newline(-1) << "}";
+ s.op->newline() << "return 1;";
+ s.op->newline(-1) << "}";
+ s.op->assert_0_indent();
+
// Run a varuse_collecting_visitor over probes that need global
// variable locks. We'll use this information later in
// emit_locks()/emit_unlocks().