This is the mail archive of the
gdb-patches@sourceware.org
mailing list for the GDB project.
[just fyi, the ugly version] Re: handle leader exits; don't get stuck when all resumed threads exit
- From: Pedro Alves <pedro at codesourcery dot com>
- To: gdb-patches at sourceware dot org
- Cc: Jan Kratochvil <jan dot kratochvil at redhat dot com>
- Date: Wed, 26 Oct 2011 20:08:59 +0100
- Subject: [just fyi, the ugly version] Re: handle leader exits; don't get stuck when all resumed threads exit
- References: <201110262000.03322.pedro@codesourcery.com>
On Wednesday 26 October 2011 20:00:03, Pedro Alves wrote:
> This one's got a long history behind it... It was much bigger at one point,
> and then I hit a brick wall and came up with this much smaller and simpler
> patch.
...
> So at this point, I came up with a patch that when a zombie
> leader is detected, it would try to see if all other threads in
> the thread group were still around, and reap/exit them if they were not,
> while being careful that there was one thread that would not report any
> exit (the execing one), so blocking in sigsuspend was no good. But grew
> into an ugly monster, that was very tricky to get right, and still,
> wasn't 100% correct.
And for reference, here's what I had. linux-nat.c:handle_leader_exit_or_mtexec
has most of the intersting bits. On the core side, it used
TARGET_WAITKIND_IGNORE+minus_one_ptid to mean the same as TARGET_WAITKIND_NO_RESUME,
but that's a minor detail that I was going to change if I had persued this further.
One problem that it has that made me stop pushing it further and restart
over, was the sigsuspend in handle_leader_exit_or_mtexec meant that we'd
break multi-process+non-stop support, as we could end up in the
sigsuspend in linux_nat_wait_1 with events still left to collect with
waitpid (the former sigsuspend would eat the SIGCHLD for events of
other inferiors).
--
Pedro Alves
---
gdb/infrun.c | 89 +++++-
gdb/linux-nat.c | 734 ++++++++++++++++++++++++++++++++++++++++++++++++--------
gdb/linux-nat.h | 7
gdb/remote.c | 4
4 files changed, 716 insertions(+), 118 deletions(-)
Index: src/gdb/linux-nat.c
===================================================================
--- src.orig/gdb/linux-nat.c 2011-10-26 18:03:17.000000000 +0100
+++ src/gdb/linux-nat.c 2011-10-26 18:12:00.400742575 +0100
@@ -1268,28 +1268,78 @@ pid_is_stopped (pid_t pid)
FILE *status_file;
char buf[100];
int retval = 0;
+ int have_state = 0;
snprintf (buf, sizeof (buf), "/proc/%d/status", (int) pid);
status_file = fopen (buf, "r");
- if (status_file != NULL)
+ if (status_file == NULL)
{
- int have_state = 0;
-
- while (fgets (buf, sizeof (buf), status_file))
+ if (debug_linux_nat)
+ fprintf_unfiltered (gdb_stdlog,
+ _("unable to open /proc file '%s' "
+ "to check for `T (stopped)'"), buf);
+ return 0;
+ }
+ while (fgets (buf, sizeof (buf), status_file))
+ {
+ if (strncmp (buf, "State:", 6) == 0)
{
- if (strncmp (buf, "State:", 6) == 0)
- {
- have_state = 1;
- break;
- }
+ have_state = 1;
+ break;
}
- if (have_state && strstr (buf, "T (stopped)") != NULL)
- retval = 1;
- fclose (status_file);
}
+ if (have_state && strstr (buf, "T (stopped)") != NULL)
+ retval = 1;
+ fclose (status_file);
return retval;
}
+/* Detect `T (tracing stop)' in `/proc/PID/status'. Other states are
+ reported as false. Return the tracer's PID in *TRACER, or zero if
+ PID is not being traced. Returns -1 on error (interpreted as
+ meaning the thread is gone). */
+
+static int
+pid_is_ptrace_stopped (pid_t pid, pid_t *tracer)
+{
+ FILE *status_file;
+ char buf[100];
+ int have_state = 0;
+
+ snprintf (buf, sizeof (buf), "/proc/%d/status", (int) pid);
+ status_file = fopen (buf, "r");
+ if (status_file == NULL)
+ {
+ if (debug_linux_nat)
+ fprintf_unfiltered (gdb_stdlog,
+ _("unable to open /proc file '%s' to check for"
+ " `t (tracing stop)'"), buf);
+ return -1;
+ }
+
+ have_state = 0;
+ *tracer = 0;
+ while ((!have_state || *tracer == 0)
+ && fgets (buf, sizeof (buf), status_file))
+ {
+ if (strncmp (buf, "TracerPid:", 10) == 0)
+ {
+ *tracer = strtoul (buf + 10, NULL, 10);
+ if (*tracer == 0)
+ break;
+ }
+ else if (strncmp (buf, "State:", 6) == 0)
+ {
+ /* Accept both "T (tracing stop)" and "t (tracing stop)". */
+ if (strstr (buf, "(tracing stop)") == NULL)
+ break;
+ have_state = 1;
+ }
+ }
+ fclose (status_file);
+ return have_state && *tracer != 0;
+}
+
/* Wait for the LWP specified by LP, which we have just attached to.
Returns a wait status for that LWP, to cache. */
@@ -1985,6 +2035,17 @@ linux_nat_resume (struct target_ops *ops
target_async (inferior_event_handler, 0);
}
+/* Return the caller's thread ID (TID). We assume a single-thread
+ GDB, or more accurately, assume the ptracer thread is GDB's main
+ thread, thus avoiding the need to autoconf support for gettid. */
+
+static pid_t
+gdb_gettid (void)
+{
+ /* Assume single-thread. */
+ return getpid ();
+}
+
/* Send a signal to an LWP. */
static int
@@ -2142,6 +2203,18 @@ linux_handle_syscall_trap (struct lwp_in
return 1;
}
+/* Return non-zero if LWP PID has a pending SIGSTOP. */
+
+static int
+linux_nat_has_pending_sigstop (int pid)
+{
+ sigset_t pending, blocked, ignored;
+
+ linux_proc_pending_signals (pid, &pending, &blocked, &ignored);
+
+ return sigismember (&pending, SIGSTOP);
+}
+
/* Handle a GNU/Linux extended wait response. If we see a clone
event, we need to add the new LWP to our list (and not report the
trap to higher layers). This function returns non-zero if the
@@ -2333,7 +2406,7 @@ linux_handle_extended_wait (struct lwp_i
if (debug_linux_nat)
fprintf_unfiltered (gdb_stdlog,
"LHEW: resuming parent LWP %d\n", pid);
- linux_ops->to_resume (linux_ops, pid_to_ptid (GET_LWP (lp->ptid)),
+ linux_ops->to_resume (linux_ops, pid_to_ptid (pid),
0, TARGET_SIGNAL_0);
return 1;
@@ -2346,13 +2419,46 @@ linux_handle_extended_wait (struct lwp_i
{
if (debug_linux_nat)
fprintf_unfiltered (gdb_stdlog,
- "LHEW: Got exec event from LWP %ld\n",
- GET_LWP (lp->ptid));
+ "LHEW: Got exec event from LWP %d\n", pid);
ourstatus->kind = TARGET_WAITKIND_EXECD;
ourstatus->value.execd_pathname
= xstrdup (linux_child_pid_to_exec_file (pid));
+ /* We might have managed get this child with a pending SIGSTOP
+ we had sent it ourselves, without having set its SIGNALLED
+ flag -- if this exec was the result of a non-leader thread
+ execing, and as a result we saw the leader zombie, and then
+ sent all other threads a SIGSTOP in order to try to check if
+ they were still alive (see linux_nat_wait_1's handling of a
+ zombie leader). It happens that a SIGSTOP is queueable in
+ the thread that execs (triggering this PTRACE_EVENT_EXEC)
+ when we still see it existing with the pre-exec tid in the "D
+ (sleep)" state (before we reap all other threads, at which
+ point the execing thread vanishes). That SIGSTOP survives
+ the exec all the way to this new process incarnation. Since
+ in reality, the kernel changes the thread's tid across the
+ exec, we can't tell which was the original TID that execd
+ (which is gone from the lwp list by now), and so we can't
+ check that lwp's SIGNALLED flag to copy it to the post-exec
+ lwp. To solve this, We peek at /proc/PID/status for a
+ pending SIGSTOP instead. It may also happen the SIGSTOP the
+ process has pending is there because something else (outside
+ GDB, a job control stop) sent it. To address that, we would
+ need to check siginfo.si_pid==gettid() later when we
+ determine whether a SIGSTOP is a delayed SIGSTOP that we had
+ sent ourselves. We don't do that presently, though, but
+ that's not a problem specific to this case alone. */
+ if (linux_nat_has_pending_sigstop (pid))
+ {
+ if (debug_linux_nat)
+ fprintf_unfiltered (gdb_stdlog,
+ "LHEW: execd LWP %d has SIGSTOP pending. Maybe ignoring it.\n",
+ pid);
+
+ lp->signalled = 1;
+ }
+
return 0;
}
@@ -2363,8 +2469,8 @@ linux_handle_extended_wait (struct lwp_i
if (debug_linux_nat)
fprintf_unfiltered (gdb_stdlog,
"LHEW: Got expected PTRACE_EVENT_"
- "VFORK_DONE from LWP %ld: stopping\n",
- GET_LWP (lp->ptid));
+ "VFORK_DONE from LWP %d: stopping\n",
+ pid);
ourstatus->kind = TARGET_WAITKIND_VFORK_DONE;
return 0;
@@ -2373,9 +2479,9 @@ linux_handle_extended_wait (struct lwp_i
if (debug_linux_nat)
fprintf_unfiltered (gdb_stdlog,
"LHEW: Got PTRACE_EVENT_VFORK_DONE "
- "from LWP %ld: resuming\n",
- GET_LWP (lp->ptid));
- ptrace (PTRACE_CONT, GET_LWP (lp->ptid), 0, 0);
+ "from LWP %d: resuming\n",
+ pid);
+ ptrace (PTRACE_CONT, pid, 0, 0);
return 1;
}
@@ -2383,7 +2489,8 @@ linux_handle_extended_wait (struct lwp_i
_("unknown ptrace event %d"), event);
}
-/* Return non-zero if LWP is a zombie. */
+/* Return positive if LWP is a zombie, false if not, negative if we
+ failed to open the status file (the thread vanished). */
static int
linux_lwp_is_zombie (long lwp)
@@ -2397,8 +2504,11 @@ linux_lwp_is_zombie (long lwp)
procfile = fopen (buffer, "r");
if (procfile == NULL)
{
- warning (_("unable to open /proc file '%s'"), buffer);
- return 0;
+ if (debug_linux_nat)
+ fprintf_unfiltered (gdb_stdlog,
+ _("unable to open /proc file '%s' "
+ "to check zombieness\n"), buffer);
+ return -1;
}
have_state = 0;
@@ -2418,10 +2528,9 @@ linux_lwp_is_zombie (long lwp)
exited. */
static int
-wait_lwp (struct lwp_info *lp)
+wait_lwp (struct lwp_info *lp, int *status, int options)
{
pid_t pid;
- int status = 0;
int thread_dead = 0;
sigset_t prev_mask;
@@ -2429,16 +2538,17 @@ wait_lwp (struct lwp_info *lp)
gdb_assert (lp->status == 0);
/* Make sure SIGCHLD is blocked for sigsuspend avoiding a race below. */
- block_child_signals (&prev_mask);
+ if ((options & WNOHANG) == 0)
+ block_child_signals (&prev_mask);
for (;;)
{
/* If my_waitpid returns 0 it means the __WCLONE vs. non-__WCLONE kind
was right and we should just call sigsuspend. */
- pid = my_waitpid (GET_LWP (lp->ptid), &status, WNOHANG);
+ pid = my_waitpid (GET_LWP (lp->ptid), status, WNOHANG);
if (pid == -1 && errno == ECHILD)
- pid = my_waitpid (GET_LWP (lp->ptid), &status, __WCLONE | WNOHANG);
+ pid = my_waitpid (GET_LWP (lp->ptid), status, __WCLONE | WNOHANG);
if (pid == -1 && errno == ECHILD)
{
/* The thread has previously exited. We need to delete it
@@ -2451,6 +2561,9 @@ wait_lwp (struct lwp_info *lp)
fprintf_unfiltered (gdb_stdlog, "WL: %s vanished.\n",
target_pid_to_str (lp->ptid));
}
+ if (pid == 0 && (options & WNOHANG) != 0)
+ return 0;
+
if (pid != 0)
break;
@@ -2465,9 +2578,10 @@ wait_lwp (struct lwp_info *lp)
As a workaround, check if we're waiting for the thread group leader and
if it's a zombie, and avoid calling waitpid if it is.
- This is racy, what if the tgl becomes a zombie right after we check?
- Therefore always use WNOHANG with sigsuspend - it is equivalent to
- waiting waitpid but the linux_lwp_is_zombie is safe this way. */
+ This is racy, what if the tgl becomes a zombie right after we
+ check? Therefore always use WNOHANG with sigsuspend - it is
+ equivalent to waiting with waitpid but the check for
+ zombieness is safe this way. */
if (GET_PID (lp->ptid) == GET_LWP (lp->ptid)
&& linux_lwp_is_zombie (GET_LWP (lp->ptid)))
@@ -2480,17 +2594,19 @@ wait_lwp (struct lwp_info *lp)
break;
}
- /* Wait for next SIGCHLD and try again. This may let SIGCHLD handlers
- get invoked despite our caller had them intentionally blocked by
- block_child_signals. This is sensitive only to the loop of
- linux_nat_wait_1 and there if we get called my_waitpid gets called
- again before it gets to sigsuspend so we can safely let the handlers
- get executed here. */
-
- sigsuspend (&suspend_mask);
+ /* Wait for next SIGCHLD and try again. This may let the
+ SIGCHLD handler run despite our caller having it
+ intentionally blocked (with block_child_signals). This is
+ sensitive only to the big wait loop in linux_nat_wait_1, and
+ if we get called from there, my_waitpid is called again
+ before sigsuspend is reached, so we can safely let the
+ SIGCHLD handler get executed here. */
+ if ((options & WNOHANG) == 0)
+ sigsuspend (&suspend_mask);
}
- restore_child_signals_mask (&prev_mask);
+ if ((options & WNOHANG) == 0)
+ restore_child_signals_mask (&prev_mask);
if (!thread_dead)
{
@@ -2501,11 +2617,11 @@ wait_lwp (struct lwp_info *lp)
fprintf_unfiltered (gdb_stdlog,
"WL: waitpid %s received %s\n",
target_pid_to_str (lp->ptid),
- status_to_str (status));
+ status_to_str (*status));
}
/* Check if the thread has exited. */
- if (WIFEXITED (status) || WIFSIGNALED (status))
+ if (WIFEXITED (*status) || WIFSIGNALED (*status))
{
thread_dead = 1;
if (debug_linux_nat)
@@ -2517,35 +2633,35 @@ wait_lwp (struct lwp_info *lp)
if (thread_dead)
{
exit_lwp (lp);
- return 0;
+ return -1;
}
- gdb_assert (WIFSTOPPED (status));
+ gdb_assert (WIFSTOPPED (*status));
/* Handle GNU/Linux's syscall SIGTRAPs. */
- if (WIFSTOPPED (status) && WSTOPSIG (status) == SYSCALL_SIGTRAP)
+ if (WIFSTOPPED (*status) && WSTOPSIG (*status) == SYSCALL_SIGTRAP)
{
/* No longer need the sysgood bit. The ptrace event ends up
recorded in lp->waitstatus if we care for it. We can carry
on handling the event like a regular SIGTRAP from here
on. */
- status = W_STOPCODE (SIGTRAP);
+ *status = W_STOPCODE (SIGTRAP);
if (linux_handle_syscall_trap (lp, 1))
- return wait_lwp (lp);
+ return wait_lwp (lp, status, options);
}
/* Handle GNU/Linux's extended waitstatus for trace events. */
- if (WIFSTOPPED (status) && WSTOPSIG (status) == SIGTRAP && status >> 16 != 0)
+ if (WIFSTOPPED (*status) && WSTOPSIG (*status) == SIGTRAP && *status >> 16 != 0)
{
if (debug_linux_nat)
fprintf_unfiltered (gdb_stdlog,
"WL: Handling extended status 0x%06x\n",
- status);
- if (linux_handle_extended_wait (lp, status, 1))
- return wait_lwp (lp);
+ *status);
+ if (linux_handle_extended_wait (lp, *status, 1))
+ return wait_lwp (lp, status, options);
}
- return status;
+ return GET_LWP (lp->ptid);
}
/* Save the most recent siginfo for LP. This is currently only called
@@ -2589,7 +2705,15 @@ stop_callback (struct lwp_info *lp, void
errno ? safe_strerror (errno) : "ERRNO-OK");
}
- lp->signalled = 1;
+ if (ret == 0)
+ {
+ if (debug_linux_nat)
+ fprintf_unfiltered (gdb_stdlog,
+ "SC: LWP %ld is now signalled\n",
+ GET_LWP (lp->ptid));
+ lp->signalled = 1;
+ }
+
gdb_assert (lp->status == 0);
}
@@ -2769,10 +2893,11 @@ stop_wait_callback (struct lwp_info *lp,
if (!lp->stopped)
{
+ int ret;
int status;
- status = wait_lwp (lp);
- if (status == 0)
+ ret = wait_lwp (lp, &status, 0);
+ if (ret < 0)
return 0;
if (lp->ignore_sigint && WIFSTOPPED (status)
@@ -3191,7 +3316,27 @@ linux_nat_filter_event (int lwpid, int s
fork, vfork, and clone events, then we'll just add the
new one to our list and go back to waiting for the event
to be reported - the stopped process might be returned
- from waitpid before or after the event is. */
+ from waitpid before or after the event is.
+
+ But note the case of a non-leader thread exec'ing after the
+ leader having exited, and gone from our lists. The non-leader
+ thread changes its tid to the tgid. */
+
+ if (WIFSTOPPED (status) && lp == NULL
+ && (WSTOPSIG (status) == SIGTRAP && status >> 16 == PTRACE_EVENT_EXEC))
+ {
+ /* A multi-thread exec after we had seen the leader exiting. */
+ if (debug_linux_nat)
+ fprintf_unfiltered (gdb_stdlog,
+ "LLW: Re-adding thread group leader LWP %d.\n",
+ lwpid);
+
+ lp = add_lwp (BUILD_LWP (lwpid, lwpid));
+ lp->stopped = 1;
+ lp->resumed = 1;
+ add_thread (lp->ptid);
+ }
+
if (WIFSTOPPED (status) && !lp)
{
add_to_pid_list (&stopped_pids, lwpid, status);
@@ -3393,6 +3538,371 @@ linux_nat_filter_event (int lwpid, int s
}
static ptid_t
+return_ignore (ptid_t wait_ptid, struct target_waitstatus *ourstatus)
+{
+ ourstatus->kind = TARGET_WAITKIND_IGNORE;
+
+ if (iterate_over_lwps (wait_ptid, resumed_callback, NULL) == NULL)
+ {
+ if (debug_linux_nat)
+ fprintf_unfiltered (gdb_stdlog, "LLW: exit (no resumed LWP)\n");
+ return minus_one_ptid;
+ }
+ else
+ {
+ if (debug_linux_nat)
+ fprintf_unfiltered (gdb_stdlog, "LLW: exit (ignore)\n");
+ return null_ptid;
+ }
+}
+
+static int
+status_pending_p (struct lwp_info *lp)
+{
+ return (lp->status
+ || lp->waitstatus.kind != TARGET_WAITKIND_IGNORE);
+}
+
+static int
+handle_leader_exit_or_mtexec (int *status, int *new_pending)
+{
+ pid_t lwpid;
+ struct inferior *inf;
+
+ *new_pending = 0;
+
+ /* Check for zombie thread group leaders. We can't wait for
+ those if there are still other threads in the thread group.
+ If the leader becomes zombie right after this check, we'll
+ get a SIGCHLD signal, and wake up on the sigsuspend call
+ below (SIGCHLD is blocked at this point). */
+
+ ALL_INFERIORS (inf)
+ {
+ int tracer;
+ struct lwp_info *leader_lp;
+ int leader_zombie;
+ int got_exec;
+ int num;
+
+ if (inf->pid == 0)
+ continue;
+
+ leader_zombie = 0;
+ leader_lp = NULL;
+ got_exec = 0;
+ leader_lp = find_lwp_pid (pid_to_ptid (inf->pid));
+
+ if (debug_linux_nat)
+ fprintf_unfiltered (gdb_stdlog,
+ "LLW: group leader %d in our list?: %s\n",
+ inf->pid, leader_lp != NULL ? "yes" : "no");
+
+ if (leader_lp == NULL
+ && pid_is_ptrace_stopped (inf->pid, &tracer) > 0
+ && tracer != 0 && tracer == gdb_gettid ())
+ {
+ struct thread_info *tp;
+
+ /* We've seen the leader exit before, hence it's not in
+ our lists. But, we're magically tracing it, and it's
+ in ptrace-stop. This can only mean that we were
+ debugging only one other thread, and it exec'ed
+ (assuming it's not possible for a new clone to reuse
+ the thread group'd id!). If there were more threads
+ in the group, the leader would remain zombie until we
+ reapped the other threads. */
+ leader_lp = add_lwp (BUILD_LWP (inf->pid, inf->pid));
+ leader_lp->stopped = 1;
+ leader_lp->resumed = 1;
+ add_thread (leader_lp->ptid);
+
+ got_exec = 1;
+
+ if (debug_linux_nat)
+ fprintf_unfiltered (gdb_stdlog,
+ "LLW: Thread group leader %s reapeared "
+ "(another thread execd).\n",
+ target_pid_to_str (leader_lp->ptid));
+ }
+ else
+ {
+ int ret;
+
+ ret = linux_lwp_is_zombie (inf->pid);
+ if (ret > 0)
+ {
+ leader_zombie = 1;
+
+ if (debug_linux_nat)
+ fprintf_unfiltered (gdb_stdlog,
+ "LLW: Thread group leader %d is zombie "
+ "(it exited, or another thread execd).\n",
+ inf->pid);
+ }
+ else if (ret < 0)
+ {
+ /* If the leader is really gone instead of having
+ remained zombie on exit, then this inferior used raw
+ clone without CLONE_THREAD to spawn processes. IOW,
+ the leader really wasn't the leader of the other
+ "threads". */
+ if (debug_linux_nat)
+ fprintf_unfiltered (gdb_stdlog,
+ "LLW: Thread group leader %d vanished"
+ "(not really leader)\n",
+ inf->pid);
+ }
+ }
+
+ if (got_exec || leader_zombie)
+ {
+ /* A non-leader thread execs, or we found the leader
+ zombie. */
+
+ struct lwp_info *other_lp;
+
+ /* Reap zombies first. On kernels that don't report
+ exit events for non-leader threads, we'll not see
+ them as zombies. On kernels that do, we need to do
+ this to unblock the thread that execd (it will remain
+ in "State: D (disk sleep)" until we do). */
+ ALL_LWPS (other_lp)
+ if (GET_PID (other_lp->ptid) == inf->pid
+ && GET_LWP (other_lp->ptid) != inf->pid)
+ stop_callback (other_lp, NULL);
+
+ for (;;)
+ {
+ int need_wait = 0;
+ struct lwp_info *tmp;
+
+ ALL_LWPS_SAFE (other_lp, tmp)
+ if (GET_PID (other_lp->ptid) == inf->pid
+ && GET_LWP (other_lp->ptid) != inf->pid)
+ {
+ pid_t pid;
+ int status;
+ int ret;
+
+ if (other_lp->stopped)
+ {
+ ret = linux_lwp_is_zombie (GET_LWP (other_lp->ptid));
+ if (ret > 0)
+ {
+ if (debug_linux_nat)
+ fprintf_unfiltered (gdb_stdlog,
+ "LLW: stopped lwp %s is zombie "
+ "(it exited, or another thread execd).\n",
+ target_pid_to_str (other_lp->ptid));
+ other_lp->stopped = 0;
+ ret = wait_lwp (other_lp, &status, 0);
+ gdb_assert (ret < 0);
+ continue;
+ }
+ else if (ret < 0)
+ {
+ if (debug_linux_nat)
+ fprintf_unfiltered (gdb_stdlog, "LLW: %s vanished.\n",
+ target_pid_to_str (other_lp->ptid));
+ exit_lwp (other_lp);
+ continue;
+ }
+ }
+ else
+ {
+ pid = wait_lwp (other_lp, &status, WNOHANG);
+ if (pid == 0)
+ need_wait = 1;
+ else if (pid < 0)
+ continue;
+ else if (pid > 0)
+ {
+ other_lp->stopped = 1;
+
+ if (other_lp->ignore_sigint && WIFSTOPPED (status)
+ && WSTOPSIG (status) == SIGINT)
+ {
+ other_lp->ignore_sigint = 0;
+ if (other_lp->last_resume_kind == resume_stop)
+ other_lp->status = W_STOPCODE (SIGSTOP);
+ else
+ other_lp->status = 0;
+ }
+ else
+ {
+ if (WSTOPSIG (status) != SIGSTOP)
+ {
+ if (linux_nat_status_is_event (status))
+ {
+ save_siginfo (other_lp);
+ save_sigtrap (other_lp);
+ }
+ }
+ else
+ other_lp->signalled = 0;
+
+ if (WSTOPSIG (status) != SIGSTOP)
+ other_lp->status = status;
+ else if (other_lp->last_resume_kind == resume_stop)
+ other_lp->status = status;
+ else
+ other_lp->status = 0;
+ }
+
+ maybe_clear_ignore_sigint (other_lp);
+ }
+ }
+ }
+
+ if (!need_wait)
+ break; /* done */
+
+ sigsuspend (&suspend_mask);
+ }
+
+ /* Stop all threads and verify if they are still alive.
+ If we stop all the threads and use the
+ stop_wait_callback to check if they have exited we
+ can determine whether this signal should be ignored
+ or whether it means the end of the debugged
+ application. If the latter we'll want to use the
+ leader's exit code as exit code of the
+ application.
+
+ Another case we're handling here is a multi-threaded
+ app having a non-leader thread execing.
+
+ In that case, the Linux kernel destroys all other
+ threads (except the execing one) in the thread group,
+ and resets the execing thread's tid to the tgid. No
+ exit notifications are sent. The execing thread has
+ its tid reset to the thread groud id. When this
+ happens, we'll see the leader as zombie. There's a
+ (another) nasty problem here (believed to be a kernel
+ bug). There's a race window where we can successfuly
+ send a SIGSTOP to the execing thread, before it
+ changes tid, and that SIGSTOP ends up pending
+ post-exec. If we didn't consider that happening,
+ we'd sometimes see the inferior report a spurious
+ SIGSTOP when the user resumes it after the exec.
+ We're handling this by always queueing the leader a
+ SIGSTOP as soon as we see it report an exec, so that
+ we always know to ignore it. */
+
+ num = num_lwps (inf->pid);
+ if (num == 0)
+ {
+ gdb_assert (leader_lp == NULL);
+
+ /* No LWPs left. Must be an exec after we had seen
+ the leader exiting. */
+ if (debug_linux_nat)
+ fprintf_unfiltered (gdb_stdlog,
+ "LLW: Re-adding thread group leader LWP %d.\n",
+ inf->pid);
+
+ leader_lp = add_lwp (BUILD_LWP (inf->pid, inf->pid));
+ leader_lp->stopped = 1;
+ leader_lp->resumed = 1;
+ add_thread (leader_lp->ptid);
+ num = 1;
+ }
+
+ if (num > 1)
+ {
+ if (leader_lp != NULL)
+ {
+ /* If there is at least one more LWP, then the
+ leader exiting was not the end of the debugged
+ application and should be ignored. */
+ if (debug_linux_nat)
+ fprintf_unfiltered (gdb_stdlog,
+ "LLW: Thread group leader LWP %d vanished.\n",
+ inf->pid);
+ exit_lwp (leader_lp);
+ }
+
+ ALL_LWPS (other_lp)
+ if (GET_PID (other_lp->ptid) == inf->pid
+ && GET_LWP (other_lp->ptid) != inf->pid
+ && other_lp->resumed)
+ {
+ if (!status_pending_p (other_lp))
+ {
+ if (debug_linux_nat)
+ fprintf_unfiltered (gdb_stdlog,
+ "SARC: re-resuming LWP %ld\n",
+ GET_LWP (other_lp->ptid));
+ resume_lwp (other_lp, other_lp->step);
+ }
+ else
+ {
+ if (debug_linux_nat)
+ fprintf_unfiltered (gdb_stdlog,
+ "SARC: not re-resuming LWP %ld (has pending)\n",
+ GET_LWP (other_lp->ptid));
+ *new_pending = 1;
+ }
+ }
+ }
+ else if (leader_lp != NULL)
+ {
+ /* We only have the leader left, then this could either
+ have been an exec or we'll see the leader exit. We
+ won't get another SIGCHLD though, so wait here. */
+ if (debug_linux_nat)
+ fprintf_unfiltered (gdb_stdlog,
+ "LLW: only the leader left in tgid %d. poll it.\n",
+ inf->pid);
+
+ lwpid = my_waitpid (inf->pid, status, 0);
+ gdb_assert (lwpid > 0);
+
+ /* Even if there are signals (with signo < SIGTRAP)
+ pending on the thread, we should see the exec event
+ first. */
+ gdb_assert (!WIFSTOPPED (*status)
+ || (WSTOPSIG (*status) == SIGTRAP
+ && *status >> 16 == PTRACE_EVENT_EXEC));
+
+ return lwpid;
+ }
+ else
+ {
+ ALL_LWPS (other_lp)
+ if (GET_PID (other_lp->ptid) == inf->pid
+ && GET_LWP (other_lp->ptid) != inf->pid)
+ break;
+ gdb_assert (other_lp);
+
+ if (other_lp->resumed)
+ {
+ if (!status_pending_p (other_lp))
+ {
+ if (debug_linux_nat)
+ fprintf_unfiltered (gdb_stdlog,
+ "SARC: re-resuming LWP %ld\n",
+ GET_LWP (other_lp->ptid));
+ resume_lwp (other_lp, other_lp->step);
+ }
+ else
+ {
+ if (debug_linux_nat)
+ fprintf_unfiltered (gdb_stdlog,
+ "SARC: not re-resuming LWP %ld (has pending)\n",
+ GET_LWP (other_lp->ptid));
+ *new_pending = 1;
+ }
+ }
+ }
+ }
+ }
+
+ return 0;
+}
+
+static ptid_t
linux_nat_wait_1 (struct target_ops *ops,
ptid_t ptid, struct target_waitstatus *ourstatus,
int target_options)
@@ -3436,25 +3946,11 @@ linux_nat_wait_1 (struct target_ops *ops
pid = GET_LWP (ptid);
retry:
+ options = 0;
lp = NULL;
status = 0;
options = 0;
- /* Make sure that of those LWPs we want to get an event from, there
- is at least one LWP that has been resumed. If there's none, just
- bail out. The core may just be flushing asynchronously all
- events. */
- if (iterate_over_lwps (ptid, resumed_callback, NULL) == NULL)
- {
- ourstatus->kind = TARGET_WAITKIND_IGNORE;
-
- if (debug_linux_nat)
- fprintf_unfiltered (gdb_stdlog, "LLW: exit (no resumed LWP)\n");
-
- restore_child_signals_mask (&prev_mask);
- return minus_one_ptid;
- }
-
/* First check if there is a LWP with a wait status pending. */
if (pid == -1)
{
@@ -3472,7 +3968,7 @@ retry:
/* But if we don't find one, we'll have to wait, and check both
cloned and uncloned processes. We start with the cloned
processes. */
- options = __WCLONE | WNOHANG;
+ options = __WCLONE;
}
else if (is_lwp (ptid))
{
@@ -3557,22 +4053,25 @@ retry:
set_sigint_trap ();
}
- /* Translate generic target_wait options into waitpid options. */
- if (target_options & TARGET_WNOHANG)
- options |= WNOHANG;
+ /* Always use WNOHANG, due to a kernel/ptrace quirk where if the
+ thread group leader exits while other threads in the thread group
+ still exist, waitpid hangs. */
+ options |= WNOHANG;
while (lp == NULL)
{
pid_t lwpid;
+ struct inferior *inf;
+ /* If this is true, then we paused LWPs momentarily, and may now
+ have pending events to handle. */
+ int new_pending;
+ errno = 0;
lwpid = my_waitpid (pid, &status, options);
if (lwpid > 0)
{
- /* If this is true, then we paused LWPs momentarily, and may
- now have pending events to handle. */
- int new_pending;
-
+ got_event:
gdb_assert (pid == -1 || lwpid == pid);
if (debug_linux_nat)
@@ -3677,17 +4176,21 @@ retry:
if (new_pending)
goto retry;
- if (pid == -1)
- {
- /* waitpid did return something. Restart over. */
- options |= __WCLONE;
- }
- continue;
+ /* We got here because waitpid did return something, but
+ the event was filtered out. It could have been an
+ lwp exit, and so we may end up with no resumed lwps
+ left. Restart over. */
+ goto retry;
}
}
if (pid == -1)
{
+ if (debug_linux_nat)
+ fprintf_unfiltered (gdb_stdlog,
+ "LNW: waitpid(-1, ...) returned %d, %s\n",
+ lwpid, errno ? safe_strerror (errno) : "ERRNO-OK");
+
/* Alternate between checking cloned and uncloned processes. */
options ^= __WCLONE;
@@ -3696,14 +4199,43 @@ retry:
In sync mode, suspend waiting for a SIGCHLD signal. */
if (options & __WCLONE)
{
+ /* Check for zombie thread group leader. We can't wait
+ for those if there are still other threads in the
+ thread group. Even though we're waiting with
+ PID==-1, other threads in the group may have not been
+ resumed (schedlock, for example). If the leader
+ becomes zombie right after this check, we'll get a
+ SIGCHLD signal, and wake up on the sigsuspend call
+ below (SIGCHLD is blocked at this point). */
+ lwpid = handle_leader_exit_or_mtexec (&status, &new_pending);
+ if (lwpid > 0)
+ goto got_event;
+ /* If we now have pending events to consume, go all the way back
+ and handle them. */
+ else if (new_pending)
+ goto retry;
+
if (target_options & TARGET_WNOHANG)
{
/* No interesting event. */
- ourstatus->kind = TARGET_WAITKIND_IGNORE;
+ restore_child_signals_mask (&prev_mask);
+ return return_ignore (ptid, ourstatus);
+ }
+ /* But if there are no resumed children left, bail.
+ We'd be stuck forever in sigsuspend otherwise. Note
+ we can't do this before calling waitpid, because we
+ need to handle a non-leader thread exec'ing. When
+ that happens, the pre-exec leader may have not been
+ resumed, but since the non-leader changes its tid to
+ reincarnate as the leader, we'd never see the exec
+ event. */
+ if (iterate_over_lwps (ptid, resumed_callback, NULL) == NULL)
+ {
if (debug_linux_nat)
- fprintf_unfiltered (gdb_stdlog, "LLW: exit (ignore)\n");
+ fprintf_unfiltered (gdb_stdlog, "LLW: exit (no resumed LWP)\n");
+ ourstatus->kind = TARGET_WAITKIND_IGNORE;
restore_child_signals_mask (&prev_mask);
return minus_one_ptid;
}
@@ -3711,16 +4243,30 @@ retry:
sigsuspend (&suspend_mask);
}
}
- else if (target_options & TARGET_WNOHANG)
+ else
{
- /* No interesting event for PID yet. */
- ourstatus->kind = TARGET_WAITKIND_IGNORE;
+ /* Check for zombie thread group leader. We can't wait for
+ those if there are still other threads in the thread
+ group. If the leader becomes zombie right after this
+ check, we'll get a SIGCHLD signal, and wake up on the
+ sigsuspend call below (SIGCHLD is blocked at this
+ point). */
+ lwpid = handle_leader_exit_or_mtexec (&status, &new_pending);
+ if (lwpid > 0)
+ goto got_event;
+ /* If we now have pending events to consume, go all the way back
+ and handle them. */
+ else if (new_pending)
+ goto retry;
- if (debug_linux_nat)
- fprintf_unfiltered (gdb_stdlog, "LLW: exit (ignore)\n");
+ if (target_options & TARGET_WNOHANG)
+ {
+ /* No interesting event for PID yet. */
+ restore_child_signals_mask (&prev_mask);
+ return return_ignore (ptid, ourstatus);
+ }
- restore_child_signals_mask (&prev_mask);
- return minus_one_ptid;
+ sigsuspend (&suspend_mask);
}
/* We shouldn't end up here unless we want to try again. */
Index: src/gdb/infrun.c
===================================================================
--- src.orig/gdb/infrun.c 2011-10-26 18:03:15.000000000 +0100
+++ src/gdb/infrun.c 2011-10-26 18:09:20.980742552 +0100
@@ -2806,6 +2806,7 @@ fetch_inferior_event (void *client_data)
normal_stop ();
if (target_has_execution
+ && ecs->ws.kind != TARGET_WAITKIND_IGNORE
&& ecs->ws.kind != TARGET_WAITKIND_EXITED
&& ecs->ws.kind != TARGET_WAITKIND_SIGNALLED
&& ecs->event_thread->step_multi
@@ -3149,23 +3150,40 @@ handle_inferior_event (struct execution_
if (ecs->ws.kind == TARGET_WAITKIND_IGNORE)
{
- /* We had an event in the inferior, but we are not interested in
- handling it at this level. The lower layers have already
- done what needs to be done, if anything.
-
- One of the possible circumstances for this is when the
- inferior produces output for the console. The inferior has
- not stopped, and we are ignoring the event. Another possible
- circumstance is any event which the lower level knows will be
- reported multiple times without an intervening resume. */
- if (debug_infrun)
- fprintf_unfiltered (gdb_stdlog, "infrun: TARGET_WAITKIND_IGNORE\n");
- prepare_to_wait (ecs);
- return;
+ if (ptid_equal (ecs->ptid, null_ptid))
+ {
+ /* We had an event in the inferior, but we are not
+ interested in handling it at this level. The lower
+ layers have already done what needs to be done, if
+ anything.
+
+ One of the possible circumstances for this is when the
+ inferior produces output for the console. The inferior
+ has not stopped, and we are ignoring the event. Another
+ possible circumstance is any event which the lower level
+ knows will be reported multiple times without an
+ intervening resume. */
+ prepare_to_wait (ecs);
+ return;
+ }
+
+ if (target_can_async_p ()
+ && !sync_execution
+ && ptid_equal (ecs->ptid, minus_one_ptid))
+ {
+ /* There were no unwaited-for children left in the target,
+ but, we're not synchronously waiting for events either.
+ Just ignore. Otherwise, if we were running a synchronous
+ execution command, we need to cancel it and give the user
+ back the terminal. */
+ prepare_to_wait (ecs);
+ return;
+ }
}
if (ecs->ws.kind != TARGET_WAITKIND_EXITED
- && ecs->ws.kind != TARGET_WAITKIND_SIGNALLED)
+ && ecs->ws.kind != TARGET_WAITKIND_SIGNALLED
+ && ecs->ws.kind != TARGET_WAITKIND_IGNORE)
{
struct inferior *inf = find_inferior_pid (ptid_get_pid (ecs->ptid));
@@ -3182,17 +3200,32 @@ handle_inferior_event (struct execution_
/* Always clear state belonging to the previous time we stopped. */
stop_stack_dummy = STOP_NONE;
- /* If it's a new process, add it to the thread database. */
+ if (ecs->ws.kind == TARGET_WAITKIND_IGNORE
+ && ptid_equal (ecs->ptid, minus_one_ptid))
+ {
+ /* No unwaited-for children left. IOW, all resumed children
+ have exited. */
+ if (debug_infrun)
+ fprintf_unfiltered (gdb_stdlog, "infrun: TARGET_WAITKIND_IGNORE\n");
- ecs->new_thread_event = (!ptid_equal (ecs->ptid, inferior_ptid)
- && !ptid_equal (ecs->ptid, minus_one_ptid)
- && !in_thread_list (ecs->ptid));
+ stop_print_frame = 0;
+ stop_stepping (ecs);
+ return;
+ }
+ /* If it's a new process, add it to the thread database. */
if (ecs->ws.kind != TARGET_WAITKIND_EXITED
- && ecs->ws.kind != TARGET_WAITKIND_SIGNALLED && ecs->new_thread_event)
- add_thread (ecs->ptid);
+ && ecs->ws.kind != TARGET_WAITKIND_SIGNALLED)
+ {
+ ecs->new_thread_event = (!ptid_equal (ecs->ptid, inferior_ptid)
+ && !ptid_equal (ecs->ptid, minus_one_ptid)
+ && !in_thread_list (ecs->ptid));
- ecs->event_thread = find_thread_ptid (ecs->ptid);
+ if (ecs->new_thread_event)
+ ecs->event_thread = add_thread (ecs->ptid);
+ else
+ ecs->event_thread = find_thread_ptid (ecs->ptid);
+ }
/* Dependent on valid ECS->EVENT_THREAD. */
adjust_pc_after_break (ecs);
@@ -5824,7 +5857,8 @@ normal_stop (void)
if (!non_stop)
make_cleanup (finish_thread_state_cleanup, &minus_one_ptid);
else if (last.kind != TARGET_WAITKIND_SIGNALLED
- && last.kind != TARGET_WAITKIND_EXITED)
+ && last.kind != TARGET_WAITKIND_EXITED
+ && last.kind != TARGET_WAITKIND_IGNORE)
make_cleanup (finish_thread_state_cleanup, &inferior_ptid);
/* In non-stop mode, we don't want GDB to switch threads behind the
@@ -5842,6 +5876,7 @@ normal_stop (void)
if (!non_stop
&& !ptid_equal (previous_inferior_ptid, inferior_ptid)
&& target_has_execution
+ && last.kind != TARGET_WAITKIND_IGNORE
&& last.kind != TARGET_WAITKIND_SIGNALLED
&& last.kind != TARGET_WAITKIND_EXITED)
{
@@ -5852,6 +5887,15 @@ normal_stop (void)
previous_inferior_ptid = inferior_ptid;
}
+ if (last.kind == TARGET_WAITKIND_IGNORE)
+ {
+ gdb_assert ((sync_execution || !target_can_async_p ())
+ && ptid_equal (last_ptid, minus_one_ptid));
+
+ target_terminal_ours_for_output ();
+ printf_filtered (_("No unwaited-for children left.\n"));
+ }
+
if (!breakpoints_always_inserted_mode () && target_has_execution)
{
if (remove_breakpoints ())
@@ -6038,6 +6082,7 @@ done:
if (!target_has_execution
|| last.kind == TARGET_WAITKIND_SIGNALLED
|| last.kind == TARGET_WAITKIND_EXITED
+ || last.kind == TARGET_WAITKIND_IGNORE
|| (!inferior_thread ()->step_multi
&& !(inferior_thread ()->control.stop_bpstat
&& inferior_thread ()->control.proceed_to_finish)
Index: src/gdb/linux-nat.h
===================================================================
--- src.orig/gdb/linux-nat.h 2011-10-26 17:49:22.420742373 +0100
+++ src/gdb/linux-nat.h 2011-10-26 18:09:20.980742552 +0100
@@ -124,6 +124,13 @@ extern struct lwp_info *lwp_list;
(LP) != NULL; \
(LP) = (LP)->next)
+/* Iterate over the each active LWP. Safe even if the statement
+ deletes the current LWP. */
+#define ALL_LWPS_SAFE(LP, TMP) \
+ for ((LP) = lwp_list; \
+ (LP) != NULL ? ((TMP) = (LP)->next, 1) : 0; \
+ (LP) = (TMP))
+
#define GET_LWP(ptid) ptid_get_lwp (ptid)
#define GET_PID(ptid) ptid_get_pid (ptid)
#define is_lwp(ptid) (GET_LWP (ptid) != 0)
Index: src/gdb/remote.c
===================================================================
--- src.orig/gdb/remote.c 2011-10-26 17:49:22.420742373 +0100
+++ src/gdb/remote.c 2011-10-26 18:09:20.980742552 +0100
@@ -5453,7 +5453,7 @@ remote_wait_ns (ptid_t ptid, struct targ
if (options & TARGET_WNOHANG)
{
status->kind = TARGET_WAITKIND_IGNORE;
- return minus_one_ptid;
+ return null_ptid;
}
/* Otherwise do a blocking wait. */
@@ -5587,7 +5587,7 @@ remote_wait_as (ptid_t ptid, struct targ
/* Nothing interesting happened. If we're doing a non-blocking
poll, we're done. Otherwise, go back to waiting. */
if (options & TARGET_WNOHANG)
- return minus_one_ptid;
+ return null_ptid;
else
goto again;
}