This is the mail archive of the gdb-patches@sourceware.org mailing list for the GDB project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[just fyi, the ugly version] Re: handle leader exits; don't get stuck when all resumed threads exit


On Wednesday 26 October 2011 20:00:03, Pedro Alves wrote:
> This one's got a long history behind it...  It was much bigger at one point, 
> and then I hit a brick wall and came up with this much smaller and simpler
> patch.

...

> So at this point, I came up with a patch that when a zombie
> leader is detected, it would try to see if all other threads in
> the thread group were still around, and reap/exit them if they were not,
> while being careful that there was one thread that would not report any
> exit (the execing one), so blocking in sigsuspend was no good.  But grew
> into an ugly monster, that was very tricky to get right, and still,
> wasn't 100% correct.

And for reference, here's what I had.  linux-nat.c:handle_leader_exit_or_mtexec
has most of the intersting bits.  On the core side, it used
TARGET_WAITKIND_IGNORE+minus_one_ptid to mean the same as TARGET_WAITKIND_NO_RESUME,
but that's a minor detail that I was going to change if I had persued this further.

One problem that it has that made me stop pushing it further and restart
over, was the sigsuspend in handle_leader_exit_or_mtexec meant that we'd
break multi-process+non-stop support, as we could end up in the
sigsuspend in linux_nat_wait_1 with events still left to collect with
waitpid (the former sigsuspend would eat the SIGCHLD for events of
other inferiors).

-- 
Pedro Alves

---
 gdb/infrun.c    |   89 +++++-
 gdb/linux-nat.c |  734 ++++++++++++++++++++++++++++++++++++++++++++++++--------
 gdb/linux-nat.h |    7 
 gdb/remote.c    |    4 
 4 files changed, 716 insertions(+), 118 deletions(-)

Index: src/gdb/linux-nat.c
===================================================================
--- src.orig/gdb/linux-nat.c	2011-10-26 18:03:17.000000000 +0100
+++ src/gdb/linux-nat.c	2011-10-26 18:12:00.400742575 +0100
@@ -1268,28 +1268,78 @@ pid_is_stopped (pid_t pid)
   FILE *status_file;
   char buf[100];
   int retval = 0;
+  int have_state = 0;
 
   snprintf (buf, sizeof (buf), "/proc/%d/status", (int) pid);
   status_file = fopen (buf, "r");
-  if (status_file != NULL)
+  if (status_file == NULL)
     {
-      int have_state = 0;
-
-      while (fgets (buf, sizeof (buf), status_file))
+      if (debug_linux_nat)
+	fprintf_unfiltered (gdb_stdlog,
+			    _("unable to open /proc file '%s' "
+			      "to check for `T (stopped)'"), buf);
+      return 0;
+    }
+  while (fgets (buf, sizeof (buf), status_file))
+    {
+      if (strncmp (buf, "State:", 6) == 0)
 	{
-	  if (strncmp (buf, "State:", 6) == 0)
-	    {
-	      have_state = 1;
-	      break;
-	    }
+	  have_state = 1;
+	  break;
 	}
-      if (have_state && strstr (buf, "T (stopped)") != NULL)
-	retval = 1;
-      fclose (status_file);
     }
+  if (have_state && strstr (buf, "T (stopped)") != NULL)
+    retval = 1;
+  fclose (status_file);
   return retval;
 }
 
+/* Detect `T (tracing stop)' in `/proc/PID/status'.  Other states are
+   reported as false.  Return the tracer's PID in *TRACER, or zero if
+   PID is not being traced.  Returns -1 on error (interpreted as
+   meaning the thread is gone).  */
+
+static int
+pid_is_ptrace_stopped (pid_t pid, pid_t *tracer)
+{
+  FILE *status_file;
+  char buf[100];
+  int have_state = 0;
+
+  snprintf (buf, sizeof (buf), "/proc/%d/status", (int) pid);
+  status_file = fopen (buf, "r");
+  if (status_file == NULL)
+    {
+      if (debug_linux_nat)
+	fprintf_unfiltered (gdb_stdlog,
+			    _("unable to open /proc file '%s' to check for"
+			      " `t (tracing stop)'"), buf);
+      return -1;
+    }
+
+  have_state = 0;
+  *tracer = 0;
+  while ((!have_state || *tracer == 0)
+	 && fgets (buf, sizeof (buf), status_file))
+    {
+      if (strncmp (buf, "TracerPid:", 10) == 0)
+	{
+	  *tracer = strtoul (buf + 10, NULL, 10);
+	  if (*tracer == 0)
+	    break;
+	}
+      else if (strncmp (buf, "State:", 6) == 0)
+	{
+	  /* Accept both "T (tracing stop)" and "t (tracing stop)".  */
+	  if (strstr (buf, "(tracing stop)") == NULL)
+	    break;
+	  have_state = 1;
+	}
+    }
+  fclose (status_file);
+  return have_state && *tracer != 0;
+}
+
 /* Wait for the LWP specified by LP, which we have just attached to.
    Returns a wait status for that LWP, to cache.  */
 
@@ -1985,6 +2035,17 @@ linux_nat_resume (struct target_ops *ops
     target_async (inferior_event_handler, 0);
 }
 
+/* Return the caller's thread ID (TID).  We assume a single-thread
+   GDB, or more accurately, assume the ptracer thread is GDB's main
+   thread, thus avoiding the need to autoconf support for gettid.  */
+
+static pid_t
+gdb_gettid (void)
+{
+  /* Assume single-thread.  */
+  return getpid ();
+}
+
 /* Send a signal to an LWP.  */
 
 static int
@@ -2142,6 +2203,18 @@ linux_handle_syscall_trap (struct lwp_in
   return 1;
 }
 
+/* Return non-zero if LWP PID has a pending SIGSTOP.  */
+
+static int
+linux_nat_has_pending_sigstop (int pid)
+{
+  sigset_t pending, blocked, ignored;
+
+  linux_proc_pending_signals (pid, &pending, &blocked, &ignored);
+
+  return sigismember (&pending, SIGSTOP);
+}
+
 /* Handle a GNU/Linux extended wait response.  If we see a clone
    event, we need to add the new LWP to our list (and not report the
    trap to higher layers).  This function returns non-zero if the
@@ -2333,7 +2406,7 @@ linux_handle_extended_wait (struct lwp_i
 	  if (debug_linux_nat)
 	    fprintf_unfiltered (gdb_stdlog,
 				"LHEW: resuming parent LWP %d\n", pid);
-	  linux_ops->to_resume (linux_ops, pid_to_ptid (GET_LWP (lp->ptid)),
+	  linux_ops->to_resume (linux_ops, pid_to_ptid (pid),
 				0, TARGET_SIGNAL_0);
 
 	  return 1;
@@ -2346,13 +2419,46 @@ linux_handle_extended_wait (struct lwp_i
     {
       if (debug_linux_nat)
 	fprintf_unfiltered (gdb_stdlog,
-			    "LHEW: Got exec event from LWP %ld\n",
-			    GET_LWP (lp->ptid));
+			    "LHEW: Got exec event from LWP %d\n", pid);
 
       ourstatus->kind = TARGET_WAITKIND_EXECD;
       ourstatus->value.execd_pathname
 	= xstrdup (linux_child_pid_to_exec_file (pid));
 
+      /* We might have managed get this child with a pending SIGSTOP
+	 we had sent it ourselves, without having set its SIGNALLED
+	 flag -- if this exec was the result of a non-leader thread
+	 execing, and as a result we saw the leader zombie, and then
+	 sent all other threads a SIGSTOP in order to try to check if
+	 they were still alive (see linux_nat_wait_1's handling of a
+	 zombie leader).  It happens that a SIGSTOP is queueable in
+	 the thread that execs (triggering this PTRACE_EVENT_EXEC)
+	 when we still see it existing with the pre-exec tid in the "D
+	 (sleep)" state (before we reap all other threads, at which
+	 point the execing thread vanishes).  That SIGSTOP survives
+	 the exec all the way to this new process incarnation.  Since
+	 in reality, the kernel changes the thread's tid across the
+	 exec, we can't tell which was the original TID that execd
+	 (which is gone from the lwp list by now), and so we can't
+	 check that lwp's SIGNALLED flag to copy it to the post-exec
+	 lwp.  To solve this, We peek at /proc/PID/status for a
+	 pending SIGSTOP instead.  It may also happen the SIGSTOP the
+	 process has pending is there because something else (outside
+	 GDB, a job control stop) sent it.  To address that, we would
+	 need to check siginfo.si_pid==gettid() later when we
+	 determine whether a SIGSTOP is a delayed SIGSTOP that we had
+	 sent ourselves.  We don't do that presently, though, but
+	 that's not a problem specific to this case alone.  */
+      if (linux_nat_has_pending_sigstop (pid))
+	{
+	  if (debug_linux_nat)
+	    fprintf_unfiltered (gdb_stdlog,
+				"LHEW: execd LWP %d has SIGSTOP pending.  Maybe ignoring it.\n",
+				pid);
+
+	  lp->signalled = 1;
+	}
+
       return 0;
     }
 
@@ -2363,8 +2469,8 @@ linux_handle_extended_wait (struct lwp_i
 	  if (debug_linux_nat)
 	    fprintf_unfiltered (gdb_stdlog,
 				"LHEW: Got expected PTRACE_EVENT_"
-				"VFORK_DONE from LWP %ld: stopping\n",
-				GET_LWP (lp->ptid));
+				"VFORK_DONE from LWP %d: stopping\n",
+				pid);
 
 	  ourstatus->kind = TARGET_WAITKIND_VFORK_DONE;
 	  return 0;
@@ -2373,9 +2479,9 @@ linux_handle_extended_wait (struct lwp_i
       if (debug_linux_nat)
 	fprintf_unfiltered (gdb_stdlog,
 			    "LHEW: Got PTRACE_EVENT_VFORK_DONE "
-			    "from LWP %ld: resuming\n",
-			    GET_LWP (lp->ptid));
-      ptrace (PTRACE_CONT, GET_LWP (lp->ptid), 0, 0);
+			    "from LWP %d: resuming\n",
+			    pid);
+      ptrace (PTRACE_CONT, pid, 0, 0);
       return 1;
     }
 
@@ -2383,7 +2489,8 @@ linux_handle_extended_wait (struct lwp_i
 		  _("unknown ptrace event %d"), event);
 }
 
-/* Return non-zero if LWP is a zombie.  */
+/* Return positive if LWP is a zombie, false if not, negative if we
+   failed to open the status file (the thread vanished).  */
 
 static int
 linux_lwp_is_zombie (long lwp)
@@ -2397,8 +2504,11 @@ linux_lwp_is_zombie (long lwp)
   procfile = fopen (buffer, "r");
   if (procfile == NULL)
     {
-      warning (_("unable to open /proc file '%s'"), buffer);
-      return 0;
+      if (debug_linux_nat)
+	fprintf_unfiltered (gdb_stdlog,
+			    _("unable to open /proc file '%s' "
+			      "to check zombieness\n"), buffer);
+      return -1;
     }
 
   have_state = 0;
@@ -2418,10 +2528,9 @@ linux_lwp_is_zombie (long lwp)
    exited.  */
 
 static int
-wait_lwp (struct lwp_info *lp)
+wait_lwp (struct lwp_info *lp, int *status, int options)
 {
   pid_t pid;
-  int status = 0;
   int thread_dead = 0;
   sigset_t prev_mask;
 
@@ -2429,16 +2538,17 @@ wait_lwp (struct lwp_info *lp)
   gdb_assert (lp->status == 0);
 
   /* Make sure SIGCHLD is blocked for sigsuspend avoiding a race below.  */
-  block_child_signals (&prev_mask);
+  if ((options & WNOHANG) == 0)
+    block_child_signals (&prev_mask);
 
   for (;;)
     {
       /* If my_waitpid returns 0 it means the __WCLONE vs. non-__WCLONE kind
 	 was right and we should just call sigsuspend.  */
 
-      pid = my_waitpid (GET_LWP (lp->ptid), &status, WNOHANG);
+      pid = my_waitpid (GET_LWP (lp->ptid), status, WNOHANG);
       if (pid == -1 && errno == ECHILD)
-	pid = my_waitpid (GET_LWP (lp->ptid), &status, __WCLONE | WNOHANG);
+	pid = my_waitpid (GET_LWP (lp->ptid), status, __WCLONE | WNOHANG);
       if (pid == -1 && errno == ECHILD)
 	{
 	  /* The thread has previously exited.  We need to delete it
@@ -2451,6 +2561,9 @@ wait_lwp (struct lwp_info *lp)
 	    fprintf_unfiltered (gdb_stdlog, "WL: %s vanished.\n",
 				target_pid_to_str (lp->ptid));
 	}
+      if (pid == 0 && (options & WNOHANG) != 0)
+	return 0;
+
       if (pid != 0)
 	break;
 
@@ -2465,9 +2578,10 @@ wait_lwp (struct lwp_info *lp)
 	 As a workaround, check if we're waiting for the thread group leader and
 	 if it's a zombie, and avoid calling waitpid if it is.
 
-	 This is racy, what if the tgl becomes a zombie right after we check?
-	 Therefore always use WNOHANG with sigsuspend - it is equivalent to
-	 waiting waitpid but the linux_lwp_is_zombie is safe this way.  */
+	 This is racy, what if the tgl becomes a zombie right after we
+	 check?  Therefore always use WNOHANG with sigsuspend - it is
+	 equivalent to waiting with waitpid but the check for
+	 zombieness is safe this way.  */
 
       if (GET_PID (lp->ptid) == GET_LWP (lp->ptid)
 	  && linux_lwp_is_zombie (GET_LWP (lp->ptid)))
@@ -2480,17 +2594,19 @@ wait_lwp (struct lwp_info *lp)
 	  break;
 	}
 
-      /* Wait for next SIGCHLD and try again.  This may let SIGCHLD handlers
-	 get invoked despite our caller had them intentionally blocked by
-	 block_child_signals.  This is sensitive only to the loop of
-	 linux_nat_wait_1 and there if we get called my_waitpid gets called
-	 again before it gets to sigsuspend so we can safely let the handlers
-	 get executed here.  */
-
-      sigsuspend (&suspend_mask);
+      /* Wait for next SIGCHLD and try again.  This may let the
+	 SIGCHLD handler run despite our caller having it
+	 intentionally blocked (with block_child_signals).  This is
+	 sensitive only to the big wait loop in linux_nat_wait_1, and
+	 if we get called from there, my_waitpid is called again
+	 before sigsuspend is reached, so we can safely let the
+	 SIGCHLD handler get executed here.  */
+      if ((options & WNOHANG) == 0)
+	sigsuspend (&suspend_mask);
     }
 
-  restore_child_signals_mask (&prev_mask);
+  if ((options & WNOHANG) == 0)
+    restore_child_signals_mask (&prev_mask);
 
   if (!thread_dead)
     {
@@ -2501,11 +2617,11 @@ wait_lwp (struct lwp_info *lp)
 	  fprintf_unfiltered (gdb_stdlog,
 			      "WL: waitpid %s received %s\n",
 			      target_pid_to_str (lp->ptid),
-			      status_to_str (status));
+			      status_to_str (*status));
 	}
 
       /* Check if the thread has exited.  */
-      if (WIFEXITED (status) || WIFSIGNALED (status))
+      if (WIFEXITED (*status) || WIFSIGNALED (*status))
 	{
 	  thread_dead = 1;
 	  if (debug_linux_nat)
@@ -2517,35 +2633,35 @@ wait_lwp (struct lwp_info *lp)
   if (thread_dead)
     {
       exit_lwp (lp);
-      return 0;
+      return -1;
     }
 
-  gdb_assert (WIFSTOPPED (status));
+  gdb_assert (WIFSTOPPED (*status));
 
   /* Handle GNU/Linux's syscall SIGTRAPs.  */
-  if (WIFSTOPPED (status) && WSTOPSIG (status) == SYSCALL_SIGTRAP)
+  if (WIFSTOPPED (*status) && WSTOPSIG (*status) == SYSCALL_SIGTRAP)
     {
       /* No longer need the sysgood bit.  The ptrace event ends up
 	 recorded in lp->waitstatus if we care for it.  We can carry
 	 on handling the event like a regular SIGTRAP from here
 	 on.  */
-      status = W_STOPCODE (SIGTRAP);
+      *status = W_STOPCODE (SIGTRAP);
       if (linux_handle_syscall_trap (lp, 1))
-	return wait_lwp (lp);
+	return wait_lwp (lp, status, options);
     }
 
   /* Handle GNU/Linux's extended waitstatus for trace events.  */
-  if (WIFSTOPPED (status) && WSTOPSIG (status) == SIGTRAP && status >> 16 != 0)
+  if (WIFSTOPPED (*status) && WSTOPSIG (*status) == SIGTRAP && *status >> 16 != 0)
     {
       if (debug_linux_nat)
 	fprintf_unfiltered (gdb_stdlog,
 			    "WL: Handling extended status 0x%06x\n",
-			    status);
-      if (linux_handle_extended_wait (lp, status, 1))
-	return wait_lwp (lp);
+			    *status);
+      if (linux_handle_extended_wait (lp, *status, 1))
+	return wait_lwp (lp, status, options);
     }
 
-  return status;
+  return GET_LWP (lp->ptid);
 }
 
 /* Save the most recent siginfo for LP.  This is currently only called
@@ -2589,7 +2705,15 @@ stop_callback (struct lwp_info *lp, void
 			      errno ? safe_strerror (errno) : "ERRNO-OK");
 	}
 
-      lp->signalled = 1;
+      if (ret == 0)
+	{
+	  if (debug_linux_nat)
+	    fprintf_unfiltered (gdb_stdlog,
+				"SC:   LWP %ld is now signalled\n",
+				GET_LWP (lp->ptid));
+	  lp->signalled = 1;
+	}
+
       gdb_assert (lp->status == 0);
     }
 
@@ -2769,10 +2893,11 @@ stop_wait_callback (struct lwp_info *lp,
 
   if (!lp->stopped)
     {
+      int ret;
       int status;
 
-      status = wait_lwp (lp);
-      if (status == 0)
+      ret = wait_lwp (lp, &status, 0);
+      if (ret < 0)
 	return 0;
 
       if (lp->ignore_sigint && WIFSTOPPED (status)
@@ -3191,7 +3316,27 @@ linux_nat_filter_event (int lwpid, int s
      fork, vfork, and clone events, then we'll just add the
      new one to our list and go back to waiting for the event
      to be reported - the stopped process might be returned
-     from waitpid before or after the event is.  */
+     from waitpid before or after the event is.
+
+     But note the case of a non-leader thread exec'ing after the
+     leader having exited, and gone from our lists.  The non-leader
+     thread changes its tid to the tgid.  */
+
+  if (WIFSTOPPED (status) && lp == NULL
+      && (WSTOPSIG (status) == SIGTRAP && status >> 16 == PTRACE_EVENT_EXEC))
+    {
+      /* A multi-thread exec after we had seen the leader exiting.  */
+      if (debug_linux_nat)
+	fprintf_unfiltered (gdb_stdlog,
+			    "LLW: Re-adding thread group leader LWP %d.\n",
+			    lwpid);
+
+      lp = add_lwp (BUILD_LWP (lwpid, lwpid));
+      lp->stopped = 1;
+      lp->resumed = 1;
+      add_thread (lp->ptid);
+    }
+
   if (WIFSTOPPED (status) && !lp)
     {
       add_to_pid_list (&stopped_pids, lwpid, status);
@@ -3393,6 +3538,371 @@ linux_nat_filter_event (int lwpid, int s
 }
 
 static ptid_t
+return_ignore (ptid_t wait_ptid, struct target_waitstatus *ourstatus)
+{
+  ourstatus->kind = TARGET_WAITKIND_IGNORE;
+
+  if (iterate_over_lwps (wait_ptid, resumed_callback, NULL) == NULL)
+    {
+      if (debug_linux_nat)
+	fprintf_unfiltered (gdb_stdlog, "LLW: exit (no resumed LWP)\n");
+      return minus_one_ptid;
+    }
+  else
+    {
+      if (debug_linux_nat)
+	fprintf_unfiltered (gdb_stdlog, "LLW: exit (ignore)\n");
+      return null_ptid;
+    }
+}
+
+static int
+status_pending_p (struct lwp_info *lp)
+{
+  return (lp->status
+	  || lp->waitstatus.kind != TARGET_WAITKIND_IGNORE);
+}
+
+static int
+handle_leader_exit_or_mtexec (int *status, int *new_pending)
+{
+  pid_t lwpid;
+  struct inferior *inf;
+
+  *new_pending = 0;
+
+  /* Check for zombie thread group leaders.  We can't wait for
+     those if there are still other threads in the thread group.
+     If the leader becomes zombie right after this check, we'll
+     get a SIGCHLD signal, and wake up on the sigsuspend call
+     below (SIGCHLD is blocked at this point).  */
+
+  ALL_INFERIORS (inf)
+    {
+      int tracer;
+      struct lwp_info *leader_lp;
+      int leader_zombie;
+      int got_exec;
+      int num;
+
+      if (inf->pid == 0)
+	continue;
+
+      leader_zombie = 0;
+      leader_lp = NULL;
+      got_exec = 0;
+      leader_lp = find_lwp_pid (pid_to_ptid (inf->pid));
+
+      if (debug_linux_nat)
+	fprintf_unfiltered (gdb_stdlog,
+			    "LLW: group leader %d in our list?: %s\n",
+			    inf->pid, leader_lp != NULL ? "yes" : "no");
+
+      if (leader_lp == NULL
+	  && pid_is_ptrace_stopped (inf->pid, &tracer) > 0
+	  && tracer != 0 && tracer == gdb_gettid ())
+	{
+	  struct thread_info *tp;
+
+	  /* We've seen the leader exit before, hence it's not in
+	     our lists.  But, we're magically tracing it, and it's
+	     in ptrace-stop.  This can only mean that we were
+	     debugging only one other thread, and it exec'ed
+	     (assuming it's not possible for a new clone to reuse
+	     the thread group'd id!).  If there were more threads
+	     in the group, the leader would remain zombie until we
+	     reapped the other threads.  */
+	  leader_lp = add_lwp (BUILD_LWP (inf->pid, inf->pid));
+	  leader_lp->stopped = 1;
+	  leader_lp->resumed = 1;
+	  add_thread (leader_lp->ptid);
+
+	  got_exec = 1;
+
+	  if (debug_linux_nat)
+	    fprintf_unfiltered (gdb_stdlog,
+				"LLW: Thread group leader %s reapeared "
+				"(another thread execd).\n",
+				target_pid_to_str (leader_lp->ptid));
+	}
+      else
+	{
+	  int ret;
+
+	  ret = linux_lwp_is_zombie (inf->pid);
+	  if (ret > 0)
+	    {
+	      leader_zombie = 1;
+
+	      if (debug_linux_nat)
+		fprintf_unfiltered (gdb_stdlog,
+				    "LLW: Thread group leader %d is zombie "
+				    "(it exited, or another thread execd).\n",
+				    inf->pid);
+	    }
+	  else if (ret < 0)
+	    {
+	      /* If the leader is really gone instead of having
+		 remained zombie on exit, then this inferior used raw
+		 clone without CLONE_THREAD to spawn processes.  IOW,
+		 the leader really wasn't the leader of the other
+		 "threads".  */
+	      if (debug_linux_nat)
+		fprintf_unfiltered (gdb_stdlog,
+				    "LLW: Thread group leader %d vanished"
+				    "(not really leader)\n",
+				    inf->pid);
+	    }
+	}
+
+      if (got_exec || leader_zombie)
+	{
+	  /* A non-leader thread execs, or we found the leader
+	     zombie.  */
+
+	  struct lwp_info *other_lp;
+
+	  /* Reap zombies first.  On kernels that don't report
+	     exit events for non-leader threads, we'll not see
+	     them as zombies.  On kernels that do, we need to do
+	     this to unblock the thread that execd (it will remain
+	     in "State: D (disk sleep)" until we do).  */
+	  ALL_LWPS (other_lp)
+	    if (GET_PID (other_lp->ptid) == inf->pid
+		&& GET_LWP (other_lp->ptid) != inf->pid)
+	      stop_callback (other_lp, NULL);
+
+	  for (;;)
+	    {
+	      int need_wait = 0;
+	      struct lwp_info *tmp;
+
+	      ALL_LWPS_SAFE (other_lp, tmp)
+		if (GET_PID (other_lp->ptid) == inf->pid
+		    && GET_LWP (other_lp->ptid) != inf->pid)
+		  {
+		    pid_t pid;
+		    int status;
+		    int ret;
+
+		    if (other_lp->stopped)
+		      {
+			ret = linux_lwp_is_zombie (GET_LWP (other_lp->ptid));
+			if (ret > 0)
+			  {
+			    if (debug_linux_nat)
+			      fprintf_unfiltered (gdb_stdlog,
+						  "LLW: stopped lwp %s is zombie "
+						  "(it exited, or another thread execd).\n",
+						  target_pid_to_str (other_lp->ptid));
+			    other_lp->stopped = 0;
+			    ret = wait_lwp (other_lp, &status, 0);
+			    gdb_assert (ret < 0);
+			    continue;
+			  }
+			else if (ret < 0)
+			  {
+			    if (debug_linux_nat)
+			      fprintf_unfiltered (gdb_stdlog, "LLW: %s vanished.\n",
+						  target_pid_to_str (other_lp->ptid));
+			    exit_lwp (other_lp);
+			    continue;
+			  }
+		      }
+		    else
+		      {
+			pid = wait_lwp (other_lp, &status, WNOHANG);
+			if (pid == 0)
+			  need_wait = 1;
+			else if (pid < 0)
+			  continue;
+			else if (pid > 0)
+			  {
+			    other_lp->stopped = 1;
+
+			    if (other_lp->ignore_sigint && WIFSTOPPED (status)
+				&& WSTOPSIG (status) == SIGINT)
+			      {
+				other_lp->ignore_sigint = 0;
+				if (other_lp->last_resume_kind == resume_stop)
+				  other_lp->status = W_STOPCODE (SIGSTOP);
+				else
+				  other_lp->status = 0;
+			      }
+			    else
+			      {
+				if (WSTOPSIG (status) != SIGSTOP)
+				  {
+				    if (linux_nat_status_is_event (status))
+				      {
+					save_siginfo (other_lp);
+					save_sigtrap (other_lp);
+				      }
+				  }
+				else
+				  other_lp->signalled = 0;
+
+				if (WSTOPSIG (status) != SIGSTOP)
+				  other_lp->status = status;
+				else if (other_lp->last_resume_kind == resume_stop)
+				  other_lp->status = status;
+				else
+				  other_lp->status = 0;
+			      }
+
+			    maybe_clear_ignore_sigint (other_lp);
+			  }
+		      }
+		  }
+
+	      if (!need_wait)
+		break;  /* done */
+
+	      sigsuspend (&suspend_mask);
+	    }
+
+	  /* Stop all threads and verify if they are still alive.
+	     If we stop all the threads and use the
+	     stop_wait_callback to check if they have exited we
+	     can determine whether this signal should be ignored
+	     or whether it means the end of the debugged
+	     application.  If the latter we'll want to use the
+	     leader's exit code as exit code of the
+	     application.
+
+	     Another case we're handling here is a multi-threaded
+	     app having a non-leader thread execing.
+
+	     In that case, the Linux kernel destroys all other
+	     threads (except the execing one) in the thread group,
+	     and resets the execing thread's tid to the tgid.  No
+	     exit notifications are sent.  The execing thread has
+	     its tid reset to the thread groud id.  When this
+	     happens, we'll see the leader as zombie.  There's a
+	     (another) nasty problem here (believed to be a kernel
+	     bug).  There's a race window where we can successfuly
+	     send a SIGSTOP to the execing thread, before it
+	     changes tid, and that SIGSTOP ends up pending
+	     post-exec.  If we didn't consider that happening,
+	     we'd sometimes see the inferior report a spurious
+	     SIGSTOP when the user resumes it after the exec.
+	     We're handling this by always queueing the leader a
+	     SIGSTOP as soon as we see it report an exec, so that
+	     we always know to ignore it.  */
+
+	  num = num_lwps (inf->pid);
+	  if (num == 0)
+	    {
+	      gdb_assert (leader_lp == NULL);
+
+	      /* No LWPs left.  Must be an exec after we had seen
+		 the leader exiting.  */
+	      if (debug_linux_nat)
+		fprintf_unfiltered (gdb_stdlog,
+				    "LLW: Re-adding thread group leader LWP %d.\n",
+				    inf->pid);
+
+	      leader_lp = add_lwp (BUILD_LWP (inf->pid, inf->pid));
+	      leader_lp->stopped = 1;
+	      leader_lp->resumed = 1;
+	      add_thread (leader_lp->ptid);
+	      num = 1;
+	    }
+
+	  if (num > 1)
+	    {
+	      if (leader_lp != NULL)
+		{
+		  /* If there is at least one more LWP, then the
+		     leader exiting was not the end of the debugged
+		     application and should be ignored.  */
+		  if (debug_linux_nat)
+		    fprintf_unfiltered (gdb_stdlog,
+					"LLW: Thread group leader LWP %d vanished.\n",
+					inf->pid);
+		  exit_lwp (leader_lp);
+		}
+
+	      ALL_LWPS (other_lp)
+		if (GET_PID (other_lp->ptid) == inf->pid
+		    && GET_LWP (other_lp->ptid) != inf->pid
+		    && other_lp->resumed)
+		  {
+		    if (!status_pending_p (other_lp))
+		      {
+			if (debug_linux_nat)
+			  fprintf_unfiltered (gdb_stdlog,
+					      "SARC: re-resuming LWP %ld\n",
+					      GET_LWP (other_lp->ptid));
+			resume_lwp (other_lp, other_lp->step);
+		      }
+		    else
+		      {
+			if (debug_linux_nat)
+			  fprintf_unfiltered (gdb_stdlog,
+					      "SARC: not re-resuming LWP %ld (has pending)\n",
+					      GET_LWP (other_lp->ptid));
+			*new_pending = 1;
+		      }
+		  }
+	    }
+	  else if (leader_lp != NULL)
+	    {
+	      /* We only have the leader left, then this could either
+		 have been an exec or we'll see the leader exit.  We
+		 won't get another SIGCHLD though, so wait here.  */
+	      if (debug_linux_nat)
+		fprintf_unfiltered (gdb_stdlog,
+				    "LLW: only the leader left in tgid %d.  poll it.\n",
+				    inf->pid);
+
+	      lwpid = my_waitpid (inf->pid, status, 0);
+	      gdb_assert (lwpid > 0);
+
+	      /* Even if there are signals (with signo < SIGTRAP)
+		 pending on the thread, we should see the exec event
+		 first.  */
+	      gdb_assert (!WIFSTOPPED (*status)
+			  || (WSTOPSIG (*status) == SIGTRAP
+			      && *status >> 16 == PTRACE_EVENT_EXEC));
+
+	      return lwpid;
+	    }
+	  else
+	    {
+	      ALL_LWPS (other_lp)
+		if (GET_PID (other_lp->ptid) == inf->pid
+		    && GET_LWP (other_lp->ptid) != inf->pid)
+		  break;
+	      gdb_assert (other_lp);
+
+	      if (other_lp->resumed)
+		{
+		  if (!status_pending_p (other_lp))
+		    {
+		      if (debug_linux_nat)
+			fprintf_unfiltered (gdb_stdlog,
+					    "SARC: re-resuming LWP %ld\n",
+					    GET_LWP (other_lp->ptid));
+		      resume_lwp (other_lp, other_lp->step);
+		    }
+		  else
+		    {
+		      if (debug_linux_nat)
+			fprintf_unfiltered (gdb_stdlog,
+					    "SARC: not re-resuming LWP %ld (has pending)\n",
+					    GET_LWP (other_lp->ptid));
+		      *new_pending = 1;
+		    }
+		}
+	    }
+	}
+    }
+
+  return 0;
+}
+
+static ptid_t
 linux_nat_wait_1 (struct target_ops *ops,
 		  ptid_t ptid, struct target_waitstatus *ourstatus,
 		  int target_options)
@@ -3436,25 +3946,11 @@ linux_nat_wait_1 (struct target_ops *ops
     pid = GET_LWP (ptid);
 
 retry:
+  options = 0;
   lp = NULL;
   status = 0;
   options = 0;
 
-  /* Make sure that of those LWPs we want to get an event from, there
-     is at least one LWP that has been resumed.  If there's none, just
-     bail out.  The core may just be flushing asynchronously all
-     events.  */
-  if (iterate_over_lwps (ptid, resumed_callback, NULL) == NULL)
-    {
-      ourstatus->kind = TARGET_WAITKIND_IGNORE;
-
-      if (debug_linux_nat)
-	fprintf_unfiltered (gdb_stdlog, "LLW: exit (no resumed LWP)\n");
-
-      restore_child_signals_mask (&prev_mask);
-      return minus_one_ptid;
-    }
-
   /* First check if there is a LWP with a wait status pending.  */
   if (pid == -1)
     {
@@ -3472,7 +3968,7 @@ retry:
       /* But if we don't find one, we'll have to wait, and check both
 	 cloned and uncloned processes.  We start with the cloned
 	 processes.  */
-      options = __WCLONE | WNOHANG;
+      options = __WCLONE;
     }
   else if (is_lwp (ptid))
     {
@@ -3557,22 +4053,25 @@ retry:
       set_sigint_trap ();
     }
 
-  /* Translate generic target_wait options into waitpid options.  */
-  if (target_options & TARGET_WNOHANG)
-    options |= WNOHANG;
+  /* Always use WNOHANG, due to a kernel/ptrace quirk where if the
+     thread group leader exits while other threads in the thread group
+     still exist, waitpid hangs.  */
+  options |= WNOHANG;
 
   while (lp == NULL)
     {
       pid_t lwpid;
+      struct inferior *inf;
+      /* If this is true, then we paused LWPs momentarily, and may now
+	 have pending events to handle.  */
+      int new_pending;
 
+      errno = 0;
       lwpid = my_waitpid (pid, &status, options);
 
       if (lwpid > 0)
 	{
-	  /* If this is true, then we paused LWPs momentarily, and may
-	     now have pending events to handle.  */
-	  int new_pending;
-
+	got_event:
 	  gdb_assert (pid == -1 || lwpid == pid);
 
 	  if (debug_linux_nat)
@@ -3677,17 +4176,21 @@ retry:
 	      if (new_pending)
 		goto retry;
 
-	      if (pid == -1)
-		{
-		  /* waitpid did return something.  Restart over.  */
-		  options |= __WCLONE;
-		}
-	      continue;
+	      /* We got here because waitpid did return something, but
+		 the event was filtered out.  It could have been an
+		 lwp exit, and so we may end up with no resumed lwps
+		 left.  Restart over.  */
+	      goto retry;
 	    }
 	}
 
       if (pid == -1)
 	{
+	  if (debug_linux_nat)
+	    fprintf_unfiltered (gdb_stdlog,
+				"LNW: waitpid(-1, ...) returned %d, %s\n",
+				lwpid, errno ? safe_strerror (errno) : "ERRNO-OK");
+
 	  /* Alternate between checking cloned and uncloned processes.  */
 	  options ^= __WCLONE;
 
@@ -3696,14 +4199,43 @@ retry:
 	     In sync mode, suspend waiting for a SIGCHLD signal.  */
 	  if (options & __WCLONE)
 	    {
+	      /* Check for zombie thread group leader.  We can't wait
+		 for those if there are still other threads in the
+		 thread group.  Even though we're waiting with
+		 PID==-1, other threads in the group may have not been
+		 resumed (schedlock, for example).  If the leader
+		 becomes zombie right after this check, we'll get a
+		 SIGCHLD signal, and wake up on the sigsuspend call
+		 below (SIGCHLD is blocked at this point).  */
+	      lwpid = handle_leader_exit_or_mtexec (&status, &new_pending);
+	      if (lwpid > 0)
+		goto got_event;
+	      /* If we now have pending events to consume, go all the way back
+		 and handle them.  */
+	      else if (new_pending)
+		goto retry;
+
 	      if (target_options & TARGET_WNOHANG)
 		{
 		  /* No interesting event.  */
-		  ourstatus->kind = TARGET_WAITKIND_IGNORE;
+		  restore_child_signals_mask (&prev_mask);
+		  return return_ignore (ptid, ourstatus);
+		}
 
+	      /* But if there are no resumed children left, bail.
+		 We'd be stuck forever in sigsuspend otherwise.  Note
+		 we can't do this before calling waitpid, because we
+		 need to handle a non-leader thread exec'ing.  When
+		 that happens, the pre-exec leader may have not been
+		 resumed, but since the non-leader changes its tid to
+		 reincarnate as the leader, we'd never see the exec
+		 event.  */
+	      if (iterate_over_lwps (ptid, resumed_callback, NULL) == NULL)
+		{
 		  if (debug_linux_nat)
-		    fprintf_unfiltered (gdb_stdlog, "LLW: exit (ignore)\n");
+		    fprintf_unfiltered (gdb_stdlog, "LLW: exit (no resumed LWP)\n");
 
+		  ourstatus->kind = TARGET_WAITKIND_IGNORE;
 		  restore_child_signals_mask (&prev_mask);
 		  return minus_one_ptid;
 		}
@@ -3711,16 +4243,30 @@ retry:
 	      sigsuspend (&suspend_mask);
 	    }
 	}
-      else if (target_options & TARGET_WNOHANG)
+      else
 	{
-	  /* No interesting event for PID yet.  */
-	  ourstatus->kind = TARGET_WAITKIND_IGNORE;
+	  /* Check for zombie thread group leader.  We can't wait for
+	     those if there are still other threads in the thread
+	     group.  If the leader becomes zombie right after this
+	     check, we'll get a SIGCHLD signal, and wake up on the
+	     sigsuspend call below (SIGCHLD is blocked at this
+	     point).  */
+	  lwpid = handle_leader_exit_or_mtexec (&status, &new_pending);
+	  if (lwpid > 0)
+	    goto got_event;
+	  /* If we now have pending events to consume, go all the way back
+	     and handle them.  */
+	  else if (new_pending)
+	    goto retry;
 
-	  if (debug_linux_nat)
-	    fprintf_unfiltered (gdb_stdlog, "LLW: exit (ignore)\n");
+	  if (target_options & TARGET_WNOHANG)
+	    {
+	      /* No interesting event for PID yet.  */
+	      restore_child_signals_mask (&prev_mask);
+	      return return_ignore (ptid, ourstatus);
+	    }
 
-	  restore_child_signals_mask (&prev_mask);
-	  return minus_one_ptid;
+	  sigsuspend (&suspend_mask);
 	}
 
       /* We shouldn't end up here unless we want to try again.  */
Index: src/gdb/infrun.c
===================================================================
--- src.orig/gdb/infrun.c	2011-10-26 18:03:15.000000000 +0100
+++ src/gdb/infrun.c	2011-10-26 18:09:20.980742552 +0100
@@ -2806,6 +2806,7 @@ fetch_inferior_event (void *client_data)
 	normal_stop ();
 
       if (target_has_execution
+	  && ecs->ws.kind != TARGET_WAITKIND_IGNORE
 	  && ecs->ws.kind != TARGET_WAITKIND_EXITED
 	  && ecs->ws.kind != TARGET_WAITKIND_SIGNALLED
 	  && ecs->event_thread->step_multi
@@ -3149,23 +3150,40 @@ handle_inferior_event (struct execution_
 
   if (ecs->ws.kind == TARGET_WAITKIND_IGNORE)
     {
-      /* We had an event in the inferior, but we are not interested in
-	 handling it at this level.  The lower layers have already
-	 done what needs to be done, if anything.
-
-	 One of the possible circumstances for this is when the
-	 inferior produces output for the console.  The inferior has
-	 not stopped, and we are ignoring the event.  Another possible
-	 circumstance is any event which the lower level knows will be
-	 reported multiple times without an intervening resume.  */
-      if (debug_infrun)
-	fprintf_unfiltered (gdb_stdlog, "infrun: TARGET_WAITKIND_IGNORE\n");
-      prepare_to_wait (ecs);
-      return;
+      if (ptid_equal (ecs->ptid, null_ptid))
+	{
+	  /* We had an event in the inferior, but we are not
+	     interested in handling it at this level.  The lower
+	     layers have already done what needs to be done, if
+	     anything.
+
+	     One of the possible circumstances for this is when the
+	     inferior produces output for the console.  The inferior
+	     has not stopped, and we are ignoring the event.  Another
+	     possible circumstance is any event which the lower level
+	     knows will be reported multiple times without an
+	     intervening resume.  */
+	  prepare_to_wait (ecs);
+	  return;
+	}
+
+      if (target_can_async_p ()
+	  && !sync_execution
+	  && ptid_equal (ecs->ptid, minus_one_ptid))
+	{
+	  /* There were no unwaited-for children left in the target,
+	     but, we're not synchronously waiting for events either.
+	     Just ignore.  Otherwise, if we were running a synchronous
+	     execution command, we need to cancel it and give the user
+	     back the terminal.  */
+	  prepare_to_wait (ecs);
+	  return;
+	}
     }
 
   if (ecs->ws.kind != TARGET_WAITKIND_EXITED
-      && ecs->ws.kind != TARGET_WAITKIND_SIGNALLED)
+      && ecs->ws.kind != TARGET_WAITKIND_SIGNALLED
+      && ecs->ws.kind != TARGET_WAITKIND_IGNORE)
     {
       struct inferior *inf = find_inferior_pid (ptid_get_pid (ecs->ptid));
 
@@ -3182,17 +3200,32 @@ handle_inferior_event (struct execution_
   /* Always clear state belonging to the previous time we stopped.  */
   stop_stack_dummy = STOP_NONE;
 
-  /* If it's a new process, add it to the thread database.  */
+  if (ecs->ws.kind == TARGET_WAITKIND_IGNORE
+      && ptid_equal (ecs->ptid, minus_one_ptid))
+    {
+      /* No unwaited-for children left.  IOW, all resumed children
+	 have exited.  */
+      if (debug_infrun)
+	fprintf_unfiltered (gdb_stdlog, "infrun: TARGET_WAITKIND_IGNORE\n");
 
-  ecs->new_thread_event = (!ptid_equal (ecs->ptid, inferior_ptid)
-			   && !ptid_equal (ecs->ptid, minus_one_ptid)
-			   && !in_thread_list (ecs->ptid));
+      stop_print_frame = 0;
+      stop_stepping (ecs);
+      return;
+    }
 
+  /* If it's a new process, add it to the thread database.  */
   if (ecs->ws.kind != TARGET_WAITKIND_EXITED
-      && ecs->ws.kind != TARGET_WAITKIND_SIGNALLED && ecs->new_thread_event)
-    add_thread (ecs->ptid);
+      && ecs->ws.kind != TARGET_WAITKIND_SIGNALLED)
+    {
+      ecs->new_thread_event = (!ptid_equal (ecs->ptid, inferior_ptid)
+			       && !ptid_equal (ecs->ptid, minus_one_ptid)
+			       && !in_thread_list (ecs->ptid));
 
-  ecs->event_thread = find_thread_ptid (ecs->ptid);
+      if (ecs->new_thread_event)
+	ecs->event_thread = add_thread (ecs->ptid);
+      else
+	ecs->event_thread = find_thread_ptid (ecs->ptid);
+    }
 
   /* Dependent on valid ECS->EVENT_THREAD.  */
   adjust_pc_after_break (ecs);
@@ -5824,7 +5857,8 @@ normal_stop (void)
   if (!non_stop)
     make_cleanup (finish_thread_state_cleanup, &minus_one_ptid);
   else if (last.kind != TARGET_WAITKIND_SIGNALLED
-	   && last.kind != TARGET_WAITKIND_EXITED)
+	   && last.kind != TARGET_WAITKIND_EXITED
+	   && last.kind != TARGET_WAITKIND_IGNORE)
     make_cleanup (finish_thread_state_cleanup, &inferior_ptid);
 
   /* In non-stop mode, we don't want GDB to switch threads behind the
@@ -5842,6 +5876,7 @@ normal_stop (void)
   if (!non_stop
       && !ptid_equal (previous_inferior_ptid, inferior_ptid)
       && target_has_execution
+      && last.kind != TARGET_WAITKIND_IGNORE
       && last.kind != TARGET_WAITKIND_SIGNALLED
       && last.kind != TARGET_WAITKIND_EXITED)
     {
@@ -5852,6 +5887,15 @@ normal_stop (void)
       previous_inferior_ptid = inferior_ptid;
     }
 
+  if (last.kind == TARGET_WAITKIND_IGNORE)
+    {
+      gdb_assert ((sync_execution || !target_can_async_p ())
+		  && ptid_equal (last_ptid, minus_one_ptid));
+
+      target_terminal_ours_for_output ();
+      printf_filtered (_("No unwaited-for children left.\n"));
+    }
+
   if (!breakpoints_always_inserted_mode () && target_has_execution)
     {
       if (remove_breakpoints ())
@@ -6038,6 +6082,7 @@ done:
   if (!target_has_execution
       || last.kind == TARGET_WAITKIND_SIGNALLED
       || last.kind == TARGET_WAITKIND_EXITED
+      || last.kind == TARGET_WAITKIND_IGNORE
       || (!inferior_thread ()->step_multi
 	  && !(inferior_thread ()->control.stop_bpstat
 	       && inferior_thread ()->control.proceed_to_finish)
Index: src/gdb/linux-nat.h
===================================================================
--- src.orig/gdb/linux-nat.h	2011-10-26 17:49:22.420742373 +0100
+++ src/gdb/linux-nat.h	2011-10-26 18:09:20.980742552 +0100
@@ -124,6 +124,13 @@ extern struct lwp_info *lwp_list;
        (LP) != NULL;							\
        (LP) = (LP)->next)
 
+/* Iterate over the each active LWP.  Safe even if the statement
+   deletes the current LWP.  */
+#define ALL_LWPS_SAFE(LP, TMP)						\
+  for ((LP) = lwp_list;							\
+       (LP) != NULL ? ((TMP) = (LP)->next, 1) : 0;			\
+       (LP) = (TMP))
+
 #define GET_LWP(ptid)		ptid_get_lwp (ptid)
 #define GET_PID(ptid)		ptid_get_pid (ptid)
 #define is_lwp(ptid)		(GET_LWP (ptid) != 0)
Index: src/gdb/remote.c
===================================================================
--- src.orig/gdb/remote.c	2011-10-26 17:49:22.420742373 +0100
+++ src/gdb/remote.c	2011-10-26 18:09:20.980742552 +0100
@@ -5453,7 +5453,7 @@ remote_wait_ns (ptid_t ptid, struct targ
       if (options & TARGET_WNOHANG)
 	{
 	  status->kind = TARGET_WAITKIND_IGNORE;
-	  return minus_one_ptid;
+	  return null_ptid;
 	}
 
       /* Otherwise do a blocking wait.  */
@@ -5587,7 +5587,7 @@ remote_wait_as (ptid_t ptid, struct targ
       /* Nothing interesting happened.  If we're doing a non-blocking
 	 poll, we're done.  Otherwise, go back to waiting.  */
       if (options & TARGET_WNOHANG)
-	return minus_one_ptid;
+	return null_ptid;
       else
 	goto again;
     }


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]