[PATCH] linux: Fallback to fork and exec for clone fail on posix_spawn (BZ#29115)

Adhemerval Zanella adhemerval.zanella@linaro.org
Mon May 2 19:42:31 GMT 2022



On 02/05/2022 16:15, Alexey Izbyshev wrote:
> On 2022-05-02 21:06, Adhemerval Zanella wrote:
>> Linux clone with CLONE_VM | CLONE_VFORK may fail for some namespace
>> restriction (for instance kernel does not allow processes in different
>> time namespaces to share the sameaddress space).
>>
>> In this case clone fails with EINVAL and thus posix_spawn can not spawn
>> a new process.  However the same process can be spawned with fork and exec.
>>
>> The patch fixes by issues _Fork and exec iff clone fails with a non
>> transient failure (ENOMEM and EAGAIN still returns failure to caller).
>>
>> Failure on prepare phase in helper process does not trigger the fork
>> and exec fallback.
>>
>> Checked on x86_64-linux-gnu.
>> ---
>>  sysdeps/unix/sysv/linux/spawni.c | 175 ++++++++++++++++++++-----------
>>  1 file changed, 116 insertions(+), 59 deletions(-)
>>
>> diff --git a/sysdeps/unix/sysv/linux/spawni.c b/sysdeps/unix/sysv/linux/spawni.c
>> index d6f5ca89cd..b73715da7e 100644
>> --- a/sysdeps/unix/sysv/linux/spawni.c
>> +++ b/sysdeps/unix/sysv/linux/spawni.c
>> @@ -67,6 +67,7 @@ struct posix_spawn_args
>>    char *const *envp;
>>    int xflags;
>>    int err;
>> +  int pipe[2];
>>  };
>>
>>  /* Older version requires that shell script without shebang definition
>> @@ -103,6 +104,7 @@ __spawni_child (void *arguments)
>>    struct posix_spawn_args *args = arguments;
>>    const posix_spawnattr_t *restrict attr = args->attr;
>>    const posix_spawn_file_actions_t *file_actions = args->fa;
>> +  bool use_pipe = args->pipe[0] != -1;
>>
>>    /* The child must ensure that no signal handler are enabled because it shared
>>       memory with parent, so the signal disposition must be either SIG_DFL or
>> @@ -113,6 +115,9 @@ __spawni_child (void *arguments)
>>    struct sigaction sa;
>>    memset (&sa, '\0', sizeof (sa));
>>
>> +  if (use_pipe)
>> +    __close (args->pipe[0]);
>> +
>>    sigset_t hset;
>>    __sigprocmask (SIG_BLOCK, 0, &hset);
>>    for (int sig = 1; sig < _NSIG; ++sig)
>> @@ -300,45 +305,26 @@ fail:
>>       (EINTERNALBUG) describing that, use ECHILD.  Another option would
>>       be to set args->err to some negative sentinel and have the parent
>>       abort(), but that seems needlessly harsh.  */
>> -  args->err = errno ? : ECHILD;
>> +  int ret = errno ? : ECHILD;
>> +  if (use_pipe)
>> +    {
>> +      __close (args->pipe[0]);
> 
> The read end has already been closed above.

Ack.

> 
>> +      while (__write_nocancel (args->pipe[1], &ret, sizeof (ret)) < 0);
> 
> If the parent is killed, write() will fail with EPIPE, so this loop will become infinite. Maybe check for EPIPE and break?

We block all signals (__libc_signal_block_all), so the parent will be only killed
by signals that can no be caught (SIGKILL).  Same for SIGPIPE.

> 
>> +    }
>> +  else
>> +    args->err = ret;
>> +
>>    _exit (SPAWN_ERROR);
>>  }
>>
>> -/* Spawn a new process executing PATH with the attributes describes in *ATTRP.
>> -   Before running the process perform the actions described in FILE-ACTIONS. */
>> -static int
>> -__spawnix (pid_t * pid, const char *file,
>> -       const posix_spawn_file_actions_t * file_actions,
>> -       const posix_spawnattr_t * attrp, char *const argv[],
>> -       char *const envp[], int xflags,
>> -       int (*exec) (const char *, char *const *, char *const *))
>> +/* Create a new process using clone with CLONE_VM | CLONE_VFORK to optimize
>> +   memory usage.  Return TRUE is the helper process could be spawned, or
>> +   FALSE otherwise.  */
>> +static bool
>> +spawni_clone (struct posix_spawn_args *args, pid_t *new_pid, int *ec)
>>  {
>> -  pid_t new_pid;
>> -  struct posix_spawn_args args;
>> -  int ec;
>> -
>> -  /* To avoid imposing hard limits on posix_spawn{p} the total number of
>> -     arguments is first calculated to allocate a mmap to hold all possible
>> -     values.  */
>> -  ptrdiff_t argc = 0;
>> -  /* Linux allows at most max (0x7FFFFFFF, 1/4 stack size) arguments
>> -     to be used in a execve call.  We limit to INT_MAX minus one due the
>> -     compatiblity code that may execute a shell script (maybe_script_execute)
>> -     where it will construct another argument list with an additional
>> -     argument.  */
>> -  ptrdiff_t limit = INT_MAX - 1;
>> -  while (argv[argc++] != NULL)
>> -    if (argc == limit)
>> -      {
>> -    errno = E2BIG;
>> -    return errno;
>> -      }
>> -
>> -  int prot = (PROT_READ | PROT_WRITE
>> -         | ((GL (dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
>> -
>>    /* Add a slack area for child's stack.  */
>> -  size_t argv_size = (argc * sizeof (void *)) + 512;
>> +  size_t argv_size = (args->argc * sizeof (void *)) + 512;
>>    /* We need at least a few pages in case the compiler's stack checking is
>>       enabled.  In some configs, it is known to use at least 24KiB.  We use
>>       32KiB to be "safe" from anything the compiler might do.  Besides, the
>> @@ -347,28 +333,12 @@ __spawnix (pid_t * pid, const char *file,
>>       where it might use about 1k extra stack space).  */
>>    argv_size += (32 * 1024);
>>    size_t stack_size = ALIGN_UP (argv_size, GLRO(dl_pagesize));
>> +  int prot = (PROT_READ | PROT_WRITE
>> +         | ((GL (dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
>>    void *stack = __mmap (NULL, stack_size, prot,
>>              MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
>>    if (__glibc_unlikely (stack == MAP_FAILED))
>> -    return errno;
>> -
>> -  /* Disable asynchronous cancellation.  */
>> -  int state;
>> -  __pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, &state);
>> -
>> -  /* Child must set args.err to something non-negative - we rely on
>> -     the parent and child sharing VM.  */
>> -  args.err = 0;
>> -  args.file = file;
>> -  args.exec = exec;
>> -  args.fa = file_actions;
>> -  args.attr = attrp ? attrp : &(const posix_spawnattr_t) { 0 };
>> -  args.argv = argv;
>> -  args.argc = argc;
>> -  args.envp = envp;
>> -  args.xflags = xflags;
>> -
>> -  __libc_signal_block_all (&args.oldmask);
> 
> The caller expects the error to be in "*ec".

Ack.

>> +  /* There is no much point in retrying with fork and exec if kernel returns a
>> +     failure due resource exhaustion.  */
>> +  return *new_pid > 0 || (errno != ENOMEM && errno != EAGAIN);
>> +}
>> +
>> +/* Fallback spawn case which uses fork plus exec along with a pipe to
>> +   communicate any helper process prepare pahse or execve failure.  */
>> +static void
>> +spawni_fork_fallback (struct posix_spawn_args *args, pid_t *new_pid, int *ec)
>> +{
>> +  if (__pipe2 (args->pipe, O_CLOEXEC))
> 
> Same here: the caller expects the error to be in "*ec".

Ack.

> 
>> +    return;
>> +
>> +  /* Do not trigger atfork handler nor any internal state reset since the
>> +     helper process will call execve.  */
>> +  *new_pid = _Fork ();
>> +  if (*new_pid == 0)
>> +    __spawni_child (args);
>> +  else if (*new_pid > 0)
>> +    {
>> +      __close (args->pipe[1]);
> 
> The write end should be closed on the error path too.

Ack, I moved the __close (args.pipe[1]) to be along __close (args.pipe[0]).


>> +  pid_t new_pid = 0;
>> +  int ec = -1;
>> +  /* clone with CLONE_VM | CLONE_VFORK may fail for some namespace restriction
>> +     (for instance Linux does not allow processes in different time namespaces
>> +     to share address space) and in this case clone fails with EINVAL.  Retry
>> +     with fork and exec.  */
>> +  if (!spawni_clone (&args, &new_pid, &ec))
>> +    spawni_fork_fallback (&args, &new_pid, &ec);
>> +
>>    if ((ec == 0) && (pid != NULL))
>>      *pid = new_pid;
> 
> Shouldn't also __spawni_child() check whether posix_spawn actions are trying to do something with the error pipe write end (e.g. to close it, to dup() it or dup2() over it, etc.) and prevent that?

Yeah we should, we need to either duplicate the file descriptor if a file operation
might clobber it and return EBADF for dup2 file action.


More information about the Libc-alpha mailing list