Re: proc_flush_task oops

From: Eric W. Biederman
Date: Tue Dec 19 2017 - 13:25:48 EST


Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx> writes:

> On 2017/12/19 12:39, Dave Jones wrote:
>> On Mon, Dec 18, 2017 at 03:50:52PM -0800, Linus Torvalds wrote:
>>
>> > But I don't see what would have changed in this area recently.
>> >
>> > Do you end up saving the seeds that cause crashes? Is this
>> > reproducible? (Other than seeing it twoce, of course)
>>
>> Only clue so far, is every time I'm able to trigger it, the last thing
>> the child process that triggers it did, was an execveat.
>>
>> Telling it to just fuzz execveat doesn't instantly trigger it, so it
>> must be a combination of some other syscall. I'll leave a script running
>> overnight to see if I can binary search the other syscalls in
>> combination with it.
>>
>> One other thing: I said this was rc4, but it was actually rc4 + all the
>> x86 stuff from today. There's enough creepy stuff in that pile, that
>> I'll try with just plain rc4 tomorrow too.
>>
>> Dave
>>
>
> When hitting an oops at finalization function using fuzzing tool, checking for
> corresponding initialization function and previous error messages might help.
>
> It seems to me that there are error paths which allow nsproxy to be replaced
> without assigning ->pid_ns_for_children->proc_mnt.
>
> ----------
> static __latent_entropy struct task_struct *copy_process(
> unsigned long clone_flags,
> unsigned long stack_start,
> unsigned long stack_size,
> int __user *child_tidptr,
> struct pid *pid,
> int trace,
> unsigned long tls,
> int node)
> {
> (...snipped...)
> retval = copy_namespaces(clone_flags, p); // Allocates p->nsproxy->pid_ns_for_children
> if (retval)
> goto bad_fork_cleanup_mm;
> retval = copy_io(clone_flags, p);
> if (retval)
> goto bad_fork_cleanup_namespaces; // p->nsproxy->pid_ns_for_children->proc_mnt == NULL.
> retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
> if (retval)
> goto bad_fork_cleanup_io; // p->nsproxy->pid_ns_for_children->proc_mnt == NULL.
>
> if (pid != &init_struct_pid) {
> pid = alloc_pid(p->nsproxy->pid_ns_for_children); // Initializes p->nsproxy->pid_ns_for_children->proc_mnt upon success.
> if (IS_ERR(pid)) {
> retval = PTR_ERR(pid);
> goto bad_fork_cleanup_thread; // p->nsproxy->pid_ns_for_children->proc_mnt == NULL.
> }
> }
> (...snipped...)
> }

> int copy_namespaces(unsigned long flags, struct task_struct *tsk)
> {
> struct nsproxy *old_ns = tsk->nsproxy;
> struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
> struct nsproxy *new_ns;
>
> if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
> CLONE_NEWPID | CLONE_NEWNET |
> CLONE_NEWCGROUP)))) {
> get_nsproxy(old_ns);
> return 0;
> }
>
> if (!ns_capable(user_ns, CAP_SYS_ADMIN))
> return -EPERM;
>
> /*
> * CLONE_NEWIPC must detach from the undolist: after switching
> * to a new ipc namespace, the semaphore arrays from the old
> * namespace are unreachable. In clone parlance, CLONE_SYSVSEM
> * means share undolist with parent, so we must forbid using
> * it along with CLONE_NEWIPC.
> */
> if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) ==
> (CLONE_NEWIPC | CLONE_SYSVSEM))
> return -EINVAL;
>
> new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs); // Allocates new nsproxy.
> if (IS_ERR(new_ns))
> return PTR_ERR(new_ns);
>
> tsk->nsproxy = new_ns; // p->nsproxy is updated with ->pid_ns_for_children->proc_mnt == NULL.
> return 0;
> }


> static struct nsproxy *create_new_namespaces(unsigned long flags,
> struct task_struct *tsk, struct user_namespace *user_ns,
> struct fs_struct *new_fs)
> {
> struct nsproxy *new_nsp;
> int err;
>
> new_nsp = create_nsproxy();
> if (!new_nsp)
> return ERR_PTR(-ENOMEM);
>
> new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
> if (IS_ERR(new_nsp->mnt_ns)) {
> err = PTR_ERR(new_nsp->mnt_ns);
> goto out_ns;
> }
>
> new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns);
> if (IS_ERR(new_nsp->uts_ns)) {
> err = PTR_ERR(new_nsp->uts_ns);
> goto out_uts;
> }
>
> new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns);
> if (IS_ERR(new_nsp->ipc_ns)) {
> err = PTR_ERR(new_nsp->ipc_ns);
> goto out_ipc;
> }
>
> new_nsp->pid_ns_for_children =
> copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns_for_children); // Returns with ->proc_mnt == NULL.
> if (IS_ERR(new_nsp->pid_ns_for_children)) {
> err = PTR_ERR(new_nsp->pid_ns_for_children);
> goto out_pid;
> }
>
> new_nsp->cgroup_ns = copy_cgroup_ns(flags, user_ns,
> tsk->nsproxy->cgroup_ns);
> if (IS_ERR(new_nsp->cgroup_ns)) {
> err = PTR_ERR(new_nsp->cgroup_ns);
> goto out_cgroup;
> }
>
> new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
> if (IS_ERR(new_nsp->net_ns)) {
> err = PTR_ERR(new_nsp->net_ns);
> goto out_net;
> }
>
> return new_nsp; // Returns with ->pid_ns_for_children->proc_mnt == NULL.
>
> out_net:
> put_cgroup_ns(new_nsp->cgroup_ns);
> out_cgroup:
> if (new_nsp->pid_ns_for_children)
> put_pid_ns(new_nsp->pid_ns_for_children);
> out_pid:
> if (new_nsp->ipc_ns)
> put_ipc_ns(new_nsp->ipc_ns);
> out_ipc:
> if (new_nsp->uts_ns)
> put_uts_ns(new_nsp->uts_ns);
> out_uts:
> if (new_nsp->mnt_ns)
> put_mnt_ns(new_nsp->mnt_ns);
> out_ns:
> kmem_cache_free(nsproxy_cachep, new_nsp);
> return ERR_PTR(err);
> }
>
> struct pid_namespace *copy_pid_ns(unsigned long flags,
> struct user_namespace *user_ns, struct pid_namespace *old_ns)
> {
> if (!(flags & CLONE_NEWPID))
> return get_pid_ns(old_ns);
> if (task_active_pid_ns(current) != old_ns)
> return ERR_PTR(-EINVAL);
> return create_pid_namespace(user_ns, old_ns); // Returns with ->proc_mnt == NULL.
> }
>
> static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
> struct pid_namespace *parent_pid_ns)
> {
> struct pid_namespace *ns;
> unsigned int level = parent_pid_ns->level + 1;
> struct ucounts *ucounts;
> int err;
>
> err = -EINVAL;
> if (!in_userns(parent_pid_ns->user_ns, user_ns))
> goto out;
>
> err = -ENOSPC;
> if (level > MAX_PID_NS_LEVEL)
> goto out;
> ucounts = inc_pid_namespaces(user_ns);
> if (!ucounts)
> goto out;
>
> err = -ENOMEM;
> ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); // Initializes ->proc_mnt with NULL.
> if (ns == NULL)
> goto out_dec;
>
> idr_init(&ns->idr);
>
> ns->pid_cachep = create_pid_cachep(level + 1);
> if (ns->pid_cachep == NULL)
> goto out_free_idr;
>
> err = ns_alloc_inum(&ns->ns);
> if (err)
> goto out_free_idr;
> ns->ns.ops = &pidns_operations;
>
> kref_init(&ns->kref);
> ns->level = level;
> ns->parent = get_pid_ns(parent_pid_ns);
> ns->user_ns = get_user_ns(user_ns);
> ns->ucounts = ucounts;
> ns->pid_allocated = PIDNS_ADDING;
> INIT_WORK(&ns->proc_work, proc_cleanup_work);
>
> return ns; // Returns with ->proc_mnt == NULL.
>
> out_free_idr:
> idr_destroy(&ns->idr);
> kmem_cache_free(pid_ns_cachep, ns);
> out_dec:
> dec_pid_namespaces(ucounts);
> out:
> return ERR_PTR(err);
> }
>
> struct pid *alloc_pid(struct pid_namespace *ns)
> {
> (...snipped...)
> if (unlikely(is_child_reaper(pid))) {
> if (pid_ns_prepare_proc(ns)) {
> disable_pid_allocation(ns);
> goto out_free;
> }
> }
> (...snipped...)
> }
>
> int pid_ns_prepare_proc(struct pid_namespace *ns)
> {
> struct vfsmount *mnt;
>
> mnt = kern_mount_data(&proc_fs_type, ns);
> if (IS_ERR(mnt))
> return PTR_ERR(mnt);
>
> ns->proc_mnt = mnt;
> return 0;
> }
> ----------
>
> If one of copy_io(), copy_thread_tls() or alloc_pid() returns an error, creation
> of a child fails with p->nsproxy->pid_ns_for_children->proc_mnt == NULL.
>
> Then, when the child exits, the parent waiting at wait() calls
> proc_flush_task() which assumes that everything was set up properly.

I don't think so.

proc_mnt is allocated when the first pid is allocated in a pid
namespace. And proc_mnt is freed after the last pid in a pid
namespace is freed. That is schedule_work(&ns->proc_work) in free_pid.

Yes we can occassionally have pid namespaces without pids and
thus without a proc_mnt.

But I don't see that allows us to create a task_struct that is
not flushable.

If fork fails there is hashed pid and no task_struct to call
proc_flush_task on.

If fork succeeds there the process contains a pid in a pid namespace.

So while pid_ns_for_children->proc_mnt might be NULL. It does not
follow that pid->ns->proc_mnt can be NULL.

Eric

> ----------
> void proc_flush_task(struct task_struct *task)
> {
> int i;
> struct pid *pid, *tgid;
> struct upid *upid;
>
> pid = task_pid(task);
> tgid = task_tgid(task);
>
> for (i = 0; i <= pid->level; i++) {
> upid = &pid->numbers[i];
> proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
> tgid->numbers[i].nr);
> }
> }
> ----------