Re: [PATCH v2 1/1] sched/psi: use kernfs polling functions for PSI trigger polling

From: Suren Baghdasaryan
Date: Thu Jun 29 2023 - 21:04:36 EST


On Thu, Jun 29, 2023 at 5:56 PM Suren Baghdasaryan <surenb@xxxxxxxxxx> wrote:
>
> Destroying psi trigger in cgroup_file_release causes UAF issues when
> a cgroup is removed from under a polling process. This is happening
> because cgroup removal causes a call to cgroup_file_release while the
> actual file is still alive. Destroying the trigger at this point would
> also destroy its waitqueue head and if there is still a polling process
> on that file accessing the waitqueue, it will step on the freed pointer:
>
> do_select
> vfs_poll
> do_rmdir
> cgroup_rmdir
> kernfs_drain_open_files
> cgroup_file_release
> cgroup_pressure_release
> psi_trigger_destroy
> wake_up_pollfree(&t->event_wait)
> // vfs_poll is unblocked
> synchronize_rcu
> kfree(t)
> poll_freewait -> UAF access to the trigger's waitqueue head
>
> Patch [1] fixed this issue for epoll() case using wake_up_pollfree(),
> however the same issue exists for synchronous poll() case.
> The root cause of this issue is that the lifecycles of the psi trigger's
> waitqueue and of the file associated with the trigger are different. Fix
> this by using kernfs_generic_poll function when polling on cgroup-specific
> psi triggers. It internally uses kernfs_open_node->poll waitqueue head
> with its lifecycle tied to the file's lifecycle. This also renders the
> fix in [1] obsolete, so revert it.
>
> [1] commit c2dbe32d5db5 ("sched/psi: Fix use-after-free in ep_remove_wait_queue()")
>
> Fixes: 0e94682b73bf ("psi: introduce psi monitor")
> Reported-by: Lu Jialin <lujialin4@xxxxxxxxxx>
> Closes: https://lore.kernel.org/all/20230613062306.101831-1-lujialin4@xxxxxxxxxx/
> Signed-off-by: Suren Baghdasaryan <surenb@xxxxxxxxxx>

This patch is a replacement for the patchset posted at
https://lore.kernel.org/all/20230626201713.1204982-1-surenb@xxxxxxxxxx/
The original patchset was more appropriate for Tejun's tree but this
one is mostly touching PSI-related code, so I changed the recipient to
PeterZ. That said, I would still greatly appreciate inputs from Tejun
and Greg, and anyone else of course.
Thanks,
Suren.

> ---
> include/linux/psi.h | 5 +++--
> include/linux/psi_types.h | 3 +++
> kernel/cgroup/cgroup.c | 2 +-
> kernel/sched/psi.c | 29 +++++++++++++++++++++--------
> 4 files changed, 28 insertions(+), 11 deletions(-)
>
> diff --git a/include/linux/psi.h b/include/linux/psi.h
> index ab26200c2803..e0745873e3f2 100644
> --- a/include/linux/psi.h
> +++ b/include/linux/psi.h
> @@ -23,8 +23,9 @@ void psi_memstall_enter(unsigned long *flags);
> void psi_memstall_leave(unsigned long *flags);
>
> int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
> -struct psi_trigger *psi_trigger_create(struct psi_group *group,
> - char *buf, enum psi_res res, struct file *file);
> +struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf,
> + enum psi_res res, struct file *file,
> + struct kernfs_open_file *of);
> void psi_trigger_destroy(struct psi_trigger *t);
>
> __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
> diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
> index 040c089581c6..f1fd3a8044e0 100644
> --- a/include/linux/psi_types.h
> +++ b/include/linux/psi_types.h
> @@ -137,6 +137,9 @@ struct psi_trigger {
> /* Wait queue for polling */
> wait_queue_head_t event_wait;
>
> + /* Kernfs file for cgroup triggers */
> + struct kernfs_open_file *of;
> +
> /* Pending event flag */
> int event;
>
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index bfe3cd8ccf36..f55a40db065f 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -3730,7 +3730,7 @@ static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
> }
>
> psi = cgroup_psi(cgrp);
> - new = psi_trigger_create(psi, buf, res, of->file);
> + new = psi_trigger_create(psi, buf, res, of->file, of);
> if (IS_ERR(new)) {
> cgroup_put(cgrp);
> return PTR_ERR(new);
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index 81fca77397f6..9bb3f2b3ccfc 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -493,8 +493,12 @@ static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total,
> continue;
>
> /* Generate an event */
> - if (cmpxchg(&t->event, 0, 1) == 0)
> - wake_up_interruptible(&t->event_wait);
> + if (cmpxchg(&t->event, 0, 1) == 0) {
> + if (t->of)
> + kernfs_notify(t->of->kn);
> + else
> + wake_up_interruptible(&t->event_wait);
> + }
> t->last_event_time = now;
> /* Reset threshold breach flag once event got generated */
> t->pending_event = false;
> @@ -1271,8 +1275,9 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
> return 0;
> }
>
> -struct psi_trigger *psi_trigger_create(struct psi_group *group,
> - char *buf, enum psi_res res, struct file *file)
> +struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf,
> + enum psi_res res, struct file *file,
> + struct kernfs_open_file *of)
> {
> struct psi_trigger *t;
> enum psi_states state;
> @@ -1331,7 +1336,9 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
>
> t->event = 0;
> t->last_event_time = 0;
> - init_waitqueue_head(&t->event_wait);
> + t->of = of;
> + if (!of)
> + init_waitqueue_head(&t->event_wait);
> t->pending_event = false;
> t->aggregator = privileged ? PSI_POLL : PSI_AVGS;
>
> @@ -1388,7 +1395,10 @@ void psi_trigger_destroy(struct psi_trigger *t)
> * being accessed later. Can happen if cgroup is deleted from under a
> * polling process.
> */
> - wake_up_pollfree(&t->event_wait);
> + if (t->of)
> + kernfs_notify(t->of->kn);
> + else
> + wake_up_interruptible(&t->event_wait);
>
> if (t->aggregator == PSI_AVGS) {
> mutex_lock(&group->avgs_lock);
> @@ -1465,7 +1475,10 @@ __poll_t psi_trigger_poll(void **trigger_ptr,
> if (!t)
> return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
>
> - poll_wait(file, &t->event_wait, wait);
> + if (t->of)
> + kernfs_generic_poll(t->of, wait);
> + else
> + poll_wait(file, &t->event_wait, wait);
>
> if (cmpxchg(&t->event, 1, 0) == 1)
> ret |= EPOLLPRI;
> @@ -1535,7 +1548,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
> return -EBUSY;
> }
>
> - new = psi_trigger_create(&psi_system, buf, res, file);
> + new = psi_trigger_create(&psi_system, buf, res, file, NULL);
> if (IS_ERR(new)) {
> mutex_unlock(&seq->lock);
> return PTR_ERR(new);
> --
> 2.41.0.255.g8b1d071c50-goog
>