Re: [PATCH v11 8/8] vhost: use vhost_tasks for worker threads

From: Mike Christie
Date: Mon May 15 2023 - 23:53:53 EST


On 5/15/23 5:54 PM, Linus Torvalds wrote:
> On Mon, May 15, 2023 at 3:23 PM Mike Christie
> <michael.christie@xxxxxxxxxx> wrote:
>>
>> The vhost layer really doesn't want any signals and wants to work like kthreads
>> for that case. To make it really simple can we do something like this where it
>> separates user and io worker behavior where the major diff is how they handle
>> signals and exit. I also included a fix for the freeze case:
>
> I don't love the SIGKILL special case, but I also don't find this
> deeply offensive. So if this is what it takes, I'm ok with it.
>
> I wonder if we could make that special case simply check for "is
> SIGKILL blocked" instead? No normal case will cause that, and it means

Yeah, it's doable. Updated below.

> that a PF_USER_WORKER thread could decide per-thread what it wants to
> do wrt SIGKILL.
>
> Christian? And I guess we should Cc: Oleg too, since the signal parts
> are an area he's familiar with and has worked on.. Eric Biederman has
> already been on the list and has also been involved
>
> Oleg: see
>
> https://lore.kernel.org/lkml/122b597e-a5fa-daf7-27bb-6f04fa98d496@xxxxxxxxxx/
>
> for the context here.

Oleg and Christian,


Below is an updated patch that doesn't check for PF_USER_WORKER in the
signal.c code and instead will check for if we have blocked the signal.




diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 537cbf9a2ade..e0f5ac90a228 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -29,7 +29,6 @@ struct kernel_clone_args {
u32 io_thread:1;
u32 user_worker:1;
u32 no_files:1;
- u32 ignore_signals:1;
unsigned long stack;
unsigned long stack_size;
unsigned long tls;
diff --git a/kernel/fork.c b/kernel/fork.c
index ed4e01daccaa..fd2970b598b2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2336,8 +2336,15 @@ __latent_entropy struct task_struct *copy_process(
p->flags &= ~PF_KTHREAD;
if (args->kthread)
p->flags |= PF_KTHREAD;
- if (args->user_worker)
+ if (args->user_worker) {
+ /*
+ * User worker are similar to io_threads but they do not
+ * support signals and cleanup is driven via another kernel
+ * interface so even SIGKILL is blocked.
+ */
p->flags |= PF_USER_WORKER;
+ siginitsetinv(&p->blocked, 0);
+ }
if (args->io_thread) {
/*
* Mark us an IO worker, and block any signal that isn't
@@ -2517,8 +2524,8 @@ __latent_entropy struct task_struct *copy_process(
if (retval)
goto bad_fork_cleanup_io;

- if (args->ignore_signals)
- ignore_signals(p);
+ if (args->user_worker)
+ p->flags |= PF_NOFREEZE;

stackleak_task_init(p);

@@ -2860,7 +2867,6 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
.fn = fn,
.fn_arg = arg,
.io_thread = 1,
- .user_worker = 1,
};

return copy_process(NULL, 0, node, &args);
diff --git a/kernel/signal.c b/kernel/signal.c
index 8f6330f0e9ca..bc7e26072437 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -995,6 +995,19 @@ static inline bool wants_signal(int sig, struct task_struct *p)
return task_curr(p) || !task_sigpending(p);
}

+static void try_set_pending_sigkill(struct task_struct *t)
+{
+ /*
+ * User workers don't support signals and their exit is driven through
+ * their kernel layer, so by default block even SIGKILL.
+ */
+ if (sigismember(&t->blocked, SIGKILL))
+ return;
+
+ sigaddset(&t->pending.signal, SIGKILL);
+ signal_wake_up(t, 1);
+}
+
static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
{
struct signal_struct *signal = p->signal;
@@ -1055,8 +1068,7 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
t = p;
do {
task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
- sigaddset(&t->pending.signal, SIGKILL);
- signal_wake_up(t, 1);
+ try_set_pending_sigkill(t);
} while_each_thread(p, t);
return;
}
@@ -1373,8 +1385,7 @@ int zap_other_threads(struct task_struct *p)
/* Don't bother with already dead threads */
if (t->exit_state)
continue;
- sigaddset(&t->pending.signal, SIGKILL);
- signal_wake_up(t, 1);
+ try_set_pending_sigkill(t);
}

return count;
diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c
index b7cbd66f889e..2d8d3ebaec4d 100644
--- a/kernel/vhost_task.c
+++ b/kernel/vhost_task.c
@@ -75,13 +75,13 @@ struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg,
const char *name)
{
struct kernel_clone_args args = {
- .flags = CLONE_FS | CLONE_UNTRACED | CLONE_VM,
+ .flags = CLONE_FS | CLONE_UNTRACED | CLONE_VM |
+ CLONE_THREAD | CLONE_SIGHAND,
.exit_signal = 0,
.fn = vhost_task_fn,
.name = name,
.user_worker = 1,
.no_files = 1,
- .ignore_signals = 1,
};
struct vhost_task *vtsk;
struct task_struct *tsk;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d257916f39e5..255a2147e5c1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1207,12 +1207,12 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
DEFINE_WAIT(wait);

/*
- * Do not throttle user workers, kthreads other than kswapd or
+ * Do not throttle IO/user workers, kthreads other than kswapd or
* workqueues. They may be required for reclaim to make
* forward progress (e.g. journalling workqueues or kthreads).
*/
if (!current_is_kswapd() &&
- current->flags & (PF_USER_WORKER|PF_KTHREAD)) {
+ current->flags & (PF_USER_WORKER|PF_IO_WORKER|PF_KTHREAD)) {
cond_resched();
return;
}