[RFC PATCH 58/86] treewide: x86: remove cond_resched()

From: Ankur Arora
Date: Tue Nov 07 2023 - 18:09:39 EST


There are broadly three sets of uses of cond_resched():

1. Calls to cond_resched() out of the goodness of our heart,
otherwise known as avoiding lockup splats.

2. Open coded variants of cond_resched_lock() which call
cond_resched().

3. Retry or error handling loops, where cond_resched() is used as a
quick alternative to spinning in a tight-loop.

When running under a full preemption model, the cond_resched() reduces
to a NOP (not even a barrier) so removing it obviously cannot matter.

But considering only voluntary preemption models (for say code that
has been mostly tested under those), for set-1 and set-2 the
scheduler can now preempt kernel tasks running beyond their time
quanta anywhere they are preemptible() [1]. Which removes any need
for these explicitly placed scheduling points.

The cond_resched() calls in set-3 are a little more difficult.
To start with, given it's NOP character under full preemption, it
never actually saved us from a tight loop.
With voluntary preemption, it's not a NOP, but it might as well
be -- for most workloads the scheduler does not have an interminable
supply of runnable tasks on the runqueue.

So, cond_resched() is useful to not get softlockup splats, but not
terribly good for error handling. Ideally, these should be replaced
with some kind of timed or event wait.
For now we use cond_resched_stall(), which tries to schedule if
possible, and executes a cpu_relax() if not.

Most of the instances of cond_resched() here are from set-1 or set-2.
Remove them.

There's one set-3 case where kvm_recalculate_apic_map() sees an
unexpected APIC-id, where we now use cond_resched_stall() to delay
the retry.

[1] https://lore.kernel.org/lkml/20231107215742.363031-1-ankur.a.arora@xxxxxxxxxx/

Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Borislav Petkov <bp@xxxxxxxxx>
Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
Cc: Sean Christopherson <seanjc@xxxxxxxxxx>
Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx>
Cc: Juergen Gross <jgross@xxxxxxxx>
Cc: Boris Ostrovsky <boris.ostrovsky@xxxxxxxxxx>
Cc: "David S. Miller" <davem@xxxxxxxxxxxxx>
Cc: David Ahern <dsahern@xxxxxxxxxx>
Cc: x86@xxxxxxxxxx
Signed-off-by: Ankur Arora <ankur.a.arora@xxxxxxxxxx>
---
arch/x86/kernel/alternative.c | 10 ----------
arch/x86/kernel/cpu/sgx/encl.c | 14 +++++++-------
arch/x86/kernel/cpu/sgx/ioctl.c | 3 ---
arch/x86/kernel/cpu/sgx/main.c | 5 -----
arch/x86/kernel/cpu/sgx/virt.c | 4 ----
arch/x86/kvm/lapic.c | 6 +++++-
arch/x86/kvm/mmu/mmu.c | 2 +-
arch/x86/kvm/svm/sev.c | 5 +++--
arch/x86/net/bpf_jit_comp.c | 1 -
arch/x86/net/bpf_jit_comp32.c | 1 -
arch/x86/xen/mmu_pv.c | 1 -
11 files changed, 16 insertions(+), 36 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 73be3931e4f0..3d0b6a606852 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2189,16 +2189,6 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
*/
atomic_set_release(&bp_desc.refs, 1);

- /*
- * Function tracing can enable thousands of places that need to be
- * updated. This can take quite some time, and with full kernel debugging
- * enabled, this could cause the softlockup watchdog to trigger.
- * This function gets called every 256 entries added to be patched.
- * Call cond_resched() here to make sure that other tasks can get scheduled
- * while processing all the functions being patched.
- */
- cond_resched();
-
/*
* Corresponding read barrier in int3 notifier for making sure the
* nr_entries and handler are correctly ordered wrt. patching.
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 279148e72459..05afb4e2f552 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -549,14 +549,15 @@ int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
break;
}

- /* Reschedule on every XA_CHECK_SCHED iteration. */
+ /*
+ * Drop the lock every XA_CHECK_SCHED iteration so the
+ * scheduler can preempt if needed.
+ */
if (!(++count % XA_CHECK_SCHED)) {
xas_pause(&xas);
xas_unlock(&xas);
mutex_unlock(&encl->lock);

- cond_resched();
-
mutex_lock(&encl->lock);
xas_lock(&xas);
}
@@ -723,16 +724,15 @@ void sgx_encl_release(struct kref *ref)
}

kfree(entry);
+
/*
- * Invoke scheduler on every XA_CHECK_SCHED iteration
- * to prevent soft lockups.
+ * Drop the lock every XA_CHECK_SCHED iteration so the
+ * scheduler can preempt if needed.
*/
if (!(++count % XA_CHECK_SCHED)) {
xas_pause(&xas);
xas_unlock(&xas);

- cond_resched();
-
xas_lock(&xas);
}
}
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index 5d390df21440..2b899569bb60 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -439,9 +439,6 @@ static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg)
break;
}

- if (need_resched())
- cond_resched();
-
ret = sgx_encl_add_page(encl, add_arg.src + c, add_arg.offset + c,
&secinfo, add_arg.flags);
if (ret)
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 166692f2d501..f8bd01e56b72 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -98,8 +98,6 @@ static unsigned long __sgx_sanitize_pages(struct list_head *dirty_page_list)
list_move_tail(&page->list, &dirty);
left_dirty++;
}
-
- cond_resched();
}

list_splice(&dirty, dirty_page_list);
@@ -413,8 +411,6 @@ static int ksgxd(void *p)

if (sgx_should_reclaim(SGX_NR_HIGH_PAGES))
sgx_reclaim_pages();
-
- cond_resched();
}

return 0;
@@ -581,7 +577,6 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
}

sgx_reclaim_pages();
- cond_resched();
}

if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
index 7aaa3652e31d..6ce0983c6249 100644
--- a/arch/x86/kernel/cpu/sgx/virt.c
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -175,7 +175,6 @@ static long sgx_vepc_remove_all(struct sgx_vepc *vepc)
return -EBUSY;
}
}
- cond_resched();
}

/*
@@ -204,7 +203,6 @@ static int sgx_vepc_release(struct inode *inode, struct file *file)
continue;

xa_erase(&vepc->page_array, index);
- cond_resched();
}

/*
@@ -223,7 +221,6 @@ static int sgx_vepc_release(struct inode *inode, struct file *file)
list_add_tail(&epc_page->list, &secs_pages);

xa_erase(&vepc->page_array, index);
- cond_resched();
}

/*
@@ -245,7 +242,6 @@ static int sgx_vepc_release(struct inode *inode, struct file *file)

if (sgx_vepc_free_page(epc_page))
list_add_tail(&epc_page->list, &secs_pages);
- cond_resched();
}

if (!list_empty(&secs_pages))
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 3e977dbbf993..dd87a8214c80 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -435,7 +435,11 @@ void kvm_recalculate_apic_map(struct kvm *kvm)
kvfree(new);
new = NULL;
if (r == -E2BIG) {
- cond_resched();
+ /*
+ * A vCPU was just added or a enabled its APIC.
+ * Give things time to settle before retrying.
+ */
+ cond_resched_stall();
goto retry;
}

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index f7901cb4d2fa..58efaca73dd4 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -6431,8 +6431,8 @@ static int shadow_mmu_try_split_huge_page(struct kvm *kvm,
}

if (need_topup_split_caches_or_resched(kvm)) {
+ /* The preemption point in write_unlock() reschedules if needed. */
write_unlock(&kvm->mmu_lock);
- cond_resched();
/*
* If the topup succeeds, return -EAGAIN to indicate that the
* rmap iterator should be restarted because the MMU lock was
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 4900c078045a..a98f29692a29 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -476,7 +476,6 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages)
page_virtual = kmap_local_page(pages[i]);
clflush_cache_range(page_virtual, PAGE_SIZE);
kunmap_local(page_virtual);
- cond_resched();
}
}

@@ -2157,12 +2156,14 @@ void sev_vm_destroy(struct kvm *kvm)
/*
* if userspace was terminated before unregistering the memory regions
* then lets unpin all the registered memory.
+ *
+ * This might be a while but we are preemptible so the scheduler can
+ * always preempt if needed.
*/
if (!list_empty(head)) {
list_for_each_safe(pos, q, head) {
__unregister_enc_region_locked(kvm,
list_entry(pos, struct enc_region, list));
- cond_resched();
}
}

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index a5930042139d..bae5b39810bb 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -2819,7 +2819,6 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
prog->aux->extable = (void *) image + roundup(proglen, align);
}
oldproglen = proglen;
- cond_resched();
}

if (bpf_jit_enable > 1)
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index 429a89c5468b..03566f031b23 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -2594,7 +2594,6 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
}
}
oldproglen = proglen;
- cond_resched();
}

if (bpf_jit_enable > 1)
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index b6830554ff69..a046cde342b1 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2510,7 +2510,6 @@ int xen_remap_pfn(struct vm_area_struct *vma, unsigned long addr,
addr += range;
if (err_ptr)
err_ptr += batch;
- cond_resched();
}
out:

--
2.31.1