[RFC PATCH 77/86] treewide: netfilter: remove cond_resched()

From: Ankur Arora
Date: Tue Nov 07 2023 - 18:12:15 EST


There are broadly three sets of uses of cond_resched():

1. Calls to cond_resched() out of the goodness of our heart,
otherwise known as avoiding lockup splats.

2. Open coded variants of cond_resched_lock() which call
cond_resched().

3. Retry or error handling loops, where cond_resched() is used as a
quick alternative to spinning in a tight-loop.

When running under a full preemption model, the cond_resched() reduces
to a NOP (not even a barrier) so removing it obviously cannot matter.

But considering only voluntary preemption models (for say code that
has been mostly tested under those), for set-1 and set-2 the
scheduler can now preempt kernel tasks running beyond their time
quanta anywhere they are preemptible() [1]. Which removes any need
for these explicitly placed scheduling points.

The cond_resched() calls in set-3 are a little more difficult.
To start with, given it's NOP character under full preemption, it
never actually saved us from a tight loop.
With voluntary preemption, it's not a NOP, but it might as well be --
for most workloads the scheduler does not have an interminable supply
of runnable tasks on the runqueue.

So, cond_resched() is useful to not get softlockup splats, but not
terribly good for error handling. Ideally, these should be replaced
with some kind of timed or event wait.
For now we use cond_resched_stall(), which tries to schedule if
possible, and executes a cpu_relax() if not.

Most of the uses here are in set-1 (some right after we give up a lock
or enable bottom-halves, causing an explicit preemption check.)
We can remove all of them.

There's one case where we do "cond_resched(); cpu_relax()" while
spinning on a seqcount. Replace with cond_resched_stall().

[1] https://lore.kernel.org/lkml/20231107215742.363031-1-ankur.a.arora@xxxxxxxxxx/

Cc: Florian Westphal <fw@xxxxxxxxx>
Cc: Eric Dumazet <edumazet@xxxxxxxxxx>
Cc: Jakub Kicinski <kuba@xxxxxxxxxx>
Cc: Paolo Abeni <pabeni@xxxxxxxxxx>
Cc: Simon Horman <horms@xxxxxxxxxxxx>
Cc: Julian Anastasov <ja@xxxxxx>
Cc: "David S. Miller" <davem@xxxxxxxxxxxxx>
Cc: Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx>
Cc: Jozsef Kadlecsik <kadlec@xxxxxxxxxxxxx>
Signed-off-by: Ankur Arora <ankur.a.arora@xxxxxxxxxx>
---
net/netfilter/ipset/ip_set_core.c | 1 -
net/netfilter/ipvs/ip_vs_est.c | 3 ---
net/netfilter/nf_conncount.c | 2 --
net/netfilter/nf_conntrack_core.c | 3 ---
net/netfilter/nf_conntrack_ecache.c | 3 ---
net/netfilter/nf_tables_api.c | 2 --
net/netfilter/nft_set_rbtree.c | 2 --
net/netfilter/x_tables.c | 3 +--
net/netfilter/xt_hashlimit.c | 1 -
9 files changed, 1 insertion(+), 19 deletions(-)

diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 35d2f9c9ada0..f584c5e756ae 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1703,7 +1703,6 @@ call_ad(struct net *net, struct sock *ctnl, struct sk_buff *skb,
if (retried) {
__ip_set_get_netlink(set);
nfnl_unlock(NFNL_SUBSYS_IPSET);
- cond_resched();
nfnl_lock(NFNL_SUBSYS_IPSET);
__ip_set_put_netlink(set);
}
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index c5970ba416ae..5543efeeb3f7 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -622,7 +622,6 @@ static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs)
goto unlock;
}
mutex_unlock(&__ip_vs_mutex);
- cond_resched();
}

unlock:
@@ -681,7 +680,6 @@ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max)

if (!ipvs->enable || kthread_should_stop())
goto stop;
- cond_resched();

diff = ktime_to_ns(ktime_sub(t2, t1));
if (diff <= 1 * NSEC_PER_USEC) {
@@ -815,7 +813,6 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
* and deleted (releasing kthread contexts)
*/
mutex_unlock(&__ip_vs_mutex);
- cond_resched();
mutex_lock(&__ip_vs_mutex);

/* Current kt released ? */
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 5d8ed6c90b7e..e7bc39ca204d 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -473,8 +473,6 @@ static void tree_gc_worker(struct work_struct *work)
rcu_read_unlock();
local_bh_enable();

- cond_resched();
-
spin_lock_bh(&nf_conncount_locks[tree]);
if (gc_count < ARRAY_SIZE(gc_nodes))
goto next; /* do not bother */
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 9f6f2e643575..d2f38870bbab 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1563,7 +1563,6 @@ static void gc_worker(struct work_struct *work)
* we will just continue with next hash slot.
*/
rcu_read_unlock();
- cond_resched();
i++;

delta_time = nfct_time_stamp - end_time;
@@ -2393,7 +2392,6 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
}
spin_unlock(lockp);
local_bh_enable();
- cond_resched();
}

return NULL;
@@ -2418,7 +2416,6 @@ static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data),

nf_ct_delete(ct, iter_data->portid, iter_data->report);
nf_ct_put(ct);
- cond_resched();
}
mutex_unlock(&nf_conntrack_mutex);
}
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index 69948e1d6974..b568e329bf22 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -84,7 +84,6 @@ static enum retry_state ecache_work_evict_list(struct nf_conntrack_net *cnet)

if (sent++ > 16) {
spin_unlock_bh(&cnet->ecache.dying_lock);
- cond_resched();
goto next;
}
}
@@ -96,8 +95,6 @@ static enum retry_state ecache_work_evict_list(struct nf_conntrack_net *cnet)

hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
nf_ct_put(ct);
-
- cond_resched();
}

return ret;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 29c651804cb2..6ff5515d9b17 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3742,8 +3742,6 @@ static int nft_table_validate(struct net *net, const struct nft_table *table)
err = nft_chain_validate(&ctx, chain);
if (err < 0)
return err;
-
- cond_resched();
}

return 0;
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index e34662f4a71e..9bdf7c0e0831 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -495,8 +495,6 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
if (fatal_signal_pending(current))
return -EINTR;

- cond_resched();
-
write_lock_bh(&priv->lock);
write_seqcount_begin(&priv->count);
err = __nft_rbtree_insert(net, set, rbe, ext);
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 21624d68314f..ab53adf6393d 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1433,8 +1433,7 @@ xt_replace_table(struct xt_table *table,

if (seq & 1) {
do {
- cond_resched();
- cpu_relax();
+ cond_resched_stall();
} while (seq == raw_read_seqcount(s));
}
}
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 0859b8f76764..47a11d49231a 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -372,7 +372,6 @@ static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, bool select
dsthash_free(ht, dh);
}
spin_unlock_bh(&ht->lock);
- cond_resched();
}
}

--
2.31.1