Re:Performance regression in ip_set_swap on 6.7.0

From: Jozsef Kadlecsik
Date: Sat Jan 13 2024 - 13:31:37 EST


On Thu, 11 Jan 2024, David Wang wrote:

> I tested the patch with code stressing swap->destroy->create->add 10000
> times, the performance regression still happens, and now it is
> ip_set_destroy. (I pasted the test code at the end of this mail)
> time show that most delay is 'off cpu':
> $ time sudo ./stressipset
>
> real 2m45.115s
> user 0m0.019s
> sys 0m0.744s
>
> Most time, callstack stuck in rcu_barrier:
> $ sudo cat /proc/2158/stack
> [<0>] rcu_barrier+0x1f6/0x2d0
> [<0>] ip_set_destroy+0x84/0x1d0 [ip_set]
> [<0>] nfnetlink_rcv_msg+0x2ac/0x2f0 [nfnetlink]
> [<0>] netlink_rcv_skb+0x57/0x100
> [<0>] netlink_unicast+0x19a/0x280
> [<0>] netlink_sendmsg+0x250/0x4d0
> [<0>] __sys_sendto+0x1be/0x1d0
> [<0>] __x64_sys_sendto+0x20/0x30
> [<0>] do_syscall_64+0x42/0xf0
> [<0>] entry_SYSCALL_64_after_hwframe+0x6e/0x76
>
> perf_event_open profiling show similiar call signature for rcu_call and synchronize_rcu
>
> ip_set_destroy(49.651% 2133/4296)
> rcu_barrier(80.684% 1721/2133)
> wait_for_completion(79.198% 1363/1721)
> schedule_timeout(94.864% 1293/1363)
> schedule(96.520% 1248/1293)
> __schedule(97.436% 1216/1248)
> preempt_count_add(0.240% 3/1248)
> srso_return_thunk(0.160% 2/1248)
> preempt_count_sub(0.160% 2/1248)
> srso_return_thunk(0.077% 1/1293)
> _raw_spin_unlock_irq(1.027% 14/1363)
> _raw_spin_lock_irq(0.514% 7/1363)
> __cond_resched(0.220% 3/1363)
> srso_return_thunk(0.147% 2/1363)
>
> ip_set_swap(79.842% 709/888) (this profiling was captured when synchronize_rcu is used in ip_set_swap)
> synchronize_rcu(74.330% 527/709)
> __wait_rcu_gp(89.184% 470/527)
> wait_for_completion(86.383% 406/470)
> schedule_timeout(91.133% 370/406)
> schedule(95.135% 352/370)
> _raw_spin_unlock_irq(3.202% 13/406)
> _raw_spin_lock_irq(0.739% 3/406)
> srso_return_thunk(0.246% 1/406)
> _raw_spin_unlock_irq(7.021% 33/470)
> __call_rcu_common.constprop.0(3.830% 18/470)
> rcu_gp_is_expedited(3.036% 16/527)
> __cond_resched(0.569% 3/527)
> srso_return_thunk(0.190% 1/527)
>
> They all call wait_for_completion, which may sleep on something on
> purpose, I guess...

That's OK because ip_set_destroy() calls rcu_barrier() which is needed to
handle flush in list type of sets.

However, rcu_barrier() with call_rcu() together makes multiple destroys
one after another slow. But rcu_barrier() is needed for list type of sets
only and that can be handled separately. So could you test the patch
below? According to my tests it is even a little bit faster than the
original code before synchronize_rcu() was added to swap.

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index e8c350a3ade1..912f750d0bea 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -242,6 +242,8 @@ extern void ip_set_type_unregister(struct ip_set_type *set_type);

/* A generic IP set */
struct ip_set {
+ /* For call_cru in destroy */
+ struct rcu_head rcu;
/* The name of the set */
char name[IPSET_MAXNAMELEN];
/* Lock protecting the set data */
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 4c133e06be1d..3bf9bb345809 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1182,6 +1182,14 @@ ip_set_destroy_set(struct ip_set *set)
kfree(set);
}

+static void
+ip_set_destroy_set_rcu(struct rcu_head *head)
+{
+ struct ip_set *set = container_of(head, struct ip_set, rcu);
+
+ ip_set_destroy_set(set);
+}
+
static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
const struct nlattr * const attr[])
{
@@ -1193,8 +1201,6 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
if (unlikely(protocol_min_failed(attr)))
return -IPSET_ERR_PROTOCOL;

- /* Must wait for flush to be really finished in list:set */
- rcu_barrier();

/* Commands are serialized and references are
* protected by the ip_set_ref_lock.
@@ -1206,8 +1212,10 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
* counter, so if it's already zero, we can proceed
* without holding the lock.
*/
- read_lock_bh(&ip_set_ref_lock);
if (!attr[IPSET_ATTR_SETNAME]) {
+ /* Must wait for flush to be really finished in list:set */
+ rcu_barrier();
+ read_lock_bh(&ip_set_ref_lock);
for (i = 0; i < inst->ip_set_max; i++) {
s = ip_set(inst, i);
if (s && (s->ref || s->ref_netlink)) {
@@ -1228,6 +1236,9 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
inst->is_destroyed = false;
} else {
u32 flags = flag_exist(info->nlh);
+ u16 features = 0;
+
+ read_lock_bh(&ip_set_ref_lock);
s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
&i);
if (!s) {
@@ -1238,10 +1249,14 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
ret = -IPSET_ERR_BUSY;
goto out;
}
+ features = s->type->features;
ip_set(inst, i) = NULL;
read_unlock_bh(&ip_set_ref_lock);
-
- ip_set_destroy_set(s);
+ if (features & IPSET_TYPE_NAME) {
+ /* Must wait for flush to be really finished */
+ rcu_barrier();
+ }
+ call_rcu(&s->rcu, ip_set_destroy_set_rcu);
}
return 0;
out:
@@ -1394,9 +1409,6 @@ static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info,
ip_set(inst, to_id) = from;
write_unlock_bh(&ip_set_ref_lock);

- /* Make sure all readers of the old set pointers are completed. */
- synchronize_rcu();
-
return 0;
}

@@ -2357,6 +2369,9 @@ ip_set_net_exit(struct net *net)

inst->is_deleted = true; /* flag for ip_set_nfnl_put */

+ /* Wait for call_rcu() in destroy */
+ rcu_barrier();
+
nfnl_lock(NFNL_SUBSYS_IPSET);
for (i = 0; i < inst->ip_set_max; i++) {
set = ip_set(inst, i);


Best regards,
Jozsef
--
E-mail : kadlec@xxxxxxxxxxxxxxxxx, kadlecsik.jozsef@xxxxxxxxx
PGP key : https://wigner.hu/~kadlec/pgp_public_key.txt
Address : Wigner Research Centre for Physics
H-1525 Budapest 114, POB. 49, Hungary