Re:Performance regression in ip_set_swap on 6.7.0

From: David Wang
Date: Thu Jan 11 2024 - 09:56:01 EST


I tested the patch with code stressing swap->destroy->create->add 10000 times, the performance regression still happens, and now it is ip_set_destroy.
(I pasted the test code at the end of this mail)
time show that most delay is 'off cpu':
$ time sudo ./stressipset

real 2m45.115s
user 0m0.019s
sys 0m0.744s

Most time, callstack stuck in rcu_barrier:
$ sudo cat /proc/2158/stack
[<0>] rcu_barrier+0x1f6/0x2d0
[<0>] ip_set_destroy+0x84/0x1d0 [ip_set]
[<0>] nfnetlink_rcv_msg+0x2ac/0x2f0 [nfnetlink]
[<0>] netlink_rcv_skb+0x57/0x100
[<0>] netlink_unicast+0x19a/0x280
[<0>] netlink_sendmsg+0x250/0x4d0
[<0>] __sys_sendto+0x1be/0x1d0
[<0>] __x64_sys_sendto+0x20/0x30
[<0>] do_syscall_64+0x42/0xf0
[<0>] entry_SYSCALL_64_after_hwframe+0x6e/0x76

perf_event_open profiling show similiar call signature for rcu_call and synchronize_rcu

ip_set_destroy(49.651% 2133/4296)
rcu_barrier(80.684% 1721/2133)
wait_for_completion(79.198% 1363/1721)
schedule_timeout(94.864% 1293/1363)
schedule(96.520% 1248/1293)
__schedule(97.436% 1216/1248)
preempt_count_add(0.240% 3/1248)
srso_return_thunk(0.160% 2/1248)
preempt_count_sub(0.160% 2/1248)
srso_return_thunk(0.077% 1/1293)
_raw_spin_unlock_irq(1.027% 14/1363)
_raw_spin_lock_irq(0.514% 7/1363)
__cond_resched(0.220% 3/1363)
srso_return_thunk(0.147% 2/1363)

ip_set_swap(79.842% 709/888) (this profiling was captured when synchronize_rcu is used in ip_set_swap)
synchronize_rcu(74.330% 527/709)
__wait_rcu_gp(89.184% 470/527)
wait_for_completion(86.383% 406/470)
schedule_timeout(91.133% 370/406)
schedule(95.135% 352/370)
_raw_spin_unlock_irq(3.202% 13/406)
_raw_spin_lock_irq(0.739% 3/406)
srso_return_thunk(0.246% 1/406)
_raw_spin_unlock_irq(7.021% 33/470)
__call_rcu_common.constprop.0(3.830% 18/470)
rcu_gp_is_expedited(3.036% 16/527)
__cond_resched(0.569% 3/527)
srso_return_thunk(0.190% 1/527)

They all call wait_for_completion, which may sleep on something on purpose, I guess...(Maybe rcu is not a good choice here?)


I made another test with 'synchronize_rcu' removed from ip_set_swap and no 'call_rcu' in ip_set_destroy, the performance is much better:
$ time sudo ./stressipset

real 0m2.203s
user 0m0.037s
sys 0m0.188s


Here is my test code, it is very ugly...(I have no knowledge of netfilter messaging protocol, and just hack it out with the help of strace.)
And I was lazy, before run the test, foo/bar netset is needed:

ipset create foo hash:net
ipset create bar hash:net

Here come the ugly code, should be able to compile with gcc.

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <unistd.h>
#include <arpa/inet.h>
#include <linux/netlink.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/ipset/ip_set.h>
#include <string.h>

unsigned char mbuffer[4096];
unsigned char buf[128]; // cmd buffer for swap
unsigned char dbuf[128]; // cmd buffer for destroy
unsigned char cbuf[128]; // cmd buffer for create
unsigned char abuf[128]; // cmd buffer for add
int main() {
int err;
int sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_NETFILTER);
if (sock<0) {
perror("Fail to create socket");
return 1;
}
struct sockaddr_nl addr = {
.nl_family = AF_NETLINK,
.nl_pad = 0,
.nl_pid = 0,
.nl_groups = 0
};
struct sockaddr raddr = {0};
socklen_t rsize;
int seq = 0x12345678;
err = bind(sock, (struct sockaddr*)&addr, sizeof(addr));
if (err) {
perror("Fail to bind");
return 1;
}
err = getsockname(sock, &raddr, &rsize);
if (err) {
perror("Fail to getsockname");
return 1;
}
struct nlmsghdr *phdr;
struct nfgenmsg *pnfg;
struct nlattr *pnla;
unsigned int total;
ssize_t rz;
struct iovec iovs;
iovs.iov_base = mbuffer;
iovs.iov_len = sizeof(mbuffer);
struct msghdr msg = {0};
msg.msg_name = &addr;
msg.msg_namelen = sizeof(addr);
msg.msg_iov = &iovs;
msg.msg_iovlen = 1;

memset(buf, 0, sizeof(buf));
total = 0;
phdr = (struct nlmsghdr*)(buf+total);
total += sizeof(struct nlmsghdr);
phdr->nlmsg_type=NFNL_SUBSYS_IPSET<<8|IPSET_CMD_PROTOCOL;
phdr->nlmsg_seq = seq;
phdr->nlmsg_flags = NLM_F_REQUEST;
pnfg = (struct nfgenmsg*)(buf+total);
total += sizeof(struct nfgenmsg);
pnfg->nfgen_family=AF_INET;
pnfg->version= NFNETLINK_V0;
pnfg->res_id=htons(0);
pnla = (struct nlattr *)(buf+total);
pnla->nla_len = 5;
pnla->nla_type = 1;
buf[total+sizeof(struct nlattr)]=0x06;
total+=8;
phdr->nlmsg_len = total;
rz = sendto(sock, buf, total, 0, (struct sockaddr*)&addr, sizeof(addr));
rz = recvmsg(sock, &msg, 0);

int sz = total;
// build swap
pnla = (struct nlattr *)(buf+total);
pnla->nla_len = 8;
pnla->nla_type = 2;
char *p = buf+(total+sizeof(struct nlattr));
p[0]='f'; p[1]='o'; p[2]='o'; p[3]=0;
total+=8;
pnla = (struct nlattr *)(buf+total);
pnla->nla_len = 8;
pnla->nla_type = 3;
p = buf+(total+sizeof(struct nlattr));
p[0]='b'; p[1]='a'; p[2]='r'; p[3]=0;
total+=8;
phdr->nlmsg_type = NFNL_SUBSYS_IPSET<<8|IPSET_CMD_SWAP;
phdr->nlmsg_flags = NLM_F_REQUEST|NLM_F_ACK;
phdr->nlmsg_len = total;

// build destroy
char bar[] = "\x62\x61\x72\x00";
memcpy(dbuf, buf, sz);
total = sz;
struct nlmsghdr *phdr_d = (struct nlmsghdr*)dbuf;
pnla = (struct nlattr *)(dbuf+total);
pnla->nla_len = 8;
pnla->nla_type = 2;
memcpy(dbuf+(total+sizeof(struct nlattr)), bar, sizeof(bar));
total+=pnla->nla_len;
phdr_d->nlmsg_type = NFNL_SUBSYS_IPSET<<8|IPSET_CMD_DESTROY;
phdr_d->nlmsg_flags = NLM_F_REQUEST|NLM_F_ACK;
phdr_d->nlmsg_len = total;

// build create
memcpy(cbuf, buf, sz);
total = sz;
struct nlmsghdr *phdr_c = (struct nlmsghdr*)cbuf;
pnla = (struct nlattr *)(cbuf+total);
pnla->nla_len = 8;
pnla->nla_type = 2;
memcpy(cbuf+(total+sizeof(struct nlattr)), bar, sizeof(bar));
total+=(pnla->nla_len+3)/4*4;
char hashnet[] = "\x68\x61\x73\x68\x3a\x6e\x65\x74\x00";
pnla = (struct nlattr *)(cbuf+total);
pnla->nla_len = 13;
pnla->nla_type = 3;
memcpy(cbuf+(total+sizeof(struct nlattr)), hashnet, sizeof(hashnet));
total+=(pnla->nla_len+3)/4*4;
pnla = (struct nlattr *)(cbuf+total);
pnla->nla_len = 5;
pnla->nla_type = 4;
cbuf[total+sizeof(struct nlattr)]=6;
total+=(pnla->nla_len+3)/4*4;
pnla = (struct nlattr *)(cbuf+total);
pnla->nla_len = 5;
pnla->nla_type = 5;
cbuf[total+sizeof(struct nlattr)]=2;
total+=(pnla->nla_len+3)/4*4;
pnla = (struct nlattr *)(cbuf+total);
pnla->nla_len = 4;
pnla->nla_type = NLA_F_NESTED|0x7;
total+=(pnla->nla_len+3)/4*4;

phdr_c->nlmsg_type = NFNL_SUBSYS_IPSET<<8|IPSET_CMD_CREATE;
phdr_c->nlmsg_flags = NLM_F_REQUEST|NLM_F_ACK;
phdr_c->nlmsg_len = total;

// build add
memcpy(abuf, buf, sz);
total = sz;
struct nlmsghdr *phdr_a = (struct nlmsghdr*)abuf;
pnla = (struct nlattr *)(abuf+total);
pnla->nla_len = 8;
pnla->nla_type = 2;
memcpy(abuf+(total+sizeof(struct nlattr)), bar, sizeof(bar));
total+=(pnla->nla_len+3)/4*4;
char ip[] = "\x0c\x00\x01\x80\x08\x00\x01\x40\x0a\x01\x00\x00\x05\x00\x03\x00\x10\x00\x00\x00\x08\x00\x09\x40\x00\x00\x00\x00";
pnla = (struct nlattr *)(abuf+total);
pnla->nla_len = 32;
pnla->nla_type = NLA_F_NESTED|0x7;
memcpy(abuf+(total+sizeof(struct nlattr)), ip, sizeof(ip));
total+=(pnla->nla_len+3)/4*4;
phdr_a->nlmsg_type = NFNL_SUBSYS_IPSET<<8|IPSET_CMD_ADD;
phdr_a->nlmsg_flags = NLM_F_REQUEST|NLM_F_ACK;
phdr_a->nlmsg_len = total;


for (int i=0; i<10000; i++) {
// swap foo bar
seq++;
phdr->nlmsg_seq = seq;
sendto(sock, buf, phdr->nlmsg_len, 0, (struct sockaddr*)&addr, sizeof(addr));
recvmsg(sock, &msg, 0);
err=*(short*)(mbuffer+(sizeof(struct nlmsghdr)-2)); if (err) { printf("fail to swap %d\n", err); break; }
// destroy bar
seq++;
phdr_d->nlmsg_seq = seq;
sendto(sock, dbuf, phdr_d->nlmsg_len, 0, (struct sockaddr*)&addr, sizeof(addr));
recvmsg(sock, &msg, 0);
err=*(short*)(mbuffer+(sizeof(struct nlmsghdr)-2)); if (err) { printf("fail to destroy\n"); break; }
// create bar
seq++;
phdr_c->nlmsg_seq = seq;
sendto(sock, cbuf, phdr_c->nlmsg_len, 0, (struct sockaddr*)&addr, sizeof(addr));
recvmsg(sock, &msg, 0);
err=*(short*)(mbuffer+(sizeof(struct nlmsghdr)-2)); if (err) { printf("fail to create\n"); break; }
// add bar
seq++;
phdr_a->nlmsg_seq = seq;
sendto(sock, abuf, phdr_a->nlmsg_len, 0, (struct sockaddr*)&addr, sizeof(addr));
recvmsg(sock, &msg, 0);
err=*(short*)(mbuffer+(sizeof(struct nlmsghdr)-2)); if (err) { printf("fail to add\n"); break; }
}

close(sock);
return 0;
}


Thanks
David