[RFC] net: add new socket option SO_SETNETNS

From: aloktiagi
Date: Wed Feb 01 2023 - 14:23:20 EST


This socket option provides a mechanism for users to switch a sockets network
namespace. This enables use cases where multiple IPv6 only network namespaces
can use a single IPv4 network namespace for IPv4 only egress connectivity by
switching their sockets from IPv6 to IPv4 network namespace. This allows for
migration of systems to IPv6 only while keeping their connectivity to IPv4 only
destinations intact.

Today, we achieve this by setting up seccomp filter to intercept network system
calls like connect() from a container in a container manager which runs in an
IPv4 only network namespace. The container manager creates a new IPv4 connection
and injects the new file descriptor through SECCOMP_NOTIFY_IOCTL_ADDFD replacing
the original file descriptor from the connect() call. This does not work for
cases where the original file descriptor is handed off to a system like epoll
before the connect() call. After a new file descriptor is injected the original
file descriptor being referenced by the epoll fd is not longer valid leading to
failures. As a workaround the container manager when intercepting connect()
loops through all open socket file descriptors to check if they are referencing
the socket attempting the connect() and replace the reference with the to be
injected file descriptor. This workaround is cumbersome and makes the solution
prone to similar yet to be discovered issues.

With SO_SETNETNS, the container manager can simply switch the original
unconnected socket’s network namespace to the IPv4 only network namespace
without the need for injecting any new socket. The container can then proceed
with the connect() call and establish connectivity to the IPv4 only destination.

This socket option is only allowed for sockets that have never been connected
since connected or recently disconnected sockets maybe bound to their network
namespaces network device and switching their namespace may lead to undefined
behavior.

Signed-off-by: aloktiagi <aloktiagi@xxxxxxxxx>
---
include/uapi/asm-generic/socket.h | 2 +
net/core/sock.c | 46 +++++
tools/testing/selftests/net/Makefile | 1 +
tools/testing/selftests/net/so_set_netns.c | 208 +++++++++++++++++++++
4 files changed, 257 insertions(+)
create mode 100644 tools/testing/selftests/net/so_set_netns.c

diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 638230899e98..dc9498233fe5 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -132,6 +132,8 @@

#define SO_RCVMARK 75

+#define SO_SETNETNS 76
+
#if !defined(__KERNEL__)

#if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
diff --git a/net/core/sock.c b/net/core/sock.c
index f954d5893e79..34cb72b211a6 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1535,6 +1535,52 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
WRITE_ONCE(sk->sk_txrehash, (u8)val);
break;

+ case SO_SETNETNS:
+ {
+ struct net *other_ns, *my_ns;
+
+ if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+
+ if (sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_DGRAM) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+
+ other_ns = get_net_ns_by_fd(val);
+ if (IS_ERR(other_ns)) {
+ ret = PTR_ERR(other_ns);
+ break;
+ }
+
+ if (!ns_capable(other_ns->user_ns, CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ goto out_err;
+ }
+
+ /* check that the socket has never been connected or recently disconnected */
+ if (sk->sk_state != TCP_CLOSE || sk->sk_shutdown & SHUTDOWN_MASK) {
+ ret = -EOPNOTSUPP;
+ goto out_err;
+ }
+
+ /* check that the socket is not bound to an interface*/
+ if (sk->sk_bound_dev_if != 0) {
+ ret = -EOPNOTSUPP;
+ goto out_err;
+ }
+
+ my_ns = sock_net(sk);
+ sock_net_set(sk, other_ns);
+ put_net(my_ns);
+ break;
+out_err:
+ put_net(other_ns);
+ break;
+ }
+
default:
ret = -ENOPROTOOPT;
break;
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 3007e98a6d64..c2e7679e31bb 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -75,6 +75,7 @@ TEST_GEN_PROGS += so_incoming_cpu
TEST_PROGS += sctp_vrf.sh
TEST_GEN_FILES += sctp_hello
TEST_GEN_FILES += csum
+TEST_GEN_PROGS += so_set_netns

TEST_FILES := settings

diff --git a/tools/testing/selftests/net/so_set_netns.c b/tools/testing/selftests/net/so_set_netns.c
new file mode 100644
index 000000000000..cc7767d23a5d
--- /dev/null
+++ b/tools/testing/selftests/net/so_set_netns.c
@@ -0,0 +1,208 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <error.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <linux/tcp.h>
+#include <linux/socket.h>
+
+#include <sys/types.h>
+#include <sys/sendfile.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+
+#include "../kselftest_harness.h"
+
+#ifndef SO_SETNETNS
+#define SO_SETNETNS 76
+#endif
+
+static int unshare_open(void)
+{
+ const char *netns_path = "/proc/self/ns/net";
+ int fd, ret;
+
+ if (unshare(CLONE_NEWNET) != 0)
+ return -1;
+
+ fd = open(netns_path, O_RDONLY);
+ if (fd <= 0)
+ return -1;
+
+ ret = system("ip link set lo up");
+ if (ret < 0)
+ return -1;
+
+ return fd;
+}
+
+static int switch_ns(int fd)
+{
+ if (setns(fd, CLONE_NEWNET))
+ return -1;
+ return 0;
+}
+
+static void init_namespaces(struct __test_metadata *_metadata,
+ int *netns_client, int *netns_server)
+{
+ *netns_client = unshare_open();
+ ASSERT_GE(*netns_client, 0);
+
+ *netns_server = unshare_open();
+ ASSERT_GE(*netns_server, 0);
+}
+
+static void setup_network(struct __test_metadata *_metadata,
+ int *netns_client, int *netns_server)
+{
+ int ret;
+
+ ret = switch_ns(*netns_client);
+ ASSERT_EQ(ret, 0);
+
+ ret = system("ip addr add fd::1/64 dev lo");
+ ASSERT_EQ(ret, 0);
+
+ ret = switch_ns(*netns_server);
+ ASSERT_EQ(ret, 0);
+
+ ret = system("ip addr add 192.168.1.1/24 dev lo");
+ ASSERT_EQ(ret, 0);
+}
+
+static void setup_client_server(struct __test_metadata *_metadata,
+ int *netns_client, int *netns_server,
+ int *client_fd, int *server_fd)
+{
+ struct sockaddr_in addr;
+ int ret;
+
+ ret = switch_ns(*netns_client);
+ ASSERT_EQ(ret, 0);
+
+ *client_fd = socket(AF_INET, SOCK_STREAM, 0);
+
+ ret = switch_ns(*netns_server);
+ ASSERT_EQ(ret, 0);
+
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = inet_addr("192.168.1.1");
+ addr.sin_port = htons(80);
+
+ *server_fd = socket(AF_INET, SOCK_STREAM, 0);
+ ret = bind(*server_fd, &addr, sizeof(addr));
+ ASSERT_EQ(ret, 0);
+ ret = listen(*server_fd, 10);
+ ASSERT_EQ(ret, 0);
+}
+
+FIXTURE(so_set_netns)
+{
+ int netns_client, netns_server;
+ int client_fd, server_fd;
+};
+
+FIXTURE_SETUP(so_set_netns)
+{
+ init_namespaces(_metadata, &self->netns_client, &self->netns_server);
+ setup_network(_metadata, &self->netns_client, &self->netns_server);
+ setup_client_server(_metadata,
+ &self->netns_client, &self->netns_server,
+ &self->client_fd, &self->server_fd);
+}
+
+FIXTURE_TEARDOWN(so_set_netns)
+{
+ close(self->client_fd);
+ close(self->server_fd);
+ close(self->netns_client);
+ close(self->netns_server);
+}
+
+TEST_F(so_set_netns, test_socket_ns_switch_unconnected) {
+ struct sockaddr_in addr;
+ int ret;
+
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = inet_addr("192.168.1.1");
+ addr.sin_port = htons(80);
+
+ ret = switch_ns(self->netns_client);
+ ASSERT_EQ(ret, 0);
+
+ ret = setsockopt(self->client_fd,
+ SOL_SOCKET, SO_SETNETNS,
+ &self->netns_server,
+ sizeof(self->netns_server));
+ ASSERT_EQ(ret, 0);
+
+ ret = connect(self->client_fd, &addr, sizeof(addr));
+ ASSERT_EQ(ret, 0);
+}
+
+TEST_F(so_set_netns, test_socket_ns_switch_connected) {
+ struct sockaddr_in addr;
+ int ret;
+
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = inet_addr("192.168.1.1");
+ addr.sin_port = htons(80);
+
+ ret = setsockopt(self->client_fd,
+ SOL_SOCKET, SO_SETNETNS,
+ &self->netns_server,
+ sizeof(self->netns_server));
+ ASSERT_EQ(ret, 0);
+
+ ret = connect(self->client_fd, &addr, sizeof(addr));
+ ASSERT_EQ(ret, 0);
+
+ // switching network namespace of connected
+ // socket should fail
+ ret = setsockopt(self->client_fd,
+ SOL_SOCKET, SO_SETNETNS,
+ &self->netns_client,
+ sizeof(self->netns_client));
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, EOPNOTSUPP);
+}
+
+TEST_F(so_set_netns, test_socket_ns_switch_disconnected) {
+ struct sockaddr_in addr;
+ int ret;
+
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = inet_addr("192.168.1.1");
+ addr.sin_port = htons(80);
+
+ ret = setsockopt(self->client_fd,
+ SOL_SOCKET, SO_SETNETNS,
+ &self->netns_server,
+ sizeof(self->netns_server));
+ ASSERT_EQ(ret, 0);
+
+ ret = connect(self->client_fd, &addr, sizeof(addr));
+ ASSERT_EQ(ret, 0);
+
+ close(self->server_fd);
+
+ // switching network namespace of recently disconnected
+ // socket should fail
+ ret = setsockopt(self->client_fd,
+ SOL_SOCKET, SO_SETNETNS,
+ &self->netns_client,
+ sizeof(self->netns_client));
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, EOPNOTSUPP);
+}
+
+TEST_HARNESS_MAIN
--
2.34.1