Re: [regression 6.1.76] dlm: cannot start dlm midcomms -97 after backport of e9cdebbe23f1 ("dlm: use kernel_connect() and kernel_bind()")

From: Valentin Kleibel
Date: Thu Feb 08 2024 - 07:18:07 EST


Hi Jordan, hi all

Just a quick look comparing dlm_tcp_listen_bind between the latest 6.1
and 6.6 stable branches,
it looks like there is a mismatch here with the dlm_local_addr[0] parameter.

6.1
----

static int dlm_tcp_listen_bind(struct socket *sock)
{
int addr_len;

/* Bind to our port */
make_sockaddr(dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len);
return kernel_bind(sock, (struct sockaddr *)&dlm_local_addr[0],
addr_len);
}

6.6
----
static int dlm_tcp_listen_bind(struct socket *sock)
{
int addr_len;

/* Bind to our port */
make_sockaddr(&dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len);
return kernel_bind(sock, (struct sockaddr *)&dlm_local_addr[0],
addr_len);
}

6.6 contains commit c51c9cd8 (fs: dlm: don't put dlm_local_addrs on heap) which
changed

static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];

to

static struct sockaddr_storage dlm_local_addr[DLM_MAX_ADDR_COUNT];

It looks like kernel_bind() in 6.1 needs to be modified to match.

We tried to apply commit c51c9cd8 (fs: dlm: don't put dlm_local_addrs on heap) to the debian kernel 6.1.76 and came up with the attached patch. Besides the different offsets there is a slight change dlm_tcp_bind() where in 6.1.76 kernel_bind() is used instead of sock->ops->bind() in the original commit.

This patch solves the issue we experienced.

Thanks for your help,
Valentin--- a/fs/dlm/lowcomms.c 2024-02-08 10:42:19.328861479 +0100
+++ b/fs/dlm/lowcomms.c 2024-02-08 10:57:22.900463149 +0100
@@ -174,7 +174,7 @@
static DEFINE_SPINLOCK(dlm_node_addrs_spin);

static struct listen_connection listen_con;
-static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
+static struct sockaddr_storage dlm_local_addr[DLM_MAX_ADDR_COUNT];
static int dlm_local_count;
int dlm_allow_conn;

@@ -398,7 +398,7 @@
if (!sa_out)
return 0;

- if (dlm_local_addr[0]->ss_family == AF_INET) {
+ if (dlm_local_addr[0].ss_family == AF_INET) {
struct sockaddr_in *in4 = (struct sockaddr_in *) &sas;
struct sockaddr_in *ret4 = (struct sockaddr_in *) sa_out;
ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
@@ -727,7 +727,7 @@
static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
int *addr_len)
{
- saddr->ss_family = dlm_local_addr[0]->ss_family;
+ saddr->ss_family = dlm_local_addr[0].ss_family;
if (saddr->ss_family == AF_INET) {
struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
in4_addr->sin_port = cpu_to_be16(port);
@@ -1167,7 +1167,7 @@
int i, addr_len, result = 0;

for (i = 0; i < dlm_local_count; i++) {
- memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
+ memcpy(&localaddr, &dlm_local_addr[i], sizeof(localaddr));
make_sockaddr(&localaddr, port, &addr_len);

if (!i)
@@ -1187,7 +1187,7 @@
/* Get local addresses */
static void init_local(void)
{
- struct sockaddr_storage sas, *addr;
+ struct sockaddr_storage sas;
int i;

dlm_local_count = 0;
@@ -1195,21 +1195,10 @@
if (dlm_our_addr(&sas, i))
break;

- addr = kmemdup(&sas, sizeof(*addr), GFP_NOFS);
- if (!addr)
- break;
- dlm_local_addr[dlm_local_count++] = addr;
+ memcpy(&dlm_local_addr[dlm_local_count++], &sas, sizeof(sas));
}
}

-static void deinit_local(void)
-{
- int i;
-
- for (i = 0; i < dlm_local_count; i++)
- kfree(dlm_local_addr[i]);
-}
-
static struct writequeue_entry *new_writequeue_entry(struct connection *con)
{
struct writequeue_entry *entry;
@@ -1575,7 +1564,7 @@
}

/* Create a socket to communicate with */
- result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
+ result = sock_create_kern(&init_net, dlm_local_addr[0].ss_family,
SOCK_STREAM, dlm_proto_ops->proto, &sock);
if (result < 0)
goto socket_err;
@@ -1786,7 +1775,6 @@
foreach_conn(free_conn);
srcu_read_unlock(&connections_srcu, idx);
work_stop();
- deinit_local();

dlm_proto_ops = NULL;
}
@@ -1803,7 +1791,7 @@
if (result < 0)
return result;

- result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
+ result = sock_create_kern(&init_net, dlm_local_addr[0].ss_family,
SOCK_STREAM, dlm_proto_ops->proto, &sock);
if (result < 0) {
log_print("Can't create comms socket: %d", result);
@@ -1842,7 +1830,7 @@
/* Bind to our cluster-known address connecting to avoid
* routing problems.
*/
- memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr));
+ memcpy(&src_addr, &dlm_local_addr[0], sizeof(src_addr));
make_sockaddr(&src_addr, 0, &addr_len);

result = kernel_bind(sock, (struct sockaddr *)&src_addr,
@@ -1899,7 +1887,7 @@
int addr_len;

/* Bind to our port */
- make_sockaddr(dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len);
+ make_sockaddr(&dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len);
return kernel_bind(sock, (struct sockaddr *)&dlm_local_addr[0],
addr_len);
}
@@ -1992,7 +1980,7 @@

error = work_start();
if (error)
- goto fail_local;
+ goto fail;

dlm_allow_conn = 1;

@@ -2022,8 +2010,6 @@
fail_proto_ops:
dlm_allow_conn = 0;
work_stop();
-fail_local:
- deinit_local();
fail:
return error;
}