Re: [PATCH] RDMA/device: Fix a race between mad_client and cm_client init

From: Jason Gunthorpe
Date: Wed Jan 03 2024 - 13:48:21 EST


On Mon, Jan 01, 2024 at 07:43:35PM -0800, Shifeng Li wrote:
> diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
> index 67bcea7a153c..85782786993d 100644
> --- a/drivers/infiniband/core/device.c
> +++ b/drivers/infiniband/core/device.c
> @@ -1315,12 +1315,6 @@ static int enable_device_and_get(struct ib_device *device)
> down_write(&devices_rwsem);
> xa_set_mark(&devices, device->index, DEVICE_REGISTERED);
>
> - /*
> - * By using downgrade_write() we ensure that no other thread can clear
> - * DEVICE_REGISTERED while we are completing the client setup.
> - */
> - downgrade_write(&devices_rwsem);
> -
> if (device->ops.enable_driver) {
> ret = device->ops.enable_driver(device);
> if (ret)
> @@ -1337,7 +1331,7 @@ static int enable_device_and_get(struct ib_device *device)
> if (!ret)
> ret = add_compat_devs(device);
> out:
> - up_read(&devices_rwsem);
> + up_write(&devices_rwsem);
> return ret;
> }

I don't think messing with the devices_rwsem here is a great idea, it
would be better to address this on the clients_rwsem side like:

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 67bcea7a153c6a..b956c9f8e62d34 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -1730,7 +1730,7 @@ static int assign_client_id(struct ib_client *client)
{
int ret;

- down_write(&clients_rwsem);
+ lockdep_assert_held(&clients_rwsem);
/*
* The add/remove callbacks must be called in FIFO/LIFO order. To
* achieve this we assign client_ids so they are sorted in
@@ -1739,14 +1739,11 @@ static int assign_client_id(struct ib_client *client)
client->client_id = highest_client_id;
ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL);
if (ret)
- goto out;
+ return ret;

highest_client_id++;
xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED);
-
-out:
- up_write(&clients_rwsem);
- return ret;
+ return 0;
}

static void remove_client_id(struct ib_client *client)
@@ -1776,25 +1773,31 @@ int ib_register_client(struct ib_client *client)
{
struct ib_device *device;
unsigned long index;
+ bool need_unreg = false;
int ret;

refcount_set(&client->uses, 1);
init_completion(&client->uses_zero);
- ret = assign_client_id(client);
- if (ret)
- return ret;

down_read(&devices_rwsem);
+ down_write(&clients_rwsem);
+ ret = assign_client_id(client);
+ if (ret)
+ goto out;
+
+ need_unreg = true;
xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) {
ret = add_client_context(device, client);
- if (ret) {
- up_read(&devices_rwsem);
- ib_unregister_client(client);
- return ret;
- }
+ if (ret)
+ goto out;
}
+ ret = 0;
+out:
+ up_write(&clients_rwsem);
up_read(&devices_rwsem);
- return 0;
+ if (need_unreg && ret)
+ ib_unregister_client(client);
+ return ret;
}
EXPORT_SYMBOL(ib_register_client);