[RFC v3 7/9] nvme-rdma: replace state machine with generic one

From: Daniel Wagner
Date: Thu May 04 2023 - 05:13:34 EST


Signed-off-by: Daniel Wagner <dwagner@xxxxxxx>
---
drivers/nvme/host/rdma.c | 703 ++++++++++-----------------------------
1 file changed, 173 insertions(+), 530 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index b0ab5a9d5fe0..1fde65e8c2b5 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -568,35 +568,16 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
return ret;
}

-static int nvme_rdma_alloc_queue(struct nvme_ctrl *nctrl, int idx)
+static int __nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl,
+ struct nvme_rdma_queue *queue)
{
- struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
- struct nvme_rdma_queue *queue;
struct sockaddr *src_addr = NULL;
- size_t queue_size;
int ret;

- if (idx == 0)
- queue_size = NVME_AQ_DEPTH;
- else
- queue_size = ctrl->ctrl.sqsize + 1;
-
- queue = &ctrl->queues[idx];
mutex_init(&queue->queue_lock);
queue->ctrl = ctrl;
- if (idx && ctrl->ctrl.max_integrity_segments)
- queue->pi_support = true;
- else
- queue->pi_support = false;
init_completion(&queue->cm_done);

- if (idx > 0)
- queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16;
- else
- queue->cmnd_capsule_len = sizeof(struct nvme_command);
-
- queue->queue_size = queue_size;
-
queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue,
RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(queue->cm_id)) {
@@ -638,62 +619,120 @@ static int nvme_rdma_alloc_queue(struct nvme_ctrl *nctrl, int idx)
return ret;
}

-static void __nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
+static void __nvme_rdma_free_queue(struct nvme_rdma_ctrl *ctrl,
+ struct nvme_rdma_queue *queue)
{
- rdma_disconnect(queue->cm_id);
- ib_drain_qp(queue->qp);
+ if (!test_and_clear_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
+ return;
+
+ rdma_destroy_id(queue->cm_id);
+ nvme_rdma_destroy_queue_ib(queue);
+ mutex_destroy(&queue->queue_lock);
}

-static void nvme_rdma_stop_queue(struct nvme_ctrl *nctrl, int qid)
+static int nvme_rdma_alloc_admin_queue(struct nvme_ctrl *nctrl)
{
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
- struct nvme_rdma_queue *queue = &ctrl->queues[qid];
+ struct nvme_rdma_queue *queue = &ctrl->queues[0];
+ bool pi_capable = false;
+ int ret;

- mutex_lock(&queue->queue_lock);
- if (test_and_clear_bit(NVME_RDMA_Q_LIVE, &queue->flags))
- __nvme_rdma_stop_queue(queue);
- mutex_unlock(&queue->queue_lock);
+ queue->cmnd_capsule_len = sizeof(struct nvme_command);
+ queue->queue_size = NVME_AQ_DEPTH;
+ queue->pi_support = false;
+
+ ret = __nvme_rdma_alloc_queue(ctrl, queue);
+ if (ret)
+ return ret;
+
+ ctrl->device = queue->device;
+ nctrl->numa_node = ibdev_to_node(ctrl->device->dev);
+
+ /* T10-PI support */
+ if (ctrl->device->dev->attrs.kernel_cap_flags &
+ IBK_INTEGRITY_HANDOVER)
+ pi_capable = true;
+
+ ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev,
+ pi_capable);
+
+ /*
+ * Bind the async event SQE DMA mapping to the admin queue lifetime.
+ * It's safe, since any chage in the underlying RDMA device will issue
+ * error recovery and queue re-creation.
+ */
+ ret = nvme_rdma_alloc_qe(ctrl->device->dev, &ctrl->async_event_sqe,
+ sizeof(struct nvme_command), DMA_TO_DEVICE);
+ if (ret) {
+ __nvme_rdma_free_queue(ctrl, queue);
+ return ret;
+ }
+
+ ctrl->ctrl.max_segments = ctrl->max_fr_pages;
+ ctrl->ctrl.max_hw_sectors = ctrl->max_fr_pages << (ilog2(SZ_4K) - 9);
+ if (pi_capable)
+ ctrl->ctrl.max_integrity_segments = ctrl->max_fr_pages;
+ else
+ ctrl->ctrl.max_integrity_segments = 0;
+
+ return 0;
}

-static void nvme_rdma_free_queue(struct nvme_ctrl *nctrl, int qid)
+static int nvme_rdma_alloc_io_queue(struct nvme_ctrl *nctrl, int qid)
{
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
struct nvme_rdma_queue *queue = &ctrl->queues[qid];

- if (!test_and_clear_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
- return;
+ queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16;
+ queue->queue_size = ctrl->ctrl.sqsize + 1;
+ if (ctrl->ctrl.max_integrity_segments)
+ queue->pi_support = true;
+ else
+ queue->pi_support = false;

- rdma_destroy_id(queue->cm_id);
- nvme_rdma_destroy_queue_ib(queue);
- mutex_destroy(&queue->queue_lock);
+ return __nvme_rdma_alloc_queue(ctrl, queue);
}

-static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl)
+static void nvme_rdma_free_admin_queue(struct nvme_ctrl *nctrl)
{
- int i;
+ struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
+ struct nvme_rdma_queue *queue = &ctrl->queues[0];
+
+ if (ctrl->async_event_sqe.data) {
+ cancel_work_sync(&ctrl->ctrl.async_event_work);
+ nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
+ sizeof(struct nvme_command), DMA_TO_DEVICE);
+ ctrl->async_event_sqe.data = NULL;
+ }

- for (i = 1; i < ctrl->ctrl.queue_count; i++)
- nvme_rdma_free_queue(&ctrl->ctrl, i);
+ __nvme_rdma_free_queue(ctrl, queue);
}

-static void nvme_rdma_stop_io_queues(struct nvme_rdma_ctrl *ctrl)
+static void nvme_rdma_free_io_queue(struct nvme_ctrl *nctrl, int qid)
{
- int i;
+ struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
+ struct nvme_rdma_queue *queue = &ctrl->queues[qid];

- for (i = 1; i < ctrl->ctrl.queue_count; i++)
- nvme_rdma_stop_queue(&ctrl->ctrl, i);
+ __nvme_rdma_free_queue(ctrl, queue);
+}
+
+static void __nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
+{
+ mutex_lock(&queue->queue_lock);
+ if (test_and_clear_bit(NVME_RDMA_Q_LIVE, &queue->flags)) {
+ rdma_disconnect(queue->cm_id);
+ ib_drain_qp(queue->qp);
+ }
+ mutex_unlock(&queue->queue_lock);
}

-static int nvme_rdma_start_queue(struct nvme_ctrl *nctrl, int idx)
+static int nvme_rdma_start_admin_queue(struct nvme_ctrl *nctrl)
{
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
- struct nvme_rdma_queue *queue = &ctrl->queues[idx];
+ struct nvme_rdma_queue *queue = &ctrl->queues[0];
int ret;

- if (idx)
- ret = nvmf_connect_io_queue(nctrl, idx);
- else
- ret = nvmf_connect_admin_queue(nctrl);
+ ret = nvmf_connect_admin_queue(nctrl);

if (!ret) {
set_bit(NVME_RDMA_Q_LIVE, &queue->flags);
@@ -701,58 +740,74 @@ static int nvme_rdma_start_queue(struct nvme_ctrl *nctrl, int idx)
if (test_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
__nvme_rdma_stop_queue(queue);
dev_info(ctrl->ctrl.device,
- "failed to connect queue: %d ret=%d\n", idx, ret);
+ "failed to connect queue: %d ret=%d\n", 0, ret);
}
return ret;
}

-static int nvme_rdma_start_io_queues(struct nvme_rdma_ctrl *ctrl,
- int first, int last)
+static int nvme_rdma_start_io_queue(struct nvme_ctrl *nctrl, int idx)
{
- int i, ret = 0;
+ struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
+ struct nvme_rdma_queue *queue = &ctrl->queues[idx];
+ int ret;

- for (i = first; i < last; i++) {
- ret = nvme_rdma_start_queue(&ctrl->ctrl, i);
- if (ret)
- goto out_stop_queues;
+ ret = nvmf_connect_io_queue(nctrl, idx);
+ if (!ret) {
+ set_bit(NVME_RDMA_Q_LIVE, &queue->flags);
+ } else {
+ if (test_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
+ __nvme_rdma_stop_queue(queue);
+ dev_info(ctrl->ctrl.device,
+ "failed to connect queue: %d ret=%d\n", idx, ret);
}
+ return ret;
+}

- return 0;
+static void nvme_rdma_stop_admin_queue(struct nvme_ctrl *nctrl)
+{
+ struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
+ struct nvme_rdma_queue *queue = &ctrl->queues[0];

-out_stop_queues:
- for (i--; i >= first; i--)
- nvme_rdma_stop_queue(&ctrl->ctrl, i);
- return ret;
+ __nvme_rdma_stop_queue(queue);
}

-static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl)
+static void nvme_rdma_stop_io_queue(struct nvme_ctrl *nctrl, int qid)
{
- struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
+ struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
+ struct nvme_rdma_queue *queue = &ctrl->queues[qid];
+
+ __nvme_rdma_stop_queue(queue);
+}
+
+static unsigned int nvme_rdma_nr_io_queues(struct nvme_ctrl *ctrl)
+{
+ struct ib_device *ibdev = to_rdma_ctrl(ctrl)->device->dev;
+ struct nvmf_ctrl_options *opts = ctrl->opts;
+ unsigned int nr_io_queues;
+
+ nr_io_queues = min_t(unsigned int, ibdev->num_comp_vectors,
+ min(opts->nr_io_queues, num_online_cpus()));
+ nr_io_queues += min_t(unsigned int, ibdev->num_comp_vectors,
+ min(opts->nr_write_queues, num_online_cpus()));
+ nr_io_queues += min(opts->nr_poll_queues, num_online_cpus());
+
+ return nr_io_queues;
+}
+
+static void nvme_rdma_set_io_queues(struct nvme_ctrl *nctrl,
+ unsigned int nr_io_queues)
+{
+ struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
struct ib_device *ibdev = ctrl->device->dev;
- unsigned int nr_io_queues, nr_default_queues;
+ struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
+ unsigned int nr_default_queues;
unsigned int nr_read_queues, nr_poll_queues;
- int i, ret;

nr_read_queues = min_t(unsigned int, ibdev->num_comp_vectors,
min(opts->nr_io_queues, num_online_cpus()));
nr_default_queues = min_t(unsigned int, ibdev->num_comp_vectors,
min(opts->nr_write_queues, num_online_cpus()));
nr_poll_queues = min(opts->nr_poll_queues, num_online_cpus());
- nr_io_queues = nr_read_queues + nr_default_queues + nr_poll_queues;
-
- ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
- if (ret)
- return ret;
-
- if (nr_io_queues == 0) {
- dev_err(ctrl->ctrl.device,
- "unable to set any I/O queues\n");
- return -ENOMEM;
- }
-
- ctrl->ctrl.queue_count = nr_io_queues + 1;
- dev_info(ctrl->ctrl.device,
- "creating %d I/O queues.\n", nr_io_queues);

if (opts->nr_write_queues && nr_read_queues < nr_io_queues) {
/*
@@ -781,20 +836,6 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl)
ctrl->io_queues[HCTX_TYPE_POLL] =
min(nr_poll_queues, nr_io_queues);
}
-
- for (i = 1; i < ctrl->ctrl.queue_count; i++) {
- ret = nvme_rdma_alloc_queue(&ctrl->ctrl, i);
- if (ret)
- goto out_free_queues;
- }
-
- return 0;
-
-out_free_queues:
- for (i--; i >= 1; i--)
- nvme_rdma_free_queue(&ctrl->ctrl, i);
-
- return ret;
}

static int nvme_rdma_alloc_tag_set(struct nvme_ctrl *ctrl)
@@ -812,231 +853,6 @@ static int nvme_rdma_alloc_tag_set(struct nvme_ctrl *ctrl)
cmd_size);
}

-static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl)
-{
- if (ctrl->async_event_sqe.data) {
- cancel_work_sync(&ctrl->ctrl.async_event_work);
- nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
- sizeof(struct nvme_command), DMA_TO_DEVICE);
- ctrl->async_event_sqe.data = NULL;
- }
- nvme_rdma_free_queue(&ctrl->ctrl, 0);
-}
-
-static int nvme_rdma_init_queue(struct nvme_ctrl *nctrl, int qid)
-{
- struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
- bool pi_capable = false;
- int error;
-
- if (qid != 0)
- /* only admin queue needs additional work. */
- return 0;
-
-
- ctrl->device = ctrl->queues[0].device;
- ctrl->ctrl.numa_node = ibdev_to_node(ctrl->device->dev);
-
- /* T10-PI support */
- if (ctrl->device->dev->attrs.kernel_cap_flags &
- IBK_INTEGRITY_HANDOVER)
- pi_capable = true;
-
- ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev,
- pi_capable);
-
- /*
- * Bind the async event SQE DMA mapping to the admin queue lifetime.
- * It's safe, since any chage in the underlying RDMA device will issue
- * error recovery and queue re-creation.
- */
- error = nvme_rdma_alloc_qe(ctrl->device->dev, &ctrl->async_event_sqe,
- sizeof(struct nvme_command), DMA_TO_DEVICE);
- if (error)
- return error;
-
- ctrl->ctrl.max_segments = ctrl->max_fr_pages;
- ctrl->ctrl.max_hw_sectors = ctrl->max_fr_pages << (ilog2(SZ_4K) - 9);
- if (pi_capable)
- ctrl->ctrl.max_integrity_segments = ctrl->max_fr_pages;
- else
- ctrl->ctrl.max_integrity_segments = 0;
-
- return 0;
-}
-
-static void nvme_rdma_deinit_queue(struct nvme_ctrl *nctrl, int qid)
-{
- struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
-
- if (qid != 0)
- return;
-
- if (ctrl->async_event_sqe.data) {
- nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
- sizeof(struct nvme_command), DMA_TO_DEVICE);
- ctrl->async_event_sqe.data = NULL;
- }
-}
-
-static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
- bool new)
-{
- int error;
-
- error = nvme_rdma_alloc_queue(&ctrl->ctrl, 0);
- if (error)
- return error;
-
- error = nvme_rdma_init_queue(&ctrl->ctrl, 0);
- if (error)
- goto out_free_queue;
-
- if (new) {
- error = nvme_alloc_admin_tag_set(&ctrl->ctrl,
- &ctrl->admin_tag_set, &nvme_rdma_admin_mq_ops,
- sizeof(struct nvme_rdma_request) +
- NVME_RDMA_DATA_SGL_SIZE);
- if (error)
- goto out_deinit_admin_queue;
-
- }
-
- error = nvme_rdma_start_queue(&ctrl->ctrl, 0);
- if (error)
- goto out_remove_admin_tag_set;
-
- error = nvme_enable_ctrl(&ctrl->ctrl);
- if (error)
- goto out_stop_queue;
-
- nvme_unquiesce_admin_queue(&ctrl->ctrl);
-
- error = nvme_init_ctrl_finish(&ctrl->ctrl, false);
- if (error)
- goto out_quiesce_queue;
-
- return 0;
-
-out_quiesce_queue:
- nvme_quiesce_admin_queue(&ctrl->ctrl);
- blk_sync_queue(ctrl->ctrl.admin_q);
-out_stop_queue:
- nvme_rdma_stop_queue(&ctrl->ctrl, 0);
- nvme_cancel_admin_tagset(&ctrl->ctrl);
-out_remove_admin_tag_set:
- if (new)
- nvme_remove_admin_tag_set(&ctrl->ctrl);
-out_deinit_admin_queue:
- nvme_rdma_deinit_queue(&ctrl->ctrl, 0);
-out_free_queue:
- nvme_rdma_free_queue(&ctrl->ctrl, 0);
- return error;
-}
-
-static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
-{
- int ret, nr_queues;
-
- ret = nvme_rdma_alloc_io_queues(ctrl);
- if (ret)
- return ret;
-
- if (new) {
- ret = nvme_rdma_alloc_tag_set(&ctrl->ctrl);
- if (ret)
- goto out_free_io_queues;
- }
-
- /*
- * Only start IO queues for which we have allocated the tagset
- * and limitted it to the available queues. On reconnects, the
- * queue number might have changed.
- */
- nr_queues = min(ctrl->tag_set.nr_hw_queues + 1, ctrl->ctrl.queue_count);
- ret = nvme_rdma_start_io_queues(ctrl, 1, nr_queues);
- if (ret)
- goto out_cleanup_tagset;
-
- if (!new) {
- nvme_unquiesce_io_queues(&ctrl->ctrl);
- if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) {
- /*
- * If we timed out waiting for freeze we are likely to
- * be stuck. Fail the controller initialization just
- * to be safe.
- */
- ret = -ENODEV;
- goto out_wait_freeze_timed_out;
- }
- blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset,
- ctrl->ctrl.queue_count - 1);
- nvme_unfreeze(&ctrl->ctrl);
- }
-
- /*
- * If the number of queues has increased (reconnect case)
- * start all new queues now.
- */
- ret = nvme_rdma_start_io_queues(ctrl, nr_queues,
- ctrl->tag_set.nr_hw_queues + 1);
- if (ret)
- goto out_wait_freeze_timed_out;
-
- return 0;
-
-out_wait_freeze_timed_out:
- nvme_quiesce_io_queues(&ctrl->ctrl);
- nvme_sync_io_queues(&ctrl->ctrl);
- nvme_rdma_stop_io_queues(ctrl);
-out_cleanup_tagset:
- nvme_cancel_tagset(&ctrl->ctrl);
- if (new)
- nvme_remove_io_tag_set(&ctrl->ctrl);
-out_free_io_queues:
- nvme_rdma_free_io_queues(ctrl);
- return ret;
-}
-
-static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
- bool remove)
-{
- nvme_quiesce_admin_queue(&ctrl->ctrl);
- blk_sync_queue(ctrl->ctrl.admin_q);
- nvme_rdma_stop_queue(&ctrl->ctrl, 0);
- nvme_cancel_admin_tagset(&ctrl->ctrl);
- if (remove) {
- nvme_unquiesce_admin_queue(&ctrl->ctrl);
- nvme_remove_admin_tag_set(&ctrl->ctrl);
- }
- nvme_rdma_destroy_admin_queue(ctrl);
-}
-
-static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
- bool remove)
-{
- if (ctrl->ctrl.queue_count > 1) {
- nvme_start_freeze(&ctrl->ctrl);
- nvme_quiesce_io_queues(&ctrl->ctrl);
- nvme_sync_io_queues(&ctrl->ctrl);
- nvme_rdma_stop_io_queues(ctrl);
- nvme_cancel_tagset(&ctrl->ctrl);
- if (remove) {
- nvme_unquiesce_io_queues(&ctrl->ctrl);
- nvme_remove_io_tag_set(&ctrl->ctrl);
- }
- nvme_rdma_free_io_queues(ctrl);
- }
-}
-
-static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl)
-{
- struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
-
- flush_work(&ctrl->ctrl.err_work);
- cancel_delayed_work_sync(&ctrl->ctrl.connect_work);
-}
-
static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
@@ -1054,169 +870,13 @@ static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
kfree(ctrl);
}

-static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl)
-{
- /* If we are resetting/deleting then do nothing */
- if (ctrl->ctrl.state != NVME_CTRL_CONNECTING) {
- WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW ||
- ctrl->ctrl.state == NVME_CTRL_LIVE);
- return;
- }
-
- if (nvmf_should_reconnect(&ctrl->ctrl)) {
- dev_info(ctrl->ctrl.device, "Reconnecting in %d seconds...\n",
- ctrl->ctrl.opts->reconnect_delay);
- queue_delayed_work(nvme_wq, &ctrl->ctrl.connect_work,
- ctrl->ctrl.opts->reconnect_delay * HZ);
- } else {
- nvme_delete_ctrl(&ctrl->ctrl);
- }
-}
-
-static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
-{
- int ret;
- bool changed;
-
- ret = nvme_rdma_configure_admin_queue(ctrl, new);
- if (ret)
- return ret;
-
- if (ctrl->ctrl.icdoff) {
- ret = -EOPNOTSUPP;
- dev_err(ctrl->ctrl.device, "icdoff is not supported!\n");
- goto destroy_admin;
- }
-
- if (!(ctrl->ctrl.sgls & (1 << 2))) {
- ret = -EOPNOTSUPP;
- dev_err(ctrl->ctrl.device,
- "Mandatory keyed sgls are not supported!\n");
- goto destroy_admin;
- }
-
- if (ctrl->ctrl.opts->queue_size > ctrl->ctrl.sqsize + 1) {
- dev_warn(ctrl->ctrl.device,
- "queue_size %zu > ctrl sqsize %u, clamping down\n",
- ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1);
- }
-
- if (ctrl->ctrl.sqsize + 1 > NVME_RDMA_MAX_QUEUE_SIZE) {
- dev_warn(ctrl->ctrl.device,
- "ctrl sqsize %u > max queue size %u, clamping down\n",
- ctrl->ctrl.sqsize + 1, NVME_RDMA_MAX_QUEUE_SIZE);
- ctrl->ctrl.sqsize = NVME_RDMA_MAX_QUEUE_SIZE - 1;
- }
-
- if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) {
- dev_warn(ctrl->ctrl.device,
- "sqsize %u > ctrl maxcmd %u, clamping down\n",
- ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd);
- ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1;
- }
-
- if (ctrl->ctrl.sgls & (1 << 20))
- ctrl->use_inline_data = true;
-
- if (ctrl->ctrl.queue_count > 1) {
- ret = nvme_rdma_configure_io_queues(ctrl, new);
- if (ret)
- goto destroy_admin;
- }
-
- changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
- if (!changed) {
- /*
- * state change failure is ok if we started ctrl delete,
- * unless we're during creation of a new controller to
- * avoid races with teardown flow.
- */
- WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
- ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
- WARN_ON_ONCE(new);
- ret = -EINVAL;
- goto destroy_io;
- }
-
- nvme_start_ctrl(&ctrl->ctrl);
- return 0;
-
-destroy_io:
- if (ctrl->ctrl.queue_count > 1) {
- nvme_quiesce_io_queues(&ctrl->ctrl);
- nvme_sync_io_queues(&ctrl->ctrl);
- nvme_rdma_stop_io_queues(ctrl);
- nvme_cancel_tagset(&ctrl->ctrl);
- if (new)
- nvme_remove_io_tag_set(&ctrl->ctrl);
- nvme_rdma_free_io_queues(ctrl);
- }
-destroy_admin:
- nvme_quiesce_admin_queue(&ctrl->ctrl);
- blk_sync_queue(ctrl->ctrl.admin_q);
- nvme_rdma_stop_queue(&ctrl->ctrl, 0);
- nvme_cancel_admin_tagset(&ctrl->ctrl);
- if (new)
- nvme_remove_admin_tag_set(&ctrl->ctrl);
- nvme_rdma_destroy_admin_queue(ctrl);
- return ret;
-}
-
-static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
-{
- struct nvme_ctrl *nctrl = container_of(to_delayed_work(work),
- struct nvme_ctrl, connect_work);
- struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
-
- ++ctrl->ctrl.nr_reconnects;
-
- if (nvme_rdma_setup_ctrl(ctrl, false))
- goto requeue;
-
- dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n",
- ctrl->ctrl.nr_reconnects);
-
- ctrl->ctrl.nr_reconnects = 0;
-
- return;
-
-requeue:
- dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
- ctrl->ctrl.nr_reconnects);
- nvme_rdma_reconnect_or_remove(ctrl);
-}
-
-static void nvme_rdma_error_recovery_work(struct work_struct *work)
+static int nvme_rdma_alloc_admin_tag_set(struct nvme_ctrl *ctrl)
{
- struct nvme_ctrl *nctrl = container_of(work,
- struct nvme_ctrl, err_work);
- struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);

- nvme_stop_keep_alive(&ctrl->ctrl);
- flush_work(&ctrl->ctrl.async_event_work);
- nvme_rdma_teardown_io_queues(ctrl, false);
- nvme_unquiesce_io_queues(&ctrl->ctrl);
- nvme_rdma_teardown_admin_queue(ctrl, false);
- nvme_unquiesce_admin_queue(&ctrl->ctrl);
- nvme_auth_stop(&ctrl->ctrl);
-
- if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
- /* state change failure is ok if we started ctrl delete */
- WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
- ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
- return;
- }
-
- nvme_rdma_reconnect_or_remove(ctrl);
-}
-
-static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
-{
- if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
- return;
-
- dev_warn(ctrl->ctrl.device, "starting error recovery\n");
- queue_work(nvme_reset_wq, &ctrl->ctrl.err_work);
+ return nvme_alloc_admin_tag_set(ctrl, &to_rdma_ctrl(ctrl)->admin_tag_set,
+ &nvme_rdma_admin_mq_ops,
+ sizeof(struct nvme_rdma_request) +
+ NVME_RDMA_DATA_SGL_SIZE);
}

static void nvme_rdma_end_request(struct nvme_rdma_request *req)
@@ -1240,7 +900,7 @@ static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc,
"%s for CQE 0x%p failed with status %s (%d)\n",
op, wc->wr_cqe,
ib_wc_status_msg(wc->status), wc->status);
- nvme_rdma_error_recovery(ctrl);
+ nvmf_error_recovery(&ctrl->ctrl);
}

static void nvme_rdma_memreg_done(struct ib_cq *cq, struct ib_wc *wc)
@@ -1759,7 +1419,7 @@ static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
dev_err(queue->ctrl->ctrl.device,
"got bad command_id %#x on QP %#x\n",
cqe->command_id, queue->qp->qp_num);
- nvme_rdma_error_recovery(queue->ctrl);
+ nvmf_error_recovery(&queue->ctrl->ctrl);
return;
}
req = blk_mq_rq_to_pdu(rq);
@@ -1773,7 +1433,7 @@ static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
dev_err(queue->ctrl->ctrl.device,
"Bogus remote invalidation for rkey %#x\n",
req->mr ? req->mr->rkey : 0);
- nvme_rdma_error_recovery(queue->ctrl);
+ nvmf_error_recovery(&queue->ctrl->ctrl);
}
} else if (req->mr) {
int ret;
@@ -1783,7 +1443,7 @@ static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
dev_err(queue->ctrl->ctrl.device,
"Queueing INV WR for rkey %#x failed (%d)\n",
req->mr->rkey, ret);
- nvme_rdma_error_recovery(queue->ctrl);
+ nvmf_error_recovery(&queue->ctrl->ctrl);
}
/* the local invalidation completion will end the request */
return;
@@ -1810,7 +1470,7 @@ static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
if (unlikely(wc->byte_len < len)) {
dev_err(queue->ctrl->ctrl.device,
"Unexpected nvme completion length(%d)\n", wc->byte_len);
- nvme_rdma_error_recovery(queue->ctrl);
+ nvmf_error_recovery(&queue->ctrl->ctrl);
return;
}

@@ -1980,7 +1640,7 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
case RDMA_CM_EVENT_TIMEWAIT_EXIT:
dev_dbg(queue->ctrl->ctrl.device,
"disconnect received - connection closed\n");
- nvme_rdma_error_recovery(queue->ctrl);
+ nvmf_error_recovery(&queue->ctrl->ctrl);
break;
case RDMA_CM_EVENT_DEVICE_REMOVAL:
/* device removal is handled via the ib_client API */
@@ -1988,7 +1648,7 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
default:
dev_err(queue->ctrl->ctrl.device,
"Unexpected RDMA CM event (%d)\n", ev->event);
- nvme_rdma_error_recovery(queue->ctrl);
+ nvmf_error_recovery(&queue->ctrl->ctrl);
break;
}

@@ -2006,7 +1666,7 @@ static void nvme_rdma_complete_timed_out(struct request *rq)
struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
struct nvme_rdma_queue *queue = req->queue;

- nvme_rdma_stop_queue(ctrl, nvme_rdma_queue_id(queue));
+ nvme_rdma_stop_io_queue(ctrl, nvme_rdma_queue_id(queue));
nvmf_complete_timed_out_request(rq);
}

@@ -2041,7 +1701,7 @@ static enum blk_eh_timer_return nvme_rdma_timeout(struct request *rq)
* LIVE state should trigger the normal error recovery which will
* handle completing this request.
*/
- nvme_rdma_error_recovery(ctrl);
+ nvmf_error_recovery(&ctrl->ctrl);
return BLK_EH_RESET_TIMER;
}

@@ -2242,41 +1902,9 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
.timeout = nvme_rdma_timeout,
};

-static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
-{
- nvme_rdma_teardown_io_queues(ctrl, shutdown);
- nvme_quiesce_admin_queue(&ctrl->ctrl);
- nvme_disable_ctrl(&ctrl->ctrl, shutdown);
- nvme_rdma_teardown_admin_queue(ctrl, shutdown);
-}
-
static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl)
{
- nvme_rdma_shutdown_ctrl(to_rdma_ctrl(ctrl), true);
-}
-
-static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
-{
- struct nvme_rdma_ctrl *ctrl =
- container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work);
-
- nvme_stop_ctrl(&ctrl->ctrl);
- nvme_rdma_shutdown_ctrl(ctrl, false);
-
- if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
- /* state change failure should never happen */
- WARN_ON_ONCE(1);
- return;
- }
-
- if (nvme_rdma_setup_ctrl(ctrl, false))
- goto out_fail;
-
- return;
-
-out_fail:
- ++ctrl->ctrl.nr_reconnects;
- nvme_rdma_reconnect_or_remove(ctrl);
+ nvmf_teardown_ctrl(ctrl, true);
}

static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
@@ -2290,7 +1918,7 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
.submit_async_event = nvme_rdma_submit_async_event,
.delete_ctrl = nvme_rdma_delete_ctrl,
.get_address = nvmf_get_address,
- .stop_ctrl = nvme_rdma_stop_ctrl,
+ .stop_ctrl = nvmf_stop_ctrl,
};

/*
@@ -2322,6 +1950,21 @@ nvme_rdma_existing_controller(struct nvmf_ctrl_options *opts)
return found;
}

+static struct nvme_fabrics_ops nvme_rdma_fabrics_ops = {
+ .alloc_admin_queue = nvme_rdma_alloc_admin_queue,
+ .free_admin_queue = nvme_rdma_free_admin_queue,
+ .start_admin_queue = nvme_rdma_start_admin_queue,
+ .stop_admin_queue = nvme_rdma_stop_admin_queue,
+ .alloc_io_queue = nvme_rdma_alloc_io_queue,
+ .free_io_queue = nvme_rdma_free_io_queue,
+ .start_io_queue = nvme_rdma_start_io_queue,
+ .stop_io_queue = nvme_rdma_stop_io_queue,
+ .alloc_admin_tag_set = nvme_rdma_alloc_admin_tag_set,
+ .alloc_tag_set = nvme_rdma_alloc_tag_set,
+ .nr_io_queues = nvme_rdma_nr_io_queues,
+ .set_io_queues = nvme_rdma_set_io_queues,
+};
+
static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
struct nvmf_ctrl_options *opts)
{
@@ -2333,6 +1976,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
if (!ctrl)
return ERR_PTR(-ENOMEM);
ctrl->ctrl.opts = opts;
+ ctrl->ctrl.fabrics_ops = &nvme_rdma_fabrics_ops;
INIT_LIST_HEAD(&ctrl->list);

if (!(opts->mask & NVMF_OPT_TRSVCID)) {
@@ -2369,10 +2013,9 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
}

INIT_DELAYED_WORK(&ctrl->ctrl.connect_work,
- nvme_rdma_reconnect_ctrl_work);
- INIT_WORK(&ctrl->ctrl.err_work, nvme_rdma_error_recovery_work);
- INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work);
-
+ nvmf_reconnect_ctrl_work);
+ INIT_WORK(&ctrl->ctrl.err_work, nvmf_error_recovery_work);
+ INIT_WORK(&ctrl->ctrl.reset_work, nvmf_reset_ctrl_work);
ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
opts->nr_poll_queues + 1;
ctrl->ctrl.sqsize = opts->queue_size - 1;
@@ -2392,7 +2035,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING);
WARN_ON_ONCE(!changed);

- ret = nvme_rdma_setup_ctrl(ctrl, true);
+ ret = nvmf_setup_ctrl(&ctrl->ctrl, true);
if (ret)
goto out_uninit_ctrl;

--
2.40.0