Re: [PATCH RFC V2 3/5] blk-mq: Facilitate a shared sbitmap per tagset

From: John Garry
Date: Thu Nov 21 2019 - 05:24:22 EST


int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
{
+ struct blk_mq_tag_set *tag_set = q->tag_set;
struct blk_mq_hw_ctx *hctx;
struct elevator_queue *eq;
unsigned int i;
@@ -537,6 +538,19 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
blk_mq_debugfs_register_sched_hctx(q, hctx);
}
+ if (blk_mq_is_sbitmap_shared(tag_set)) {
+ if (!blk_mq_init_sched_shared_sbitmap(tag_set, q->nr_requests)) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ queue_for_each_hw_ctx(q, hctx, i) {
+ struct blk_mq_tags *tags = hctx->sched_tags;
+
+ tags->pbitmap_tags = &tag_set->sched_shared_bitmap_tags;
+ tags->pbreserved_tags = &tag_set->sched_shared_breserved_tags;

This kind of sharing is wrong, sched tags should be request queue wide
instead of tagset wide, and each request queue has its own & independent
scheduler queue.

Right, so if we get get a scheduler tag we still need to get a driver tag, and this would be the "shared" tag.

That makes things simpler then.


+ }
+ }
+
return 0;
err:
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 42792942b428..6625bebb46c3 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -35,9 +35,9 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
*/
void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
{
- sbitmap_queue_wake_all(&tags->bitmap_tags);
+ sbitmap_queue_wake_all(tags->pbitmap_tags);
if (include_reserve)
- sbitmap_queue_wake_all(&tags->breserved_tags);
+ sbitmap_queue_wake_all(tags->pbreserved_tags);
}

[...]


mutex_init(&set->tag_list_lock);
INIT_LIST_HEAD(&set->tag_list);
@@ -3137,6 +3151,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
{
struct blk_mq_tag_set *set = q->tag_set;
struct blk_mq_hw_ctx *hctx;
+ bool sched_tags = false;
int i, ret;
if (!set)
@@ -3160,6 +3175,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
false);
} else {
+ sched_tags = true;
ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
nr, true);
}
@@ -3169,8 +3185,41 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
q->elevator->type->ops.depth_updated(hctx);
}
- if (!ret)
+ /*
+ * if ret is 0, all queues should have been updated to the same depth
+ * if not, then maybe some have been updated - yuk, need to handle this for shared sbitmap...
+ * if some are updated, we should probably roll back the change altogether. FIXME
+ */
+ if (!ret) {
+ if (blk_mq_is_sbitmap_shared(set)) {
+ if (sched_tags) {
+ sbitmap_queue_free(&set->sched_shared_bitmap_tags);
+ sbitmap_queue_free(&set->sched_shared_breserved_tags);
+ if (!blk_mq_init_sched_shared_sbitmap(set, nr))
+ return -ENOMEM; /* fixup error handling */
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ hctx->sched_tags->pbitmap_tags = &set->sched_shared_bitmap_tags;
+ hctx->sched_tags->pbreserved_tags = &set->sched_shared_breserved_tags;
+ }
+ } else {
+ sbitmap_queue_free(&set->shared_bitmap_tags);
+ sbitmap_queue_free(&set->shared_breserved_tags);
+ if (!blk_mq_init_shared_sbitmap(set))
+ return -ENOMEM; /* fixup error handling */

No, we can't re-allocate driver tags here which are shared by all LUNs. > And you should see that 'can_grow' is set as false for driver tags
in blk_mq_update_nr_requests(), which can only touch per-request-queue
data, not tagset wide data.

Yeah, I see that. We should just resize for driver tags bitmap.

Personally I think the mainline code is a little loose here, as if we could grow driver tags, then blk_mq_tagset.tags would be out-of-sync with the hctx->tags. Maybe that should be made more explicit in the code.

BTW, do you have anything to say about this (modified slightly) comment:

/*
* if ret != 0, q->nr_requests would not be updated, yet the depth
* for some hctx sched tags may have changed - is that the right thing
* to do?
*/

Thanks,
John