Re: [PATCH 3/3] nvme: complete request in work queue on CPU with flooded interrupts

From: Sagi Grimberg
Date: Wed Aug 21 2019 - 17:55:09 EST



Sagi,

Here are the test results.

Benchmark command:
fio --bs=4k --ioengine=libaio --iodepth=64 --filename=/dev/nvme0n1:/dev/nvme1n1:/dev/nvme2n1:/dev/nvme3n1:/dev/nvme4n1:/dev/nvme5n1:/dev/nvme6n1:/dev/nvme7n1:/dev/nvme8n1:/dev/nvme9n1 --direct=1 --runtime=90 --numjobs=80 --rw=randread --name=test --group_reporting --gtod_reduce=1

With your patch: 1720k IOPS
With threaded interrupts: 1320k IOPS
With just interrupts: 3720k IOPS

Interrupts are the fastest but we need to find a way to throttle it.

This is the workload that generates the flood?
If so I did not expect that this would be the perf difference..

If completions keep pounding on the cpu, I would expect irqpoll
to simply keep running forever and poll the cqs. There is no
fundamental reason why polling would be faster in an interrupt,
what matters could be:
1. we reschedule more than we need to
2. we don't reap enough completions in every poll round, which
will trigger rearming the interrupt and then when it fires reschedule
another softirq...

Maybe we need to take care of some irq_poll optimizations?

Does this (untested) patch make any difference?
--
diff --git a/lib/irq_poll.c b/lib/irq_poll.c
index 2f17b488d58e..0e94183eba15 100644
--- a/lib/irq_poll.c
+++ b/lib/irq_poll.c
@@ -12,7 +12,8 @@
#include <linux/irq_poll.h>
#include <linux/delay.h>

-static unsigned int irq_poll_budget __read_mostly = 256;
+static unsigned int irq_poll_budget __read_mostly = 3000;
+unsigned int __read_mostly irqpoll_budget_usecs = 2000;

static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll);

@@ -77,32 +78,26 @@ EXPORT_SYMBOL(irq_poll_complete);

static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
{
- struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll);
- int rearm = 0, budget = irq_poll_budget;
- unsigned long start_time = jiffies;
+ struct list_head *irqpoll_list = this_cpu_ptr(&blk_cpu_iopoll);
+ unsigned int budget = irq_poll_budget;
+ unsigned long time_limit =
+ jiffies + usecs_to_jiffies(irqpoll_budget_usecs);
+ LIST_HEAD(list);

local_irq_disable();
+ list_splice_init(irqpoll_list, &list);
+ local_irq_enable();

- while (!list_empty(list)) {
+ while (!list_empty(&list)) {
struct irq_poll *iop;
int work, weight;

- /*
- * If softirq window is exhausted then punt.
- */
- if (budget <= 0 || time_after(jiffies, start_time)) {
- rearm = 1;
- break;
- }
-
- local_irq_enable();
-
/* Even though interrupts have been re-enabled, this
* access is safe because interrupts can only add new
* entries to the tail of this list, and only ->poll()
* calls can remove this head entry from the list.
*/
- iop = list_entry(list->next, struct irq_poll, list);
+ iop = list_first_entry(&list, struct irq_poll, list);

weight = iop->weight;
work = 0;
@@ -111,8 +106,6 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h)

budget -= work;

- local_irq_disable();
-
/*
* Drivers must not modify the iopoll state, if they
* consume their assigned weight (or more, some drivers can't
@@ -125,11 +118,21 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
if (test_bit(IRQ_POLL_F_DISABLE, &iop->state))
__irq_poll_complete(iop);
else
- list_move_tail(&iop->list, list);
+ list_move_tail(&iop->list, &list);
}
+
+ /*
+ * If softirq window is exhausted then punt.
+ */
+ if (budget <= 0 || time_after_eq(jiffies, time_limit))
+ break;
}

- if (rearm)
+ local_irq_disable();
+
+ list_splice_tail_init(irqpoll_list, &list);
+ list_splice(&list, irqpoll_list);
+ if (!list_empty(irqpoll_list))
__raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);

local_irq_enable();
--