Re: [PATCH v2 1/1] dmaengine: switchtec-dma: Introduce Switchtec DMA engine PCI driver

From: Christoph Hellwig
Date: Mon Apr 10 2023 - 12:43:06 EST


On Mon, Apr 03, 2023 at 11:06:28AM -0700, Kelvin Cao wrote:
> +#define HALT_RETRY 100
> +static int halt_channel(struct switchtec_dma_chan *swdma_chan)
> +{
> + u32 status;
> + struct chan_hw_regs __iomem *chan_hw = swdma_chan->mmio_chan_hw;
> + int retry = HALT_RETRY;
> + struct pci_dev *pdev;
> + int ret;
> +
> + rcu_read_lock();
> + pdev = rcu_dereference(swdma_chan->swdma_dev->pdev);
> + if (!pdev) {
> + ret = -ENODEV;
> + goto unlock_and_exit;
> + }

This whole RCU critical section around every access to ->pdev scheme
looks a bit bothersome to me. This means that all the low-level
PCI ops are done in RCU critical section. Is this something you
came up with or is it copied from other drivers?

Normally we'd do an unregistration from the dmaengine subsystem
first, which might do a RCU synchronization at a high level,
and then we're sure that none of the methods gets called again
on the unregistered device.

Can't this driver (and the dmaengine core) support an operation
mode where you set a shutdown flag at the beginning
> +
> + spin_lock(&swdma_chan->hw_ctrl_lock);
> + writeb(SWITCHTEC_CHAN_CTRL_HALT, &chan_hw->ctrl);
> +
> + ret = -EIO;
> + do {
> + status = readl(&chan_hw->status);
> +
> + if (status & SWITCHTEC_CHAN_STS_HALTED) {
> + ret = 0;
> + break;
> + }
> +
> + udelay(1000);
> + } while (retry--);
> + spin_unlock(&swdma_chan->hw_ctrl_lock);

Why is the lock needed while reading the status and waiting
for it with long delays?

> + spin_lock(&swdma_chan->hw_ctrl_lock);
> + ctrl = SWITCHTEC_CHAN_CTRL_RESET;
> + ctrl |= SWITCHTEC_CHAN_CTRL_ERR_PAUSE;
> + writel(ctrl, &chan_hw->ctrl);
> +
> + udelay(1000);
> +
> + ctrl = SWITCHTEC_CHAN_CTRL_ERR_PAUSE;
> + writel(ctrl, &chan_hw->ctrl);
> + spin_unlock(&swdma_chan->hw_ctrl_lock);

This looks broken. PCIe MMIO Write TLPs an might not never
reach the device until you do a read from it. So instead of
the udelay you probably need to do a read* operation to some
register that has no side effects.

Also what is the point of the ctrl local variable? Just passing
the values directly would be shorted and easier to read.

> + spin_lock(&swdma_chan->hw_ctrl_lock);
> + writeb(SWITCHTEC_CHAN_CTRL_PAUSE, &chan_hw->ctrl);
> + spin_unlock(&swdma_chan->hw_ctrl_lock);
> +
> + rcu_read_unlock();
> +
> + /* wait 60ms to ensure no pending CEs */
> + msleep(60);

Without a previous read* to flush the posted writes you're not
actually waiting 60ms here from the device POV.

> + spin_lock(&swdma_chan->hw_ctrl_lock);
> + writeb(SWITCHTEC_CHAN_CTRL_PAUSE, &chan_hw->ctrl);
> +
> + ret = -EIO;
> + do {
> + status = readl(&chan_hw->status);
> +
> + if (status & SWITCHTEC_CHAN_STS_PAUSED) {
> + ret = 0;
> + break;
> + }
> +
> + udelay(1000);
> + } while (retry--);
> + spin_unlock(&swdma_chan->hw_ctrl_lock);

Same comment about waiting under a spinlock here.

> + do {
> + status = readl(&chan_hw->status);
> +
> + if (!(status & SWITCHTEC_CHAN_STS_PAUSED)) {
> + ret = 0;
> + break;
> + }
> +
> + udelay(1000);
> + } while (retry--);
> + spin_unlock(&swdma_chan->hw_ctrl_lock);

.. and here.

It might also be useful to have a helper to wait for a certain
bit to be set or not set in the status register rather than
duplicating the code.

> +static void switchtec_dma_synchronize(struct dma_chan *chan)
> +{
> + struct pci_dev *pdev;
> + struct switchtec_dma_chan *swdma_chan = to_switchtec_dma_chan(chan);
> + int rc;
> +
> + rcu_read_lock();
> + pdev = rcu_dereference(swdma_chan->swdma_dev->pdev);
> + if (pdev)
> + synchronize_irq(swdma_chan->irq);
> + rcu_read_unlock();

synchronize_irq can sleep and must not be called in a RCU critical
section.

> + if (type >= UNKNOWN_TRANSACTION)
> + goto err_unlock;
> +
> + if (type == MEMCPY)
> + if (len > SWITCHTEC_DESC_MAX_SIZE)
> + goto err_unlock;
> +
> + if ((type == WIMM) && (len == 8))

Maybe do a switch on the type to make the code more readable?

Also no need for the inner braces in the last line here.

> +static irqreturn_t switchtec_dma_isr(int irq, void *chan)
> +{
> + struct switchtec_dma_chan *swdma_chan = chan;
> +
> + if (swdma_chan->comp_ring_active)
> + tasklet_schedule(&swdma_chan->desc_task);
> +
> + return IRQ_HANDLED;
> +}
> +
> +static irqreturn_t switchtec_dma_chan_status_isr(int irq, void *dma)
> +{
> + struct switchtec_dma_dev *swdma_dev = dma;
> +
> + tasklet_schedule(&swdma_dev->chan_status_task);

What speaks against simply using threaded irqs here instead of the
tasklets?

> + swdma_chan->hw_cq = dma_alloc_coherent(swdma_dev->dma_dev.dev, size,
> + &swdma_chan->dma_addr_cq,
> + GFP_KERNEL);
> + if (!swdma_chan->hw_cq) {
> + rc = -ENOMEM;
> + goto free_and_exit;
> + }
> +
> + memset(swdma_chan->hw_cq, 0, size);

dma_alloc_coherent always returns zeroed memory, no need to
zero it again.

> +
> + /* reset host phase tag */
> + swdma_chan->phase_tag = 0;
> +
> + size = sizeof(*swdma_chan->desc_ring);
> + swdma_chan->desc_ring = kcalloc(SWITCHTEC_DMA_RING_SIZE, size,
> + GFP_KERNEL);
> + if (!swdma_chan->desc_ring) {
> + rc = -ENOMEM;
> + goto free_and_exit;
> + }
> +
> + memset(swdma_chan->desc_ring, 0, SWITCHTEC_DMA_RING_SIZE * size);

kcalloc also already zeroes the memory.

> + writew((__force u16)cpu_to_le16(SWITCHTEC_DMA_SQ_SIZE),
> + &swdma_chan->mmio_chan_fw->sq_size);
> + writew((__force u16)cpu_to_le16(SWITCHTEC_DMA_CQ_SIZE),
> + &swdma_chan->mmio_chan_fw->cq_size);

This looks broken to me, writew always expects cpu endian arguments
and byte swaps on big endian systems.

And if it isn't bogus: __force casts need a good detailed comment
explaining them.

> +void switchtec_chan_kobject_del(struct switchtec_dma_chan *swdma_chan);

This function doesn't actually seem to exists, and thus doesn't need
a forward declaration.

> + if (!swdma_dev)
> + return -ENOMEM;
> +
> + bar = pcim_iomap_table(pdev)[0];
> + swdma_dev->bar = bar;
> +
> + swdma_dev->mmio_dmac_ver = bar + SWITCHTEC_DMAC_VERSION_OFFSET;
> + swdma_dev->mmio_dmac_cap = bar + SWITCHTEC_DMAC_CAPABILITY_OFFSET;
> + swdma_dev->mmio_dmac_status = bar + SWITCHTEC_DMAC_STATUS_OFFSET;
> + swdma_dev->mmio_dmac_ctrl = bar + SWITCHTEC_DMAC_CONTROL_OFFSET;
> + swdma_dev->mmio_chan_hw_all = bar + SWITCHTEC_DMAC_CHAN_CTRL_OFFSET;
> + swdma_dev->mmio_chan_fw_all = bar + SWITCHTEC_DMAC_CHAN_CFG_STS_OFFSET;

These are all either unused or only used once or twice. I'd drop
all the extra pointers and just do the little bit of arithmetics
in the callers.

But I find the whole pcim_iomap_table concept very confusing to the
reader of the driver, and given that it doesn't really use many
devm or pcim routines I'd suggest removing them all and sticking to
one well understood way of manging resource lifetimes.

> + rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
> + if (rc)
> + rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
> + if (rc)
> + return rc;

dma_set_mask_and_coherent for a smaller mask will never succeed when
trying to set it to a larger one failed. So you can remove the second
call here.

> +#define MICROSEMI_VENDOR_ID 0x11f8

This is already in include/linux/pci_ids.h as
PCI_VENDOR_ID_MICROSEMI, please use that.

> +
> +#define SWITCHTEC_PCI_DEVICE(device_id) \
> + { \
> + .vendor = MICROSEMI_VENDOR_ID, \
> + .device = device_id, \
> + .subvendor = PCI_ANY_ID, \
> + .subdevice = PCI_ANY_ID, \
> + .class = PCI_CLASS_SYSTEM_OTHER << 8, \
> + .class_mask = 0xFFFFFFFF, \
> + }

Please just use

PCI_DEVICE(PCI_VENDOR_ID_MICROSEMI, 0x1234),

instead.

> + SWITCHTEC_PCI_DEVICE(0x4000), //PFX 100XG4

and please use normal /* */ comments.