Re: [PATCH net-next v2] drivers: net: virtio_net: Implement a dev_watchdog handler

From: Julio Faracco
Date: Fri Nov 22 2019 - 08:00:14 EST


Hi Michael,

Em sex., 22 de nov. de 2019 Ãs 07:31, Michael S. Tsirkin
<mst@xxxxxxxxxx> escreveu:
>
> On Thu, Nov 21, 2019 at 10:36:36PM -0300, Julio Faracco wrote:
> > Driver virtio_net is not handling error events for TX provided by
> > dev_watchdog. This event is reached when transmission queue is having
> > problems to transmit packets. This could happen for any reason. To
> > enable it, driver should have .ndo_tx_timeout implemented.
> >
> > This commit brings back virtnet_reset method to recover TX queues from a
> > error state. That function is called by schedule_work method and it puts
> > the reset function into work queue.
> >
> > As the error cause is unknown at this moment, it would be better to
> > reset all queues.
> >
> > Signed-off-by: Julio Faracco <jcfaracco@xxxxxxxxx>
> > Signed-off-by: Daiane Mendes <dnmendes76@xxxxxxxxx>
> > Cc: Jason Wang <jasowang@xxxxxxxxxx>
> > ---
> > v1-v2: Tag `net-next` was included to indentify where patch would be
> > applied.
> > ---
> > drivers/net/virtio_net.c | 95 +++++++++++++++++++++++++++++++++++++++-
> > 1 file changed, 94 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 4d7d5434cc5d..31890d77eaf2 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -75,6 +75,7 @@ struct virtnet_sq_stats {
> > u64 xdp_tx;
> > u64 xdp_tx_drops;
> > u64 kicks;
> > + u64 tx_timeouts;
> > };
> >
> > struct virtnet_rq_stats {
> > @@ -98,6 +99,7 @@ static const struct virtnet_stat_desc virtnet_sq_stats_desc[] = {
> > { "xdp_tx", VIRTNET_SQ_STAT(xdp_tx) },
> > { "xdp_tx_drops", VIRTNET_SQ_STAT(xdp_tx_drops) },
> > { "kicks", VIRTNET_SQ_STAT(kicks) },
> > + { "tx_timeouts", VIRTNET_SQ_STAT(tx_timeouts) },
> > };
> >
> > static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
> > @@ -211,6 +213,9 @@ struct virtnet_info {
> > /* Work struct for config space updates */
> > struct work_struct config_work;
> >
> > + /* Work struct for resetting the virtio-net driver. */
> > + struct work_struct reset_work;
> > +
> > /* Does the affinity hint is set for virtqueues? */
> > bool affinity_hint_set;
> >
> > @@ -1721,7 +1726,7 @@ static void virtnet_stats(struct net_device *dev,
> > int i;
> >
> > for (i = 0; i < vi->max_queue_pairs; i++) {
> > - u64 tpackets, tbytes, rpackets, rbytes, rdrops;
> > + u64 tpackets, tbytes, terrors, rpackets, rbytes, rdrops;
> > struct receive_queue *rq = &vi->rq[i];
> > struct send_queue *sq = &vi->sq[i];
> >
> > @@ -1729,6 +1734,7 @@ static void virtnet_stats(struct net_device *dev,
> > start = u64_stats_fetch_begin_irq(&sq->stats.syncp);
> > tpackets = sq->stats.packets;
> > tbytes = sq->stats.bytes;
> > + terrors = sq->stats.tx_timeouts;
> > } while (u64_stats_fetch_retry_irq(&sq->stats.syncp, start));
> >
> > do {
> > @@ -1743,6 +1749,7 @@ static void virtnet_stats(struct net_device *dev,
> > tot->rx_bytes += rbytes;
> > tot->tx_bytes += tbytes;
> > tot->rx_dropped += rdrops;
> > + tot->tx_errors += terrors;
> > }
> >
> > tot->tx_dropped = dev->stats.tx_dropped;
> > @@ -2578,6 +2585,33 @@ static int virtnet_set_features(struct net_device *dev,
> > return 0;
> > }
> >
> > +static void virtnet_tx_timeout(struct net_device *dev)
> > +{
> > + struct virtnet_info *vi = netdev_priv(dev);
> > + u32 i;
> > +
> > + netdev_warn(dev, "TX timeout stats:\n");
> > + /* find the stopped queue the same way dev_watchdog() does */
> > + for (i = 0; i < vi->curr_queue_pairs; i++) {
> > + struct send_queue *sq = &vi->sq[i];
> > +
> > + if (!netif_xmit_stopped(netdev_get_tx_queue(dev, i))) {
> > + netdev_warn(dev, " Available send queue: %d, sq: %s, vq: %d, name: %s\n",
> > + i, sq->name, sq->vq->index, sq->vq->name);
>
> What does this mean?
>
> > + continue;
> > + }
> > +
> > + u64_stats_update_begin(&sq->stats.syncp);
> > + sq->stats.tx_timeouts++;
> > + u64_stats_update_end(&sq->stats.syncp);
> > +
> > + netdev_warn(dev, " Unavailable send queue: %d, sq: %s, vq: %d, name: %s\n",
> > + i, sq->name, sq->vq->index, sq->vq->name);
> > + }
>
> Can we make the warning less cryptic?
> I wonder why don't we get the sq from timeout directly?
> Would seem cleaner.

I need your help with debbuging information. What kind of field shoud
it notify when a TX timeout occurs?
Or we can keep the default WARN_ONCE, but we need a minimum method implemented.
We cannot get timeout directly because it needs to be implemented to
get it directly.
Net core scheduler checks if this handler was implemented to throw a
TX timeout. See:

void __netdev_watchdog_up(struct net_device *dev)
{
if (dev->netdev_ops->ndo_tx_timeout) {
if (dev->watchdog_timeo <= 0)
dev->watchdog_timeo = 5*HZ;
if (!mod_timer(&dev->watchdog_timer,
round_jiffies(jiffies + dev->watchdog_timeo)))
dev_hold(dev);
}
}

>
> > +
> > + schedule_work(&vi->reset_work);
> > +}
> > +
> > static const struct net_device_ops virtnet_netdev = {
> > .ndo_open = virtnet_open,
> > .ndo_stop = virtnet_close,
> > @@ -2593,6 +2627,7 @@ static const struct net_device_ops virtnet_netdev = {
> > .ndo_features_check = passthru_features_check,
> > .ndo_get_phys_port_name = virtnet_get_phys_port_name,
> > .ndo_set_features = virtnet_set_features,
> > + .ndo_tx_timeout = virtnet_tx_timeout,
> > };
> >
> > static void virtnet_config_changed_work(struct work_struct *work)
> > @@ -2982,6 +3017,62 @@ static int virtnet_validate(struct virtio_device *vdev)
> > return 0;
> > }
> >
> > +static void _remove_vq_common(struct virtnet_info *vi)
> > +{
> > + vi->vdev->config->reset(vi->vdev);
> > +
> > + /* Free unused buffers in both send and recv, if any. */
> > + free_unused_bufs(vi);
> > +
> > + _free_receive_bufs(vi);
> > +
> > + free_receive_page_frags(vi);
> > +
> > + virtnet_del_vqs(vi);
> > +}
> > +
> > +static int _virtnet_reset(struct virtnet_info *vi)
> > +{
> > + struct virtio_device *vdev = vi->vdev;
> > + int ret;
> > +
> > + virtio_config_disable(vdev);
> > + vdev->failed = vdev->config->get_status(vdev) & VIRTIO_CONFIG_S_FAILED;
> > +
> > + virtnet_freeze_down(vdev);
> > + _remove_vq_common(vi);
> > +
> > + virtio_add_status(vdev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
> > + virtio_add_status(vdev, VIRTIO_CONFIG_S_DRIVER);
> > +
> > + ret = virtio_finalize_features(vdev);
> > + if (ret)
> > + goto err;
> > +
> > + ret = virtnet_restore_up(vdev);
> > + if (ret)
> > + goto err;
> > +
> > + ret = _virtnet_set_queues(vi, vi->curr_queue_pairs);
> > + if (ret)
> > + goto err;
> > +
> > + virtio_add_status(vdev, VIRTIO_CONFIG_S_DRIVER_OK);
> > + virtio_config_enable(vdev);
>
>
> Is this enough? E.g. all RX mode programming has been lost.

IMHO virtio net has a nice performance. You can take days to see a TX timeout.
If it is happening frequently, there is something wrong with device.
So, again, IMHO I don't think it would be too problematic.

>
>
>
> > + return 0;
> > +err:
> > + virtio_add_status(vdev, VIRTIO_CONFIG_S_FAILED);
> > + return ret;
> > +}
> > +
> > +static void virtnet_reset(struct work_struct *work)
> > +{
> > + struct virtnet_info *vi =
> > + container_of(work, struct virtnet_info, reset_work);
> > +
> > + _virtnet_reset(vi);
> > +}
> > +
> > static int virtnet_probe(struct virtio_device *vdev)
> > {
> > int i, err = -ENOMEM;
> > @@ -3011,6 +3102,7 @@ static int virtnet_probe(struct virtio_device *vdev)
> > dev->netdev_ops = &virtnet_netdev;
> > dev->features = NETIF_F_HIGHDMA;
> >
> > + dev->watchdog_timeo = 5 * HZ;
> > dev->ethtool_ops = &virtnet_ethtool_ops;
> > SET_NETDEV_DEV(dev, &vdev->dev);
> >
>
> Is there a way to make this tuneable from ethtool?

Yes and no.
You can do that obviously, but it is not a common field to tun. If you
compare with other drivers.

>
> > @@ -3068,6 +3160,7 @@ static int virtnet_probe(struct virtio_device *vdev)
> > vdev->priv = vi;
> >
> > INIT_WORK(&vi->config_work, virtnet_config_changed_work);
> > + INIT_WORK(&vi->reset_work, virtnet_reset);
> >
> > /* If we can receive ANY GSO packets, we must allocate large ones. */
> > if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
> > --
> > 2.17.1
>