RE: [PATCH net-next 2/2] intel: ixgbevf: Support Windows hosts (Hyper-V)

From: KY Srinivasan
Date: Thu Apr 14 2016 - 22:49:37 EST




> -----Original Message-----
> From: Alexander Duyck [mailto:alexander.duyck@xxxxxxxxx]
> Sent: Thursday, April 14, 2016 4:18 PM
> To: KY Srinivasan <kys@xxxxxxxxxxxxx>
> Cc: David Miller <davem@xxxxxxxxxxxxx>; Netdev
> <netdev@xxxxxxxxxxxxxxx>; linux-kernel@xxxxxxxxxxxxxxx;
> devel@xxxxxxxxxxxxxxxxxxxxxx; olaf@xxxxxxxxx; Robo Bot
> <apw@xxxxxxxxxxxxx>; Jason Wang <jasowang@xxxxxxxxxx>;
> eli@xxxxxxxxxxxx; jackm@xxxxxxxxxxxx; yevgenyp@xxxxxxxxxxxx; John
> Ronciak <john.ronciak@xxxxxxxxx>; intel-wired-lan <intel-wired-
> lan@xxxxxxxxxxxxxxxx>
> Subject: Re: [PATCH net-next 2/2] intel: ixgbevf: Support Windows hosts
> (Hyper-V)
>
> On Thu, Apr 14, 2016 at 4:55 PM, K. Y. Srinivasan <kys@xxxxxxxxxxxxx>
> wrote:
> > On Hyper-V, the VF/PF communication is a via software mediated path
> > as opposed to the hardware mailbox. Make the necessary
> > adjustments to support Hyper-V.
> >
> > Signed-off-by: K. Y. Srinivasan <kys@xxxxxxxxxxxxx>
> > ---
> > drivers/net/ethernet/intel/ixgbevf/ixgbevf.h | 11 ++
> > drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 56 ++++++---
> > drivers/net/ethernet/intel/ixgbevf/mbx.c | 12 ++
> > drivers/net/ethernet/intel/ixgbevf/vf.c | 138
> +++++++++++++++++++++
> > drivers/net/ethernet/intel/ixgbevf/vf.h | 2 +
> > 5 files changed, 201 insertions(+), 18 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
> b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
> > index 5ac60ee..f8d2a0b 100644
> > --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
> > +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
> > @@ -460,9 +460,13 @@ enum ixbgevf_state_t {
> >
> > enum ixgbevf_boards {
> > board_82599_vf,
> > + board_82599_vf_hv,
> > board_X540_vf,
> > + board_X540_vf_hv,
> > board_X550_vf,
> > + board_X550_vf_hv,
> > board_X550EM_x_vf,
> > + board_X550EM_x_vf_hv,
> > };
> >
> > enum ixgbevf_xcast_modes {
> > @@ -477,6 +481,13 @@ extern const struct ixgbevf_info
> ixgbevf_X550_vf_info;
> > extern const struct ixgbevf_info ixgbevf_X550EM_x_vf_info;
> > extern const struct ixgbe_mbx_operations ixgbevf_mbx_ops;
> >
> > +
> > +extern const struct ixgbevf_info ixgbevf_82599_vf_hv_info;
> > +extern const struct ixgbevf_info ixgbevf_X540_vf_hv_info;
> > +extern const struct ixgbevf_info ixgbevf_X550_vf_hv_info;
> > +extern const struct ixgbevf_info ixgbevf_X550EM_x_vf_hv_info;
> > +extern const struct ixgbe_mbx_operations ixgbevf_hv_mbx_ops;
> > +
> > /* needed by ethtool.c */
> > extern const char ixgbevf_driver_name[];
> > extern const char ixgbevf_driver_version[];
> > diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> > index 007cbe0..4a0ffac 100644
> > --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> > +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> > @@ -49,6 +49,7 @@
> > #include <linux/if.h>
> > #include <linux/if_vlan.h>
> > #include <linux/prefetch.h>
> > +#include <asm/mshyperv.h>
> >
> > #include "ixgbevf.h"
> >
> > @@ -62,10 +63,14 @@ static char ixgbevf_copyright[] =
> > "Copyright (c) 2009 - 2015 Intel Corporation.";
> >
> > static const struct ixgbevf_info *ixgbevf_info_tbl[] = {
> > - [board_82599_vf] = &ixgbevf_82599_vf_info,
> > - [board_X540_vf] = &ixgbevf_X540_vf_info,
> > - [board_X550_vf] = &ixgbevf_X550_vf_info,
> > - [board_X550EM_x_vf] = &ixgbevf_X550EM_x_vf_info,
> > + [board_82599_vf] = &ixgbevf_82599_vf_info,
> > + [board_82599_vf_hv] = &ixgbevf_82599_vf_hv_info,
> > + [board_X540_vf] = &ixgbevf_X540_vf_info,
> > + [board_X540_vf_hv] = &ixgbevf_X540_vf_hv_info,
> > + [board_X550_vf] = &ixgbevf_X550_vf_info,
> > + [board_X550_vf_hv] = &ixgbevf_X550_vf_hv_info,
> > + [board_X550EM_x_vf] = &ixgbevf_X550EM_x_vf_info,
> > + [board_X550EM_x_vf_hv] = &ixgbevf_X550EM_x_vf_hv_info,
> > };
> >
> > /* ixgbevf_pci_tbl - PCI Device ID Table
> > @@ -78,9 +83,13 @@ static const struct ixgbevf_info *ixgbevf_info_tbl[] =
> {
> > */
> > static const struct pci_device_id ixgbevf_pci_tbl[] = {
> > {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_VF), board_82599_vf },
> > + {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_82599_VF_HV),
> board_82599_vf_hv },
> > {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X540_VF), board_X540_vf },
> > + {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X540_VF_HV),
> board_X540_vf_hv },
> > {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550_VF), board_X550_vf },
> > + {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550_VF_HV),
> board_X550_vf_hv },
> > {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550EM_X_VF),
> board_X550EM_x_vf },
> > + {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550EM_X_VF_HV),
> board_X550EM_x_vf_hv},
> > /* required last entry */
> > {0, }
> > };
> > @@ -1809,12 +1818,13 @@ static int ixgbevf_vlan_rx_add_vid(struct
> net_device *netdev,
> > {
> > struct ixgbevf_adapter *adapter = netdev_priv(netdev);
> > struct ixgbe_hw *hw = &adapter->hw;
> > - int err;
> > + int err = 0;
> >
> > spin_lock_bh(&adapter->mbx_lock);
> >
> > /* add VID to filter table */
> > - err = hw->mac.ops.set_vfta(hw, vid, 0, true);
> > + if (hw->mac.ops.set_vfta)
> > + err = hw->mac.ops.set_vfta(hw, vid, 0, true);
> >
> > spin_unlock_bh(&adapter->mbx_lock);
> >
> > @@ -1835,12 +1845,13 @@ static int ixgbevf_vlan_rx_kill_vid(struct
> net_device *netdev,
> > {
> > struct ixgbevf_adapter *adapter = netdev_priv(netdev);
> > struct ixgbe_hw *hw = &adapter->hw;
> > - int err;
> > + int err = 0;
> >
> > spin_lock_bh(&adapter->mbx_lock);
> >
> > /* remove VID from filter table */
> > - err = hw->mac.ops.set_vfta(hw, vid, 0, false);
> > + if (hw->mac.ops.set_vfta)
> > + err = hw->mac.ops.set_vfta(hw, vid, 0, false);
> >
> > spin_unlock_bh(&adapter->mbx_lock);
> >
> > @@ -1873,14 +1884,16 @@ static int ixgbevf_write_uc_addr_list(struct
> net_device *netdev)
> > struct netdev_hw_addr *ha;
> >
> > netdev_for_each_uc_addr(ha, netdev) {
> > - hw->mac.ops.set_uc_addr(hw, ++count, ha->addr);
> > + if (hw->mac.ops.set_uc_addr)
> > + hw->mac.ops.set_uc_addr(hw, ++count, ha->addr);
> > udelay(200);
> > }
> > } else {
> > /* If the list is empty then send message to PF driver to
> > * clear all MAC VLANs on this VF.
> > */
> > - hw->mac.ops.set_uc_addr(hw, 0, NULL);
> > + if (hw->mac.ops.set_uc_addr)
> > + hw->mac.ops.set_uc_addr(hw, 0, NULL);
> > }
> >
> > return count;
>
> So if I am understanding this correctly it looks like you cannot read
> or write any addresses for this device. Would I be correct in
> assuming that you get to have the one unicast address that is provided
> by the PF and that is it?

That is what I have been told by the Windows folks at Intel.
>
> If so we may want to look at possibly returning some sort of error on
> these calls so that we can configure the device to indicate that we
> cannot support any of these filters.

I will do that. So, I am going to check the device ID and return an error.
Would IXGBE_NOT_IMPLEMENTED be appropriate?

>
> > @@ -1908,10 +1921,13 @@ static void ixgbevf_set_rx_mode(struct
> net_device *netdev)
> >
> > spin_lock_bh(&adapter->mbx_lock);
> >
> > - hw->mac.ops.update_xcast_mode(hw, netdev, xcast_mode);
> > + if (hw->mac.ops.update_mc_addr_list)
> > + if (hw->mac.ops.update_xcast_mode)
> > + hw->mac.ops.update_xcast_mode(hw, netdev, xcast_mode);
> >
> > /* reprogram multicast list */
> > - hw->mac.ops.update_mc_addr_list(hw, netdev);
> > + if (hw->mac.ops.update_mc_addr_list)
> > + hw->mac.ops.update_mc_addr_list(hw, netdev);
> >
> > ixgbevf_write_uc_addr_list(netdev);
> >
> > @@ -2074,10 +2090,13 @@ static void ixgbevf_up_complete(struct
> ixgbevf_adapter *adapter)
> >
> > spin_lock_bh(&adapter->mbx_lock);
> >
> > - if (is_valid_ether_addr(hw->mac.addr))
> > - hw->mac.ops.set_rar(hw, 0, hw->mac.addr, 0);
> > - else
> > - hw->mac.ops.set_rar(hw, 0, hw->mac.perm_addr, 0);
> > + if (is_valid_ether_addr(hw->mac.addr)) {
> > + if (hw->mac.ops.set_rar)
> > + hw->mac.ops.set_rar(hw, 0, hw->mac.addr, 0);
> > + } else {
> > + if (hw->mac.ops.set_rar)
> > + hw->mac.ops.set_rar(hw, 0, hw->mac.perm_addr, 0);
> > + }
> >
> > spin_unlock_bh(&adapter->mbx_lock);
> >
>
> Same here. We shouldn't let the user set a MAC address that we cannot
> support. We should be returning an error.

Yes; I will return an error.

>
> > @@ -3672,14 +3691,15 @@ static int ixgbevf_set_mac(struct net_device
> *netdev, void *p)
> > struct ixgbevf_adapter *adapter = netdev_priv(netdev);
> > struct ixgbe_hw *hw = &adapter->hw;
> > struct sockaddr *addr = p;
> > - int err;
> > + int err = 0;
> >
> > if (!is_valid_ether_addr(addr->sa_data))
> > return -EADDRNOTAVAIL;
> >
> > spin_lock_bh(&adapter->mbx_lock);
> >
> > - err = hw->mac.ops.set_rar(hw, 0, addr->sa_data, 0);
> > + if (hw->mac.ops.set_rar)
> > + err = hw->mac.ops.set_rar(hw, 0, addr->sa_data, 0);
> >
> > spin_unlock_bh(&adapter->mbx_lock);
> >
>
> Specifically here. If hw->mac.ops.set_rar is NULL we cannot allow any
> MAC address change so we should probably return "-EADDRNOTAVAIL".

Will do.
>
> > diff --git a/drivers/net/ethernet/intel/ixgbevf/mbx.c
> b/drivers/net/ethernet/intel/ixgbevf/mbx.c
> > index dc68fea..298a0da 100644
> > --- a/drivers/net/ethernet/intel/ixgbevf/mbx.c
> > +++ b/drivers/net/ethernet/intel/ixgbevf/mbx.c
> > @@ -346,3 +346,15 @@ const struct ixgbe_mbx_operations
> ixgbevf_mbx_ops = {
> > .check_for_rst = ixgbevf_check_for_rst_vf,
> > };
> >
> > +/**
> > + * Mailbox operations when running on Hyper-V.
> > + * On Hyper-V, PF/VF communiction is not through the
> > + * hardware mailbox; this communication is through
> > + * a software mediated path.
> > + * Most mail box operations are noop while running on
> > + * Hyper-V.
> > + */
> > +const struct ixgbe_mbx_operations ixgbevf_hv_mbx_ops = {
> > + .init_params = ixgbevf_init_mbx_params_vf,
> > + .check_for_rst = ixgbevf_check_for_rst_vf,
> > +};
>
> I'd say if you aren't going to use a mailbox then don't initialize it.
> The code was based off of the same code that runs the ixgbe driver.
> It should be able to function without a mailbox provided the necessary
> calls are updated in the ixgbe_mac_operations that are used by the
> hyperv VF.
>
Ok; I will change the code.

> > diff --git a/drivers/net/ethernet/intel/ixgbevf/vf.c
> b/drivers/net/ethernet/intel/ixgbevf/vf.c
> > index 4d613a4..92397fd 100644
> > --- a/drivers/net/ethernet/intel/ixgbevf/vf.c
> > +++ b/drivers/net/ethernet/intel/ixgbevf/vf.c
> > @@ -27,6 +27,13 @@
> > #include "vf.h"
> > #include "ixgbevf.h"
> >
> > +/*
> > + * On Hyper-V, to reset, we need to read from this offset
> > + * from the PCI config space. This is the mechanism used on
> > + * Hyper-V to support PF/VF communication.
> > + */
> > +#define IXGBE_HV_RESET_OFFSET 0x201
> > +
> > /**
> > * ixgbevf_start_hw_vf - Prepare hardware for Tx/Rx
> > * @hw: pointer to hardware structure
> > @@ -126,6 +133,23 @@ static s32 ixgbevf_reset_hw_vf(struct ixgbe_hw
> *hw)
> > }
> >
> > /**
> > + * Hyper-V variant; the VF/PF communication is through the PCI
> > + * config space.
> > + */
> > +static s32 ixgbevf_hv_reset_hw_vf(struct ixgbe_hw *hw)
> > +{
> > + int i;
> > + struct ixgbevf_adapter *adapter = hw->back;
> > +
> > + for (i = 0; i < 6; i++)
> > + pci_read_config_byte(adapter->pdev,
> > + (i + IXGBE_HV_RESET_OFFSET),
> > + &hw->mac.perm_addr[i]);
> > +
> > + return 0;
> > +}
> > +
>
> This bit can be problematic. What about the case where PCI_MMCONFIG
> is not defined. In such a case it will kill this function without
> reporting an error other than maybe a MAC address that is all 0s or
> all FF's.
>
> You might want to add some sort of check here with some message if
> such a situation occurs just so it can be easier to debug.

I am a little confused here. This function will only execute when we are handling Hyper-V
device IDs (while running on Hyper-V). Hyper-V PCI pass-through driver will support the
config space access.

>
> > +/**
> > * ixgbevf_stop_hw_vf - Generic stop Tx/Rx units
> > * @hw: pointer to hardware structure
> > *
> > @@ -656,6 +680,68 @@ out:
> > }
> >
> > /**
> > + * Hyper-V variant; there is no mailbox communication.
> > + */
> > +static s32 ixgbevf_hv_check_mac_link_vf(struct ixgbe_hw *hw,
> > + ixgbe_link_speed *speed,
> > + bool *link_up,
> > + bool autoneg_wait_to_complete)
> > +{
> > + struct ixgbe_mbx_info *mbx = &hw->mbx;
> > + struct ixgbe_mac_info *mac = &hw->mac;
> > + s32 ret_val = 0;
> > + u32 links_reg;
> > +
> > + /* If we were hit with a reset drop the link */
> > + if (!mbx->ops.check_for_rst(hw) || !mbx->timeout)
> > + mac->get_link_status = true;
> > +
> > + if (!mac->get_link_status)
> > + goto out;
> > +
> > + /* if link status is down no point in checking to see if pf is up */
> > + links_reg = IXGBE_READ_REG(hw, IXGBE_VFLINKS);
> > + if (!(links_reg & IXGBE_LINKS_UP))
> > + goto out;
> > +
> > + /* for SFP+ modules and DA cables on 82599 it can take up to 500usecs
> > + * before the link status is correct
> > + */
> > + if (mac->type == ixgbe_mac_82599_vf) {
> > + int i;
> > +
> > + for (i = 0; i < 5; i++) {
> > + udelay(100);
> > + links_reg = IXGBE_READ_REG(hw, IXGBE_VFLINKS);
> > +
> > + if (!(links_reg & IXGBE_LINKS_UP))
> > + goto out;
> > + }
> > + }
> > +
> > + switch (links_reg & IXGBE_LINKS_SPEED_82599) {
> > + case IXGBE_LINKS_SPEED_10G_82599:
> > + *speed = IXGBE_LINK_SPEED_10GB_FULL;
> > + break;
> > + case IXGBE_LINKS_SPEED_1G_82599:
> > + *speed = IXGBE_LINK_SPEED_1GB_FULL;
> > + break;
> > + case IXGBE_LINKS_SPEED_100_82599:
> > + *speed = IXGBE_LINK_SPEED_100_FULL;
> > + break;
> > + }
> > +
> > + /* if we passed all the tests above then the link is up and we no
> > + * longer need to check for link
> > + */
> > + mac->get_link_status = false;
> > +
> > +out:
> > + *link_up = !mac->get_link_status;
> > + return ret_val;
> > +}
> > +
>
> How does this handle the PF resetting? Seems like you are going to be
> generating Tx hangs in such a case.

I am not sure how the Windows PF driver communicates this information.
Do you have any suggestions for this.

>
> > +/**
> > * ixgbevf_rlpml_set_vf - Set the maximum receive packet length
> > * @hw: pointer to the HW structure
> > * @max_size: value to assign to max frame size
> > @@ -663,6 +749,19 @@ out:
> > void ixgbevf_rlpml_set_vf(struct ixgbe_hw *hw, u16 max_size)
> > {
> > u32 msgbuf[2];
> > + u32 reg;
> > +
> > + if (x86_hyper == &x86_hyper_ms_hyperv) {
> > + /*
> > + * If we are on Hyper-V, we implement
> > + * this functionality differently.
> > + */
> > + reg = IXGBE_READ_REG(hw, IXGBE_VFRXDCTL(0));
> > + /* CRC == 4 */
> > + reg |= ((max_size + 4) | IXGBE_RXDCTL_RLPML_EN);
> > + IXGBE_WRITE_REG(hw, IXGBE_VFRXDCTL(0), reg);
> > + return;
> > + }
> >
> > msgbuf[0] = IXGBE_VF_SET_LPE;
> > msgbuf[1] = max_size;
>
> You would be better off just implementing a hyperv version of this
> function and avoiding the mailbox call entirely.
Ok; will do.

>
> > @@ -679,6 +778,16 @@ int ixgbevf_negotiate_api_version(struct
> ixgbe_hw *hw, int api)
> > int err;
> > u32 msg[3];
> >
> > + if (x86_hyper == &x86_hyper_ms_hyperv) {
> > + /*
> > + * Hyper-V only supports api version ixgbe_mbox_api_10
> > + */
> > + if (api != ixgbe_mbox_api_10)
> > + return IXGBE_ERR_INVALID_ARGUMENT;
> > +
> > + return 0;
> > + }
> > +
> > /* Negotiate the mailbox API version */
> > msg[0] = IXGBE_VF_API_NEGOTIATE;
> > msg[1] = api;
>
> Same here. Just implement a hyperv version of this function instead
> of splicing into the existing call. Also you will need to wrap this
> code up so that we can build on all architectures, not just x86.

Ok; will do.
>
> > @@ -776,22 +885,51 @@ static const struct ixgbe_mac_operations
> ixgbevf_mac_ops = {
> > .set_vfta = ixgbevf_set_vfta_vf,
> > };
> >
> > +static const struct ixgbe_mac_operations ixgbevf_hv_mac_ops = {
> > + .init_hw = ixgbevf_init_hw_vf,
> > + .reset_hw = ixgbevf_hv_reset_hw_vf,
> > + .start_hw = ixgbevf_start_hw_vf,
> > + .get_mac_addr = ixgbevf_get_mac_addr_vf,
> > + .stop_adapter = ixgbevf_stop_hw_vf,
> > + .setup_link = ixgbevf_setup_mac_link_vf,
> > + .check_link = ixgbevf_hv_check_mac_link_vf,
> > +};
>
> You might want to consider implementing a set_rar call that will
> return an error if you try to change the address to anything that is
> not the permanent addr.

Ok; will do.
>
> > const struct ixgbevf_info ixgbevf_82599_vf_info = {
> > .mac = ixgbe_mac_82599_vf,
> > .mac_ops = &ixgbevf_mac_ops,
> > };
> >
> > +const struct ixgbevf_info ixgbevf_82599_vf_hv_info = {
> > + .mac = ixgbe_mac_82599_vf,
> > + .mac_ops = &ixgbevf_hv_mac_ops,
> > +};
> > +
> > const struct ixgbevf_info ixgbevf_X540_vf_info = {
> > .mac = ixgbe_mac_X540_vf,
> > .mac_ops = &ixgbevf_mac_ops,
> > };
> >
> > +const struct ixgbevf_info ixgbevf_X540_vf_hv_info = {
> > + .mac = ixgbe_mac_X540_vf,
> > + .mac_ops = &ixgbevf_hv_mac_ops,
> > +};
> > +
> > const struct ixgbevf_info ixgbevf_X550_vf_info = {
> > .mac = ixgbe_mac_X550_vf,
> > .mac_ops = &ixgbevf_mac_ops,
> > };
> >
> > +const struct ixgbevf_info ixgbevf_X550_vf_hv_info = {
> > + .mac = ixgbe_mac_X550_vf,
> > + .mac_ops = &ixgbevf_hv_mac_ops,
> > +};
> > +
> > const struct ixgbevf_info ixgbevf_X550EM_x_vf_info = {
> > .mac = ixgbe_mac_X550EM_x_vf,
> > .mac_ops = &ixgbevf_mac_ops,
> > };
> > +
> > +const struct ixgbevf_info ixgbevf_X550EM_x_vf_hv_info = {
> > + .mac = ixgbe_mac_X550EM_x_vf,
> > + .mac_ops = &ixgbevf_hv_mac_ops,
> > +};
> > diff --git a/drivers/net/ethernet/intel/ixgbevf/vf.h
> b/drivers/net/ethernet/intel/ixgbevf/vf.h
> > index ef9f773..7242a97 100644
> > --- a/drivers/net/ethernet/intel/ixgbevf/vf.h
> > +++ b/drivers/net/ethernet/intel/ixgbevf/vf.h
> > @@ -32,7 +32,9 @@
> > #include <linux/interrupt.h>
> > #include <linux/if_ether.h>
> > #include <linux/netdevice.h>
> > +#include <asm/hypervisor.h>
> >
> > +#include <asm/hypervisor.h>
>
> I don't think you need to include this twice. Also this is a arch
> specific header file. You might want to move this to vf.c since that
> is where you are using the code it provides. Then you can probably
> wrap it in an X86 build check so that you don't break the build on
> other architectures.

I will fix this. Thank you for your detailed comments.

Regards,

K. Y