Re: [net-next rfc V2 7/8] macvtap: add TUNSETQUEUE ioctl

From: Michael S. Tsirkin
Date: Tue Jun 04 2013 - 03:05:15 EST


On Tue, Jun 04, 2013 at 01:54:56PM +0800, Jason Wang wrote:
> On 06/03/2013 07:09 PM, Michael S. Tsirkin wrote:
> > On Mon, Jun 03, 2013 at 01:20:58PM +0800, Jason Wang wrote:
> >> On 06/02/2013 07:22 PM, Michael S. Tsirkin wrote:
> >>> On Fri, May 31, 2013 at 05:53:24PM +0800, Jason Wang wrote:
> >>>> This patch adds TUNSETQUEUE ioctl to let userspace can temporarily disable or
> >>>> enable a queue of macvtap. This is used to be compatible at API layer of tuntap
> >>>> to simplify the userspace to manage the queues.
> >>>>
> >>>> This is done by split the taps array into three different areas:
> >>>>
> >>>> - [0, numvtaps) : enabled taps
> >>>> - [numvtaps, numvtaps + numdisabled) : disabled taps
> >>>> - [numvtaps + numdisabled, MAX_MAXVTAP_QUEUES) : unused slots
> >>>>
> >>>> When a tap were enabled and disabled, it was moved to another area.
> >>>>
> >>>> Signed-off-by: Jason Wang <jasowang@xxxxxxxxxx>
> >>> This seems a bit tricky. Can we just move the tap out of the
> >>> array?
> >> Certainly yes.
> >>> the only reason we have the array is for fast
> >>> lookup on xmit.
> >>> What's the reason to keep disabled queues there?
> >> It saves us some space and make code simpler.
> >>> To be able to track all queues for cleanups, all we need is
> >>> a linked list of all queues (enabled and disabled).
> >> Yes, but you need iterate both arrays and linked list which won't be
> >> simpler than keeping them in place.
> > No, my idea is to keep all taps in the list.
> >
> > If you need all taps, walks the list.
> > If you need active taps, look them up in the array.
> >
> > Reasonable?
>
> Looks so, will change in next version.
> >
> >>>> ---
> >>>> drivers/net/macvtap.c | 167 ++++++++++++++++++++++++++++++++++++++++----
> >>>> include/linux/if_macvlan.h | 7 ++
> >>>> 2 files changed, 159 insertions(+), 15 deletions(-)
> >>>>
> >>>> diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
> >>>> index eac49cb..03b781c 100644
> >>>> --- a/drivers/net/macvtap.c
> >>>> +++ b/drivers/net/macvtap.c
> >>>> @@ -85,32 +85,126 @@ static const struct proto_ops macvtap_socket_ops;
> >>>> */
> >>>> static DEFINE_SPINLOCK(macvtap_lock);
> >>>>
> >>>> -static int macvtap_set_queue(struct net_device *dev, struct file *file,
> >>>> +static void macvtap_swap_slot(struct macvlan_dev *vlan, int a, int b)
> >>>> +{
> >>>> + struct macvtap_queue *q1, *q2;
> >>>> +
> >>>> + if (a == b)
> >>>> + return;
> >>>> +
> >>>> + q1 = rcu_dereference_protected(vlan->taps[a],
> >>>> + lockdep_is_held(&macvtap_lock));
> >>>> + q2 = rcu_dereference_protected(vlan->taps[b],
> >>>> + lockdep_is_held(&macvtap_lock));
> >>>> +
> >>>> + BUG_ON(q1 == NULL || q2 == NULL);
> >>>> +
> >>>> + rcu_assign_pointer(vlan->taps[a], q2);
> >>>> + rcu_assign_pointer(vlan->taps[b], q1);
> >>>> +
> >>>> + q1->queue_index = b;
> >>>> + q2->queue_index = a;
> >>>> +}
> >>>> +
> >>>> +static int macvtap_enable_queue(struct net_device *dev, struct file *file,
> >>>> struct macvtap_queue *q)
> >>>> {
> >>>> struct macvlan_dev *vlan = netdev_priv(dev);
> >>>> + int err = -EINVAL;
> >>>> + int total;
> >>>> +
> >>>> + spin_lock(&macvtap_lock);
> >>>> + total = vlan->numvtaps + vlan->numdisabled;
> >>>> +
> >>>> + if (q->queue_index < vlan->numvtaps)
> >>>> + goto out;
> >>>> +
> >>>> + err = 0;
> >>>> +
> >>>> + BUG_ON(q->queue_index >= total);
> >>>> + macvtap_swap_slot(vlan, q->queue_index, vlan->numvtaps);
> >>>> +
> >>>> + /* Make sure the pointers were seen before indices. */
> >>>> + wmb();
> >>> Which indices? We only care about numvtaps right?
> >>> So let's just say so.
> >> ok
> >>> Why is this wmb and not smp_wmb()?
> >> will correct it.
> >>>> +
> >>>> + vlan->numdisabled--;
> >>>> + vlan->numvtaps++;
> >>>> +out:
> >>>> + spin_unlock(&macvtap_lock);
> >>>> + return err;
> >>>> +}
> >>>> +
> >>>> +static int macvtap_set_queue(struct net_device *dev, struct file *file,
> >>>> + struct macvtap_queue *q)
> >>>> +{
> >>>> + struct macvlan_dev *vlan = netdev_priv(dev);
> >>>> int err = -EBUSY;
> >>>> + int total;
> >>>>
> >>>> spin_lock(&macvtap_lock);
> >>>> - if (vlan->numvtaps == MAX_MACVTAP_QUEUES)
> >>>> +
> >>>> + total = vlan->numvtaps + vlan->numdisabled;
> >>>> + if (total == MAX_MACVTAP_QUEUES)
> >>>> goto out;
> >>>>
> >>>> err = 0;
> >>>> +
> >>>> rcu_assign_pointer(q->vlan, vlan);
> >>>> - rcu_assign_pointer(vlan->taps[vlan->numvtaps], q);
> >>>> + rcu_assign_pointer(vlan->taps[total], q);
> >>>> sock_hold(&q->sk);
> >>>>
> >>>> q->file = file;
> >>>> - q->queue_index = vlan->numvtaps;
> >>>> + q->queue_index = total;
> >>>> file->private_data = q;
> >>>> + if (vlan->numdisabled)
> >>>> + macvtap_swap_slot(vlan, vlan->numvtaps, total);
> >>>>
> >>>> - vlan->numvtaps++;
> >>>> + /* Make sure the pointers were seen before indices. */
> >>>> + wmb();
> >>>>
> >>>> + vlan->numvtaps++;
> >>>> out:
> >>>> spin_unlock(&macvtap_lock);
> >>>> return err;
> >>>> }
> >>>>
> >>>> +static int macvtap_disable_queue(struct macvtap_queue *q)
> >>>> +{
> >>>> + struct macvlan_dev *vlan;
> >>>> + int err = -EINVAL;
> >>>> +
> >>>> + spin_lock(&macvtap_lock);
> >>>> + vlan = rcu_dereference_protected(q->vlan,
> >>>> + lockdep_is_held(&macvtap_lock));
> >>>> +
> >>>> + if (vlan) {
> >>>> + int total = vlan->numvtaps + vlan->numdisabled;
> >>>> + int index = q->queue_index;
> >>>> +
> >>>> + BUG_ON(q->queue_index >= total);
> >>>> + if (q->queue_index >= vlan->numvtaps)
> >>>> + goto out;
> >>>> +
> >>>> + err = 0;
> >>>> + macvtap_swap_slot(vlan, index, total - 1);
> >>>> + if (vlan->numdisabled)
> >>>> + /* If there's disabled taps, the above swap will cause
> >>>> + * a disabled tap to be moved to enabled area. So
> >>>> + * another swap is needed to keep the right order.
> >>>> + */
> >>>> + macvtap_swap_slot(vlan, index, vlan->numvtaps - 1);
> >>>> +
> >>>> + /* make sure the pointers were seen before indices */
> >>>> + wmb();
> >>> Hmm this looks questionable. The code near rmb first
> >>> checks numvtaps then dereferences the queue.
> >>> So it might see a new queue but old value of numvtaps.
> >> Right, barriers here were just best effort to reduce the possibility of
> >> wrong queue selection when changing the number of queues.
> > If this is an optimization, I'd say benchmark it and
> > see if it helps performance.
> >
> > I don't expect it to have any effect really.
> > In fact, the re-ordering of queues that this patch does
> > is likely to cause packet reorering and hurt performance
> > more.
>
> Yes, so I will remove the barriers.
>
> The re-ordering seems to be the easiest way to do fast lookup of active
> queues. We could use indirection to avoid the re-ordering of queues,
> it's hard to eliminate OOO packets. If we don't depends on changing the
> number of queues frequently, we're ok.
> >
> > I'm guessing the only thing we need for correctness
> > is ACCESS_ONCE on numvtaps?
>
> Did't see how it help.

For this loop:
while (unlikely(rxq >= numvtaps))
rxq -= numvtaps;

you better read numvtaps with ACCESS_ONCE otherwise
compiler can re-read numvtaps and it would
change during loop execution.
rxq can then become negative.

> >
> >>>> +
> >>>> + vlan->numvtaps--;
> >>>> + vlan->numdisabled++;
> >>>> + }
> >>>> +
> >>>> +out:
> >>>> + spin_unlock(&macvtap_lock);
> >>>> + return err;
> >>>> +}
> >>>> /*
> >>>> * The file owning the queue got closed, give up both
> >>>> * the reference that the files holds as well as the
> >>>> @@ -121,25 +215,38 @@ out:
> >>>> */
> >>>> static void macvtap_put_queue(struct macvtap_queue *q)
> >>>> {
> >>>> - struct macvtap_queue *nq;
> >>>> struct macvlan_dev *vlan;
> >>>>
> >>>> spin_lock(&macvtap_lock);
> >>>> vlan = rcu_dereference_protected(q->vlan,
> >>>> lockdep_is_held(&macvtap_lock));
> >>>> +
> >>>> if (vlan) {
> >>>> + int total = vlan->numvtaps + vlan->numdisabled;
> >>>> int index = q->queue_index;
> >>>> - BUG_ON(index >= vlan->numvtaps);
> >>>> + bool disabled = q->queue_index >= vlan->numvtaps;
> >>>> +
> >>>> + BUG_ON(q->queue_index >= total);
> >>>> + macvtap_swap_slot(vlan, index, total - 1);
> >>>> + if (!disabled && vlan->numdisabled)
> >>>> + /* If there's disabled taps, the above swap will cause
> >>>> + * a disabled tap to be moved to enabled area. So
> >>>> + * another swap is needed to keep the right order.
> >>>> + */
> >>>> + macvtap_swap_slot(vlan, index, vlan->numvtaps - 1);
> >>>> +
> >>>> + RCU_INIT_POINTER(vlan->taps[total - 1], NULL);
> >>>> + RCU_INIT_POINTER(q->vlan, NULL);
> >>>> + sock_put(&q->sk);
> >>>>
> >>>> - nq = rcu_dereference_protected(vlan->taps[vlan->numvtaps - 1],
> >>>> - lockdep_is_held(&macvtap_lock));
> >>>> - rcu_assign_pointer(vlan->taps[index], nq);
> >>>> - nq->queue_index = index;
> >>>> + /* Make sure the pointers were seen before indices */
> >>> Here it's one pointer, right?
> >> Right.
> >>>> + wmb();
> >>> Same issue as above, looks even more worrying
> >>> as queue is freed here.
> >> The read side were protected by rcu_read_lock(), so no worries here.
> > Okay so basically numvtaps is just a hint,
> > it can be wrong and nothing too bad happens?
> >
> > OK, but let's document this.
>
> Sure.
> >
> >>>>
> >>>> - RCU_INIT_POINTER(q->vlan, NULL);
> >>>> + if (disabled)
> >>>> + vlan->numdisabled--;
> >>>> + else
> >>>> + vlan->numvtaps--;
> >>>>
> >>>> - sock_put(&q->sk);
> >>>> - --vlan->numvtaps;
> >>>> }
> >>>>
> >>>> spin_unlock(&macvtap_lock);
> >>>> @@ -166,6 +273,9 @@ static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
> >>>> if (!numvtaps)
> >>>> goto out;
> >>>>
> >>>> + /* Check taps after numvtaps were exposed. */
> >>>> + rmb();
> >>>> +
> >>> Except this doesn't seem to handle case where taps are going away ...
> >> We're protected by rcu read lock here so even though it choose the queue
> >> which is going to be destroyed temporarily, the socket won't be freed
> >> before the packets were queued in the socket.
> >>>> /* Check if we can use flow to select a queue */
> >>>> rxq = skb_get_rxhash(skb);
> >>>> if (rxq) {
> >>>> @@ -201,7 +311,7 @@ static void macvtap_del_queues(struct net_device *dev)
> >>>>
> >>>> /* macvtap_put_queue can free some slots, so go through all slots */
> >>>> spin_lock(&macvtap_lock);
> >>>> - for (i = 0; i < vlan->numvtaps; i++) {
> >>>> + for (i = 0; i < vlan->numvtaps + vlan->numdisabled; i++) {
> >>>> q = rcu_dereference_protected(vlan->taps[i],
> >>>> lockdep_is_held(&macvtap_lock));
> >>>> BUG_ON(q == NULL);
> >>>> @@ -211,6 +321,7 @@ static void macvtap_del_queues(struct net_device *dev)
> >>>> }
> >>>> /* guarantee that any future macvtap_set_queue will fail */
> >>>> vlan->numvtaps = MAX_MACVTAP_QUEUES;
> >>>> + vlan->numdisabled = 0;
> >>>> spin_unlock(&macvtap_lock);
> >>>>
> >>>> synchronize_rcu();
> >>>> @@ -927,6 +1038,27 @@ static int macvtap_set_iff(struct file *file, struct ifreq __user *ifr_u)
> >>>> return 0;
> >>>> }
> >>>>
> >>>> +static int macvtap_ioctl_set_queue(struct file *file, unsigned int flags)
> >>>> +{
> >>>> + struct macvtap_queue *q = file->private_data;
> >>>> + struct macvlan_dev *vlan;
> >>>> + int ret = -EINVAL;
> >>>> +
> >>>> + vlan = macvtap_get_vlan(q);
> >>>> + if (!vlan)
> >>>> + goto done;
> >>>> +
> >>>> + if (flags & IFF_ATTACH_QUEUE)
> >>>> + ret = macvtap_enable_queue(vlan->dev, file, q);
> >>>> + else if (flags & IFF_DETACH_QUEUE)
> >>>> + ret = macvtap_disable_queue(q);
> >>>> +
> >>>> + macvtap_put_vlan(vlan);
> >>>> +
> >>>> +done:
> >>>> + return ret;
> >>>> +}
> >>>> +
> >>>> /*
> >>>> * provide compatibility with generic tun/tap interface
> >>>> */
> >>>> @@ -959,6 +1091,11 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
> >>>> macvtap_put_vlan(vlan);
> >>>> return ret;
> >>>>
> >>>> + case TUNSETQUEUE:
> >>>> + if (get_user(u, &ifr->ifr_flags))
> >>>> + return -EFAULT;
> >>>> + return macvtap_ioctl_set_queue(file, u);
> >>>> +
> >>>> case TUNGETFEATURES:
> >>>> if (put_user(IFF_TAP | IFF_NO_PI | IFF_VNET_HDR, up))
> >>>> return -EFAULT;
> >>>> diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
> >>>> index 62d8bda..d528f38 100644
> >>>> --- a/include/linux/if_macvlan.h
> >>>> +++ b/include/linux/if_macvlan.h
> >>>> @@ -69,8 +69,15 @@ struct macvlan_dev {
> >>>> u16 flags;
> >>>> int (*receive)(struct sk_buff *skb);
> >>>> int (*forward)(struct net_device *dev, struct sk_buff *skb);
> >>>> + /* This array tracks all taps (include disabled ones) and will be
> >>>> + * reshuffled to keep the following order:
> >>>> + * [0, numvtaps) : enabled taps,
> >>>> + * [numvtaps, numvtaps + numdisabled) : disabled taps,
> >>>> + * [numvtaps + numdisabled, MAX_MACVTAP_QUEUES) : unused slots
> >>>> + */
> >>>> struct macvtap_queue *taps[MAX_MACVTAP_QUEUES];
> >>>> int numvtaps;
> >>>> + int numdisabled;
> >>>> int minor;
> >>>> };
> >>>>
> >>>> --
> >>>> 1.7.1
> >>> --
> >>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> >>> the body of a message to majordomo@xxxxxxxxxxxxxxx
> >>> More majordomo info at http://vger.kernel.org/majordomo-info.html
> >>> Please read the FAQ at http://www.tux.org/lkml/
> > --
> > To unsubscribe from this list: send the line "unsubscribe netdev" in
> > the body of a message to majordomo@xxxxxxxxxxxxxxx
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/