Re: [PATCH 5/5] tun: vringfd xmit support.

From: pradeep singh rautela
Date: Fri Apr 18 2008 - 07:46:40 EST


On Fri, Apr 18, 2008 at 10:13 AM, Rusty Russell <rusty@xxxxxxxxxxxxxxx> wrote:
> This patch modifies tun to allow a vringfd to specify the send
> buffer. The user does a write to push out packets from the buffer.
>
> Again we use the 'struct virtio_net_hdr' to allow userspace to send
> GSO packets. In this case, it can hint how much to copy, and the
> other pages will be made into skb fragments.
>
> Signed-off-by: Rusty Russell <rusty@xxxxxxxxxxxxxxx>
> ---
> drivers/net/tun.c | 410 +++++++++++++++++++++++++++++++++++++++++--------
> include/linux/if_tun.h | 1
> 2 files changed, 351 insertions(+), 60 deletions(-)
>
> diff -r f797ec115d1b drivers/net/tun.c
> --- a/drivers/net/tun.c Fri Apr 18 05:58:40 2008 +1000
> +++ b/drivers/net/tun.c Fri Apr 18 06:07:21 2008 +1000
> @@ -65,6 +65,8 @@
> #include <linux/vring.h>
> #include <linux/virtio_net.h>
> #include <linux/file.h>
> +#include <linux/spinlock.h>
> +#include <linux/kthread.h>
> #include <net/net_namespace.h>
>
> #include <asm/system.h>
> @@ -102,8 +104,8 @@ struct tun_struct {
> u32 chr_filter[2];
> u32 net_filter[2];
>
> - struct vring_info *inring;
> - struct file *infile;
> + struct vring_info *inring, *outring;
> + struct file *infile, *outfile;
>
> #ifdef TUN_DEBUG
> int debug;
> @@ -258,6 +261,169 @@ static void tun_net_init(struct net_devi
> dev->tx_queue_len = TUN_READQ_SIZE; /* We prefer our own queue length */
> break;
> }
> +}
> +
> +/* We don't consolidate consecutive iovecs, so huge iovecs can break here.
> + * Users will learn not to do that. */
> +static int get_user_skb_frags(const struct iovec *iv, size_t len,
> + struct skb_frag_struct *f)
> +{
> + unsigned int i, j, num_pg = 0;
> + int err;
> + struct page *pages[MAX_SKB_FRAGS];
> +
> + down_read(&current->mm->mmap_sem);
> + while (len) {
> + int n, npages;
> + unsigned long base, len;
> + base = (unsigned long)iv->iov_base;
> + len = (unsigned long)iv->iov_len;
> +
> + if (len == 0) {
> + iv++;
> + continue;
> + }
> +
> + /* How many pages will this take? */
> + npages = 1 + (base + len - 1)/PAGE_SIZE - base/PAGE_SIZE;

Hi Rusty,
A trivial suggestion, how about
npages = 1+(len -1)/PAGE_SIZE ?

Thanks,
--Pradeep
> + if (unlikely(num_pg + npages > MAX_SKB_FRAGS)) {
> + err = -ENOSPC;
> + goto fail;
> + }
> + n = get_user_pages(current, current->mm, base, npages,
> + 0, 0, pages, NULL);
> + if (unlikely(n < 0)) {
> + err = n;
> + goto fail;
> + }
> +
> + /* Transfer pages to the frag array */
> + for (j = 0; j < n; j++) {
> + f[num_pg].page = pages[j];
> + if (j == 0) {
> + f[num_pg].page_offset = offset_in_page(base);
> + f[num_pg].size = min(len, PAGE_SIZE -
> + f[num_pg].page_offset);
> + } else {
> + f[num_pg].page_offset = 0;
> + f[num_pg].size = min(len, PAGE_SIZE);
> + }
> + len -= f[num_pg].size;
> + base += f[num_pg].size;
> + num_pg++;
> + }
> +
> + if (unlikely(n != npages)) {
> + err = -EFAULT;
> + goto fail;
> + }
> + }
> + up_read(&current->mm->mmap_sem);
> + return num_pg;
> +
> +fail:
> + for (i = 0; i < num_pg; i++)
> + put_page(f[i].page);
> + up_read(&current->mm->mmap_sem);
> + return err;
> +}
> +
> +/* We actually store this at the head of the skb. */
> +struct skb_tun_hdr {
> + struct list_head list;
> + struct tun_struct *tun;
> + unsigned int id;
> + unsigned int len;
> +};
> +
> +/* Get packet from user space buffer. copylen is a hint as to how
> + * much to copy (rest is pinned). */
> +static struct sk_buff *get_user_skb(struct tun_struct *tun, struct iovec *iv,
> + size_t copylen, size_t len)
> +{
> + struct tun_pi pi = { 0, __constant_htons(ETH_P_IP) };
> + struct sk_buff *skb;
> + size_t align = 0, extra = 0;
> + int err;
> +
> + if (!(tun->flags & TUN_NO_PI)) {
> + if (len < sizeof(pi)) {
> + err = -EINVAL;
> + goto fail;
> + }
> + len -= sizeof(pi);
> +
> + if (memcpy_fromiovec((void *)&pi, iv, sizeof(pi))) {
> + err = -EFAULT;
> + goto fail;
> + }
> + if (copylen > len)
> + copylen = len;
> + }
> +
> + if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) {
> + align = NET_IP_ALIGN;
> + if (unlikely(copylen < ETH_HLEN)) {
> + if (len < ETH_HLEN) {
> + err = -EINVAL;
> + goto fail;
> + }
> + copylen = ETH_HLEN;
> + }
> + }
> +
> + /* Allocate extra header if we need */
> + if (copylen != len)
> + extra = sizeof(struct skb_tun_hdr);
> +
> + skb = alloc_skb(extra + copylen + align, GFP_KERNEL);
> + if (!skb) {
> + err = -ENOMEM;
> + goto fail;
> + }
> +
> + if (extra + align)
> + skb_reserve(skb, extra + align);
> +
> + if (memcpy_fromiovec(skb_put(skb, copylen), iv, copylen)) {
> + err = -EFAULT;
> + goto free_skb;
> + }
> +
> + switch (tun->flags & TUN_TYPE_MASK) {
> + case TUN_TUN_DEV:
> + skb_reset_mac_header(skb);
> + skb->protocol = pi.proto;
> + skb->dev = tun->dev;
> + break;
> + case TUN_TAP_DEV:
> + skb->protocol = eth_type_trans(skb, tun->dev);
> + break;
> + };
> +
> + if (tun->flags & TUN_NOCHECKSUM)
> + skb->ip_summed = CHECKSUM_UNNECESSARY;
> +
> + /* Anything left gets put into frags. */
> + if (extra) {
> + struct skb_shared_info *sinfo = skb_shinfo(skb);
> + int err = get_user_skb_frags(iv, len - copylen, sinfo->frags);
> + if (err < 0)
> + goto free_skb;
> + sinfo->nr_frags = err;
> + }
> + tun->dev->last_rx = jiffies;
> +
> + tun->dev->stats.rx_packets++;
> + tun->dev->stats.rx_bytes += len;
> +
> + return skb;
> +
> +free_skb:
> + kfree_skb(skb);
> +fail:
> + tun->dev->stats.rx_dropped++;
> + return ERR_PTR(err);
> }
>
> #if defined(CONFIG_VRING) || defined(CONFIG_VRING_MODULE)
> @@ -355,6 +521,132 @@ static struct vring_ops recvops = {
> .pull = pull_recv_skbs,
> };
>
> +static DEFINE_SPINLOCK(finished_lock);
> +static LIST_HEAD(shinfo_finished_list);
> +static struct task_struct *shinfo_finisher;
> +
> +static void used_buffer(struct skb_tun_hdr *tunh)
> +{
> + /* Woot, something happened. */
> + vring_wake(tunh->tun->outring);
> +
> + /* Release device. Keeping this reference blocks file close. */
> + dev_put(tunh->tun->dev);
> +
> + /* tunh == skb->head. */
> + kfree(tunh);
> +}
> +
> +static int do_shinfo_finisher(void *unused)
> +{
> + LIST_HEAD(list);
> + struct skb_tun_hdr *i;
> +
> + while (!kthread_should_stop()) {
> + set_current_state(TASK_INTERRUPTIBLE);
> +
> + spin_lock_irq(&finished_lock);
> + list_splice_init(&list, &shinfo_finished_list);
> + spin_unlock_irq(&finished_lock);
> +
> + if (list_empty(&list)) {
> + schedule();
> + continue;
> + }
> +
> + list_for_each_entry(i, &list, list) {
> + vring_used_buffer(i->tun->outring, i->id, i->len);
> + used_buffer(i);
> + }
> + }
> + return 0;
> +}
> +
> +/* We are done with this skb data: put it in the used pile. */
> +static void shinfo_finished(struct skb_shared_info *sinfo)
> +{
> + struct skb_tun_hdr *tunh = (void *)skb_shinfo_to_head(sinfo);
> + unsigned long flags;
> +
> + spin_lock_irqsave(&finished_lock, flags);
> + list_add(&tunh->list, &shinfo_finished_list);
> + spin_unlock_irqrestore(&finished_lock, flags);
> +
> + wake_up_process(shinfo_finisher);
> +}
> +
> +static int xmit_packets(void *_tun)
> +{
> + struct tun_struct *tun = _tun;
> + struct iovec iov[1+MAX_SKB_FRAGS];
> + unsigned int iovnum = ARRAY_SIZE(iov);
> + int id, err, wake = 0;
> + unsigned long len;
> +
> + while ((id = vring_get_buffer(tun->outring, NULL, NULL, NULL,
> + iov, &iovnum, &len)) > 0) {
> + struct virtio_net_hdr h;
> + struct sk_buff *skb;
> + struct skb_shared_info *shinfo;
> +
> + if (unlikely(len < sizeof(h)))
> + return -EINVAL;
> +
> + err = memcpy_fromiovec((void *)&h, iov, sizeof(h));
> + if (unlikely(err))
> + return -EFAULT;
> +
> + len -= sizeof(h);
> + if (h.hdr_len > len)
> + return -EINVAL;
> +
> + /* Without GSO, we copy entire packet. */
> + if (h.gso_type == VIRTIO_NET_HDR_GSO_NONE)
> + h.hdr_len = len;
> +
> + skb = get_user_skb(tun, iov, h.hdr_len, len);
> + if (IS_ERR(skb))
> + return PTR_ERR(skb);
> +
> + if ((h.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
> + !skb_partial_csum_set(skb, h.csum_start, h.csum_offset)) {
> + kfree_skb(skb);
> + return -EINVAL;
> + }
> +
> + /* If it has fragments, set up destructor for later. */
> + shinfo = skb_shinfo(skb);
> + if (skb_shinfo(skb)->nr_frags) {
> + struct skb_tun_hdr *tunh = (void *)skb->head;
> + shinfo->destructor = shinfo_finished;
> + tunh->id = id;
> + tunh->len = sizeof(h) + skb->len;
> + } else {
> + vring_used_buffer(tun->outring, id, sizeof(h)+skb->len);
> + wake = 1;
> + }
> + netif_rx_ni(skb);
> + }
> +
> + if (wake)
> + vring_wake(tun->outring);
> +
> + /* 0 or error. */
> + return id;
> +}
> +
> +static struct vring_ops xmitops = {
> + .push = xmit_packets,
> +};
> +
> +static int init_vring(void)
> +{
> + shinfo_finisher = kthread_run(do_shinfo_finisher, NULL, "tun");
> + if (IS_ERR(shinfo_finisher))
> + return PTR_ERR(shinfo_finisher);
> + return 0;
> +}
> +
> static int set_recv_vring(struct tun_struct *tun, int fd)
> {
> int err;
> @@ -391,9 +685,47 @@ static void unset_vrings(struct tun_stru
> vring_unset_ops(tun->inring);
> fput(tun->infile);
> }
> + if (tun->outring) {
> + vring_unset_ops(tun->outring);
> + fput(tun->outfile);
> + }
> +}
> +
> +static int set_xmit_vring(struct tun_struct *tun, int fd)
> +{
> + int err;
> +
> + if (tun->outring)
> + return -EBUSY;
> +
> + tun->outfile = fget(fd);
> + if (!tun->outfile)
> + return -EBADF;
> +
> + tun->outring = vring_get(tun->outfile);
> + if (!tun->outring) {
> + err = -EBADF;
> + goto put;
> + }
> +
> + err = vring_set_ops(tun->outring, &xmitops, tun);
> + if (err) {
> + tun->outring = NULL;
> + goto put;
> + }
> + return 0;
> +
> +put:
> + fput(tun->outfile);
> + tun->outfile = NULL;
> + return err;
> }
> #else /* ... !CONFIG_VRING */
> static int set_recv_vring(struct tun_struct *tun, int fd)
> +{
> + return -ENOTTY;
> +}
> +static int set_xmit_vring(struct tun_struct *tun, int fd)
> {
> return -ENOTTY;
> }
> @@ -424,74 +756,26 @@ static unsigned int tun_chr_poll(struct
> return mask;
> }
>
> -/* Get packet from user space buffer */
> -static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv, size_t count)
> -{
> - struct tun_pi pi = { 0, __constant_htons(ETH_P_IP) };
> - struct sk_buff *skb;
> - size_t len = count, align = 0;
> -
> - if (!(tun->flags & TUN_NO_PI)) {
> - if ((len -= sizeof(pi)) > count)
> - return -EINVAL;
> -
> - if(memcpy_fromiovec((void *)&pi, iv, sizeof(pi)))
> - return -EFAULT;
> - }
> -
> - if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) {
> - align = NET_IP_ALIGN;
> - if (unlikely(len < ETH_HLEN))
> - return -EINVAL;
> - }
> -
> - if (!(skb = alloc_skb(len + align, GFP_KERNEL))) {
> - tun->dev->stats.rx_dropped++;
> - return -ENOMEM;
> - }
> -
> - if (align)
> - skb_reserve(skb, align);
> - if (memcpy_fromiovec(skb_put(skb, len), iv, len)) {
> - tun->dev->stats.rx_dropped++;
> - kfree_skb(skb);
> - return -EFAULT;
> - }
> -
> - switch (tun->flags & TUN_TYPE_MASK) {
> - case TUN_TUN_DEV:
> - skb_reset_mac_header(skb);
> - skb->protocol = pi.proto;
> - skb->dev = tun->dev;
> - break;
> - case TUN_TAP_DEV:
> - skb->protocol = eth_type_trans(skb, tun->dev);
> - break;
> - };
> -
> - if (tun->flags & TUN_NOCHECKSUM)
> - skb->ip_summed = CHECKSUM_UNNECESSARY;
> -
> - netif_rx_ni(skb);
> - tun->dev->last_rx = jiffies;
> -
> - tun->dev->stats.rx_packets++;
> - tun->dev->stats.rx_bytes += len;
> -
> - return count;
> -}
> -
> static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
> unsigned long count, loff_t pos)
> {
> struct tun_struct *tun = iocb->ki_filp->private_data;
> + size_t len;
> + struct sk_buff *skb;
>
> if (!tun)
> return -EBADFD;
>
> DBG(KERN_INFO "%s: tun_chr_write %ld\n", tun->dev->name, count);
>
> - return tun_get_user(tun, (struct iovec *) iv, iov_length(iv, count));
> + len = iov_length(iv, count);
> +
> + skb = get_user_skb(tun, (struct iovec *)iv, len, len);
> + if (IS_ERR(skb))
> + return PTR_ERR(skb);
> +
> + netif_rx_ni(skb);
> + return len;
> }
>
> /* Put packet to the user space buffer */
> @@ -831,6 +1115,9 @@ static int tun_chr_ioctl(struct inode *i
> case TUNSETRECVVRING:
> return set_recv_vring(tun, arg);
>
> + case TUNSETXMITVRING:
> + return set_xmit_vring(tun, arg);
> +
> case SIOCGIFFLAGS:
> ifr.ifr_flags = tun->if_flags;
> if (copy_to_user( argp, &ifr, sizeof ifr))
> @@ -1078,6 +1365,12 @@ static int __init tun_init(void)
> ret = misc_register(&tun_miscdev);
> if (ret)
> printk(KERN_ERR "tun: Can't register misc device %d\n", TUN_MINOR);
> + else {
> + ret = init_vring();
> + if (ret)
> + misc_deregister(&tun_miscdev);
> + }
> +
> return ret;
> }
>
> diff -r f797ec115d1b include/linux/if_tun.h
> --- a/include/linux/if_tun.h Fri Apr 18 05:58:40 2008 +1000
> +++ b/include/linux/if_tun.h Fri Apr 18 06:07:21 2008 +1000
> @@ -43,6 +43,7 @@
> #define TUNSETLINK _IOW('T', 205, int)
> #define TUNSETGROUP _IOW('T', 206, int)
> #define TUNSETRECVVRING _IOW('T', 207, int)
> +#define TUNSETXMITVRING _IOW('T', 208, int)
>
> /* TUNSETIFF ifr flags */
> #define IFF_TUN 0x0001
> _______________________________________________
> Virtualization mailing list
> Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx
> https://lists.linux-foundation.org/mailman/listinfo/virtualization
>



--
Pradeep Singh Rautela
http://eagain.wordpress.com
http://emptydomain.googlepages.com
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/