Re: [PATCH net-next] hyperv: Add support for Virtual Receive SideScaling (vRSS)

From: Tom Herbert
Date: Thu Dec 19 2013 - 16:43:13 EST


Patch is below. This version did most pre-computation of the variants
I built, but results in largest table (40*256*4 bytes), This gives
performance roughly comparable with jhash (roughly same as jhash for
IPv4, about 30% more cycles for IPv6). I have the simpler less memory
intensive versions also if you're interested, these are 10x worse
cycles so I wouldn't want those in critical path.
----
Introduce Toeplitz hash functions. Toeplitz is a hash used primarily in
NICs to performan RSS flow steering. This is a software implemenation
of that. In order to make the hash calculation efficient, we precompute
the possible hash values for each inidividual byte of input. The input
length is up to 40 bytes, so we make an array of cache[40][256].

The implemenation was verified against MSDN "Verify RSS hash" sample
values.

Signed-off-by: Tom Herbert <therbert@xxxxxxxxxx>
---
include/linux/netdevice.h | 3 +++
include/linux/toeplitz.h | 27 +++++++++++++++++++
lib/Kconfig | 3 +++
lib/Makefile | 2 ++
lib/toeplitz.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++
net/Kconfig | 5 ++++
net/core/dev.c | 11 ++++++++
7 files changed, 117 insertions(+)
create mode 100644 include/linux/toeplitz.h
create mode 100644 lib/toeplitz.c

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3de49ac..546caf2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -38,6 +38,7 @@
#include <linux/dmaengine.h>
#include <linux/workqueue.h>
#include <linux/dynamic_queue_limits.h>
+#include <linux/toeplitz.h>

#include <linux/ethtool.h>
#include <net/net_namespace.h>
@@ -195,6 +196,8 @@ struct net_device_stats {
extern struct static_key rps_needed;
#endif

+extern struct toeplitz *toeplitz_net;
+
struct neighbour;
struct neigh_parms;
struct sk_buff;
diff --git a/include/linux/toeplitz.h b/include/linux/toeplitz.h
new file mode 100644
index 0000000..bc0b8e8
--- /dev/null
+++ b/include/linux/toeplitz.h
@@ -0,0 +1,27 @@
+#ifndef __LINUX_TOEPLITZ_H
+#define __LINUX_TOEPLITZ_H
+
+#define TOEPLITZ_KEY_LEN 40
+
+struct toeplitz {
+ u8 key_vals[TOEPLITZ_KEY_LEN];
+ u32 key_cache[TOEPLITZ_KEY_LEN][256];
+};
+
+static inline unsigned int
+toeplitz_hash(const unsigned char *bytes,
+ struct toeplitz *toeplitz, int n)
+{
+ int i;
+ unsigned int result = 0;
+
+ for (i = 0; i < n; i++)
+ result ^= toeplitz->key_cache[i][bytes[i]];
+
+ return result;
+};
+
+extern struct toeplitz *toeplitz_alloc(void);
+extern void toeplitz_init(struct toeplitz *toeplitz, u8 *key_vals);
+
+#endif /* __LINUX_TOEPLITZ_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index b3c8be0..463b2b1 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -359,6 +359,9 @@ config CPU_RMAP
config DQL
bool

+config TOEPLITZ
+ bool
+
#
# Netlink attribute parsing support is select'ed if needed
#
diff --git a/lib/Makefile b/lib/Makefile
index f3bb2cb..a28349b 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -133,6 +133,8 @@ obj-$(CONFIG_CORDIC) += cordic.o

obj-$(CONFIG_DQL) += dynamic_queue_limits.o

+obj-$(CONFIG_TOEPLITZ) += toeplitz.o
+
obj-$(CONFIG_MPILIB) += mpi/
obj-$(CONFIG_SIGNATURE) += digsig.o

diff --git a/lib/toeplitz.c b/lib/toeplitz.c
new file mode 100644
index 0000000..0951dd9
--- /dev/null
+++ b/lib/toeplitz.c
@@ -0,0 +1,66 @@
+/*
+ * Toeplitz hash implemenation. See include/linux/toeplitz.h
+ *
+ * Copyright (c) 2011, Tom Herbert <therbert@xxxxxxxxxx>
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/toeplitz.h>
+
+struct toeplitz *toeplitz_alloc(void)
+{
+ return kmalloc(sizeof(struct toeplitz), GFP_KERNEL);
+}
+
+static u32 toeplitz_get_kval(unsigned char *key, int idx)
+{
+ u32 v, r;
+ int off, rem;
+
+ off = idx / 8;
+ rem = idx % 8;
+
+ v = (((unsigned int)key[off]) << 24) +
+ (((unsigned int)key[off + 1]) << 16) +
+ (((unsigned int)key[off + 2]) << 8) +
+ (((unsigned int)key[off + 3]));
+
+ r = v << rem | (unsigned int)key[off + 4] >> (8 - rem);
+ return r;
+}
+
+static inline int idx8(int idx)
+{
+#ifdef __LITTLE_ENDIAN
+ idx = (idx / 8) * 8 + (8 - (idx % 8 + 1));
+#endif
+ return idx;
+}
+
+void toeplitz_init(struct toeplitz *toeplitz, u8 *key_vals)
+{
+ int i;
+ unsigned long a, j;
+ unsigned int result = 0;
+
+ /* Set up key val table */
+ if (key_vals)
+ for (i = 0; i < TOEPLITZ_KEY_LEN; i++)
+ toeplitz->key_vals[i] = key_vals[i];
+ else
+ prandom_bytes(toeplitz->key_vals, TOEPLITZ_KEY_LEN);
+
+ /* Set up key cache table */
+ for (i = 0; i < TOEPLITZ_KEY_LEN; i++) {
+ for (j = 0; j < 256; j++) {
+ result = 0;
+ for (a = find_first_bit(&j, 8); a < 8;
+ a = find_next_bit(&j, 8, a + 1))
+ result ^= toeplitz_get_kval(
+ toeplitz->key_vals, idx8(a + (i * 8)));
+ toeplitz->key_cache[i][j] = result;
+ }
+ }
+}
diff --git a/net/Kconfig b/net/Kconfig
index b50dacc..860c9fa 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -254,6 +254,11 @@ config BQL
select DQL
default y

+config NET_TOEPLITZ
+ boolean
+ select TOEPLITZ
+ default n
+
config BPF_JIT
bool "enable BPF Just In Time compiler"
depends on HAVE_BPF_JIT
diff --git a/net/core/dev.c b/net/core/dev.c
index 5c713f2..074f530 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6633,6 +6633,9 @@ static struct pernet_operations __net_initdata
default_device_ops = {
.exit_batch = default_device_exit_batch,
};

+struct toeplitz *toeplitz_net;
+EXPORT_SYMBOL(toeplitz_net);
+
/*
* Initialize the DEV module. At boot time this walks the device list and
* unhooks any devices that fail to initialise (normally hardware not
@@ -6656,6 +6659,14 @@ static int __init net_dev_init(void)
if (netdev_kobject_init())
goto out;

+#ifdef CONFIG_NET_TOEPLITZ
+ toeplitz_net = toeplitz_alloc();
+ if (!toeplitz_net)
+ goto out;
+
+ toeplitz_init(toeplitz_net, NULL);
+#endif
+
INIT_LIST_HEAD(&ptype_all);
for (i = 0; i < PTYPE_HASH_SIZE; i++)
INIT_LIST_HEAD(&ptype_base[i]);

On Thu, Dec 19, 2013 at 12:49 PM, Haiyang Zhang <haiyangz@xxxxxxxxxxxxx> wrote:
>
>
>> -----Original Message-----
>> From: Tom Herbert [mailto:therbert@xxxxxxxxxx]
>> Sent: Thursday, December 19, 2013 2:59 PM
>> To: Haiyang Zhang
>> Cc: Daniel Borkmann; Ben Hutchings; davem@xxxxxxxxxxxxx;
>> netdev@xxxxxxxxxxxxxxx; KY Srinivasan; olaf@xxxxxxxxx;
>> jasowang@xxxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx; driverdev-
>> devel@xxxxxxxxxxxxxxxxxxxxxx
>> Subject: Re: [PATCH net-next] hyperv: Add support for Virtual Receive Side
>> Scaling (vRSS)
>>
>> I posted an implementation of library functions for Toeplitz (see [PATCH 1/2]
>> net: Toeplitz library functions). This includes some pre-computation of the
>> table to get reasonable performance in the host. Please take a look.
>>
>> On the other hand, if you're computing a hash in the host, do you really need
>> Toeplitz, flow_dissector already supports a good hash computation and can
>> parse many more packets than just plain UDP/TCP.
>> We probably only should be doing Toeplitz in the host if we need to match
>> HW computed values.
>
> The Hyper-V host requires the guest to select channel based on Toeplitz hash, so
> we need to compute it on the guest.
>
> Regarding the Toeplitz function, do you mean this patch?
> http://patchwork.ozlabs.org/patch/277344/
> This doesn't contain the implementation. Could you point me to the actual code?
>
> Thanks,
> - Haiyang
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/