[RFC PATCH net-next v1 3/3] net: route: replace route hints with input_dst_cache

From: Leone Fernando
Date: Thu Jan 25 2024 - 08:16:20 EST


Replace route hints with cached dsts - ip_rcv_finish_core will first try
to use the cache and only then fall back to the demux or perform a full
lookup.

Only add newly found dsts to the cache after all the checks have passed
successfully to avoid adding a dropped packet's dst to the cache.

Multicast dsts are not added to the dst_cache as it will require additional
checks and multicast packets are rarer and a slower path anyway.

A check was added to ip_route_use_dst_cache that prevents forwarding
packets received by devices for which forwarding is disabled.

Signed-off-by: Leone Fernando <leone4fernando@xxxxxxxxx>
---
include/net/route.h | 6 ++---
net/ipv4/ip_input.c | 58 ++++++++++++++++++++++++---------------------
net/ipv4/route.c | 36 +++++++++++++++++++++-------
3 files changed, 61 insertions(+), 39 deletions(-)

diff --git a/include/net/route.h b/include/net/route.h
index 980ab474eabd..a5a2f55947d6 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -189,9 +189,9 @@ int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
struct in_device *in_dev, u32 *itag);
int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src,
u8 tos, struct net_device *devin);
-int ip_route_use_hint(struct sk_buff *skb, __be32 dst, __be32 src,
- u8 tos, struct net_device *devin,
- const struct sk_buff *hint);
+int ip_route_use_dst_cache(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ u8 tos, struct net_device *dev,
+ struct dst_entry *dst);

static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
u8 tos, struct net_device *devin)
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 5e9c8156656a..35c8b122d62f 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -305,30 +305,44 @@ static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev)
return true;
}

-static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph,
- const struct sk_buff *hint)
+static bool ip_can_add_dst_cache(struct sk_buff *skb, __u16 rt_type)
{
- return hint && !skb_dst(skb) && ip_hdr(hint)->daddr == iph->daddr &&
- ip_hdr(hint)->tos == iph->tos;
+ return skb_valid_dst(skb) &&
+ rt_type != RTN_BROADCAST &&
+ rt_type != RTN_MULTICAST &&
+ !(IPCB(skb)->flags & IPSKB_MULTIPATH);
+}
+
+static bool ip_can_use_dst_cache(const struct net *net, struct sk_buff *skb)
+{
+ return !skb_dst(skb) && !fib4_has_custom_rules(net);
}

int tcp_v4_early_demux(struct sk_buff *skb);
int udp_v4_early_demux(struct sk_buff *skb);
static int ip_rcv_finish_core(struct net *net, struct sock *sk,
- struct sk_buff *skb, struct net_device *dev,
- const struct sk_buff *hint)
+ struct sk_buff *skb, struct net_device *dev)
{
+ struct dst_cache *dst_cache = net_generic(net, dst_cache_net_id);
const struct iphdr *iph = ip_hdr(skb);
+ struct dst_entry *dst;
int err, drop_reason;
struct rtable *rt;
+ bool do_cache;

drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;

- if (ip_can_use_hint(skb, iph, hint)) {
- err = ip_route_use_hint(skb, iph->daddr, iph->saddr, iph->tos,
- dev, hint);
- if (unlikely(err))
- goto drop_error;
+ do_cache = ip_can_use_dst_cache(net, skb);
+ if (do_cache) {
+ dst = dst_cache_input_get_noref(dst_cache, skb);
+ if (dst) {
+ err = ip_route_use_dst_cache(skb, iph->daddr,
+ iph->saddr, iph->tos,
+ dev, dst);
+ if (unlikely(err))
+ goto drop_error;
+ do_cache = false;
+ }
}

if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) &&
@@ -418,6 +432,9 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk,
}
}

+ if (do_cache && ip_can_add_dst_cache(skb, rt->rt_type))
+ dst_cache_input_add(dst_cache, skb);
+
return NET_RX_SUCCESS;

drop:
@@ -444,7 +461,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
if (!skb)
return NET_RX_SUCCESS;

- ret = ip_rcv_finish_core(net, sk, skb, dev, NULL);
+ ret = ip_rcv_finish_core(net, sk, skb, dev);
if (ret != NET_RX_DROP)
ret = dst_input(skb);
return ret;
@@ -581,21 +598,11 @@ static void ip_sublist_rcv_finish(struct list_head *head)
}
}

-static struct sk_buff *ip_extract_route_hint(const struct net *net,
- struct sk_buff *skb, int rt_type)
-{
- if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST ||
- IPCB(skb)->flags & IPSKB_MULTIPATH)
- return NULL;
-
- return skb;
-}
-
static void ip_list_rcv_finish(struct net *net, struct sock *sk,
struct list_head *head)
{
- struct sk_buff *skb, *next, *hint = NULL;
struct dst_entry *curr_dst = NULL;
+ struct sk_buff *skb, *next;
struct list_head sublist;

INIT_LIST_HEAD(&sublist);
@@ -610,14 +617,11 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk,
skb = l3mdev_ip_rcv(skb);
if (!skb)
continue;
- if (ip_rcv_finish_core(net, sk, skb, dev, hint) == NET_RX_DROP)
+ if (ip_rcv_finish_core(net, sk, skb, dev) == NET_RX_DROP)
continue;

dst = skb_dst(skb);
if (curr_dst != dst) {
- hint = ip_extract_route_hint(net, skb,
- ((struct rtable *)dst)->rt_type);
-
/* dispatch old sublist */
if (!list_empty(&sublist))
ip_sublist_rcv_finish(&sublist);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 7c5e68117ee2..3f1977f9b25c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2157,14 +2157,14 @@ static int ip_mkroute_input(struct sk_buff *skb,

/* Implements all the saddr-related checks as ip_route_input_slow(),
* assuming daddr is valid and the destination is not a local broadcast one.
- * Uses the provided hint instead of performing a route lookup.
+ * Uses the provided dst from dst_cache instead of performing a route lookup.
*/
-int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
- u8 tos, struct net_device *dev,
- const struct sk_buff *hint)
+int ip_route_use_dst_cache(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ u8 tos, struct net_device *dev,
+ struct dst_entry *dst)
{
struct in_device *in_dev = __in_dev_get_rcu(dev);
- struct rtable *rt = skb_rtable(hint);
+ struct rtable *rt = (struct rtable *)dst;
struct net *net = dev_net(dev);
int err = -EINVAL;
u32 tag = 0;
@@ -2178,21 +2178,39 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
goto martian_source;

- if (rt->rt_type != RTN_LOCAL)
- goto skip_validate_source;
+ if (ipv4_is_loopback(daddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
+ goto martian_destination;

+ if (rt->rt_type != RTN_LOCAL) {
+ if (!IN_DEV_FORWARD(in_dev)) {
+ err = -EHOSTUNREACH;
+ goto out_err;
+ }
+ goto skip_validate_source;
+ }
tos &= IPTOS_RT_MASK;
err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
if (err < 0)
goto martian_source;

skip_validate_source:
- skb_dst_copy(skb, hint);
+ skb_dst_set_noref(skb, dst);
return 0;

martian_source:
ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
+out_err:
return err;
+
+martian_destination:
+ RT_CACHE_STAT_INC(in_martian_dst);
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+ if (IN_DEV_LOG_MARTIANS(in_dev))
+ net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
+ &daddr, &saddr, dev->name);
+#endif
+ err = -EINVAL;
+ goto out_err;
}

/* get device for dst_alloc with local routes */
@@ -2213,7 +2231,7 @@ static struct net_device *ip_rt_get_dev(struct net *net,
* addresses, because every properly looped back packet
* must have correct destination already attached by output routine.
* Changes in the enforced policies must be applied also to
- * ip_route_use_hint().
+ * ip_route_use_dst_cache().
*
* Such approach solves two big problems:
* 1. Not simplex devices are handled properly.
--
2.34.1