[RFC 2/4] net/fib: Provide fib_balance_budget sysctl

From: Dmitry Safonov
Date: Tue Mar 26 2019 - 11:30:51 EST


Unfortunately, MAX_WORK at this moment is broken: during
trie_rebalance() climbing upwards resize() is being called for each
tnode. Which makes the limit useless the bigger trie gets: at each level
there are 10 attempts to balance tnode and childs, resulting in
O(10^n + 10^{n-1} + ... 10^{n-k}) complexity to balance trie, where k - is
the level where we changed the trie originally (adding/removing
alias/route).

Which results in the following price of removing one route under big
trie (similar for some other single routes that results in reallocating
tnodes by hitting the threshold limits for halve/inflate resize):

Before:
Basic info: size of leaf: 40 bytes, size of tnode: 40 bytes.
Main:
Aver depth: 1.99
Max depth: 2
Leaves: 77825
Prefixes: 77825
Internal nodes: 77
9: 1 10: 76
Pointers: 78336
Null ptrs: 435
Total size: 7912 kB
Local:
[omitted]

After:
Basic info: size of leaf: 40 bytes, size of tnode: 40 bytes.
Main:
Aver depth: 3.00
Max depth: 3
Leaves: 77824
Prefixes: 77824
Internal nodes: 20491
1: 2048 2: 18432 6: 1 11: 10
Pointers: 98368
Null ptrs: 54
Total size: 8865 kB
Local:
[omitted]

Provide a sysctl to control amount of pending balancing work.
(by default unlimited as it was)

Cc: linux-doc@xxxxxxxxxxxxxxx
Cc: Jonathan Corbet <corbet@xxxxxxx>
Fixes: ff181ed8768f ("fib_trie: Push assignment of child to parent down
into inflate/halve")
Signed-off-by: Dmitry Safonov <dima@xxxxxxxxxx>
---
Documentation/networking/ip-sysctl.txt | 6 +++
include/net/ip.h | 1 +
net/ipv4/fib_trie.c | 60 +++++++++++++++-----------
net/ipv4/sysctl_net_ipv4.c | 7 +++
4 files changed, 49 insertions(+), 25 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index acdfb5d2bcaa..fb71dacff4dd 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -81,6 +81,12 @@ fib_multipath_hash_policy - INTEGER
0 - Layer 3
1 - Layer 4

+fib_balance_budget - UNSIGNED INTEGER
+ Limits the number of resize attempts during balancing fib trie
+ on adding/removing new routes.
+ Possible values:
+ Default: UINT_MAX (0xFFFFFFFF)
+
ip_forward_update_priority - INTEGER
Whether to update SKB priority from "TOS" field in IPv4 header after it
is forwarded. The new SKB priority is mapped from TOS field value
diff --git a/include/net/ip.h b/include/net/ip.h
index be3cad9c2e4c..305d0e43088b 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -421,6 +421,7 @@ static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
return min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU);
}

+extern unsigned int fib_balance_budget;
struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx,
int fc_mx_len,
struct netlink_ext_ack *extack);
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index ad7d56c421cb..d90cf9dfd443 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -182,8 +182,10 @@ struct trie {
#endif
};

-static struct key_vector *resize(struct trie *t, struct key_vector *tn);
+static struct key_vector *resize(struct trie *t, struct key_vector *tn,
+ unsigned int *budget);
static size_t tnode_free_size;
+unsigned int fib_balance_budget = UINT_MAX;

/*
* synchronize_rcu after call_rcu for that many pages; it should be especially
@@ -506,7 +508,8 @@ static void tnode_free(struct key_vector *tn)

static struct key_vector *replace(struct trie *t,
struct key_vector *oldtnode,
- struct key_vector *tn)
+ struct key_vector *tn,
+ unsigned int *budget)
{
struct key_vector *tp = node_parent(oldtnode);
unsigned long i;
@@ -522,19 +525,19 @@ static struct key_vector *replace(struct trie *t,
tnode_free(oldtnode);

/* resize children now that oldtnode is freed */
- for (i = child_length(tn); i;) {
+ for (i = child_length(tn); i && *budget;) {
struct key_vector *inode = get_child(tn, --i);

/* resize child node */
if (tnode_full(tn, inode))
- tn = resize(t, inode);
+ tn = resize(t, inode, budget);
}

return tp;
}

-static struct key_vector *inflate(struct trie *t,
- struct key_vector *oldtnode)
+static struct key_vector *inflate(struct trie *t, struct key_vector *oldtnode,
+ unsigned int *budget)
{
struct key_vector *tn;
unsigned long i;
@@ -621,7 +624,7 @@ static struct key_vector *inflate(struct trie *t,
}

/* setup the parent pointers into and out of this node */
- return replace(t, oldtnode, tn);
+ return replace(t, oldtnode, tn, budget);
nomem:
/* all pointers should be clean so we are done */
tnode_free(tn);
@@ -629,8 +632,8 @@ static struct key_vector *inflate(struct trie *t,
return NULL;
}

-static struct key_vector *halve(struct trie *t,
- struct key_vector *oldtnode)
+static struct key_vector *halve(struct trie *t, struct key_vector *oldtnode,
+ unsigned int *budget)
{
struct key_vector *tn;
unsigned long i;
@@ -676,7 +679,7 @@ static struct key_vector *halve(struct trie *t,
}

/* setup the parent pointers into and out of this node */
- return replace(t, oldtnode, tn);
+ return replace(t, oldtnode, tn, budget);
nomem:
/* all pointers should be clean so we are done */
tnode_free(tn);
@@ -843,15 +846,15 @@ static inline bool should_collapse(struct key_vector *tn)
return used < 2;
}

-#define MAX_WORK 10
-static struct key_vector *resize(struct trie *t, struct key_vector *tn)
+static struct key_vector *resize(struct trie *t, struct key_vector *tn,
+ unsigned int *budget)
{
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats __percpu *stats = t->stats;
#endif
struct key_vector *tp = node_parent(tn);
unsigned long cindex = get_index(tn->key, tp);
- int max_work = MAX_WORK;
+ bool inflated = false;

pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
tn, inflate_threshold, halve_threshold);
@@ -865,8 +868,8 @@ static struct key_vector *resize(struct trie *t, struct key_vector *tn)
/* Double as long as the resulting node has a number of
* nonempty nodes that are above the threshold.
*/
- while (should_inflate(tp, tn) && max_work) {
- tp = inflate(t, tn);
+ while (should_inflate(tp, tn) && *budget) {
+ tp = inflate(t, tn, budget);
if (!tp) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->resize_node_skipped);
@@ -874,22 +877,25 @@ static struct key_vector *resize(struct trie *t, struct key_vector *tn)
break;
}

- max_work--;
+ (*budget)--;
+ inflated = true;
tn = get_child(tp, cindex);
}

/* update parent in case inflate failed */
tp = node_parent(tn);

- /* Return if at least one inflate is run */
- if (max_work != MAX_WORK)
+ /* Return if at least one inflate is run:
+ * microoptimization to not recalculate thresholds
+ */
+ if (inflated)
return tp;

/* Halve as long as the number of empty children in this
* node is above threshold.
*/
- while (should_halve(tp, tn) && max_work) {
- tp = halve(t, tn);
+ while (should_halve(tp, tn) && *budget) {
+ tp = halve(t, tn, budget);
if (!tp) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->resize_node_skipped);
@@ -897,7 +903,7 @@ static struct key_vector *resize(struct trie *t, struct key_vector *tn)
break;
}

- max_work--;
+ (*budget)--;
tn = get_child(tp, cindex);
}

@@ -1005,8 +1011,10 @@ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,

static void trie_rebalance(struct trie *t, struct key_vector *tn)
{
- while (!IS_TRIE(tn))
- tn = resize(t, tn);
+ unsigned int budget = fib_balance_budget;
+
+ while (budget && !IS_TRIE(tn))
+ tn = resize(t, tn, &budget);
}

static int fib_insert_node(struct trie *t, struct key_vector *tp,
@@ -1784,6 +1792,7 @@ struct fib_table *fib_trie_unmerge(struct fib_table *oldtb)
void fib_table_flush_external(struct fib_table *tb)
{
struct trie *t = (struct trie *)tb->tb_data;
+ unsigned int budget = fib_balance_budget;
struct key_vector *pn = t->kv;
unsigned long cindex = 1;
struct hlist_node *tmp;
@@ -1806,7 +1815,7 @@ void fib_table_flush_external(struct fib_table *tb)
update_suffix(pn);

/* resize completed node */
- pn = resize(t, pn);
+ pn = resize(t, pn, &budget);
cindex = get_index(pkey, pn);

continue;
@@ -1853,6 +1862,7 @@ void fib_table_flush_external(struct fib_table *tb)
int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all)
{
struct trie *t = (struct trie *)tb->tb_data;
+ unsigned int budget = fib_balance_budget;
struct key_vector *pn = t->kv;
unsigned long cindex = 1;
struct hlist_node *tmp;
@@ -1876,7 +1886,7 @@ int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all)
update_suffix(pn);

/* resize completed node */
- pn = resize(t, pn);
+ pn = resize(t, pn, &budget);
cindex = get_index(pkey, pn);

continue;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index ba0fc4b18465..d7274cc442af 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -443,6 +443,13 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
+ {
+ .procname = "fib_balance_budget",
+ .data = &fib_balance_budget,
+ .maxlen = sizeof(fib_balance_budget),
+ .mode = 0644,
+ .proc_handler = proc_douintvec,
+ },
{
.procname = "inet_peer_threshold",
.data = &inet_peer_threshold,
--
2.21.0