[SRU][N:linux-bluefield][PATCH v1 2/3] UBUNTU: SAUCE: ipv4: Flush the FIB once on multiple nexthop removal

Stav Aviram saviram at nvidia.com
Thu May 7 11:20:17 UTC 2026


From: Cosmin Ratiu <cratiu at nvidia.com>

BugLink: https://bugs.launchpad.net/bugs/2151772

When a device is going down or when a net namespace is deleted, all
nexthops on it are removed, and for each nexthop being removed the FIB
table is flushed, which does a full trie traversal looking for entries
marked RTNH_F_DEAD and removing them. This is O(N x R), with N being
number of dev nexthops and R being number of IPv4 routes.

The RTNL is held the entire time.

When there are many nexthops to be removed and many routing entries,
this can result in the RTNL being held for multiple minutes, which
causes unhappiness in other processes trying to acquire the RTNL (e.g.
systemd-networkd for DHCP renewals).

In a complicated deployment with multiple vxlan devices, each having
16K nexthops and a total of 128K ipv4 routes, this is exactly what
happens:

nexthop_flush_dev()                # loops over 16K nexthops
  -> remove_nexthop()
    -> __remove_nexthop()
      -> __remove_nexthop_fib()    # marks fi->fib_flags |= RTNH_F_DEAD
        -> fib_flush()             # for EACH nexthop!
	  -> fib_table_flush()     # walks the ENTIRE FIB, 128K entries

This patch makes use of the previously added FIB flushing signal to only
do a single FIB flush after all nexthops to be removed are marked as
RTNH_F_DEAD:
- __remove_nexthop_fib() no longer flushes the FIB.
- nexthop_flush_dev() and flush_all_nexthops() now keep track whether
  any nexthop was removed and trigger a FIB flush at the end.
- a new wrapper is defined, remove_one_nexthop() which calls
  remove_nexthop() and flushes if necessary. This is intended for places
  which must remove a single nexthop and shouldn't worry about the need
  to trigger a FIB flush. For now, the only caller is rtm_del_nexthop().
- The two direct callers of __remove_nexthop() get a WARN_ON_ONCE, since
  the nh about to be removed should not have any FIB entries referencing
  it when replacing or inserting a new one.

This dramatically improves performance from O(N x R) to O(N + R).

Releasing a nexthop reference in remove_nexthop() now no longer frees
it. Instead, it is deleted when the last fib_info pointing to it gets
freed via free_fib_info_rcu(). All routing code is already careful not
to take into consideration routes marked with RTNH_F_DEAD.

Tested with:
DEV=eth2
ip link set up dev $DEV
ip link add testnh0 link $DEV type macvlan mode bridge
ip addr add 198.51.100.1/24 dev testnh0
ip link set testnh0 up

seq 1 65536 | \
sed 's/.*/nexthop add id & via 198.51.100.2 dev testnh0/' | \
ip -batch -

i=1
for a in $(seq 0 255); do
  for b in $(seq 0 255); do
    echo "route add 10.${a}.${b}.0/32 nhid $i"
    i=$((i + 1))
  done
done | ip -batch -

time ip link set testnh0 down
ip link del testnh0

Without this patch:
real	0m32.601s
user	0m0.000s
sys	0m32.511s

With this patch:
real	0m0.209s
user	0m0.000s
sys	0m0.153s

Signed-off-by: Cosmin Ratiu <cratiu at nvidia.com>
Signed-off-by: Stav Aviram <saviram at nvidia.com>
---
 net/ipv4/nexthop.c | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index d746dcc45c93..7fe0af313423 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -1881,8 +1881,6 @@ static bool __remove_nexthop_fib(struct net *net, struct nexthop *nh)
 
 	list_for_each_entry(fi, &nh->fi_list, nh_list)
 		fi->fib_flags |= RTNH_F_DEAD;
-	if (need_flush)
-		fib_flush(net);
 
 	/* ip6_del_rt removes the entry from this list hence the _safe */
 	list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) {
@@ -1935,6 +1933,13 @@ static bool remove_nexthop(struct net *net, struct nexthop *nh,
 	return need_flush;
 }
 
+static void remove_one_nexthop(struct net *net, struct nexthop *nh,
+			       struct nl_info *nlinfo)
+{
+	if (remove_nexthop(net, nh, nlinfo))
+		fib_flush(net);
+}
+
 /* if any FIB entries reference this nexthop, any dst entries
  * need to be regenerated
  */
@@ -2314,7 +2319,7 @@ static int replace_nexthop(struct net *net, struct nexthop *old,
 	if (!err) {
 		nh_rt_cache_flush(net, old, new);
 
-		__remove_nexthop(net, new, NULL);
+		WARN_ON_ONCE(__remove_nexthop(net, new, NULL));
 		nexthop_put(new);
 	}
 
@@ -2421,6 +2426,7 @@ static void nexthop_flush_dev(struct net_device *dev, unsigned long event)
 	unsigned int hash = nh_dev_hashfn(dev->ifindex);
 	struct net *net = dev_net(dev);
 	struct hlist_head *head = &net->nexthop.devhash[hash];
+	bool need_flush = false;
 	struct hlist_node *n;
 	struct nh_info *nhi;
 
@@ -2432,22 +2438,28 @@ static void nexthop_flush_dev(struct net_device *dev, unsigned long event)
 		    (event == NETDEV_DOWN || event == NETDEV_CHANGE))
 			continue;
 
-		remove_nexthop(net, nhi->nh_parent, NULL);
+		need_flush |= remove_nexthop(net, nhi->nh_parent, NULL);
 	}
+
+	if (need_flush)
+		fib_flush(net);
 }
 
 /* rtnl; called when net namespace is deleted */
 static void flush_all_nexthops(struct net *net)
 {
 	struct rb_root *root = &net->nexthop.rb_root;
+	bool need_flush = false;
 	struct rb_node *node;
 	struct nexthop *nh;
 
 	while ((node = rb_first(root))) {
 		nh = rb_entry(node, struct nexthop, rb_node);
-		remove_nexthop(net, nh, NULL);
+		need_flush |= remove_nexthop(net, nh, NULL);
 		cond_resched();
 	}
+	if (need_flush)
+		fib_flush(net);
 }
 
 static struct nexthop *nexthop_create_group(struct net *net,
@@ -2713,7 +2725,7 @@ static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg,
 
 	err = insert_nexthop(net, nh, cfg, extack);
 	if (err) {
-		__remove_nexthop(net, nh, NULL);
+		WARN_ON_ONCE(__remove_nexthop(net, nh, NULL));
 		nexthop_put(nh);
 		nh = ERR_PTR(err);
 	}
@@ -3047,12 +3059,12 @@ static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
 		return err;
 
 	nh = nexthop_find_by_id(net, id);
-	if (!nh)
-		return -ENOENT;
-
-	remove_nexthop(net, nh, &nlinfo);
+	if (nh)
+		remove_one_nexthop(net, nh, &nlinfo);
+	else
+		err = -ENOENT;
 
-	return 0;
+	return err;
 }
 
 /* rtnl */
-- 
2.38.1




More information about the kernel-team mailing list