aboutsummaryrefslogtreecommitdiff
path: root/net/ipv4/fib_trie.c
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2016-09-28 04:48:20 -0400
committerDavid S. Miller <davem@davemloft.net>2016-09-28 04:48:20 -0400
commit9c5982fe260a28e84d167e894123dc342e76c39f (patch)
treebdd2565cbf373e53c32a47ff30c11e4de0028ce7 /net/ipv4/fib_trie.c
parenteb523f42d77a43f80bb9c57a34fbdc8406c7b075 (diff)
parentfd41b0eaa06a8a0516f9e0b0a5889035bf423784 (diff)
downloadlinux-9c5982fe260a28e84d167e894123dc342e76c39f.tar.gz
linux-9c5982fe260a28e84d167e894123dc342e76c39f.tar.bz2
linux-9c5982fe260a28e84d167e894123dc342e76c39f.zip
Merge branch 'fib-offload-notifications'
Jiri Pirko says: ==================== fib offload: switch to notifier The goal of this patchset is to allow driver to propagate all prefixes configured in kernel down HW. This is necessary for routing to work as expected. If we don't do that HW might forward prefixes known to kernel incorrectly. Take an example when default route is set in switch HW and there is an IP address set on a management (non-switch) port. Currently, only FIB entries related to the switch port netdev are offloaded using switchdev ops. This model is not extendable so the first patch introduces a replacement: notifier to propagate FIB entry additions and removals to whoever is interested. The second patch introduces couple of helpers to deal with RTNH_F_OFFLOAD flags. Currently it is set in switchdev core. There the assumption is that only one offload device exists. But for FIB notifier, we assume multiple offload devices. So the patch introduces a per FIB entry reference counter and helpers use it in order to achieve this: 0 means RTNH_F_OFFLOAD is not set, no device offloads this entry n means RTNH_F_OFFLOAD is set and the entry is offloaded by n devices Patches 3 and 4 convert mlxsw and rocker to adopt this new way, registering one notifier block for each asic instance. Both of these patches also implement internal "abort" mechanism. Using switchdev ops, "abort" is called by switchdev core whenever there is an error during FIB entry add offload. This leads to removal of all offloaded entries on system by fib_trie code. Now the new notifier assumes the driver takes care of the abort action. Here's why: 1) The fact that one HW cannot offload an entry does not mean that the others can't do it. So let only one entity to abort and leave the rest to work happily. 2) The driver knows what to in order to properly abort. For example, currently abort is broken for mlxsw, as for Spectrum there is a need to set 0.0.0.0/0 trap in RALUE register. The fifth patch removes the old, no longer used FIB offload infrastructure. The last patch reflects the changes into switchdev documentation file. --- v2->v3: -patch 3/6 -fixed offload inc/dec to be done in fib4_entry_init/fini and only in case !trap as suggested by Ido v1->v2: -patch 3/6: -fixed lpm tree setup and binding for abort and pointed out by Ido -do nexthop checks as suggested by Ido -fix use after free during abort -patch 6/6: -fixed texts as suggested by Ido ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/fib_trie.c')
-rw-r--r--net/ipv4/fib_trie.c166
1 files changed, 60 insertions, 106 deletions
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 241f27bbd7ad..31cef3602585 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -73,6 +73,7 @@
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/vmalloc.h>
+#include <linux/notifier.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -80,10 +81,47 @@
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
-#include <net/switchdev.h>
#include <trace/events/fib.h>
#include "fib_lookup.h"
+static BLOCKING_NOTIFIER_HEAD(fib_chain);
+
+int register_fib_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&fib_chain, nb);
+}
+EXPORT_SYMBOL(register_fib_notifier);
+
+int unregister_fib_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&fib_chain, nb);
+}
+EXPORT_SYMBOL(unregister_fib_notifier);
+
+int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
+ struct fib_notifier_info *info)
+{
+ info->net = net;
+ return blocking_notifier_call_chain(&fib_chain, event_type, info);
+}
+
+static int call_fib_entry_notifiers(struct net *net,
+ enum fib_event_type event_type, u32 dst,
+ int dst_len, struct fib_info *fi,
+ u8 tos, u8 type, u32 tb_id, u32 nlflags)
+{
+ struct fib_entry_notifier_info info = {
+ .dst = dst,
+ .dst_len = dst_len,
+ .fi = fi,
+ .tos = tos,
+ .type = type,
+ .tb_id = tb_id,
+ .nlflags = nlflags,
+ };
+ return call_fib_notifiers(net, event_type, &info.info);
+}
+
#define MAX_STAT_DEPTH 32
#define KEYLENGTH (8*sizeof(t_key))
@@ -1076,7 +1114,8 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp,
}
/* Caller must hold RTNL. */
-int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
+int fib_table_insert(struct net *net, struct fib_table *tb,
+ struct fib_config *cfg)
{
struct trie *t = (struct trie *)tb->tb_data;
struct fib_alias *fa, *new_fa;
@@ -1175,17 +1214,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
new_fa->tb_id = tb->tb_id;
new_fa->fa_default = -1;
- err = switchdev_fib_ipv4_add(key, plen, fi,
- new_fa->fa_tos,
- cfg->fc_type,
- cfg->fc_nlflags,
- tb->tb_id);
- if (err) {
- switchdev_fib_ipv4_abort(fi);
- kmem_cache_free(fn_alias_kmem, new_fa);
- goto out;
- }
-
hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
alias_free_mem_rcu(fa);
@@ -1193,6 +1221,11 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
fib_release_info(fi_drop);
if (state & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net);
+
+ call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD,
+ key, plen, fi,
+ new_fa->fa_tos, cfg->fc_type,
+ tb->tb_id, cfg->fc_nlflags);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
tb->tb_id, &cfg->fc_nlinfo, nlflags);
@@ -1228,30 +1261,22 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
new_fa->tb_id = tb->tb_id;
new_fa->fa_default = -1;
- /* (Optionally) offload fib entry to switch hardware. */
- err = switchdev_fib_ipv4_add(key, plen, fi, tos, cfg->fc_type,
- cfg->fc_nlflags, tb->tb_id);
- if (err) {
- switchdev_fib_ipv4_abort(fi);
- goto out_free_new_fa;
- }
-
/* Insert new entry to the list. */
err = fib_insert_alias(t, tp, l, new_fa, fa, key);
if (err)
- goto out_sw_fib_del;
+ goto out_free_new_fa;
if (!plen)
tb->tb_num_default++;
rt_cache_flush(cfg->fc_nlinfo.nl_net);
+ call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, key, plen, fi, tos,
+ cfg->fc_type, tb->tb_id, cfg->fc_nlflags);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
&cfg->fc_nlinfo, nlflags);
succeeded:
return 0;
-out_sw_fib_del:
- switchdev_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id);
out_free_new_fa:
kmem_cache_free(fn_alias_kmem, new_fa);
out:
@@ -1490,7 +1515,8 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp,
}
/* Caller must hold RTNL. */
-int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
+int fib_table_delete(struct net *net, struct fib_table *tb,
+ struct fib_config *cfg)
{
struct trie *t = (struct trie *) tb->tb_data;
struct fib_alias *fa, *fa_to_delete;
@@ -1543,9 +1569,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
if (!fa_to_delete)
return -ESRCH;
- switchdev_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos,
- cfg->fc_type, tb->tb_id);
-
+ call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen,
+ fa_to_delete->fa_info, tos, cfg->fc_type,
+ tb->tb_id, 0);
rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
&cfg->fc_nlinfo, 0);
@@ -1734,82 +1760,8 @@ out:
return NULL;
}
-/* Caller must hold RTNL */
-void fib_table_flush_external(struct fib_table *tb)
-{
- struct trie *t = (struct trie *)tb->tb_data;
- struct key_vector *pn = t->kv;
- unsigned long cindex = 1;
- struct hlist_node *tmp;
- struct fib_alias *fa;
-
- /* walk trie in reverse order */
- for (;;) {
- unsigned char slen = 0;
- struct key_vector *n;
-
- if (!(cindex--)) {
- t_key pkey = pn->key;
-
- /* cannot resize the trie vector */
- if (IS_TRIE(pn))
- break;
-
- /* resize completed node */
- pn = resize(t, pn);
- cindex = get_index(pkey, pn);
-
- continue;
- }
-
- /* grab the next available node */
- n = get_child(pn, cindex);
- if (!n)
- continue;
-
- if (IS_TNODE(n)) {
- /* record pn and cindex for leaf walking */
- pn = n;
- cindex = 1ul << n->bits;
-
- continue;
- }
-
- hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
- struct fib_info *fi = fa->fa_info;
-
- /* if alias was cloned to local then we just
- * need to remove the local copy from main
- */
- if (tb->tb_id != fa->tb_id) {
- hlist_del_rcu(&fa->fa_list);
- alias_free_mem_rcu(fa);
- continue;
- }
-
- /* record local slen */
- slen = fa->fa_slen;
-
- if (!fi || !(fi->fib_flags & RTNH_F_OFFLOAD))
- continue;
-
- switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen,
- fi, fa->fa_tos, fa->fa_type,
- tb->tb_id);
- }
-
- /* update leaf slen */
- n->slen = slen;
-
- if (hlist_empty(&n->leaf)) {
- put_child_root(pn, n->key, NULL);
- node_free(n);
- }
- }
-}
-
/* Caller must hold RTNL. */
-int fib_table_flush(struct fib_table *tb)
+int fib_table_flush(struct net *net, struct fib_table *tb)
{
struct trie *t = (struct trie *)tb->tb_data;
struct key_vector *pn = t->kv;
@@ -1858,9 +1810,11 @@ int fib_table_flush(struct fib_table *tb)
continue;
}
- switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen,
- fi, fa->fa_tos, fa->fa_type,
- tb->tb_id);
+ call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL,
+ n->key,
+ KEYLENGTH - fa->fa_slen,
+ fi, fa->fa_tos, fa->fa_type,
+ tb->tb_id, 0);
hlist_del_rcu(&fa->fa_list);
fib_release_info(fa->fa_info);
alias_free_mem_rcu(fa);