diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/Makefile | 2 | ||||
-rw-r--r-- | net/core/bpf_sk_storage.c | 6 | ||||
-rw-r--r-- | net/core/dev.c | 143 | ||||
-rw-r--r-- | net/core/dev.h | 123 | ||||
-rw-r--r-- | net/core/dev_ioctl.c | 6 | ||||
-rw-r--r-- | net/core/fib_notifier.c | 2 | ||||
-rw-r--r-- | net/core/fib_rules.c | 34 | ||||
-rw-r--r-- | net/core/filter.c | 154 | ||||
-rw-r--r-- | net/core/lwt_bpf.c | 11 | ||||
-rw-r--r-- | net/core/neighbour.c | 360 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 4 | ||||
-rw-r--r-- | net/core/net_namespace.c | 36 | ||||
-rw-r--r-- | net/core/netdev-genl-gen.c | 23 | ||||
-rw-r--r-- | net/core/netdev-genl-gen.h | 1 | ||||
-rw-r--r-- | net/core/netdev-genl.c | 75 | ||||
-rw-r--r-- | net/core/netpoll.c | 49 | ||||
-rw-r--r-- | net/core/page_pool.c | 2 | ||||
-rw-r--r-- | net/core/pktgen.c | 2 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 1025 | ||||
-rw-r--r-- | net/core/rtnl_net_debug.c | 125 | ||||
-rw-r--r-- | net/core/skb_fault_injection.c | 106 | ||||
-rw-r--r-- | net/core/skbuff.c | 8 | ||||
-rw-r--r-- | net/core/skmsg.c | 4 | ||||
-rw-r--r-- | net/core/sock.c | 76 | ||||
-rw-r--r-- | net/core/sysctl_net_core.c | 56 |
25 files changed, 1615 insertions, 818 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index c3ebbaf9c81e..d9326600e289 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -45,3 +45,5 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o obj-$(CONFIG_OF) += of_net.o obj-$(CONFIG_NET_TEST) += net_test.o obj-$(CONFIG_NET_DEVMEM) += devmem.o +obj-$(CONFIG_DEBUG_NET_SMALL_RTNL) += rtnl_net_debug.o +obj-$(CONFIG_FAIL_SKB_REALLOC) += skb_fault_injection.o diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index bc01b3aa6b0f..2f4ed83a75ae 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -106,7 +106,7 @@ static long bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, if (sock) { sdata = bpf_local_storage_update( sock->sk, (struct bpf_local_storage_map *)map, value, - map_flags, GFP_ATOMIC); + map_flags, false, GFP_ATOMIC); sockfd_put(sock); return PTR_ERR_OR_ZERO(sdata); } @@ -137,7 +137,7 @@ bpf_sk_storage_clone_elem(struct sock *newsk, { struct bpf_local_storage_elem *copy_selem; - copy_selem = bpf_selem_alloc(smap, newsk, NULL, true, GFP_ATOMIC); + copy_selem = bpf_selem_alloc(smap, newsk, NULL, true, false, GFP_ATOMIC); if (!copy_selem) return NULL; @@ -243,7 +243,7 @@ BPF_CALL_5(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, refcount_inc_not_zero(&sk->sk_refcnt)) { sdata = bpf_local_storage_update( sk, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST, gfp_flags); + BPF_NOEXIST, false, gfp_flags); /* sk must be a fullsock (guaranteed by verifier), * so sock_gen_put() is unnecessary. */ diff --git a/net/core/dev.c b/net/core/dev.c index 8453e14d301b..13d00fc10f55 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2949,6 +2949,8 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) if (dev->num_tc) netif_setup_tc(dev, txq); + net_shaper_set_real_num_tx_queues(dev, txq); + dev_qdisc_change_real_num_tx(dev, txq); dev->real_num_tx_queues = txq; @@ -6234,12 +6236,12 @@ bool napi_complete_done(struct napi_struct *n, int work_done) if (work_done) { if (n->gro_bitmask) - timeout = READ_ONCE(n->dev->gro_flush_timeout); - n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs); + timeout = napi_get_gro_flush_timeout(n); + n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n); } if (n->defer_hard_irqs_count > 0) { n->defer_hard_irqs_count--; - timeout = READ_ONCE(n->dev->gro_flush_timeout); + timeout = napi_get_gro_flush_timeout(n); if (timeout) ret = false; } @@ -6373,8 +6375,8 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); if (flags & NAPI_F_PREFER_BUSY_POLL) { - napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs); - timeout = READ_ONCE(napi->dev->gro_flush_timeout); + napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi); + timeout = napi_get_gro_flush_timeout(napi); if (napi->defer_hard_irqs_count && timeout) { hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); skip_schedule = true; @@ -6505,8 +6507,62 @@ void napi_busy_loop(unsigned int napi_id, } EXPORT_SYMBOL(napi_busy_loop); +void napi_suspend_irqs(unsigned int napi_id) +{ + struct napi_struct *napi; + + rcu_read_lock(); + napi = napi_by_id(napi_id); + if (napi) { + unsigned long timeout = napi_get_irq_suspend_timeout(napi); + + if (timeout) + hrtimer_start(&napi->timer, ns_to_ktime(timeout), + HRTIMER_MODE_REL_PINNED); + } + rcu_read_unlock(); +} + +void napi_resume_irqs(unsigned int napi_id) +{ + struct napi_struct *napi; + + rcu_read_lock(); + napi = napi_by_id(napi_id); + if (napi) { + /* If irq_suspend_timeout is set to 0 between the call to + * napi_suspend_irqs and now, the original value still + * determines the safety timeout as intended and napi_watchdog + * will resume irq processing. + */ + if (napi_get_irq_suspend_timeout(napi)) { + local_bh_disable(); + napi_schedule(napi); + local_bh_enable(); + } + } + rcu_read_unlock(); +} + #endif /* CONFIG_NET_RX_BUSY_POLL */ +static void __napi_hash_add_with_id(struct napi_struct *napi, + unsigned int napi_id) +{ + napi->napi_id = napi_id; + hlist_add_head_rcu(&napi->napi_hash_node, + &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); +} + +static void napi_hash_add_with_id(struct napi_struct *napi, + unsigned int napi_id) +{ + spin_lock(&napi_hash_lock); + WARN_ON_ONCE(napi_by_id(napi_id)); + __napi_hash_add_with_id(napi, napi_id); + spin_unlock(&napi_hash_lock); +} + static void napi_hash_add(struct napi_struct *napi) { if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state)) @@ -6519,10 +6575,8 @@ static void napi_hash_add(struct napi_struct *napi) if (unlikely(++napi_gen_id < MIN_NAPI_ID)) napi_gen_id = MIN_NAPI_ID; } while (napi_by_id(napi_gen_id)); - napi->napi_id = napi_gen_id; - hlist_add_head_rcu(&napi->napi_hash_node, - &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); + __napi_hash_add_with_id(napi, napi_gen_id); spin_unlock(&napi_hash_lock); } @@ -6645,6 +6699,30 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, } EXPORT_SYMBOL(netif_queue_set_napi); +static void napi_restore_config(struct napi_struct *n) +{ + n->defer_hard_irqs = n->config->defer_hard_irqs; + n->gro_flush_timeout = n->config->gro_flush_timeout; + n->irq_suspend_timeout = n->config->irq_suspend_timeout; + /* a NAPI ID might be stored in the config, if so use it. if not, use + * napi_hash_add to generate one for us. It will be saved to the config + * in napi_disable. + */ + if (n->config->napi_id) + napi_hash_add_with_id(n, n->config->napi_id); + else + napi_hash_add(n); +} + +static void napi_save_config(struct napi_struct *n) +{ + n->config->defer_hard_irqs = n->defer_hard_irqs; + n->config->gro_flush_timeout = n->gro_flush_timeout; + n->config->irq_suspend_timeout = n->irq_suspend_timeout; + n->config->napi_id = n->napi_id; + napi_hash_del(n); +} + void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { @@ -6672,7 +6750,13 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, set_bit(NAPI_STATE_SCHED, &napi->state); set_bit(NAPI_STATE_NPSVC, &napi->state); list_add_rcu(&napi->dev_list, &dev->napi_list); - napi_hash_add(napi); + + /* default settings from sysfs are applied to all NAPIs. any per-NAPI + * configuration will be loaded in napi_enable + */ + napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs)); + napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout)); + napi_get_frags_check(napi); /* Create kthread for this napi if dev->threaded is set. * Clear dev->threaded if kthread creation failed so that @@ -6704,6 +6788,11 @@ void napi_disable(struct napi_struct *n) hrtimer_cancel(&n->timer); + if (n->config) + napi_save_config(n); + else + napi_hash_del(n); + clear_bit(NAPI_STATE_DISABLE, &n->state); } EXPORT_SYMBOL(napi_disable); @@ -6719,6 +6808,11 @@ void napi_enable(struct napi_struct *n) { unsigned long new, val = READ_ONCE(n->state); + if (n->config) + napi_restore_config(n); + else + napi_hash_add(n); + do { BUG_ON(!test_bit(NAPI_STATE_SCHED, &val)); @@ -6748,7 +6842,11 @@ void __netif_napi_del(struct napi_struct *napi) if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state)) return; - napi_hash_del(napi); + if (napi->config) { + napi->index = -1; + napi->config = NULL; + } + list_del_rcu(&napi->dev_list); napi_free_frags(napi); @@ -11060,8 +11158,8 @@ void netdev_sw_irq_coalesce_default_on(struct net_device *dev) WARN_ON(dev->reg_state == NETREG_REGISTERED); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { - dev->gro_flush_timeout = 20000; - dev->napi_defer_hard_irqs = 1; + netdev_set_gro_flush_timeout(dev, 20000); + netdev_set_defer_hard_irqs(dev, 1); } } EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on); @@ -11085,6 +11183,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, unsigned int txqs, unsigned int rxqs) { struct net_device *dev; + size_t napi_config_sz; + unsigned int maxqs; BUG_ON(strlen(name) >= sizeof(dev->name)); @@ -11098,6 +11198,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, return NULL; } + maxqs = max(txqs, rxqs); + dev = kvzalloc(struct_size(dev, priv, sizeof_priv), GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!dev) @@ -11151,6 +11253,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, hash_init(dev->qdisc_hash); #endif + mutex_init(&dev->lock); + dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); @@ -11172,6 +11276,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, if (!dev->ethtool) goto free_all; + napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config)); + dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT); + if (!dev->napi_config) + goto free_all; + strscpy(dev->name, name); dev->name_assign_type = name_assign_type; dev->group = INIT_NETDEV_GROUP; @@ -11221,6 +11330,8 @@ void free_netdev(struct net_device *dev) return; } + mutex_destroy(&dev->lock); + kfree(dev->ethtool); netif_free_tx_queues(dev); netif_free_rx_queues(dev); @@ -11233,6 +11344,8 @@ void free_netdev(struct net_device *dev) list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) netif_napi_del(p); + kvfree(dev->napi_config); + ref_tracker_dir_exit(&dev->refcnt_tracker); #ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); @@ -11430,6 +11543,8 @@ void unregister_netdevice_many_notify(struct list_head *head, mutex_destroy(&dev->ethtool->rss_lock); + net_shaper_flush_netdev(dev); + if (skb) rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh); @@ -11998,8 +12113,6 @@ static void __init net_dev_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx); - CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout); - CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, napi_defer_hard_irqs); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler); @@ -12011,7 +12124,7 @@ static void __init net_dev_struct_check(void) #ifdef CONFIG_NET_XGRESS CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress); #endif - CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 104); + CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 92); } /* diff --git a/net/core/dev.h b/net/core/dev.h index 5654325c5b71..d043dee25a68 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -35,6 +35,16 @@ void dev_addr_flush(struct net_device *dev); int dev_addr_init(struct net_device *dev); void dev_addr_check(struct net_device *dev); +#if IS_ENABLED(CONFIG_NET_SHAPER) +void net_shaper_flush_netdev(struct net_device *dev); +void net_shaper_set_real_num_tx_queues(struct net_device *dev, + unsigned int txq); +#else +static inline void net_shaper_flush_netdev(struct net_device *dev) {} +static inline void net_shaper_set_real_num_tx_queues(struct net_device *dev, + unsigned int txq) {} +#endif + /* sysctls not referred to from outside net/core/ */ extern int netdev_unregister_timeout_secs; extern int weight_p; @@ -138,6 +148,119 @@ static inline void netif_set_gro_ipv4_max_size(struct net_device *dev, WRITE_ONCE(dev->gro_ipv4_max_size, size); } +/** + * napi_get_defer_hard_irqs - get the NAPI's defer_hard_irqs + * @n: napi struct to get the defer_hard_irqs field from + * + * Return: the per-NAPI value of the defar_hard_irqs field. + */ +static inline u32 napi_get_defer_hard_irqs(const struct napi_struct *n) +{ + return READ_ONCE(n->defer_hard_irqs); +} + +/** + * napi_set_defer_hard_irqs - set the defer_hard_irqs for a napi + * @n: napi_struct to set the defer_hard_irqs field + * @defer: the value the field should be set to + */ +static inline void napi_set_defer_hard_irqs(struct napi_struct *n, u32 defer) +{ + WRITE_ONCE(n->defer_hard_irqs, defer); +} + +/** + * netdev_set_defer_hard_irqs - set defer_hard_irqs for all NAPIs of a netdev + * @netdev: the net_device for which all NAPIs will have defer_hard_irqs set + * @defer: the defer_hard_irqs value to set + */ +static inline void netdev_set_defer_hard_irqs(struct net_device *netdev, + u32 defer) +{ + unsigned int count = max(netdev->num_rx_queues, + netdev->num_tx_queues); + struct napi_struct *napi; + int i; + + WRITE_ONCE(netdev->napi_defer_hard_irqs, defer); + list_for_each_entry(napi, &netdev->napi_list, dev_list) + napi_set_defer_hard_irqs(napi, defer); + + for (i = 0; i < count; i++) + netdev->napi_config[i].defer_hard_irqs = defer; +} + +/** + * napi_get_gro_flush_timeout - get the gro_flush_timeout + * @n: napi struct to get the gro_flush_timeout from + * + * Return: the per-NAPI value of the gro_flush_timeout field. + */ +static inline unsigned long +napi_get_gro_flush_timeout(const struct napi_struct *n) +{ + return READ_ONCE(n->gro_flush_timeout); +} + +/** + * napi_set_gro_flush_timeout - set the gro_flush_timeout for a napi + * @n: napi struct to set the gro_flush_timeout + * @timeout: timeout value to set + * + * napi_set_gro_flush_timeout sets the per-NAPI gro_flush_timeout + */ +static inline void napi_set_gro_flush_timeout(struct napi_struct *n, + unsigned long timeout) +{ + WRITE_ONCE(n->gro_flush_timeout, timeout); +} + +/** + * netdev_set_gro_flush_timeout - set gro_flush_timeout of a netdev's NAPIs + * @netdev: the net_device for which all NAPIs will have gro_flush_timeout set + * @timeout: the timeout value to set + */ +static inline void netdev_set_gro_flush_timeout(struct net_device *netdev, + unsigned long timeout) +{ + unsigned int count = max(netdev->num_rx_queues, + netdev->num_tx_queues); + struct napi_struct *napi; + int i; + + WRITE_ONCE(netdev->gro_flush_timeout, timeout); + list_for_each_entry(napi, &netdev->napi_list, dev_list) + napi_set_gro_flush_timeout(napi, timeout); + + for (i = 0; i < count; i++) + netdev->napi_config[i].gro_flush_timeout = timeout; +} + +/** + * napi_get_irq_suspend_timeout - get the irq_suspend_timeout + * @n: napi struct to get the irq_suspend_timeout from + * + * Return: the per-NAPI value of the irq_suspend_timeout field. + */ +static inline unsigned long +napi_get_irq_suspend_timeout(const struct napi_struct *n) +{ + return READ_ONCE(n->irq_suspend_timeout); +} + +/** + * napi_set_irq_suspend_timeout - set the irq_suspend_timeout for a napi + * @n: napi struct to set the irq_suspend_timeout + * @timeout: timeout value to set + * + * napi_set_irq_suspend_timeout sets the per-NAPI irq_suspend_timeout + */ +static inline void napi_set_irq_suspend_timeout(struct napi_struct *n, + unsigned long timeout) +{ + WRITE_ONCE(n->irq_suspend_timeout, timeout); +} + int rps_cpumask_housekeeping(struct cpumask *mask); #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL) diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 473c437b6b53..46d43b950471 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -64,7 +64,7 @@ int dev_ifconf(struct net *net, struct ifconf __user *uifc) } /* Loop over the interfaces, and write an info block for each. */ - rtnl_lock(); + rtnl_net_lock(net); for_each_netdev(net, dev) { if (!pos) done = inet_gifconf(dev, NULL, 0, size); @@ -72,12 +72,12 @@ int dev_ifconf(struct net *net, struct ifconf __user *uifc) done = inet_gifconf(dev, pos + total, len - total, size); if (done < 0) { - rtnl_unlock(); + rtnl_net_unlock(net); return -EFAULT; } total += done; } - rtnl_unlock(); + rtnl_net_unlock(net); return put_user(total, &uifc->ifc_len); } diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c index fc96259807b6..5cdca49b1d7c 100644 --- a/net/core/fib_notifier.c +++ b/net/core/fib_notifier.c @@ -43,7 +43,6 @@ static unsigned int fib_seq_sum(struct net *net) struct fib_notifier_ops *ops; unsigned int fib_seq = 0; - rtnl_lock(); rcu_read_lock(); list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) { if (!try_module_get(ops->owner)) @@ -52,7 +51,6 @@ static unsigned int fib_seq_sum(struct net *net) module_put(ops->owner); } rcu_read_unlock(); - rtnl_unlock(); return fib_seq; } diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 154a2681f55c..34185d138c95 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -101,7 +101,8 @@ static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, u32 pid); -static struct fib_rules_ops *lookup_rules_ops(struct net *net, int family) +static struct fib_rules_ops *lookup_rules_ops(const struct net *net, + int family) { struct fib_rules_ops *ops; @@ -370,7 +371,9 @@ static int call_fib_rule_notifiers(struct net *net, .rule = rule, }; - ops->fib_rules_seq++; + ASSERT_RTNL(); + /* Paired with READ_ONCE() in fib_rules_seq() */ + WRITE_ONCE(ops->fib_rules_seq, ops->fib_rules_seq + 1); return call_fib_notifiers(net, event_type, &info.info); } @@ -397,17 +400,16 @@ int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, } EXPORT_SYMBOL_GPL(fib_rules_dump); -unsigned int fib_rules_seq_read(struct net *net, int family) +unsigned int fib_rules_seq_read(const struct net *net, int family) { unsigned int fib_rules_seq; struct fib_rules_ops *ops; - ASSERT_RTNL(); - ops = lookup_rules_ops(net, family); if (!ops) return 0; - fib_rules_seq = ops->fib_rules_seq; + /* Paired with WRITE_ONCE() in call_fib_rule_notifiers() */ + fib_rules_seq = READ_ONCE(ops->fib_rules_seq); rules_ops_put(ops); return fib_rules_seq; @@ -556,8 +558,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, nlrule->pref = fib_default_rule_pref(ops); } - nlrule->proto = tb[FRA_PROTOCOL] ? - nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC; + nlrule->proto = nla_get_u8_default(tb[FRA_PROTOCOL], RTPROT_UNSPEC); if (tb[FRA_IIFNAME]) { struct net_device *dev; @@ -1289,13 +1290,18 @@ static struct pernet_operations fib_rules_net_ops = { .exit = fib_rules_net_exit, }; +static const struct rtnl_msg_handler fib_rules_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_NEWRULE, .doit = fib_nl_newrule}, + {.msgtype = RTM_DELRULE, .doit = fib_nl_delrule}, + {.msgtype = RTM_GETRULE, .dumpit = fib_nl_dumprule, + .flags = RTNL_FLAG_DUMP_UNLOCKED}, +}; + static int __init fib_rules_init(void) { int err; - rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, - RTNL_FLAG_DUMP_UNLOCKED); + + rtnl_register_many(fib_rules_rtnl_msg_handlers); err = register_pernet_subsys(&fib_rules_net_ops); if (err < 0) @@ -1310,9 +1316,7 @@ static int __init fib_rules_init(void) fail_unregister: unregister_pernet_subsys(&fib_rules_net_ops); fail: - rtnl_unregister(PF_UNSPEC, RTM_NEWRULE); - rtnl_unregister(PF_UNSPEC, RTM_DELRULE); - rtnl_unregister(PF_UNSPEC, RTM_GETRULE); + rtnl_unregister_many(fib_rules_rtnl_msg_handlers); return err; } diff --git a/net/core/filter.c b/net/core/filter.c index e31ee8be2de0..6625b3f563a4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1654,18 +1654,6 @@ void sk_reuseport_prog_free(struct bpf_prog *prog) bpf_prog_destroy(prog); } -struct bpf_scratchpad { - union { - __be32 diff[MAX_BPF_STACK / sizeof(__be32)]; - u8 buff[MAX_BPF_STACK]; - }; - local_lock_t bh_lock; -}; - -static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp) = { - .bh_lock = INIT_LOCAL_LOCK(bh_lock), -}; - static inline int __bpf_try_make_writable(struct sk_buff *skb, unsigned int write_len) { @@ -2022,11 +2010,6 @@ static const struct bpf_func_proto bpf_l4_csum_replace_proto = { BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, __be32 *, to, u32, to_size, __wsum, seed) { - struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); - u32 diff_size = from_size + to_size; - int i, j = 0; - __wsum ret; - /* This is quite flexible, some examples: * * from_size == 0, to_size > 0, seed := csum --> pushing data @@ -2035,19 +2018,19 @@ BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, * * Even for diffing, from_size and to_size don't need to be equal. */ - if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) || - diff_size > sizeof(sp->diff))) - return -EINVAL; - local_lock_nested_bh(&bpf_sp.bh_lock); - for (i = 0; i < from_size / sizeof(__be32); i++, j++) - sp->diff[j] = ~from[i]; - for (i = 0; i < to_size / sizeof(__be32); i++, j++) - sp->diff[j] = to[i]; + __wsum ret = seed; - ret = csum_partial(sp->diff, diff_size, seed); - local_unlock_nested_bh(&bpf_sp.bh_lock); - return ret; + if (from_size && to_size) + ret = csum_sub(csum_partial(to, to_size, ret), + csum_partial(from, from_size, 0)); + else if (to_size) + ret = csum_partial(to, to_size, ret); + + else if (from_size) + ret = ~csum_partial(from, from_size, ~ret); + + return csum_from32to16((__force unsigned int)ret); } static const struct bpf_func_proto bpf_csum_diff_proto = { @@ -2249,7 +2232,7 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, rcu_read_unlock(); return ret; } - rcu_read_unlock_bh(); + rcu_read_unlock(); if (dst) IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); out_drop: @@ -2372,7 +2355,7 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, struct flowi4 fl4 = { .flowi4_flags = FLOWI_FLAG_ANYSRC, .flowi4_mark = skb->mark, - .flowi4_tos = ip4h->tos & INET_DSCP_MASK, + .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)), .flowi4_oif = dev->ifindex, .flowi4_proto = ip4h->protocol, .daddr = ip4h->daddr, @@ -2621,18 +2604,16 @@ BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes) static void sk_msg_reset_curr(struct sk_msg *msg) { - u32 i = msg->sg.start; - u32 len = 0; - - do { - len += sk_msg_elem(msg, i)->length; - sk_msg_iter_var_next(i); - if (len >= msg->sg.size) - break; - } while (i != msg->sg.end); + if (!msg->sg.size) { + msg->sg.curr = msg->sg.start; + msg->sg.copybreak = 0; + } else { + u32 i = msg->sg.end; - msg->sg.curr = i; - msg->sg.copybreak = 0; + sk_msg_iter_var_prev(i); + msg->sg.curr = i; + msg->sg.copybreak = msg->sg.data[i].length; + } } static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { @@ -2795,7 +2776,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, sk_msg_iter_var_next(i); } while (i != msg->sg.end); - if (start >= offset + l) + if (start > offset + l) return -EINVAL; space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); @@ -2820,6 +2801,8 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, raw = page_address(page); + if (i == msg->sg.end) + sk_msg_iter_var_prev(i); psge = sk_msg_elem(msg, i); front = start - offset; back = psge->length - front; @@ -2836,7 +2819,13 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, } put_page(sg_page(psge)); - } else if (start - offset) { + new = i; + goto place_new; + } + + if (start - offset) { + if (i == msg->sg.end) + sk_msg_iter_var_prev(i); psge = sk_msg_elem(msg, i); rsge = sk_msg_elem_cpy(msg, i); @@ -2847,39 +2836,44 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, sk_msg_iter_var_next(i); sg_unmark_end(psge); sg_unmark_end(&rsge); - sk_msg_iter_next(msg, end); } /* Slot(s) to place newly allocated data */ + sk_msg_iter_next(msg, end); new = i; + sk_msg_iter_var_next(i); + + if (i == msg->sg.end) { + if (!rsge.length) + goto place_new; + sk_msg_iter_next(msg, end); + goto place_new; + } /* Shift one or two slots as needed */ - if (!copy) { - sge = sk_msg_elem_cpy(msg, i); + sge = sk_msg_elem_cpy(msg, new); + sg_unmark_end(&sge); + nsge = sk_msg_elem_cpy(msg, i); + if (rsge.length) { sk_msg_iter_var_next(i); - sg_unmark_end(&sge); + nnsge = sk_msg_elem_cpy(msg, i); sk_msg_iter_next(msg, end); + } - nsge = sk_msg_elem_cpy(msg, i); + while (i != msg->sg.end) { + msg->sg.data[i] = sge; + sge = nsge; + sk_msg_iter_var_next(i); if (rsge.length) { - sk_msg_iter_var_next(i); + nsge = nnsge; nnsge = sk_msg_elem_cpy(msg, i); - } - - while (i != msg->sg.end) { - msg->sg.data[i] = sge; - sge = nsge; - sk_msg_iter_var_next(i); - if (rsge.length) { - nsge = nnsge; - nnsge = sk_msg_elem_cpy(msg, i); - } else { - nsge = sk_msg_elem_cpy(msg, i); - } + } else { + nsge = sk_msg_elem_cpy(msg, i); } } +place_new: /* Place newly allocated data buffer */ sk_mem_charge(msg->sk, len); msg->sg.size += len; @@ -2908,8 +2902,10 @@ static const struct bpf_func_proto bpf_msg_push_data_proto = { static void sk_msg_shift_left(struct sk_msg *msg, int i) { + struct scatterlist *sge = sk_msg_elem(msg, i); int prev; + put_page(sg_page(sge)); do { prev = i; sk_msg_iter_var_next(i); @@ -2946,6 +2942,9 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, if (unlikely(flags)) return -EINVAL; + if (unlikely(len == 0)) + return 0; + /* First find the starting scatterlist element */ i = msg->sg.start; do { @@ -2958,7 +2957,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, } while (i != msg->sg.end); /* Bounds checks: start and pop must be inside message */ - if (start >= offset + l || last >= msg->sg.size) + if (start >= offset + l || last > msg->sg.size) return -EINVAL; space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); @@ -2987,12 +2986,12 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, */ if (start != offset) { struct scatterlist *nsge, *sge = sk_msg_elem(msg, i); - int a = start; + int a = start - offset; int b = sge->length - pop - a; sk_msg_iter_var_next(i); - if (pop < sge->length - a) { + if (b > 0) { if (space) { sge->length = a; sk_msg_shift_right(msg, i); @@ -3011,7 +3010,6 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, if (unlikely(!page)) return -ENOMEM; - sge->length = a; orig = sg_page(sge); from = sg_virt(sge); to = page_address(page); @@ -3021,7 +3019,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, put_page(orig); } pop = 0; - } else if (pop >= sge->length - a) { + } else { pop -= (sge->length - a); sge->length = a; } @@ -3055,7 +3053,6 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, pop -= sge->length; sk_msg_shift_left(msg, i); } - sk_msg_iter_var_next(i); } sk_mem_uncharge(msg->sk, len - pop); @@ -5140,6 +5137,17 @@ static u64 __bpf_get_netns_cookie(struct sock *sk) return net->net_cookie; } +BPF_CALL_1(bpf_get_netns_cookie, struct sk_buff *, skb) +{ + return __bpf_get_netns_cookie(skb && skb->sk ? skb->sk : NULL); +} + +static const struct bpf_func_proto bpf_get_netns_cookie_proto = { + .func = bpf_get_netns_cookie, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; + BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx) { return __bpf_get_netns_cookie(ctx); @@ -6768,8 +6776,6 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk * sock refcnt is decremented to prevent a request_sock leak. */ - if (!sk_fullsock(sk2)) - sk2 = NULL; if (sk2 != sk) { sock_gen_put(sk); /* Ensure there is no need to bump sk2 refcnt */ @@ -6816,8 +6822,6 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk * sock refcnt is decremented to prevent a request_sock leak. */ - if (!sk_fullsock(sk2)) - sk2 = NULL; if (sk2 != sk) { sock_gen_put(sk); /* Ensure there is no need to bump sk2 refcnt */ @@ -7266,7 +7270,7 @@ BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk) { sk = sk_to_full_sk(sk); - if (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE)) + if (sk && sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE)) return (unsigned long)sk; return (unsigned long)NULL; @@ -8199,6 +8203,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_under_cgroup_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; case BPF_FUNC_fib_lookup: @@ -10231,10 +10237,6 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, } \ } while (0) -#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(S, NS, F, NF, TF) \ - SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( \ - S, NS, F, NF, BPF_FIELD_SIZEOF(NS, NF), 0, TF) - static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 1a14f915b7a4..ae74634310a3 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -10,6 +10,7 @@ #include <linux/bpf.h> #include <net/lwtunnel.h> #include <net/gre.h> +#include <net/ip.h> #include <net/ip6_route.h> #include <net/ipv6_stubs.h> #include <net/inet_dscp.h> @@ -87,16 +88,18 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, static int bpf_lwt_input_reroute(struct sk_buff *skb) { + enum skb_drop_reason reason; int err = -EINVAL; if (skb->protocol == htons(ETH_P_IP)) { struct net_device *dev = skb_dst(skb)->dev; - struct iphdr *iph = ip_hdr(skb); + const struct iphdr *iph = ip_hdr(skb); dev_hold(dev); skb_dst_drop(skb); - err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, dev); + reason = ip_route_input_noref(skb, iph->daddr, iph->saddr, + ip4h_dscp(iph), dev); + err = reason ? -EINVAL : 0; dev_put(dev); } else if (skb->protocol == htons(ETH_P_IPV6)) { skb_dst_drop(skb); @@ -206,7 +209,7 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) fl4.flowi4_oif = oif; fl4.flowi4_mark = skb->mark; fl4.flowi4_uid = sock_net_uid(net, sk); - fl4.flowi4_tos = iph->tos & INET_DSCP_MASK; + fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; fl4.flowi4_proto = iph->protocol; fl4.daddr = iph->daddr; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 77b819cd995b..89656d180bc6 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -14,7 +14,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/slab.h> -#include <linux/kmemleak.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/module.h> @@ -61,6 +60,25 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, static const struct seq_operations neigh_stat_seq_ops; #endif +static struct hlist_head *neigh_get_dev_table(struct net_device *dev, int family) +{ + int i; + + switch (family) { + default: + DEBUG_NET_WARN_ON_ONCE(1); + fallthrough; /* to avoid panic by null-ptr-deref */ + case AF_INET: + i = NEIGH_ARP_TABLE; + break; + case AF_INET6: + i = NEIGH_ND_TABLE; + break; + } + + return &dev->neighbours[i]; +} + /* Neighbour hash table buckets are protected with rwlock tbl->lock. @@ -205,18 +223,14 @@ static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify, } } -static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np, - struct neigh_table *tbl) +bool neigh_remove_one(struct neighbour *n) { bool retval = false; write_lock(&n->lock); if (refcount_read(&n->refcnt) == 1) { - struct neighbour *neigh; - - neigh = rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock)); - rcu_assign_pointer(*np, neigh); + hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); retval = true; } @@ -226,29 +240,6 @@ static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np, return retval; } -bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl) -{ - struct neigh_hash_table *nht; - void *pkey = ndel->primary_key; - u32 hash_val; - struct neighbour *n; - struct neighbour __rcu **np; - - nht = rcu_dereference_protected(tbl->nht, - lockdep_is_held(&tbl->lock)); - hash_val = tbl->hash(pkey, ndel->dev, nht->hash_rnd); - hash_val = hash_val >> (32 - nht->hash_shift); - - np = &nht->hash_buckets[hash_val]; - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock)))) { - if (n == ndel) - return neigh_del(n, np, tbl); - np = &n->next; - } - return false; -} - static int neigh_forced_gc(struct neigh_table *tbl) { int max_clean = atomic_read(&tbl->gc_entries) - @@ -276,7 +267,7 @@ static int neigh_forced_gc(struct neigh_table *tbl) remove = true; write_unlock(&n->lock); - if (remove && neigh_remove_one(n, tbl)) + if (remove && neigh_remove_one(n)) shrunk++; if (shrunk >= max_clean) break; @@ -380,54 +371,42 @@ static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net, static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { - int i; - struct neigh_hash_table *nht; + struct hlist_head *dev_head; + struct hlist_node *tmp; + struct neighbour *n; - nht = rcu_dereference_protected(tbl->nht, - lockdep_is_held(&tbl->lock)); + dev_head = neigh_get_dev_table(dev, tbl->family); - for (i = 0; i < (1 << nht->hash_shift); i++) { - struct neighbour *n; - struct neighbour __rcu **np = &nht->hash_buckets[i]; + hlist_for_each_entry_safe(n, tmp, dev_head, dev_list) { + if (skip_perm && n->nud_state & NUD_PERMANENT) + continue; - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock))) != NULL) { - if (dev && n->dev != dev) { - np = &n->next; - continue; - } - if (skip_perm && n->nud_state & NUD_PERMANENT) { - np = &n->next; - continue; - } - rcu_assign_pointer(*np, - rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock))); - write_lock(&n->lock); - neigh_del_timer(n); - neigh_mark_dead(n); - if (refcount_read(&n->refcnt) != 1) { - /* The most unpleasant situation. - We must destroy neighbour entry, - but someone still uses it. - - The destroy will be delayed until - the last user releases us, but - we must kill timers etc. and move - it to safe state. - */ - __skb_queue_purge(&n->arp_queue); - n->arp_queue_len_bytes = 0; - WRITE_ONCE(n->output, neigh_blackhole); - if (n->nud_state & NUD_VALID) - n->nud_state = NUD_NOARP; - else - n->nud_state = NUD_NONE; - neigh_dbg(2, "neigh %p is stray\n", n); - } - write_unlock(&n->lock); - neigh_cleanup_and_release(n); + hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); + write_lock(&n->lock); + neigh_del_timer(n); + neigh_mark_dead(n); + if (refcount_read(&n->refcnt) != 1) { + /* The most unpleasant situation. + * We must destroy neighbour entry, + * but someone still uses it. + * + * The destroy will be delayed until + * the last user releases us, but + * we must kill timers etc. and move + * it to safe state. + */ + __skb_queue_purge(&n->arp_queue); + n->arp_queue_len_bytes = 0; + WRITE_ONCE(n->output, neigh_blackhole); + if (n->nud_state & NUD_VALID) + n->nud_state = NUD_NOARP; + else + n->nud_state = NUD_NONE; + neigh_dbg(2, "neigh %p is stray\n", n); } + write_unlock(&n->lock); + neigh_cleanup_and_release(n); } } @@ -530,27 +509,21 @@ static void neigh_get_hash_rnd(u32 *x) static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) { - size_t size = (1 << shift) * sizeof(struct neighbour *); + size_t size = (1 << shift) * sizeof(struct hlist_head); + struct hlist_head *hash_heads; struct neigh_hash_table *ret; - struct neighbour __rcu **buckets; int i; ret = kmalloc(sizeof(*ret), GFP_ATOMIC); if (!ret) return NULL; - if (size <= PAGE_SIZE) { - buckets = kzalloc(size, GFP_ATOMIC); - } else { - buckets = (struct neighbour __rcu **) - __get_free_pages(GFP_ATOMIC | __GFP_ZERO, - get_order(size)); - kmemleak_alloc(buckets, size, 1, GFP_ATOMIC); - } - if (!buckets) { + + hash_heads = kvzalloc(size, GFP_ATOMIC); + if (!hash_heads) { kfree(ret); return NULL; } - ret->hash_buckets = buckets; + ret->hash_heads = hash_heads; ret->hash_shift = shift; for (i = 0; i < NEIGH_NUM_HASH_RND; i++) neigh_get_hash_rnd(&ret->hash_rnd[i]); @@ -562,15 +535,8 @@ static void neigh_hash_free_rcu(struct rcu_head *head) struct neigh_hash_table *nht = container_of(head, struct neigh_hash_table, rcu); - size_t size = (1 << nht->hash_shift) * sizeof(struct neighbour *); - struct neighbour __rcu **buckets = nht->hash_buckets; - if (size <= PAGE_SIZE) { - kfree(buckets); - } else { - kmemleak_free(buckets); - free_pages((unsigned long)buckets, get_order(size)); - } + kvfree(nht->hash_heads); kfree(nht); } @@ -589,24 +555,17 @@ static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl, return old_nht; for (i = 0; i < (1 << old_nht->hash_shift); i++) { - struct neighbour *n, *next; + struct hlist_node *tmp; + struct neighbour *n; - for (n = rcu_dereference_protected(old_nht->hash_buckets[i], - lockdep_is_held(&tbl->lock)); - n != NULL; - n = next) { + neigh_for_each_in_bucket_safe(n, tmp, &old_nht->hash_heads[i]) { hash = tbl->hash(n->primary_key, n->dev, new_nht->hash_rnd); hash >>= (32 - new_nht->hash_shift); - next = rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock)); - rcu_assign_pointer(n->next, - rcu_dereference_protected( - new_nht->hash_buckets[hash], - lockdep_is_held(&tbl->lock))); - rcu_assign_pointer(new_nht->hash_buckets[hash], n); + hlist_del_rcu(&n->hash); + hlist_add_head_rcu(&n->hash, &new_nht->hash_heads[hash]); } } @@ -693,11 +652,7 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey, goto out_tbl_unlock; } - for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val], - lockdep_is_held(&tbl->lock)); - n1 != NULL; - n1 = rcu_dereference_protected(n1->next, - lockdep_is_held(&tbl->lock))) { + neigh_for_each_in_bucket(n1, &nht->hash_heads[hash_val]) { if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) { if (want_ref) neigh_hold(n1); @@ -713,10 +668,11 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey, list_add_tail(&n->managed_list, &n->tbl->managed_list); if (want_ref) neigh_hold(n); - rcu_assign_pointer(n->next, - rcu_dereference_protected(nht->hash_buckets[hash_val], - lockdep_is_held(&tbl->lock))); - rcu_assign_pointer(nht->hash_buckets[hash_val], n); + hlist_add_head_rcu(&n->hash, &nht->hash_heads[hash_val]); + + hlist_add_head_rcu(&n->dev_list, + neigh_get_dev_table(dev, tbl->family)); + write_unlock_bh(&tbl->lock); neigh_dbg(2, "neigh %p is created\n", n); rc = n; @@ -948,10 +904,10 @@ static void neigh_connect(struct neighbour *neigh) static void neigh_periodic_work(struct work_struct *work) { struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work); + struct neigh_hash_table *nht; + struct hlist_node *tmp; struct neighbour *n; - struct neighbour __rcu **np; unsigned int i; - struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); @@ -976,10 +932,7 @@ static void neigh_periodic_work(struct work_struct *work) goto out; for (i = 0 ; i < (1 << nht->hash_shift); i++) { - np = &nht->hash_buckets[i]; - - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock))) != NULL) { + neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) { unsigned int state; write_lock(&n->lock); @@ -988,7 +941,7 @@ static void neigh_periodic_work(struct work_struct *work) if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) || (n->flags & NTF_EXT_LEARNED)) { write_unlock(&n->lock); - goto next_elt; + continue; } if (time_before(n->used, n->confirmed) && @@ -999,18 +952,14 @@ static void neigh_periodic_work(struct work_struct *work) (state == NUD_FAILED || !time_in_range_open(jiffies, n->used, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { - rcu_assign_pointer(*np, - rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock))); + hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); write_unlock(&n->lock); neigh_cleanup_and_release(n); continue; } write_unlock(&n->lock); - -next_elt: - np = &n->next; } /* * It's fine to release lock here, even if hash table @@ -1957,7 +1906,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, NETLINK_CB(skb).portid, extack); write_lock_bh(&tbl->lock); neigh_release(neigh); - neigh_remove_one(neigh, tbl); + neigh_remove_one(neigh); write_unlock_bh(&tbl->lock); out: @@ -2728,9 +2677,8 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, for (h = s_h; h < (1 << nht->hash_shift); h++) { if (h > s_h) s_idx = 0; - for (n = rcu_dereference(nht->hash_buckets[h]), idx = 0; - n != NULL; - n = rcu_dereference(n->next)) { + idx = 0; + neigh_for_each_in_bucket_rcu(n, &nht->hash_heads[h]) { if (idx < s_idx || !net_eq(dev_net(n->dev), net)) goto next; if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || @@ -2876,6 +2824,7 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) err = neigh_valid_dump_req(nlh, cb->strict_check, &filter, cb->extack); if (err < 0 && cb->strict_check) return err; + err = 0; s_t = cb->args[0]; @@ -3097,9 +3046,7 @@ void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void for (chain = 0; chain < (1 << nht->hash_shift); chain++) { struct neighbour *n; - for (n = rcu_dereference(nht->hash_buckets[chain]); - n != NULL; - n = rcu_dereference(n->next)) + neigh_for_each_in_bucket(n, &nht->hash_heads[chain]) cb(n, cookie); } read_unlock_bh(&tbl->lock); @@ -3111,29 +3058,25 @@ EXPORT_SYMBOL(neigh_for_each); void __neigh_for_each_release(struct neigh_table *tbl, int (*cb)(struct neighbour *)) { - int chain; struct neigh_hash_table *nht; + int chain; nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); for (chain = 0; chain < (1 << nht->hash_shift); chain++) { + struct hlist_node *tmp; struct neighbour *n; - struct neighbour __rcu **np; - np = &nht->hash_buckets[chain]; - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock))) != NULL) { + neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[chain]) { int release; write_lock(&n->lock); release = cb(n); if (release) { - rcu_assign_pointer(*np, - rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock))); + hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); - } else - np = &n->next; + } write_unlock(&n->lock); if (release) neigh_cleanup_and_release(n); @@ -3190,43 +3133,53 @@ EXPORT_SYMBOL(neigh_xmit); #ifdef CONFIG_PROC_FS -static struct neighbour *neigh_get_first(struct seq_file *seq) +static struct neighbour *neigh_get_valid(struct seq_file *seq, + struct neighbour *n, + loff_t *pos) { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); + + if (!net_eq(dev_net(n->dev), net)) + return NULL; + + if (state->neigh_sub_iter) { + loff_t fakep = 0; + void *v; + + v = state->neigh_sub_iter(state, n, pos ? pos : &fakep); + if (!v) + return NULL; + if (pos) + return v; + } + + if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) + return n; + + if (READ_ONCE(n->nud_state) & ~NUD_NOARP) + return n; + + return NULL; +} + +static struct neighbour *neigh_get_first(struct seq_file *seq) +{ + struct neigh_seq_state *state = seq->private; struct neigh_hash_table *nht = state->nht; - struct neighbour *n = NULL; - int bucket; + struct neighbour *n, *tmp; state->flags &= ~NEIGH_SEQ_IS_PNEIGH; - for (bucket = 0; bucket < (1 << nht->hash_shift); bucket++) { - n = rcu_dereference(nht->hash_buckets[bucket]); - while (n) { - if (!net_eq(dev_net(n->dev), net)) - goto next; - if (state->neigh_sub_iter) { - loff_t fakep = 0; - void *v; - - v = state->neigh_sub_iter(state, n, &fakep); - if (!v) - goto next; - } - if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) - break; - if (READ_ONCE(n->nud_state) & ~NUD_NOARP) - break; -next: - n = rcu_dereference(n->next); + while (++state->bucket < (1 << nht->hash_shift)) { + neigh_for_each_in_bucket(n, &nht->hash_heads[state->bucket]) { + tmp = neigh_get_valid(seq, n, NULL); + if (tmp) + return tmp; } - - if (n) - break; } - state->bucket = bucket; - return n; + return NULL; } static struct neighbour *neigh_get_next(struct seq_file *seq, @@ -3234,46 +3187,28 @@ static struct neighbour *neigh_get_next(struct seq_file *seq, loff_t *pos) { struct neigh_seq_state *state = seq->private; - struct net *net = seq_file_net(seq); - struct neigh_hash_table *nht = state->nht; + struct neighbour *tmp; if (state->neigh_sub_iter) { void *v = state->neigh_sub_iter(state, n, pos); + if (v) return n; } - n = rcu_dereference(n->next); - - while (1) { - while (n) { - if (!net_eq(dev_net(n->dev), net)) - goto next; - if (state->neigh_sub_iter) { - void *v = state->neigh_sub_iter(state, n, pos); - if (v) - return n; - goto next; - } - if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) - break; - if (READ_ONCE(n->nud_state) & ~NUD_NOARP) - break; -next: - n = rcu_dereference(n->next); + hlist_for_each_entry_continue(n, hash) { + tmp = neigh_get_valid(seq, n, pos); + if (tmp) { + n = tmp; + goto out; } - - if (n) - break; - - if (++state->bucket >= (1 << nht->hash_shift)) - break; - - n = rcu_dereference(nht->hash_buckets[state->bucket]); } + n = neigh_get_first(seq); +out: if (n && pos) --(*pos); + return n; } @@ -3376,7 +3311,7 @@ void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl struct neigh_seq_state *state = seq->private; state->tbl = tbl; - state->bucket = 0; + state->bucket = -1; state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH); rcu_read_lock(); @@ -3886,17 +3821,18 @@ EXPORT_SYMBOL(neigh_sysctl_unregister); #endif /* CONFIG_SYSCTL */ +static const struct rtnl_msg_handler neigh_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_NEWNEIGH, .doit = neigh_add}, + {.msgtype = RTM_DELNEIGH, .doit = neigh_delete}, + {.msgtype = RTM_GETNEIGH, .doit = neigh_get, .dumpit = neigh_dump_info, + .flags = RTNL_FLAG_DUMP_UNLOCKED}, + {.msgtype = RTM_GETNEIGHTBL, .dumpit = neightbl_dump_info}, + {.msgtype = RTM_SETNEIGHTBL, .doit = neightbl_set}, +}; + static int __init neigh_init(void) { - rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETNEIGH, neigh_get, neigh_dump_info, - RTNL_FLAG_DUMP_UNLOCKED); - - rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info, - 0); - rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL, 0); - + rtnl_register_many(neigh_rtnl_msg_handlers); return 0; } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 05cf5347f25e..2d9afc6e2161 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -409,7 +409,7 @@ NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec); static int change_gro_flush_timeout(struct net_device *dev, unsigned long val) { - WRITE_ONCE(dev->gro_flush_timeout, val); + netdev_set_gro_flush_timeout(dev, val); return 0; } @@ -429,7 +429,7 @@ static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val if (val > S32_MAX) return -ERANGE; - WRITE_ONCE(dev->napi_defer_hard_irqs, val); + netdev_set_defer_hard_irqs(dev, (u32)val); return 0; } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index e39479f1c9a4..ae34ac818cda 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -56,7 +56,6 @@ static bool init_net_initialized; * outside. */ DECLARE_RWSEM(pernet_ops_rwsem); -EXPORT_SYMBOL_GPL(pernet_ops_rwsem); #define MIN_PERNET_OPS_ID \ ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *)) @@ -317,6 +316,7 @@ static __net_init void preinit_net_sysctl(struct net *net) */ net->core.sysctl_optmem_max = 128 * 1024; net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED; + net->core.sysctl_tstamp_allow_data = 1; } /* init code that must occur even if setup_net() is not called. */ @@ -334,6 +334,12 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_ idr_init(&net->netns_ids); spin_lock_init(&net->nsid_lock); mutex_init(&net->ipv4.ra_mutex); + +#ifdef CONFIG_DEBUG_NET_SMALL_RTNL + mutex_init(&net->rtnl_mutex); + lock_set_cmp_fn(&net->rtnl_mutex, rtnl_net_lock_cmp_fn, NULL); +#endif + preinit_net_sysctl(net); } @@ -694,20 +700,18 @@ EXPORT_SYMBOL_GPL(get_net_ns); struct net *get_net_ns_by_fd(int fd) { - struct fd f = fdget(fd); - struct net *net = ERR_PTR(-EINVAL); + CLASS(fd, f)(fd); - if (!fd_file(f)) + if (fd_empty(f)) return ERR_PTR(-EBADF); if (proc_ns_file(fd_file(f))) { struct ns_common *ns = get_proc_ns(file_inode(fd_file(f))); if (ns->ops == &netns_operations) - net = get_net(container_of(ns, struct net, ns)); + return get_net(container_of(ns, struct net, ns)); } - fdput(f); - return net; + return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(get_net_ns_by_fd); #endif @@ -1153,13 +1157,23 @@ static void __init netns_ipv4_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_tcp_early_demux); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_l3mdev_accept); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_tcp_reordering); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_tcp_rmem); - CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 18); + CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 22); } #endif +static const struct rtnl_msg_handler net_ns_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_NEWNSID, .doit = rtnl_net_newid, + .flags = RTNL_FLAG_DOIT_UNLOCKED}, + {.msgtype = RTM_GETNSID, .doit = rtnl_net_getid, + .dumpit = rtnl_net_dumpid, + .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, +}; + void __init net_ns_init(void) { struct net_generic *ng; @@ -1197,11 +1211,7 @@ void __init net_ns_init(void) if (register_pernet_subsys(&net_ns_ops)) panic("Could not register network namespace subsystems"); - rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL, - RTNL_FLAG_DOIT_UNLOCKED); - rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid, - RTNL_FLAG_DOIT_UNLOCKED | - RTNL_FLAG_DUMP_UNLOCKED); + rtnl_register_many(net_ns_rtnl_msg_handlers); } static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list) diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index b28424ae06d5..a89cbd8d87c3 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -14,12 +14,16 @@ /* Integer value ranges */ static const struct netlink_range_validation netdev_a_page_pool_id_range = { .min = 1ULL, - .max = 4294967295ULL, + .max = U32_MAX, }; static const struct netlink_range_validation netdev_a_page_pool_ifindex_range = { .min = 1ULL, - .max = 2147483647ULL, + .max = S32_MAX, +}; + +static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range = { + .max = S32_MAX, }; /* Common nested types */ @@ -87,6 +91,14 @@ static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1] [NETDEV_A_DMABUF_QUEUES] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy), }; +/* NETDEV_CMD_NAPI_SET - do */ +static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT + 1] = { + [NETDEV_A_NAPI_ID] = { .type = NLA_U32, }, + [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range), + [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, }, + [NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, }, +}; + /* Ops table for netdev */ static const struct genl_split_ops netdev_nl_ops[] = { { @@ -171,6 +183,13 @@ static const struct genl_split_ops netdev_nl_ops[] = { .maxattr = NETDEV_A_DMABUF_FD, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, + { + .cmd = NETDEV_CMD_NAPI_SET, + .doit = netdev_nl_napi_set_doit, + .policy = netdev_napi_set_nl_policy, + .maxattr = NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, }; static const struct genl_multicast_group netdev_nl_mcgrps[] = { diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index 8cda334fd042..e09dd7539ff2 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -33,6 +33,7 @@ int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info); enum { NETDEV_NLGRP_MGMT, diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 1cb954f2d39e..9527dd46e4dc 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -24,7 +24,7 @@ struct netdev_nl_dump_ctx { static struct netdev_nl_dump_ctx *netdev_dump_ctx(struct netlink_callback *cb) { - NL_ASSERT_DUMP_CTX_FITS(struct netdev_nl_dump_ctx); + NL_ASSERT_CTX_FITS(struct netdev_nl_dump_ctx); return (struct netdev_nl_dump_ctx *)cb->ctx; } @@ -161,6 +161,9 @@ static int netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, const struct genl_info *info) { + unsigned long irq_suspend_timeout; + unsigned long gro_flush_timeout; + u32 napi_defer_hard_irqs; void *hdr; pid_t pid; @@ -189,6 +192,21 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, goto nla_put_failure; } + napi_defer_hard_irqs = napi_get_defer_hard_irqs(napi); + if (nla_put_s32(rsp, NETDEV_A_NAPI_DEFER_HARD_IRQS, + napi_defer_hard_irqs)) + goto nla_put_failure; + + irq_suspend_timeout = napi_get_irq_suspend_timeout(napi); + if (nla_put_uint(rsp, NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, + irq_suspend_timeout)) + goto nla_put_failure; + + gro_flush_timeout = napi_get_gro_flush_timeout(napi); + if (nla_put_uint(rsp, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, + gro_flush_timeout)) + goto nla_put_failure; + genlmsg_end(rsp, hdr); return 0; @@ -215,6 +233,7 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; rtnl_lock(); + rcu_read_lock(); napi = napi_by_id(napi_id); if (napi) { @@ -224,6 +243,7 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) err = -ENOENT; } + rcu_read_unlock(); rtnl_unlock(); if (err) @@ -292,6 +312,59 @@ int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) } static int +netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info) +{ + u64 irq_suspend_timeout = 0; + u64 gro_flush_timeout = 0; + u32 defer = 0; + + if (info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]) { + defer = nla_get_u32(info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]); + napi_set_defer_hard_irqs(napi, defer); + } + + if (info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]) { + irq_suspend_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]); + napi_set_irq_suspend_timeout(napi, irq_suspend_timeout); + } + + if (info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]) { + gro_flush_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]); + napi_set_gro_flush_timeout(napi, gro_flush_timeout); + } + + return 0; +} + +int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct napi_struct *napi; + unsigned int napi_id; + int err; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID)) + return -EINVAL; + + napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]); + + rtnl_lock(); + rcu_read_lock(); + + napi = napi_by_id(napi_id); + if (napi) { + err = netdev_nl_napi_set_config(napi, info); + } else { + NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]); + err = -ENOENT; + } + + rcu_read_unlock(); + rtnl_unlock(); + + return err; +} + +static int netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { diff --git a/net/core/netpoll.c b/net/core/netpoll.c index aa49b92e9194..2e459b9d88eb 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -45,9 +45,6 @@ #define MAX_UDP_CHUNK 1460 #define MAX_SKBS 32 - -static struct sk_buff_head skb_pool; - #define USEC_PER_POLL 50 #define MAX_SKB_SIZE \ @@ -234,20 +231,23 @@ void netpoll_poll_enable(struct net_device *dev) up(&ni->dev_lock); } -static void refill_skbs(void) +static void refill_skbs(struct netpoll *np) { + struct sk_buff_head *skb_pool; struct sk_buff *skb; unsigned long flags; - spin_lock_irqsave(&skb_pool.lock, flags); - while (skb_pool.qlen < MAX_SKBS) { + skb_pool = &np->skb_pool; + + spin_lock_irqsave(&skb_pool->lock, flags); + while (skb_pool->qlen < MAX_SKBS) { skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC); if (!skb) break; - __skb_queue_tail(&skb_pool, skb); + __skb_queue_tail(skb_pool, skb); } - spin_unlock_irqrestore(&skb_pool.lock, flags); + spin_unlock_irqrestore(&skb_pool->lock, flags); } static void zap_completion_queue(void) @@ -284,12 +284,12 @@ static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve) struct sk_buff *skb; zap_completion_queue(); - refill_skbs(); + refill_skbs(np); repeat: skb = alloc_skb(len, GFP_ATOMIC); if (!skb) - skb = skb_dequeue(&skb_pool); + skb = skb_dequeue(&np->skb_pool); if (!skb) { if (++count < 10) { @@ -531,6 +531,14 @@ static int netpoll_parse_ip_addr(const char *str, union inet_addr *addr) return -1; } +static void skb_pool_flush(struct netpoll *np) +{ + struct sk_buff_head *skb_pool; + + skb_pool = &np->skb_pool; + skb_queue_purge_reason(skb_pool, SKB_CONSUMED); +} + int netpoll_parse_options(struct netpoll *np, char *opt) { char *cur=opt, *delim; @@ -626,7 +634,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) goto out; } - if (!ndev->npinfo) { + if (!rcu_access_pointer(ndev->npinfo)) { npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL); if (!npinfo) { err = -ENOMEM; @@ -641,7 +649,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) ops = ndev->netdev_ops; if (ops->ndo_netpoll_setup) { - err = ops->ndo_netpoll_setup(ndev, npinfo); + err = ops->ndo_netpoll_setup(ndev); if (err) goto free_npinfo; } @@ -673,6 +681,8 @@ int netpoll_setup(struct netpoll *np) struct in_device *in_dev; int err; + skb_queue_head_init(&np->skb_pool); + rtnl_lock(); if (np->dev_name[0]) { struct net *net = current->nsproxy->net_ns; @@ -773,14 +783,16 @@ put_noaddr: } /* fill up the skb queue */ - refill_skbs(); + refill_skbs(np); err = __netpoll_setup(np, ndev); if (err) - goto put; + goto flush; rtnl_unlock(); return 0; +flush: + skb_pool_flush(np); put: DEBUG_NET_WARN_ON_ONCE(np->dev); if (ip_overwritten) @@ -792,13 +804,6 @@ unlock: } EXPORT_SYMBOL(netpoll_setup); -static int __init netpoll_init(void) -{ - skb_queue_head_init(&skb_pool); - return 0; -} -core_initcall(netpoll_init); - static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head) { struct netpoll_info *npinfo = @@ -835,6 +840,8 @@ void __netpoll_cleanup(struct netpoll *np) call_rcu(&npinfo->rcu, rcu_cleanup_netpoll_info); } else RCU_INIT_POINTER(np->dev->npinfo, NULL); + + skb_pool_flush(np); } EXPORT_SYMBOL_GPL(__netpoll_cleanup); diff --git a/net/core/page_pool.c b/net/core/page_pool.c index a813d30d2135..f89cf93f6eb4 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -950,6 +950,7 @@ netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool, if (netmem && *offset + size > max_size) { netmem = page_pool_drain_frag(pool, netmem); if (netmem) { + recycle_stat_inc(pool, cached); alloc_stat_inc(pool, fast); goto frag_reset; } @@ -974,7 +975,6 @@ frag_reset: pool->frag_users++; pool->frag_offset = *offset + size; - alloc_stat_inc(pool, fast); return netmem; } EXPORT_SYMBOL(page_pool_alloc_frag_netmem); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 34f68ef74b8f..7e23cacbe66e 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2285,7 +2285,7 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) s64 remaining; struct hrtimer_sleeper t; - hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_setup_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); hrtimer_set_expires(&t.timer, spin_until); remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer)); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2ba5cd965d3f..dd142f444659 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -179,6 +179,166 @@ bool lockdep_rtnl_is_held(void) EXPORT_SYMBOL(lockdep_rtnl_is_held); #endif /* #ifdef CONFIG_PROVE_LOCKING */ +#ifdef CONFIG_DEBUG_NET_SMALL_RTNL +void __rtnl_net_lock(struct net *net) +{ + ASSERT_RTNL(); + + mutex_lock(&net->rtnl_mutex); +} +EXPORT_SYMBOL(__rtnl_net_lock); + +void __rtnl_net_unlock(struct net *net) +{ + ASSERT_RTNL(); + + mutex_unlock(&net->rtnl_mutex); +} +EXPORT_SYMBOL(__rtnl_net_unlock); + +void rtnl_net_lock(struct net *net) +{ + rtnl_lock(); + __rtnl_net_lock(net); +} +EXPORT_SYMBOL(rtnl_net_lock); + +void rtnl_net_unlock(struct net *net) +{ + __rtnl_net_unlock(net); + rtnl_unlock(); +} +EXPORT_SYMBOL(rtnl_net_unlock); + +int rtnl_net_trylock(struct net *net) +{ + int ret = rtnl_trylock(); + + if (ret) + __rtnl_net_lock(net); + + return ret; +} +EXPORT_SYMBOL(rtnl_net_trylock); + +static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b) +{ + if (net_eq(net_a, net_b)) + return 0; + + /* always init_net first */ + if (net_eq(net_a, &init_net)) + return -1; + + if (net_eq(net_b, &init_net)) + return 1; + + /* otherwise lock in ascending order */ + return net_a < net_b ? -1 : 1; +} + +int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) +{ + const struct net *net_a, *net_b; + + net_a = container_of(a, struct net, rtnl_mutex.dep_map); + net_b = container_of(b, struct net, rtnl_mutex.dep_map); + + return rtnl_net_cmp_locks(net_a, net_b); +} + +bool rtnl_net_is_locked(struct net *net) +{ + return rtnl_is_locked() && mutex_is_locked(&net->rtnl_mutex); +} +EXPORT_SYMBOL(rtnl_net_is_locked); + +bool lockdep_rtnl_net_is_held(struct net *net) +{ + return lockdep_rtnl_is_held() && lockdep_is_held(&net->rtnl_mutex); +} +EXPORT_SYMBOL(lockdep_rtnl_net_is_held); +#else +static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b) +{ + /* No need to swap */ + return -1; +} +#endif + +struct rtnl_nets { + /* ->newlink() needs to freeze 3 netns at most; + * 2 for the new device, 1 for its peer. + */ + struct net *net[3]; + unsigned char len; +}; + +static void rtnl_nets_init(struct rtnl_nets *rtnl_nets) +{ + memset(rtnl_nets, 0, sizeof(*rtnl_nets)); +} + +static void rtnl_nets_destroy(struct rtnl_nets *rtnl_nets) +{ + int i; + + for (i = 0; i < rtnl_nets->len; i++) { + put_net(rtnl_nets->net[i]); + rtnl_nets->net[i] = NULL; + } + + rtnl_nets->len = 0; +} + +/** + * rtnl_nets_add - Add netns to be locked before ->newlink(). + * + * @rtnl_nets: rtnl_nets pointer passed to ->get_peer_net(). + * @net: netns pointer with an extra refcnt held. + * + * The extra refcnt is released in rtnl_nets_destroy(). + */ +static void rtnl_nets_add(struct rtnl_nets *rtnl_nets, struct net *net) +{ + int i; + + DEBUG_NET_WARN_ON_ONCE(rtnl_nets->len == ARRAY_SIZE(rtnl_nets->net)); + + for (i = 0; i < rtnl_nets->len; i++) { + switch (rtnl_net_cmp_locks(rtnl_nets->net[i], net)) { + case 0: + put_net(net); + return; + case 1: + swap(rtnl_nets->net[i], net); + } + } + + rtnl_nets->net[i] = net; + rtnl_nets->len++; +} + +static void rtnl_nets_lock(struct rtnl_nets *rtnl_nets) +{ + int i; + + rtnl_lock(); + + for (i = 0; i < rtnl_nets->len; i++) + __rtnl_net_lock(rtnl_nets->net[i]); +} + +static void rtnl_nets_unlock(struct rtnl_nets *rtnl_nets) +{ + int i; + + for (i = 0; i < rtnl_nets->len; i++) + __rtnl_net_unlock(rtnl_nets->net[i]); + + rtnl_unlock(); +} + static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; static inline int rtm_msgindex(int msgtype) @@ -269,64 +429,13 @@ unlock: } /** - * rtnl_register_module - Register a rtnetlink message type - * - * @owner: module registering the hook (THIS_MODULE) - * @protocol: Protocol family or PF_UNSPEC - * @msgtype: rtnetlink message type - * @doit: Function pointer called for each request message - * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message - * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions - * - * Like rtnl_register, but for use by removable modules. - */ -int rtnl_register_module(struct module *owner, - int protocol, int msgtype, - rtnl_doit_func doit, rtnl_dumpit_func dumpit, - unsigned int flags) -{ - return rtnl_register_internal(owner, protocol, msgtype, - doit, dumpit, flags); -} -EXPORT_SYMBOL_GPL(rtnl_register_module); - -/** - * rtnl_register - Register a rtnetlink message type - * @protocol: Protocol family or PF_UNSPEC - * @msgtype: rtnetlink message type - * @doit: Function pointer called for each request message - * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message - * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions - * - * Registers the specified function pointers (at least one of them has - * to be non-NULL) to be called whenever a request message for the - * specified protocol family and message type is received. - * - * The special protocol family PF_UNSPEC may be used to define fallback - * function pointers for the case when no entry for the specific protocol - * family exists. - */ -void rtnl_register(int protocol, int msgtype, - rtnl_doit_func doit, rtnl_dumpit_func dumpit, - unsigned int flags) -{ - int err; - - err = rtnl_register_internal(NULL, protocol, msgtype, doit, dumpit, - flags); - if (err) - pr_err("Unable to register rtnetlink message handler, " - "protocol = %d, message type = %d\n", protocol, msgtype); -} - -/** * rtnl_unregister - Unregister a rtnetlink message type * @protocol: Protocol family or PF_UNSPEC * @msgtype: rtnetlink message type * * Returns 0 on success or a negative error code. */ -int rtnl_unregister(int protocol, int msgtype) +static int rtnl_unregister(int protocol, int msgtype) { struct rtnl_link __rcu **tab; struct rtnl_link *link; @@ -349,7 +458,6 @@ int rtnl_unregister(int protocol, int msgtype) return 0; } -EXPORT_SYMBOL_GPL(rtnl_unregister); /** * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol @@ -384,6 +492,26 @@ void rtnl_unregister_all(int protocol) } EXPORT_SYMBOL_GPL(rtnl_unregister_all); +/** + * __rtnl_register_many - Register rtnetlink message types + * @handlers: Array of struct rtnl_msg_handlers + * @n: The length of @handlers + * + * Registers the specified function pointers (at least one of them has + * to be non-NULL) to be called whenever a request message for the + * specified protocol family and message type is received. + * + * The special protocol family PF_UNSPEC may be used to define fallback + * function pointers for the case when no entry for the specific protocol + * family exists. + * + * When one element of @handlers fails to register, + * 1) built-in: panics. + * 2) modules : the previous successful registrations are unwinded + * and an error is returned. + * + * Use rtnl_register_many(). + */ int __rtnl_register_many(const struct rtnl_msg_handler *handlers, int n) { const struct rtnl_msg_handler *handler; @@ -394,6 +522,10 @@ int __rtnl_register_many(const struct rtnl_msg_handler *handlers, int n) handler->msgtype, handler->doit, handler->dumpit, handler->flags); if (err) { + if (!handler->owner) + panic("Unable to register rtnetlink message " + "handlers, %pS\n", handlers); + __rtnl_unregister_many(handlers, i); break; } @@ -413,46 +545,33 @@ void __rtnl_unregister_many(const struct rtnl_msg_handler *handlers, int n) } EXPORT_SYMBOL_GPL(__rtnl_unregister_many); +static DEFINE_MUTEX(link_ops_mutex); static LIST_HEAD(link_ops); -static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind) +static struct rtnl_link_ops *rtnl_link_ops_get(const char *kind, int *srcu_index) { - const struct rtnl_link_ops *ops; + struct rtnl_link_ops *ops; + + rcu_read_lock(); - list_for_each_entry(ops, &link_ops, list) { - if (!strcmp(ops->kind, kind)) - return ops; + list_for_each_entry_rcu(ops, &link_ops, list) { + if (!strcmp(ops->kind, kind)) { + *srcu_index = srcu_read_lock(&ops->srcu); + goto unlock; + } } - return NULL; -} -/** - * __rtnl_link_register - Register rtnl_link_ops with rtnetlink. - * @ops: struct rtnl_link_ops * to register - * - * The caller must hold the rtnl_mutex. This function should be used - * by drivers that create devices during module initialization. It - * must be called before registering the devices. - * - * Returns 0 on success or a negative error code. - */ -int __rtnl_link_register(struct rtnl_link_ops *ops) -{ - if (rtnl_link_ops_get(ops->kind)) - return -EEXIST; + ops = NULL; +unlock: + rcu_read_unlock(); - /* The check for alloc/setup is here because if ops - * does not have that filled up, it is not possible - * to use the ops for creating device. So do not - * fill up dellink as well. That disables rtnl_dellink. - */ - if ((ops->alloc || ops->setup) && !ops->dellink) - ops->dellink = unregister_netdevice_queue; + return ops; +} - list_add_tail(&ops->list, &link_ops); - return 0; +static void rtnl_link_ops_put(struct rtnl_link_ops *ops, int srcu_index) +{ + srcu_read_unlock(&ops->srcu, srcu_index); } -EXPORT_SYMBOL_GPL(__rtnl_link_register); /** * rtnl_link_register - Register rtnl_link_ops with rtnetlink. @@ -462,6 +581,7 @@ EXPORT_SYMBOL_GPL(__rtnl_link_register); */ int rtnl_link_register(struct rtnl_link_ops *ops) { + struct rtnl_link_ops *tmp; int err; /* Sanity-check max sizes to avoid stack buffer overflow. */ @@ -469,9 +589,31 @@ int rtnl_link_register(struct rtnl_link_ops *ops) ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)) return -EINVAL; - rtnl_lock(); - err = __rtnl_link_register(ops); - rtnl_unlock(); + /* The check for alloc/setup is here because if ops + * does not have that filled up, it is not possible + * to use the ops for creating device. So do not + * fill up dellink as well. That disables rtnl_dellink. + */ + if ((ops->alloc || ops->setup) && !ops->dellink) + ops->dellink = unregister_netdevice_queue; + + err = init_srcu_struct(&ops->srcu); + if (err) + return err; + + mutex_lock(&link_ops_mutex); + + list_for_each_entry(tmp, &link_ops, list) { + if (!strcmp(ops->kind, tmp->kind)) { + err = -EEXIST; + goto unlock; + } + } + + list_add_tail_rcu(&ops->list, &link_ops); +unlock: + mutex_unlock(&link_ops_mutex); + return err; } EXPORT_SYMBOL_GPL(rtnl_link_register); @@ -488,25 +630,6 @@ static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) unregister_netdevice_many(&list_kill); } -/** - * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. - * @ops: struct rtnl_link_ops * to unregister - * - * The caller must hold the rtnl_mutex and guarantee net_namespace_list - * integrity (hold pernet_ops_rwsem for writing to close the race - * with setup_net() and cleanup_net()). - */ -void __rtnl_link_unregister(struct rtnl_link_ops *ops) -{ - struct net *net; - - for_each_net(net) { - __rtnl_kill_links(net, ops); - } - list_del(&ops->list); -} -EXPORT_SYMBOL_GPL(__rtnl_link_unregister); - /* Return with the rtnl_lock held when there are no network * devices unregistering in any network namespace. */ @@ -535,10 +658,22 @@ static void rtnl_lock_unregistering_all(void) */ void rtnl_link_unregister(struct rtnl_link_ops *ops) { + struct net *net; + + mutex_lock(&link_ops_mutex); + list_del_rcu(&ops->list); + mutex_unlock(&link_ops_mutex); + + synchronize_srcu(&ops->srcu); + cleanup_srcu_struct(&ops->srcu); + /* Close the race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); rtnl_lock_unregistering_all(); - __rtnl_link_unregister(ops); + + for_each_net(net) + __rtnl_kill_links(net, ops); + rtnl_unlock(); up_write(&pernet_ops_rwsem); } @@ -595,31 +730,51 @@ static size_t rtnl_link_get_size(const struct net_device *dev) static LIST_HEAD(rtnl_af_ops); -static const struct rtnl_af_ops *rtnl_af_lookup(const int family) +static struct rtnl_af_ops *rtnl_af_lookup(const int family, int *srcu_index) { - const struct rtnl_af_ops *ops; + struct rtnl_af_ops *ops; ASSERT_RTNL(); - list_for_each_entry(ops, &rtnl_af_ops, list) { - if (ops->family == family) - return ops; + rcu_read_lock(); + + list_for_each_entry_rcu(ops, &rtnl_af_ops, list) { + if (ops->family == family) { + *srcu_index = srcu_read_lock(&ops->srcu); + goto unlock; + } } - return NULL; + ops = NULL; +unlock: + rcu_read_unlock(); + + return ops; +} + +static void rtnl_af_put(struct rtnl_af_ops *ops, int srcu_index) +{ + srcu_read_unlock(&ops->srcu, srcu_index); } /** * rtnl_af_register - Register rtnl_af_ops with rtnetlink. * @ops: struct rtnl_af_ops * to register * - * Returns 0 on success or a negative error code. + * Return: 0 on success or a negative error code. */ -void rtnl_af_register(struct rtnl_af_ops *ops) +int rtnl_af_register(struct rtnl_af_ops *ops) { + int err = init_srcu_struct(&ops->srcu); + + if (err) + return err; + rtnl_lock(); list_add_tail_rcu(&ops->list, &rtnl_af_ops); rtnl_unlock(); + + return 0; } EXPORT_SYMBOL_GPL(rtnl_af_register); @@ -634,6 +789,8 @@ void rtnl_af_unregister(struct rtnl_af_ops *ops) rtnl_unlock(); synchronize_rcu(); + synchronize_srcu(&ops->srcu); + cleanup_srcu_struct(&ops->srcu); } EXPORT_SYMBOL_GPL(rtnl_af_unregister); @@ -1147,6 +1304,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */ + rtnl_devlink_port_size(dev) + rtnl_dpll_pin_size(dev) + + nla_total_size(8) /* IFLA_MAX_PACING_OFFLOAD_HORIZON */ + 0; } @@ -1896,6 +2054,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, READ_ONCE(dev->tso_max_size)) || nla_put_u32(skb, IFLA_TSO_MAX_SEGS, READ_ONCE(dev->tso_max_segs)) || + nla_put_uint(skb, IFLA_MAX_PACING_OFFLOAD_HORIZON, + READ_ONCE(dev->max_pacing_offload_horizon)) || #ifdef CONFIG_RPS nla_put_u32(skb, IFLA_NUM_RX_QUEUES, READ_ONCE(dev->num_rx_queues)) || @@ -2004,6 +2164,7 @@ nla_put_failure: } static const struct nla_policy ifla_policy[IFLA_MAX+1] = { + [IFLA_UNSPEC] = { .strict_start_type = IFLA_DPLL_PIN }, [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 }, [IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, @@ -2112,10 +2273,11 @@ static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { [IFLA_XDP_PROG_ID] = { .type = NLA_U32 }, }; -static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla) +static struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla, + int *ops_srcu_index) { - const struct rtnl_link_ops *ops = NULL; struct nlattr *linfo[IFLA_INFO_MAX + 1]; + struct rtnl_link_ops *ops = NULL; if (nla_parse_nested_deprecated(linfo, IFLA_INFO_MAX, nla, ifla_info_policy, NULL) < 0) return NULL; @@ -2124,7 +2286,7 @@ static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla char kind[MODULE_NAME_LEN]; nla_strscpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind)); - ops = rtnl_link_ops_get(kind); + ops = rtnl_link_ops_get(kind, ops_srcu_index); } return ops; @@ -2244,8 +2406,8 @@ static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh, static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { - const struct rtnl_link_ops *kind_ops = NULL; struct netlink_ext_ack *extack = cb->extack; + struct rtnl_link_ops *kind_ops = NULL; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int flags = NLM_F_MULTI; @@ -2256,6 +2418,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) struct net *tgt_net = net; u32 ext_filter_mask = 0; struct net_device *dev; + int ops_srcu_index; int master_idx = 0; int netnsid = -1; int err, i; @@ -2289,7 +2452,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) master_idx = nla_get_u32(tb[i]); break; case IFLA_LINKINFO: - kind_ops = linkinfo_to_kind_ops(tb[i]); + kind_ops = linkinfo_to_kind_ops(tb[i], &ops_srcu_index); break; default: if (cb->strict_check) { @@ -2315,6 +2478,10 @@ walk_entries: if (err < 0) break; } + + if (kind_ops) + rtnl_link_ops_put(kind_ops, ops_srcu_index); + cb->seq = tgt_net->dev_base_seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); if (netnsid >= 0) @@ -2345,9 +2512,10 @@ int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer, } EXPORT_SYMBOL(rtnl_nla_parse_ifinfomsg); -struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) +static struct net *rtnl_link_get_net_ifla(struct nlattr *tb[]) { - struct net *net; + struct net *net = NULL; + /* Examine the link attributes and figure out which * network namespace we are talking about. */ @@ -2355,8 +2523,17 @@ struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); else if (tb[IFLA_NET_NS_FD]) net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD])); - else + + return net; +} + +struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) +{ + struct net *net = rtnl_link_get_net_ifla(tb); + + if (!net) net = get_net(src_net); + return net; } EXPORT_SYMBOL(rtnl_link_get_net); @@ -2496,20 +2673,24 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], int rem, err; nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { - const struct rtnl_af_ops *af_ops; + struct rtnl_af_ops *af_ops; + int af_ops_srcu_index; - af_ops = rtnl_af_lookup(nla_type(af)); + af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index); if (!af_ops) return -EAFNOSUPPORT; if (!af_ops->set_link_af) - return -EOPNOTSUPP; - - if (af_ops->validate_link_af) { + err = -EOPNOTSUPP; + else if (af_ops->validate_link_af) err = af_ops->validate_link_af(dev, af, extack); - if (err < 0) - return err; - } + else + err = 0; + + rtnl_af_put(af_ops, af_ops_srcu_index); + + if (err < 0) + return err; } } @@ -2800,8 +2981,8 @@ static int do_set_proto_down(struct net_device *dev, #define DO_SETLINK_MODIFIED 0x01 /* notify flag means notify + modified. */ #define DO_SETLINK_NOTIFY 0x03 -static int do_setlink(const struct sk_buff *skb, - struct net_device *dev, struct ifinfomsg *ifm, +static int do_setlink(const struct sk_buff *skb, struct net_device *dev, + struct net *tgt_net, struct ifinfomsg *ifm, struct netlink_ext_ack *extack, struct nlattr **tb, int status) { @@ -2809,32 +2990,25 @@ static int do_setlink(const struct sk_buff *skb, char ifname[IFNAMSIZ]; int err; + err = validate_linkmsg(dev, tb, extack); + if (err < 0) + goto errout; + if (tb[IFLA_IFNAME]) nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else ifname[0] = '\0'; - if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_TARGET_NETNSID]) { + if (!net_eq(tgt_net, dev_net(dev))) { const char *pat = ifname[0] ? ifname : NULL; - struct net *net; int new_ifindex; - net = rtnl_link_get_net_capable(skb, dev_net(dev), - tb, CAP_NET_ADMIN); - if (IS_ERR(net)) { - err = PTR_ERR(net); - goto errout; - } - - if (tb[IFLA_NEW_IFINDEX]) - new_ifindex = nla_get_s32(tb[IFLA_NEW_IFINDEX]); - else - new_ifindex = 0; + new_ifindex = nla_get_s32_default(tb[IFLA_NEW_IFINDEX], 0); - err = __dev_change_net_namespace(dev, net, pat, new_ifindex); - put_net(net); + err = __dev_change_net_namespace(dev, tgt_net, pat, new_ifindex); if (err) goto errout; + status |= DO_SETLINK_MODIFIED; } @@ -3093,11 +3267,18 @@ static int do_setlink(const struct sk_buff *skb, int rem; nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { - const struct rtnl_af_ops *af_ops; + struct rtnl_af_ops *af_ops; + int af_ops_srcu_index; - BUG_ON(!(af_ops = rtnl_af_lookup(nla_type(af)))); + af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index); + if (!af_ops) { + err = -EAFNOSUPPORT; + goto errout; + } err = af_ops->set_link_af(dev, af, extack); + rtnl_af_put(af_ops, af_ops_srcu_index); + if (err < 0) goto errout; @@ -3194,11 +3375,13 @@ static struct net_device *rtnl_dev_get(struct net *net, static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct ifinfomsg *ifm = nlmsg_data(nlh); struct net *net = sock_net(skb->sk); - struct ifinfomsg *ifm; - struct net_device *dev; - int err; struct nlattr *tb[IFLA_MAX+1]; + struct net_device *dev = NULL; + struct rtnl_nets rtnl_nets; + struct net *tgt_net; + int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); @@ -3209,25 +3392,31 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto errout; - err = -EINVAL; - ifm = nlmsg_data(nlh); + tgt_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); + if (IS_ERR(tgt_net)) { + err = PTR_ERR(tgt_net); + goto errout; + } + + rtnl_nets_init(&rtnl_nets); + rtnl_nets_add(&rtnl_nets, get_net(net)); + rtnl_nets_add(&rtnl_nets, tgt_net); + + rtnl_nets_lock(&rtnl_nets); + if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(net, tb); else - goto errout; + err = -EINVAL; - if (dev == NULL) { + if (dev) + err = do_setlink(skb, dev, tgt_net, ifm, extack, tb, 0); + else if (!err) err = -ENODEV; - goto errout; - } - err = validate_linkmsg(dev, tb, extack); - if (err < 0) - goto errout; - - err = do_setlink(skb, dev, ifm, extack, tb, 0); + rtnl_nets_unlock(&rtnl_nets); errout: return err; } @@ -3287,14 +3476,14 @@ EXPORT_SYMBOL_GPL(rtnl_delete_link); static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct ifinfomsg *ifm = nlmsg_data(nlh); struct net *net = sock_net(skb->sk); u32 portid = NETLINK_CB(skb).portid; - struct net *tgt_net = net; - struct net_device *dev = NULL; - struct ifinfomsg *ifm; struct nlattr *tb[IFLA_MAX+1]; - int err; + struct net_device *dev = NULL; + struct net *tgt_net = net; int netnsid = -1; + int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); @@ -3312,27 +3501,24 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, return PTR_ERR(tgt_net); } - err = -EINVAL; - ifm = nlmsg_data(nlh); + rtnl_net_lock(tgt_net); + if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(tgt_net, tb); + + if (dev) + err = rtnl_delete_link(dev, portid, nlh); + else if (ifm->ifi_index > 0 || tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + err = -ENODEV; else if (tb[IFLA_GROUP]) err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP])); else - goto out; - - if (!dev) { - if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME] || ifm->ifi_index > 0) - err = -ENODEV; - - goto out; - } + err = -EINVAL; - err = rtnl_delete_link(dev, portid, nlh); + rtnl_net_unlock(tgt_net); -out: if (netnsid >= 0) put_net(tgt_net); @@ -3459,21 +3645,90 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, } EXPORT_SYMBOL(rtnl_create_link); +struct rtnl_newlink_tbs { + struct nlattr *tb[IFLA_MAX + 1]; + struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; + struct nlattr *attr[RTNL_MAX_TYPE + 1]; + struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; +}; + +static int rtnl_changelink(const struct sk_buff *skb, struct nlmsghdr *nlh, + const struct rtnl_link_ops *ops, + struct net_device *dev, struct net *tgt_net, + struct rtnl_newlink_tbs *tbs, + struct nlattr **data, + struct netlink_ext_ack *extack) +{ + struct nlattr ** const linkinfo = tbs->linkinfo; + struct nlattr ** const tb = tbs->tb; + int status = 0; + int err; + + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + + if (nlh->nlmsg_flags & NLM_F_REPLACE) + return -EOPNOTSUPP; + + if (linkinfo[IFLA_INFO_DATA]) { + if (!ops || ops != dev->rtnl_link_ops || !ops->changelink) + return -EOPNOTSUPP; + + err = ops->changelink(dev, tb, data, extack); + if (err < 0) + return err; + + status |= DO_SETLINK_NOTIFY; + } + + if (linkinfo[IFLA_INFO_SLAVE_DATA]) { + const struct rtnl_link_ops *m_ops = NULL; + struct nlattr **slave_data = NULL; + struct net_device *master_dev; + + master_dev = netdev_master_upper_dev_get(dev); + if (master_dev) + m_ops = master_dev->rtnl_link_ops; + + if (!m_ops || !m_ops->slave_changelink) + return -EOPNOTSUPP; + + if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) + return -EINVAL; + + if (m_ops->slave_maxtype) { + err = nla_parse_nested_deprecated(tbs->slave_attr, + m_ops->slave_maxtype, + linkinfo[IFLA_INFO_SLAVE_DATA], + m_ops->slave_policy, extack); + if (err < 0) + return err; + + slave_data = tbs->slave_attr; + } + + err = m_ops->slave_changelink(master_dev, dev, tb, slave_data, extack); + if (err < 0) + return err; + + status |= DO_SETLINK_NOTIFY; + } + + return do_setlink(skb, dev, tgt_net, nlmsg_data(nlh), extack, tb, status); +} + static int rtnl_group_changelink(const struct sk_buff *skb, - struct net *net, int group, - struct ifinfomsg *ifm, - struct netlink_ext_ack *extack, - struct nlattr **tb) + struct net *net, struct net *tgt_net, + int group, struct ifinfomsg *ifm, + struct netlink_ext_ack *extack, + struct nlattr **tb) { struct net_device *dev, *aux; int err; for_each_netdev_safe(net, dev, aux) { if (dev->group == group) { - err = validate_linkmsg(dev, tb, extack); - if (err < 0) - return err; - err = do_setlink(skb, dev, ifm, extack, tb, 0); + err = do_setlink(skb, dev, tgt_net, ifm, extack, tb, 0); if (err < 0) return err; } @@ -3484,6 +3739,7 @@ static int rtnl_group_changelink(const struct sk_buff *skb, static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, const struct rtnl_link_ops *ops, + struct net *tgt_net, struct net *link_net, const struct nlmsghdr *nlh, struct nlattr **tb, struct nlattr **data, struct netlink_ext_ack *extack) @@ -3491,7 +3747,6 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, unsigned char name_assign_type = NET_NAME_USER; struct net *net = sock_net(skb->sk); u32 portid = NETLINK_CB(skb).portid; - struct net *dest_net, *link_net; struct net_device *dev; char ifname[IFNAMSIZ]; int err; @@ -3506,27 +3761,7 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, name_assign_type = NET_NAME_ENUM; } - dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); - if (IS_ERR(dest_net)) - return PTR_ERR(dest_net); - - if (tb[IFLA_LINK_NETNSID]) { - int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); - - link_net = get_net_ns_by_id(dest_net, id); - if (!link_net) { - NL_SET_ERR_MSG(extack, "Unknown network namespace id"); - err = -EINVAL; - goto out; - } - err = -EPERM; - if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) - goto out; - } else { - link_net = NULL; - } - - dev = rtnl_create_link(link_net ? : dest_net, ifname, + dev = rtnl_create_link(link_net ? : tgt_net, ifname, name_assign_type, ops, tb, extack); if (IS_ERR(dev)) { err = PTR_ERR(dev); @@ -3548,7 +3783,7 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, if (err < 0) goto out_unregister; if (link_net) { - err = dev_change_net_namespace(dev, dest_net, ifname); + err = dev_change_net_namespace(dev, tgt_net, ifname); if (err < 0) goto out_unregister; } @@ -3558,9 +3793,6 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, goto out_unregister; } out: - if (link_net) - put_net(link_net); - put_net(dest_net); return err; out_unregister: if (ops->newlink) { @@ -3574,41 +3806,49 @@ out_unregister: goto out; } -struct rtnl_newlink_tbs { +static int rtnl_add_peer_net(struct rtnl_nets *rtnl_nets, + const struct rtnl_link_ops *ops, + struct nlattr *data[], + struct netlink_ext_ack *extack) +{ struct nlattr *tb[IFLA_MAX + 1]; - struct nlattr *attr[RTNL_MAX_TYPE + 1]; - struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; -}; + struct net *net; + int err; + + if (!data || !data[ops->peer_type]) + return 0; + + err = rtnl_nla_parse_ifinfomsg(tb, data[ops->peer_type], extack); + if (err < 0) + return err; + + if (ops->validate) { + err = ops->validate(tb, NULL, extack); + if (err < 0) + return err; + } + + net = rtnl_link_get_net_ifla(tb); + if (IS_ERR(net)) + return PTR_ERR(net); + if (net) + rtnl_nets_add(rtnl_nets, net); + + return 0; +} static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + const struct rtnl_link_ops *ops, + struct net *tgt_net, struct net *link_net, struct rtnl_newlink_tbs *tbs, + struct nlattr **data, struct netlink_ext_ack *extack) { - struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; struct nlattr ** const tb = tbs->tb; - const struct rtnl_link_ops *m_ops; - struct net_device *master_dev; struct net *net = sock_net(skb->sk); - const struct rtnl_link_ops *ops; - struct nlattr **slave_data; - char kind[MODULE_NAME_LEN]; struct net_device *dev; struct ifinfomsg *ifm; - struct nlattr **data; bool link_specified; - int err; - -#ifdef CONFIG_MODULES -replay: -#endif - err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, - ifla_policy, extack); - if (err < 0) - return err; - - err = rtnl_ensure_unique_netns(tb, extack, false); - if (err < 0) - return err; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) { @@ -3625,151 +3865,148 @@ replay: dev = NULL; } - master_dev = NULL; - m_ops = NULL; - if (dev) { - master_dev = netdev_master_upper_dev_get(dev); - if (master_dev) - m_ops = master_dev->rtnl_link_ops; + if (dev) + return rtnl_changelink(skb, nlh, ops, dev, tgt_net, tbs, data, extack); + + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { + /* No dev found and NLM_F_CREATE not set. Requested dev does not exist, + * or it's for a group + */ + if (link_specified || !tb[IFLA_GROUP]) + return -ENODEV; + + return rtnl_group_changelink(skb, net, tgt_net, + nla_get_u32(tb[IFLA_GROUP]), + ifm, extack, tb); } + if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) + return -EOPNOTSUPP; + + if (!ops) { + NL_SET_ERR_MSG(extack, "Unknown device type"); + return -EOPNOTSUPP; + } + + return rtnl_newlink_create(skb, ifm, ops, tgt_net, link_net, nlh, tb, data, extack); +} + +static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr **tb, **linkinfo, **data = NULL; + struct net *tgt_net, *link_net = NULL; + struct rtnl_link_ops *ops = NULL; + struct rtnl_newlink_tbs *tbs; + struct rtnl_nets rtnl_nets; + int ops_srcu_index; + int ret; + + tbs = kmalloc(sizeof(*tbs), GFP_KERNEL); + if (!tbs) + return -ENOMEM; + + tb = tbs->tb; + ret = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), tb, + IFLA_MAX, ifla_policy, extack); + if (ret < 0) + goto free; + + ret = rtnl_ensure_unique_netns(tb, extack, false); + if (ret < 0) + goto free; + + linkinfo = tbs->linkinfo; if (tb[IFLA_LINKINFO]) { - err = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX, + ret = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX, tb[IFLA_LINKINFO], ifla_info_policy, NULL); - if (err < 0) - return err; - } else - memset(linkinfo, 0, sizeof(linkinfo)); + if (ret < 0) + goto free; + } else { + memset(linkinfo, 0, sizeof(tbs->linkinfo)); + } if (linkinfo[IFLA_INFO_KIND]) { + char kind[MODULE_NAME_LEN]; + nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); - ops = rtnl_link_ops_get(kind); - } else { - kind[0] = '\0'; - ops = NULL; + ops = rtnl_link_ops_get(kind, &ops_srcu_index); +#ifdef CONFIG_MODULES + if (!ops) { + request_module("rtnl-link-%s", kind); + ops = rtnl_link_ops_get(kind, &ops_srcu_index); + } +#endif } - data = NULL; + rtnl_nets_init(&rtnl_nets); + if (ops) { - if (ops->maxtype > RTNL_MAX_TYPE) - return -EINVAL; + if (ops->maxtype > RTNL_MAX_TYPE) { + ret = -EINVAL; + goto put_ops; + } if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { - err = nla_parse_nested_deprecated(tbs->attr, ops->maxtype, + ret = nla_parse_nested_deprecated(tbs->attr, ops->maxtype, linkinfo[IFLA_INFO_DATA], ops->policy, extack); - if (err < 0) - return err; + if (ret < 0) + goto put_ops; + data = tbs->attr; } + if (ops->validate) { - err = ops->validate(tb, data, extack); - if (err < 0) - return err; + ret = ops->validate(tb, data, extack); + if (ret < 0) + goto put_ops; } - } - - slave_data = NULL; - if (m_ops) { - if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) - return -EINVAL; - if (m_ops->slave_maxtype && - linkinfo[IFLA_INFO_SLAVE_DATA]) { - err = nla_parse_nested_deprecated(tbs->slave_attr, - m_ops->slave_maxtype, - linkinfo[IFLA_INFO_SLAVE_DATA], - m_ops->slave_policy, - extack); - if (err < 0) - return err; - slave_data = tbs->slave_attr; + if (ops->peer_type) { + ret = rtnl_add_peer_net(&rtnl_nets, ops, data, extack); + if (ret < 0) + goto put_ops; } } - if (dev) { - int status = 0; + tgt_net = rtnl_link_get_net_capable(skb, sock_net(skb->sk), tb, CAP_NET_ADMIN); + if (IS_ERR(tgt_net)) { + ret = PTR_ERR(tgt_net); + goto put_net; + } - if (nlh->nlmsg_flags & NLM_F_EXCL) - return -EEXIST; - if (nlh->nlmsg_flags & NLM_F_REPLACE) - return -EOPNOTSUPP; + rtnl_nets_add(&rtnl_nets, tgt_net); - err = validate_linkmsg(dev, tb, extack); - if (err < 0) - return err; - - if (linkinfo[IFLA_INFO_DATA]) { - if (!ops || ops != dev->rtnl_link_ops || - !ops->changelink) - return -EOPNOTSUPP; + if (tb[IFLA_LINK_NETNSID]) { + int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); - err = ops->changelink(dev, tb, data, extack); - if (err < 0) - return err; - status |= DO_SETLINK_NOTIFY; + link_net = get_net_ns_by_id(tgt_net, id); + if (!link_net) { + NL_SET_ERR_MSG(extack, "Unknown network namespace id"); + ret = -EINVAL; + goto put_net; } - if (linkinfo[IFLA_INFO_SLAVE_DATA]) { - if (!m_ops || !m_ops->slave_changelink) - return -EOPNOTSUPP; + rtnl_nets_add(&rtnl_nets, link_net); - err = m_ops->slave_changelink(master_dev, dev, tb, - slave_data, extack); - if (err < 0) - return err; - status |= DO_SETLINK_NOTIFY; + if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) { + ret = -EPERM; + goto put_net; } - - return do_setlink(skb, dev, ifm, extack, tb, status); - } - - if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { - /* No dev found and NLM_F_CREATE not set. Requested dev does not exist, - * or it's for a group - */ - if (link_specified) - return -ENODEV; - if (tb[IFLA_GROUP]) - return rtnl_group_changelink(skb, net, - nla_get_u32(tb[IFLA_GROUP]), - ifm, extack, tb); - return -ENODEV; } - if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) - return -EOPNOTSUPP; + rtnl_nets_lock(&rtnl_nets); + ret = __rtnl_newlink(skb, nlh, ops, tgt_net, link_net, tbs, data, extack); + rtnl_nets_unlock(&rtnl_nets); - if (!ops) { -#ifdef CONFIG_MODULES - if (kind[0]) { - __rtnl_unlock(); - request_module("rtnl-link-%s", kind); - rtnl_lock(); - ops = rtnl_link_ops_get(kind); - if (ops) - goto replay; - } -#endif - NL_SET_ERR_MSG(extack, "Unknown device type"); - return -EOPNOTSUPP; - } - - return rtnl_newlink_create(skb, ifm, ops, nlh, tb, data, extack); -} - -static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) -{ - struct rtnl_newlink_tbs *tbs; - int ret; - - tbs = kmalloc(sizeof(*tbs), GFP_KERNEL); - if (!tbs) - return -ENOMEM; - - ret = __rtnl_newlink(skb, nlh, tbs, extack); +put_net: + rtnl_nets_destroy(&rtnl_nets); +put_ops: + if (ops) + rtnl_link_ops_put(ops, ops_srcu_index); +free: kfree(tbs); return ret; } @@ -4341,9 +4578,10 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, netif_is_bridge_port(dev)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops = br_dev->netdev_ops; + bool notified = false; err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid, - nlh->nlmsg_flags, extack); + nlh->nlmsg_flags, ¬ified, extack); if (err) goto out; else @@ -4352,16 +4590,18 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, /* Embedded bridge, macvlan, and any other device support */ if ((ndm->ndm_flags & NTF_SELF)) { + bool notified = false; + if (dev->netdev_ops->ndo_fdb_add) err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags, - extack); + ¬ified, extack); else err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags); - if (!err) { + if (!err && !notified) { rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH, ndm->ndm_state); ndm->ndm_flags &= ~NTF_SELF; @@ -4461,11 +4701,13 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && netif_is_bridge_port(dev)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); + bool notified = false; ops = br_dev->netdev_ops; if (!del_bulk) { if (ops->ndo_fdb_del) - err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack); + err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, + ¬ified, extack); } else { if (ops->ndo_fdb_del_bulk) err = ops->ndo_fdb_del_bulk(nlh, dev, extack); @@ -4479,10 +4721,13 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, /* Embedded bridge, macvlan, and any other device support */ if (ndm->ndm_flags & NTF_SELF) { + bool notified = false; + ops = dev->netdev_ops; if (!del_bulk) { if (ops->ndo_fdb_del) - err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack); + err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, + ¬ified, extack); else err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid); } else { @@ -4493,7 +4738,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, } if (!err) { - if (!del_bulk) + if (!del_bulk && !notified) rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH, ndm->ndm_state); ndm->ndm_flags &= ~NTF_SELF; @@ -6198,7 +6443,7 @@ static int rtnl_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb) int idx, s_idx; int err; - NL_ASSERT_DUMP_CTX_FITS(struct rtnl_mdb_dump_ctx); + NL_ASSERT_CTX_FITS(struct rtnl_mdb_dump_ctx); if (cb->strict_check) { err = rtnl_mdb_valid_dump_req(cb->nlh, cb->extack); @@ -6765,6 +7010,41 @@ static struct pernet_operations rtnetlink_net_ops = { .exit = rtnetlink_net_exit, }; +static const struct rtnl_msg_handler rtnetlink_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_NEWLINK, .doit = rtnl_newlink, + .flags = RTNL_FLAG_DOIT_PERNET}, + {.msgtype = RTM_DELLINK, .doit = rtnl_dellink, + .flags = RTNL_FLAG_DOIT_PERNET_WIP}, + {.msgtype = RTM_GETLINK, .doit = rtnl_getlink, + .dumpit = rtnl_dump_ifinfo, .flags = RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, + {.msgtype = RTM_SETLINK, .doit = rtnl_setlink, + .flags = RTNL_FLAG_DOIT_PERNET_WIP}, + {.msgtype = RTM_GETADDR, .dumpit = rtnl_dump_all}, + {.msgtype = RTM_GETROUTE, .dumpit = rtnl_dump_all}, + {.msgtype = RTM_GETNETCONF, .dumpit = rtnl_dump_all}, + {.msgtype = RTM_GETSTATS, .doit = rtnl_stats_get, + .dumpit = rtnl_stats_dump}, + {.msgtype = RTM_SETSTATS, .doit = rtnl_stats_set}, + {.msgtype = RTM_NEWLINKPROP, .doit = rtnl_newlinkprop}, + {.msgtype = RTM_DELLINKPROP, .doit = rtnl_dellinkprop}, + {.protocol = PF_BRIDGE, .msgtype = RTM_GETLINK, + .dumpit = rtnl_bridge_getlink}, + {.protocol = PF_BRIDGE, .msgtype = RTM_DELLINK, + .doit = rtnl_bridge_dellink}, + {.protocol = PF_BRIDGE, .msgtype = RTM_SETLINK, + .doit = rtnl_bridge_setlink}, + {.protocol = PF_BRIDGE, .msgtype = RTM_NEWNEIGH, .doit = rtnl_fdb_add}, + {.protocol = PF_BRIDGE, .msgtype = RTM_DELNEIGH, .doit = rtnl_fdb_del, + .flags = RTNL_FLAG_BULK_DEL_SUPPORTED}, + {.protocol = PF_BRIDGE, .msgtype = RTM_GETNEIGH, .doit = rtnl_fdb_get, + .dumpit = rtnl_fdb_dump}, + {.protocol = PF_BRIDGE, .msgtype = RTM_NEWMDB, .doit = rtnl_mdb_add}, + {.protocol = PF_BRIDGE, .msgtype = RTM_DELMDB, .doit = rtnl_mdb_del, + .flags = RTNL_FLAG_BULK_DEL_SUPPORTED}, + {.protocol = PF_BRIDGE, .msgtype = RTM_GETMDB, .doit = rtnl_mdb_get, + .dumpit = rtnl_mdb_dump}, +}; + void __init rtnetlink_init(void) { if (register_pernet_subsys(&rtnetlink_net_ops)) @@ -6772,34 +7052,5 @@ void __init rtnetlink_init(void) register_netdevice_notifier(&rtnetlink_dev_notifier); - rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink, - rtnl_dump_ifinfo, RTNL_FLAG_DUMP_SPLIT_NLM_DONE); - rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, 0); - - rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, 0); - rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, 0); - rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, 0); - - rtnl_register(PF_UNSPEC, RTM_NEWLINKPROP, rtnl_newlinkprop, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_DELLINKPROP, rtnl_dellinkprop, NULL, 0); - - rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0); - rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, - RTNL_FLAG_BULK_DEL_SUPPORTED); - rtnl_register(PF_BRIDGE, RTM_GETNEIGH, rtnl_fdb_get, rtnl_fdb_dump, 0); - - rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, 0); - rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, 0); - rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, 0); - - rtnl_register(PF_UNSPEC, RTM_GETSTATS, rtnl_stats_get, rtnl_stats_dump, - 0); - rtnl_register(PF_UNSPEC, RTM_SETSTATS, rtnl_stats_set, NULL, 0); - - rtnl_register(PF_BRIDGE, RTM_GETMDB, rtnl_mdb_get, rtnl_mdb_dump, 0); - rtnl_register(PF_BRIDGE, RTM_NEWMDB, rtnl_mdb_add, NULL, 0); - rtnl_register(PF_BRIDGE, RTM_DELMDB, rtnl_mdb_del, NULL, - RTNL_FLAG_BULK_DEL_SUPPORTED); + rtnl_register_many(rtnetlink_rtnl_msg_handlers); } diff --git a/net/core/rtnl_net_debug.c b/net/core/rtnl_net_debug.c new file mode 100644 index 000000000000..f406045cbd0e --- /dev/null +++ b/net/core/rtnl_net_debug.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Copyright Amazon.com Inc. or its affiliates. */ + +#include <linux/init.h> +#include <linux/netdevice.h> +#include <linux/notifier.h> +#include <linux/rtnetlink.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +static int rtnl_net_debug_event(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + enum netdev_cmd cmd = event; + + /* Keep enum and don't add default to trigger -Werror=switch */ + switch (cmd) { + case NETDEV_UP: + case NETDEV_DOWN: + case NETDEV_REBOOT: + case NETDEV_CHANGE: + case NETDEV_REGISTER: + case NETDEV_UNREGISTER: + case NETDEV_CHANGEMTU: + case NETDEV_CHANGEADDR: + case NETDEV_PRE_CHANGEADDR: + case NETDEV_GOING_DOWN: + case NETDEV_CHANGENAME: + case NETDEV_FEAT_CHANGE: + case NETDEV_BONDING_FAILOVER: + case NETDEV_PRE_UP: + case NETDEV_PRE_TYPE_CHANGE: + case NETDEV_POST_TYPE_CHANGE: + case NETDEV_POST_INIT: + case NETDEV_PRE_UNINIT: + case NETDEV_RELEASE: + case NETDEV_NOTIFY_PEERS: + case NETDEV_JOIN: + case NETDEV_CHANGEUPPER: + case NETDEV_RESEND_IGMP: + case NETDEV_PRECHANGEMTU: + case NETDEV_CHANGEINFODATA: + case NETDEV_BONDING_INFO: + case NETDEV_PRECHANGEUPPER: + case NETDEV_CHANGELOWERSTATE: + case NETDEV_UDP_TUNNEL_PUSH_INFO: + case NETDEV_UDP_TUNNEL_DROP_INFO: + case NETDEV_CHANGE_TX_QUEUE_LEN: + case NETDEV_CVLAN_FILTER_PUSH_INFO: + case NETDEV_CVLAN_FILTER_DROP_INFO: + case NETDEV_SVLAN_FILTER_PUSH_INFO: + case NETDEV_SVLAN_FILTER_DROP_INFO: + case NETDEV_OFFLOAD_XSTATS_ENABLE: + case NETDEV_OFFLOAD_XSTATS_DISABLE: + case NETDEV_OFFLOAD_XSTATS_REPORT_USED: + case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA: + case NETDEV_XDP_FEAT_CHANGE: + ASSERT_RTNL(); + break; + + /* Once an event fully supports RTNL_NET, move it here + * and remove "if (0)" below. + * + * case NETDEV_XXX: + * ASSERT_RTNL_NET(net); + * break; + */ + } + + /* Just to avoid unused-variable error for dev and net. */ + if (0) + ASSERT_RTNL_NET(net); + + return NOTIFY_DONE; +} + +static int rtnl_net_debug_net_id; + +static int __net_init rtnl_net_debug_net_init(struct net *net) +{ + struct notifier_block *nb; + + nb = net_generic(net, rtnl_net_debug_net_id); + nb->notifier_call = rtnl_net_debug_event; + + return register_netdevice_notifier_net(net, nb); +} + +static void __net_exit rtnl_net_debug_net_exit(struct net *net) +{ + struct notifier_block *nb; + + nb = net_generic(net, rtnl_net_debug_net_id); + unregister_netdevice_notifier_net(net, nb); +} + +static struct pernet_operations rtnl_net_debug_net_ops __net_initdata = { + .init = rtnl_net_debug_net_init, + .exit = rtnl_net_debug_net_exit, + .id = &rtnl_net_debug_net_id, + .size = sizeof(struct notifier_block), +}; + +static struct notifier_block rtnl_net_debug_block = { + .notifier_call = rtnl_net_debug_event, +}; + +static int __init rtnl_net_debug_init(void) +{ + int ret; + + ret = register_pernet_device(&rtnl_net_debug_net_ops); + if (ret) + return ret; + + ret = register_netdevice_notifier(&rtnl_net_debug_block); + if (ret) + unregister_pernet_subsys(&rtnl_net_debug_net_ops); + + return ret; +} + +subsys_initcall(rtnl_net_debug_init); diff --git a/net/core/skb_fault_injection.c b/net/core/skb_fault_injection.c new file mode 100644 index 000000000000..4235db6bdfad --- /dev/null +++ b/net/core/skb_fault_injection.c @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/debugfs.h> +#include <linux/fault-inject.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> + +static struct { + struct fault_attr attr; + char devname[IFNAMSIZ]; + bool filtered; +} skb_realloc = { + .attr = FAULT_ATTR_INITIALIZER, + .filtered = false, +}; + +static bool should_fail_net_realloc_skb(struct sk_buff *skb) +{ + struct net_device *net = skb->dev; + + if (skb_realloc.filtered && + strncmp(net->name, skb_realloc.devname, IFNAMSIZ)) + /* device name filter set, but names do not match */ + return false; + + if (!should_fail(&skb_realloc.attr, 1)) + return false; + + return true; +} +ALLOW_ERROR_INJECTION(should_fail_net_realloc_skb, TRUE); + +void skb_might_realloc(struct sk_buff *skb) +{ + if (!should_fail_net_realloc_skb(skb)) + return; + + pskb_expand_head(skb, 0, 0, GFP_ATOMIC); +} +EXPORT_SYMBOL(skb_might_realloc); + +static int __init fail_skb_realloc_setup(char *str) +{ + return setup_fault_attr(&skb_realloc.attr, str); +} +__setup("fail_skb_realloc=", fail_skb_realloc_setup); + +static void reset_settings(void) +{ + skb_realloc.filtered = false; + memset(&skb_realloc.devname, 0, IFNAMSIZ); +} + +static ssize_t devname_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + ssize_t ret; + + reset_settings(); + ret = simple_write_to_buffer(&skb_realloc.devname, IFNAMSIZ, + ppos, buffer, count); + if (ret < 0) + return ret; + + skb_realloc.devname[IFNAMSIZ - 1] = '\0'; + /* Remove a possible \n at the end of devname */ + strim(skb_realloc.devname); + + if (strnlen(skb_realloc.devname, IFNAMSIZ)) + skb_realloc.filtered = true; + + return count; +} + +static ssize_t devname_read(struct file *file, + char __user *buffer, + size_t size, loff_t *ppos) +{ + if (!skb_realloc.filtered) + return 0; + + return simple_read_from_buffer(buffer, size, ppos, &skb_realloc.devname, + strlen(skb_realloc.devname)); +} + +static const struct file_operations devname_ops = { + .write = devname_write, + .read = devname_read, +}; + +static int __init fail_skb_realloc_debugfs(void) +{ + umode_t mode = S_IFREG | 0600; + struct dentry *dir; + + dir = fault_create_debugfs_attr("fail_skb_realloc", NULL, + &skb_realloc.attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + debugfs_create_file("devname", mode, dir, NULL, &devname_ops); + + return 0; +} + +late_initcall(fail_skb_realloc_debugfs); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 74149dc4ee31..6841e61a6bd0 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -753,14 +753,14 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, if (in_hardirq() || irqs_disabled()) { nc = this_cpu_ptr(&netdev_alloc_cache); data = page_frag_alloc(nc, len, gfp_mask); - pfmemalloc = nc->pfmemalloc; + pfmemalloc = page_frag_cache_is_pfmemalloc(nc); } else { local_bh_disable(); local_lock_nested_bh(&napi_alloc_cache.bh_lock); nc = this_cpu_ptr(&napi_alloc_cache.page); data = page_frag_alloc(nc, len, gfp_mask); - pfmemalloc = nc->pfmemalloc; + pfmemalloc = page_frag_cache_is_pfmemalloc(nc); local_unlock_nested_bh(&napi_alloc_cache.bh_lock); local_bh_enable(); @@ -850,7 +850,7 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) len = SKB_HEAD_ALIGN(len); data = page_frag_alloc(&nc->page, len, gfp_mask); - pfmemalloc = nc->page.pfmemalloc; + pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); } local_unlock_nested_bh(&napi_alloc_cache.bh_lock); @@ -5506,7 +5506,7 @@ static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) { bool ret; - if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly)) + if (likely(tsonly || READ_ONCE(sock_net(sk)->core.sysctl_tstamp_allow_data))) return true; read_lock_bh(&sk->sk_callback_lock); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index b1dcbd3be89e..e90fbab703b2 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1117,9 +1117,9 @@ static void sk_psock_strp_data_ready(struct sock *sk) if (tls_sw_has_ctx_rx(sk)) { psock->saved_data_ready(sk); } else { - write_lock_bh(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); strp_data_ready(&psock->strp); - write_unlock_bh(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); } } rcu_read_unlock(); diff --git a/net/core/sock.c b/net/core/sock.c index 039be95c40cf..74729d20cd00 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -286,8 +286,6 @@ EXPORT_SYMBOL(sysctl_rmem_max); __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; -int sysctl_tstamp_allow_data __read_mostly = 1; - DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); EXPORT_SYMBOL_GPL(memalloc_socks_key); @@ -822,14 +820,11 @@ EXPORT_SYMBOL(sock_set_sndtimeo); static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) { + sock_valbool_flag(sk, SOCK_RCVTSTAMP, val); + sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns); if (val) { sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); - sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); - sock_set_flag(sk, SOCK_RCVTSTAMP); sock_enable_timestamp(sk, SOCK_TIMESTAMP); - } else { - sock_reset_flag(sk, SOCK_RCVTSTAMP); - sock_reset_flag(sk, SOCK_RCVTSTAMPNS); } } @@ -1052,32 +1047,34 @@ static int sock_reserve_memory(struct sock *sk, int bytes) #ifdef CONFIG_PAGE_POOL -/* This is the number of tokens that the user can SO_DEVMEM_DONTNEED in - * 1 syscall. The limit exists to limit the amount of memory the kernel - * allocates to copy these tokens. +/* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED + * in 1 syscall. The limit exists to limit the amount of memory the kernel + * allocates to copy these tokens, and to prevent looping over the frags for + * too long. */ #define MAX_DONTNEED_TOKENS 128 +#define MAX_DONTNEED_FRAGS 1024 static noinline_for_stack int sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) { unsigned int num_tokens, i, j, k, netmem_num = 0; struct dmabuf_token *tokens; + int ret = 0, num_frags = 0; netmem_ref netmems[16]; - int ret = 0; if (!sk_is_tcp(sk)) return -EBADF; - if (optlen % sizeof(struct dmabuf_token) || + if (optlen % sizeof(*tokens) || optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS) return -EINVAL; - tokens = kvmalloc_array(optlen, sizeof(*tokens), GFP_KERNEL); + num_tokens = optlen / sizeof(*tokens); + tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL); if (!tokens) return -ENOMEM; - num_tokens = optlen / sizeof(struct dmabuf_token); if (copy_from_sockptr(tokens, optval, optlen)) { kvfree(tokens); return -EFAULT; @@ -1086,24 +1083,28 @@ sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) xa_lock_bh(&sk->sk_user_frags); for (i = 0; i < num_tokens; i++) { for (j = 0; j < tokens[i].token_count; j++) { + if (++num_frags > MAX_DONTNEED_FRAGS) + goto frag_limit_reached; + netmem_ref netmem = (__force netmem_ref)__xa_erase( &sk->sk_user_frags, tokens[i].token_start + j); - if (netmem && - !WARN_ON_ONCE(!netmem_is_net_iov(netmem))) { - netmems[netmem_num++] = netmem; - if (netmem_num == ARRAY_SIZE(netmems)) { - xa_unlock_bh(&sk->sk_user_frags); - for (k = 0; k < netmem_num; k++) - WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); - netmem_num = 0; - xa_lock_bh(&sk->sk_user_frags); - } - ret++; + if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem))) + continue; + + netmems[netmem_num++] = netmem; + if (netmem_num == ARRAY_SIZE(netmems)) { + xa_unlock_bh(&sk->sk_user_frags); + for (k = 0; k < netmem_num; k++) + WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); + netmem_num = 0; + xa_lock_bh(&sk->sk_user_frags); } + ret++; } } +frag_limit_reached: xa_unlock_bh(&sk->sk_user_frags); for (k = 0; k < netmem_num; k++) WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); @@ -2594,14 +2595,11 @@ void __sock_wfree(struct sk_buff *skb) void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) { skb_orphan(skb); - skb->sk = sk; #ifdef CONFIG_INET - if (unlikely(!sk_fullsock(sk))) { - skb->destructor = sock_edemux; - sock_hold(sk); - return; - } + if (unlikely(!sk_fullsock(sk))) + return skb_set_owner_edemux(skb, sk); #endif + skb->sk = sk; skb->destructor = sock_wfree; skb_set_hash_from_sk(skb, sk); /* @@ -2899,6 +2897,8 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, { u32 tsflags; + BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31)); + switch (cmsg->cmsg_type) { case SO_MARK: if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && @@ -2927,6 +2927,17 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, return -EINVAL; sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); break; + case SCM_TS_OPT_ID: + if (sk_is_tcp(sk)) + return -EINVAL; + tsflags = READ_ONCE(sk->sk_tsflags); + if (!(tsflags & SOF_TIMESTAMPING_OPT_ID)) + return -EINVAL; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg); + sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID; + break; /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ case SCM_RIGHTS: case SCM_CREDENTIALS: @@ -3819,9 +3830,6 @@ void sk_common_release(struct sock *sk) sk->sk_prot->unhash(sk); - if (sk->sk_socket) - sk->sk_socket->sk = NULL; - /* * In this point socket cannot receive new packets, but it is possible * that some packets are in flight because some CPU runs receiver and diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 86a2476678c4..cb8d32e5c14e 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -51,29 +51,45 @@ int sysctl_devconf_inherit_init_net __read_mostly; EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); #if IS_ENABLED(CONFIG_NET_FLOW_LIMIT) || IS_ENABLED(CONFIG_RPS) -static void dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos, - struct cpumask *mask) +static int dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos, + struct cpumask *mask) { - char kbuf[128]; + char *kbuf; int len; if (*ppos || !*lenp) { *lenp = 0; - return; + return 0; + } + + /* CPUs are displayed as a hex bitmap + a comma between each groups of 8 + * nibbles (except the last one which has a newline instead). + * Guesstimate the buffer size at the group granularity level. + */ + len = min(DIV_ROUND_UP(nr_cpumask_bits, 32) * (8 + 1), *lenp); + kbuf = kmalloc(len, GFP_KERNEL); + if (!kbuf) { + *lenp = 0; + return -ENOMEM; } - len = min(sizeof(kbuf) - 1, *lenp); len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask)); if (!len) { *lenp = 0; - return; + goto free_buf; } - if (len < *lenp) - kbuf[len++] = '\n'; + /* scnprintf writes a trailing null char not counted in the returned + * length, override it with a newline. + */ + kbuf[len++] = '\n'; memcpy(buffer, kbuf, len); *lenp = len; *ppos += len; + +free_buf: + kfree(kbuf); + return 0; } #endif @@ -117,8 +133,8 @@ static int rps_default_mask_sysctl(const struct ctl_table *table, int write, if (err) goto done; } else { - dump_cpumask(buffer, lenp, ppos, - net->core.rps_default_mask ? : cpu_none_mask); + err = dump_cpumask(buffer, lenp, ppos, + net->core.rps_default_mask ? : cpu_none_mask); } done: @@ -247,7 +263,7 @@ write_unlock: } rcu_read_unlock(); - dump_cpumask(buffer, lenp, ppos, mask); + ret = dump_cpumask(buffer, lenp, ppos, mask); } done: @@ -491,15 +507,6 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "tstamp_allow_data", - .data = &sysctl_tstamp_allow_data, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE - }, #ifdef CONFIG_RPS { .procname = "rps_sock_flow_entries", @@ -665,6 +672,15 @@ static struct ctl_table netns_core_table[] = { .extra2 = SYSCTL_ONE, .proc_handler = proc_dou8vec_minmax, }, + { + .procname = "tstamp_allow_data", + .data = &init_net.core.sysctl_tstamp_allow_data, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, /* sysctl_core_net_init() will set the values after this * to readonly in network namespaces */ |