diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/Makefile | 2 | ||||
-rw-r--r-- | net/core/dev.c | 105 | ||||
-rw-r--r-- | net/core/devlink.c | 1603 | ||||
-rw-r--r-- | net/core/ethtool.c | 267 | ||||
-rw-r--r-- | net/core/filter.c | 596 | ||||
-rw-r--r-- | net/core/flow_dissector.c | 92 | ||||
-rw-r--r-- | net/core/flow_offload.c | 153 | ||||
-rw-r--r-- | net/core/lwt_bpf.c | 265 | ||||
-rw-r--r-- | net/core/lwtunnel.c | 16 | ||||
-rw-r--r-- | net/core/neighbour.c | 11 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 12 | ||||
-rw-r--r-- | net/core/net-traces.c | 8 | ||||
-rw-r--r-- | net/core/page_pool.c | 22 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 12 | ||||
-rw-r--r-- | net/core/scm.c | 27 | ||||
-rw-r--r-- | net/core/skbuff.c | 4 | ||||
-rw-r--r-- | net/core/skmsg.c | 3 | ||||
-rw-r--r-- | net/core/sock.c | 180 | ||||
-rw-r--r-- | net/core/sysctl_net_core.c | 18 |
19 files changed, 2571 insertions, 825 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index fccd31e0e7f7..f97d6254e564 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -11,7 +11,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \ - fib_notifier.o xdp.o + fib_notifier.o xdp.o flow_offload.o obj-y += net-sysfs.o obj-$(CONFIG_PAGE_POOL) += page_pool.o diff --git a/net/core/dev.c b/net/core/dev.c index 82f20022259d..2b67f2aa59dd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3421,7 +3421,7 @@ static void qdisc_pkt_len_init(struct sk_buff *skb) /* To get more precise estimation of bytes sent on wire, * we add to pkt_len the headers size of all segments */ - if (shinfo->gso_size) { + if (shinfo->gso_size && skb_transport_header_was_set(skb)) { unsigned int hdr_len; u16 gso_segs = shinfo->gso_segs; @@ -7878,6 +7878,63 @@ int dev_get_phys_port_name(struct net_device *dev, EXPORT_SYMBOL(dev_get_phys_port_name); /** + * dev_get_port_parent_id - Get the device's port parent identifier + * @dev: network device + * @ppid: pointer to a storage for the port's parent identifier + * @recurse: allow/disallow recursion to lower devices + * + * Get the devices's port parent identifier + */ +int dev_get_port_parent_id(struct net_device *dev, + struct netdev_phys_item_id *ppid, + bool recurse) +{ + const struct net_device_ops *ops = dev->netdev_ops; + struct netdev_phys_item_id first = { }; + struct net_device *lower_dev; + struct list_head *iter; + int err = -EOPNOTSUPP; + + if (ops->ndo_get_port_parent_id) + return ops->ndo_get_port_parent_id(dev, ppid); + + if (!recurse) + return err; + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + err = dev_get_port_parent_id(lower_dev, ppid, recurse); + if (err) + break; + if (!first.id_len) + first = *ppid; + else if (memcmp(&first, ppid, sizeof(*ppid))) + return -ENODATA; + } + + return err; +} +EXPORT_SYMBOL(dev_get_port_parent_id); + +/** + * netdev_port_same_parent_id - Indicate if two network devices have + * the same port parent identifier + * @a: first network device + * @b: second network device + */ +bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b) +{ + struct netdev_phys_item_id a_id = { }; + struct netdev_phys_item_id b_id = { }; + + if (dev_get_port_parent_id(a, &a_id, true) || + dev_get_port_parent_id(b, &b_id, true)) + return false; + + return netdev_phys_item_id_same(&a_id, &b_id); +} +EXPORT_SYMBOL(netdev_port_same_parent_id); + +/** * dev_change_proto_down - update protocol port state information * @dev: device * @proto_down: new value @@ -7897,6 +7954,25 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down); +/** + * dev_change_proto_down_generic - generic implementation for + * ndo_change_proto_down that sets carrier according to + * proto_down. + * + * @dev: device + * @proto_down: new value + */ +int dev_change_proto_down_generic(struct net_device *dev, bool proto_down) +{ + if (proto_down) + netif_carrier_off(dev); + else + netif_carrier_on(dev); + dev->proto_down = proto_down; + return 0; +} +EXPORT_SYMBOL(dev_change_proto_down_generic); + u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, enum bpf_netdev_command cmd) { @@ -7976,35 +8052,41 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, enum bpf_netdev_command query; struct bpf_prog *prog = NULL; bpf_op_t bpf_op, bpf_chk; + bool offload; int err; ASSERT_RTNL(); - query = flags & XDP_FLAGS_HW_MODE ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG; + offload = flags & XDP_FLAGS_HW_MODE; + query = offload ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG; bpf_op = bpf_chk = ops->ndo_bpf; - if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) + if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) { + NL_SET_ERR_MSG(extack, "underlying driver does not support XDP in native mode"); return -EOPNOTSUPP; + } if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE)) bpf_op = generic_xdp_install; if (bpf_op == bpf_chk) bpf_chk = generic_xdp_install; if (fd >= 0) { - if (__dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG) || - __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG_HW)) + if (!offload && __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG)) { + NL_SET_ERR_MSG(extack, "native and generic XDP can't be active at the same time"); return -EEXIST; + } if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && - __dev_xdp_query(dev, bpf_op, query)) + __dev_xdp_query(dev, bpf_op, query)) { + NL_SET_ERR_MSG(extack, "XDP program already attached"); return -EBUSY; + } prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, bpf_op == ops->ndo_bpf); if (IS_ERR(prog)) return PTR_ERR(prog); - if (!(flags & XDP_FLAGS_HW_MODE) && - bpf_prog_is_dev_bound(prog->aux)) { + if (!offload && bpf_prog_is_dev_bound(prog->aux)) { NL_SET_ERR_MSG(extack, "using device-bound program without HW_MODE flag is not supported"); bpf_prog_put(prog); return -EINVAL; @@ -8152,7 +8234,7 @@ static netdev_features_t netdev_sync_upper_features(struct net_device *lower, netdev_features_t feature; int feature_bit; - for_each_netdev_feature(&upper_disables, feature_bit) { + for_each_netdev_feature(upper_disables, feature_bit) { feature = __NETIF_F_BIT(feature_bit); if (!(upper->wanted_features & feature) && (features & feature)) { @@ -8172,7 +8254,7 @@ static void netdev_sync_lower_features(struct net_device *upper, netdev_features_t feature; int feature_bit; - for_each_netdev_feature(&upper_disables, feature_bit) { + for_each_netdev_feature(upper_disables, feature_bit) { feature = __NETIF_F_BIT(feature_bit); if (!(features & feature) && (lower->features & feature)) { netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n", @@ -8712,6 +8794,9 @@ int init_dummy_netdev(struct net_device *dev) set_bit(__LINK_STATE_PRESENT, &dev->state); set_bit(__LINK_STATE_START, &dev->state); + /* napi_busy_loop stats accounting wants this */ + dev_net_set(dev, &init_net); + /* Note : We dont allocate pcpu_refcnt for dummy devices, * because users of this 'device' dont need to change * its refcount. diff --git a/net/core/devlink.c b/net/core/devlink.c index 60248a53c0ad..4f31ddc883e7 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -81,6 +81,7 @@ struct devlink_dpipe_header devlink_dpipe_header_ipv6 = { EXPORT_SYMBOL(devlink_dpipe_header_ipv6); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg); +EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr); static LIST_HEAD(devlink_list); @@ -115,6 +116,8 @@ static struct devlink *devlink_get_from_attrs(struct net *net, busname = nla_data(attrs[DEVLINK_ATTR_BUS_NAME]); devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]); + lockdep_assert_held(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { if (strcmp(devlink->dev->bus->name, busname) == 0 && strcmp(dev_name(devlink->dev), devname) == 0 && @@ -932,6 +935,9 @@ static int devlink_nl_sb_pool_fill(struct sk_buff *msg, struct devlink *devlink, if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE, pool_info.threshold_type)) goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_POOL_CELL_SIZE, + pool_info.cell_size)) + goto nla_put_failure; genlmsg_end(msg, hdr); return 0; @@ -2656,6 +2662,27 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) return devlink->ops->reload(devlink, info->extack); } +static int devlink_nl_cmd_flash_update(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + const char *file_name, *component; + struct nlattr *nla_component; + + if (!devlink->ops->flash_update) + return -EOPNOTSUPP; + + if (!info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME]) + return -EINVAL; + file_name = nla_data(info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME]); + + nla_component = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT]; + component = nla_component ? nla_data(nla_component) : NULL; + + return devlink->ops->flash_update(devlink, file_name, component, + info->extack); +} + static const struct devlink_param devlink_param_generic[] = { { .id = DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET, @@ -2843,11 +2870,13 @@ nla_put_failure: } static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, + unsigned int port_index, struct devlink_param_item *param_item, enum devlink_command cmd, u32 portid, u32 seq, int flags) { union devlink_param_value param_value[DEVLINK_PARAM_CMODE_MAX + 1]; + bool param_value_set[DEVLINK_PARAM_CMODE_MAX + 1] = {}; const struct devlink_param *param = param_item->param; struct devlink_param_gset_ctx ctx; struct nlattr *param_values_list; @@ -2866,12 +2895,15 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, return -EOPNOTSUPP; param_value[i] = param_item->driverinit_value; } else { + if (!param_item->published) + continue; ctx.cmode = i; err = devlink_param_get(devlink, param, &ctx); if (err) return err; param_value[i] = ctx.val; } + param_value_set[i] = true; } hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); @@ -2880,6 +2912,13 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, if (devlink_nl_put_handle(msg, devlink)) goto genlmsg_cancel; + + if (cmd == DEVLINK_CMD_PORT_PARAM_GET || + cmd == DEVLINK_CMD_PORT_PARAM_NEW || + cmd == DEVLINK_CMD_PORT_PARAM_DEL) + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, port_index)) + goto genlmsg_cancel; + param_attr = nla_nest_start(msg, DEVLINK_ATTR_PARAM); if (!param_attr) goto genlmsg_cancel; @@ -2899,7 +2938,7 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, goto param_nest_cancel; for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) { - if (!devlink_param_cmode_is_supported(param, i)) + if (!param_value_set[i]) continue; err = devlink_nl_param_value_fill_one(msg, param->type, i, param_value[i]); @@ -2922,18 +2961,22 @@ genlmsg_cancel: } static void devlink_param_notify(struct devlink *devlink, + unsigned int port_index, struct devlink_param_item *param_item, enum devlink_command cmd) { struct sk_buff *msg; int err; - WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL); + WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL && + cmd != DEVLINK_CMD_PORT_PARAM_NEW && + cmd != DEVLINK_CMD_PORT_PARAM_DEL); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; - err = devlink_nl_param_fill(msg, devlink, param_item, cmd, 0, 0, 0); + err = devlink_nl_param_fill(msg, devlink, port_index, param_item, cmd, + 0, 0, 0); if (err) { nlmsg_free(msg); return; @@ -2962,7 +3005,7 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, idx++; continue; } - err = devlink_nl_param_fill(msg, devlink, param_item, + err = devlink_nl_param_fill(msg, devlink, 0, param_item, DEVLINK_CMD_PARAM_GET, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, @@ -3051,7 +3094,7 @@ devlink_param_value_get_from_info(const struct devlink_param *param, } static struct devlink_param_item * -devlink_param_get_from_info(struct devlink *devlink, +devlink_param_get_from_info(struct list_head *param_list, struct genl_info *info) { char *param_name; @@ -3060,7 +3103,7 @@ devlink_param_get_from_info(struct devlink *devlink, return NULL; param_name = nla_data(info->attrs[DEVLINK_ATTR_PARAM_NAME]); - return devlink_param_find_by_name(&devlink->param_list, param_name); + return devlink_param_find_by_name(param_list, param_name); } static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb, @@ -3071,7 +3114,7 @@ static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb, struct sk_buff *msg; int err; - param_item = devlink_param_get_from_info(devlink, info); + param_item = devlink_param_get_from_info(&devlink->param_list, info); if (!param_item) return -EINVAL; @@ -3079,7 +3122,7 @@ static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb, if (!msg) return -ENOMEM; - err = devlink_nl_param_fill(msg, devlink, param_item, + err = devlink_nl_param_fill(msg, devlink, 0, param_item, DEVLINK_CMD_PARAM_GET, info->snd_portid, info->snd_seq, 0); if (err) { @@ -3090,10 +3133,12 @@ static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb, return genlmsg_reply(msg, info); } -static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb, - struct genl_info *info) +static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink, + unsigned int port_index, + struct list_head *param_list, + struct genl_info *info, + enum devlink_command cmd) { - struct devlink *devlink = info->user_ptr[0]; enum devlink_param_type param_type; struct devlink_param_gset_ctx ctx; enum devlink_param_cmode cmode; @@ -3102,7 +3147,7 @@ static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb, union devlink_param_value value; int err = 0; - param_item = devlink_param_get_from_info(devlink, info); + param_item = devlink_param_get_from_info(param_list, info); if (!param_item) return -EINVAL; param = param_item->param; @@ -3142,17 +3187,28 @@ static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb, return err; } - devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); + devlink_param_notify(devlink, port_index, param_item, cmd); return 0; } +static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + + return __devlink_nl_cmd_param_set_doit(devlink, 0, &devlink->param_list, + info, DEVLINK_CMD_PARAM_NEW); +} + static int devlink_param_register_one(struct devlink *devlink, - const struct devlink_param *param) + unsigned int port_index, + struct list_head *param_list, + const struct devlink_param *param, + enum devlink_command cmd) { struct devlink_param_item *param_item; - if (devlink_param_find_by_name(&devlink->param_list, - param->name)) + if (devlink_param_find_by_name(param_list, param->name)) return -EEXIST; if (param->supported_cmodes == BIT(DEVLINK_PARAM_CMODE_DRIVERINIT)) @@ -3165,24 +3221,111 @@ static int devlink_param_register_one(struct devlink *devlink, return -ENOMEM; param_item->param = param; - list_add_tail(¶m_item->list, &devlink->param_list); - devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); + list_add_tail(¶m_item->list, param_list); + devlink_param_notify(devlink, port_index, param_item, cmd); return 0; } static void devlink_param_unregister_one(struct devlink *devlink, - const struct devlink_param *param) + unsigned int port_index, + struct list_head *param_list, + const struct devlink_param *param, + enum devlink_command cmd) { struct devlink_param_item *param_item; - param_item = devlink_param_find_by_name(&devlink->param_list, - param->name); + param_item = devlink_param_find_by_name(param_list, param->name); WARN_ON(!param_item); - devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_DEL); + devlink_param_notify(devlink, port_index, param_item, cmd); list_del(¶m_item->list); kfree(param_item); } +static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink_param_item *param_item; + struct devlink_port *devlink_port; + struct devlink *devlink; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + mutex_lock(&devlink->lock); + list_for_each_entry(devlink_port, &devlink->port_list, list) { + list_for_each_entry(param_item, + &devlink_port->param_list, list) { + if (idx < start) { + idx++; + continue; + } + err = devlink_nl_param_fill(msg, + devlink_port->devlink, + devlink_port->index, param_item, + DEVLINK_CMD_PORT_PARAM_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) { + mutex_unlock(&devlink->lock); + goto out; + } + idx++; + } + } + mutex_unlock(&devlink->lock); + } +out: + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + +static int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink_param_item *param_item; + struct sk_buff *msg; + int err; + + param_item = devlink_param_get_from_info(&devlink_port->param_list, + info); + if (!param_item) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_param_fill(msg, devlink_port->devlink, + devlink_port->index, param_item, + DEVLINK_CMD_PORT_PARAM_GET, + info->snd_portid, info->snd_seq, 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int devlink_nl_cmd_port_param_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_port *devlink_port = info->user_ptr[0]; + + return __devlink_nl_cmd_param_set_doit(devlink_port->devlink, + devlink_port->index, + &devlink_port->param_list, info, + DEVLINK_CMD_PORT_PARAM_NEW); +} + static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg, struct devlink *devlink, struct devlink_snapshot *snapshot) @@ -3504,44 +3647,56 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { u64 ret_offset, start_offset, end_offset = 0; - struct nlattr *attrs[DEVLINK_ATTR_MAX + 1]; const struct genl_ops *ops = cb->data; struct devlink_region *region; struct nlattr *chunks_attr; const char *region_name; struct devlink *devlink; + struct nlattr **attrs; bool dump = true; void *hdr; int err; start_offset = *((u64 *)&cb->args[0]); + attrs = kmalloc_array(DEVLINK_ATTR_MAX + 1, sizeof(*attrs), GFP_KERNEL); + if (!attrs) + return -ENOMEM; + err = nlmsg_parse(cb->nlh, GENL_HDRLEN + devlink_nl_family.hdrsize, attrs, DEVLINK_ATTR_MAX, ops->policy, cb->extack); if (err) - goto out; + goto out_free; + mutex_lock(&devlink_mutex); devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs); - if (IS_ERR(devlink)) - goto out; + if (IS_ERR(devlink)) { + err = PTR_ERR(devlink); + goto out_dev; + } - mutex_lock(&devlink_mutex); mutex_lock(&devlink->lock); if (!attrs[DEVLINK_ATTR_REGION_NAME] || - !attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) + !attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) { + err = -EINVAL; goto out_unlock; + } region_name = nla_data(attrs[DEVLINK_ATTR_REGION_NAME]); region = devlink_region_get_by_name(devlink, region_name); - if (!region) + if (!region) { + err = -EINVAL; goto out_unlock; + } hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, DEVLINK_CMD_REGION_READ); - if (!hdr) + if (!hdr) { + err = -EMSGSIZE; goto out_unlock; + } err = devlink_nl_put_handle(skb, devlink); if (err) @@ -3552,8 +3707,10 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, goto nla_put_failure; chunks_attr = nla_nest_start(skb, DEVLINK_ATTR_REGION_CHUNKS); - if (!chunks_attr) + if (!chunks_attr) { + err = -EMSGSIZE; goto nla_put_failure; + } if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] && attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) { @@ -3576,8 +3733,10 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, goto nla_put_failure; /* Check if there was any progress done to prevent infinite loop */ - if (ret_offset == start_offset) + if (ret_offset == start_offset) { + err = -EINVAL; goto nla_put_failure; + } *((u64 *)&cb->args[0]) = ret_offset; @@ -3585,6 +3744,7 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, genlmsg_end(skb, hdr); mutex_unlock(&devlink->lock); mutex_unlock(&devlink_mutex); + kfree(attrs); return skb->len; @@ -3592,487 +3752,629 @@ nla_put_failure: genlmsg_cancel(skb, hdr); out_unlock: mutex_unlock(&devlink->lock); +out_dev: mutex_unlock(&devlink_mutex); -out: +out_free: + kfree(attrs); + return err; +} + +struct devlink_info_req { + struct sk_buff *msg; +}; + +int devlink_info_driver_name_put(struct devlink_info_req *req, const char *name) +{ + return nla_put_string(req->msg, DEVLINK_ATTR_INFO_DRIVER_NAME, name); +} +EXPORT_SYMBOL_GPL(devlink_info_driver_name_put); + +int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn) +{ + return nla_put_string(req->msg, DEVLINK_ATTR_INFO_SERIAL_NUMBER, sn); +} +EXPORT_SYMBOL_GPL(devlink_info_serial_number_put); + +static int devlink_info_version_put(struct devlink_info_req *req, int attr, + const char *version_name, + const char *version_value) +{ + struct nlattr *nest; + int err; + + nest = nla_nest_start(req->msg, attr); + if (!nest) + return -EMSGSIZE; + + err = nla_put_string(req->msg, DEVLINK_ATTR_INFO_VERSION_NAME, + version_name); + if (err) + goto nla_put_failure; + + err = nla_put_string(req->msg, DEVLINK_ATTR_INFO_VERSION_VALUE, + version_value); + if (err) + goto nla_put_failure; + + nla_nest_end(req->msg, nest); + + return 0; + +nla_put_failure: + nla_nest_cancel(req->msg, nest); + return err; +} + +int devlink_info_version_fixed_put(struct devlink_info_req *req, + const char *version_name, + const char *version_value) +{ + return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_FIXED, + version_name, version_value); +} +EXPORT_SYMBOL_GPL(devlink_info_version_fixed_put); + +int devlink_info_version_stored_put(struct devlink_info_req *req, + const char *version_name, + const char *version_value) +{ + return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED, + version_name, version_value); +} +EXPORT_SYMBOL_GPL(devlink_info_version_stored_put); + +int devlink_info_version_running_put(struct devlink_info_req *req, + const char *version_name, + const char *version_value) +{ + return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING, + version_name, version_value); +} +EXPORT_SYMBOL_GPL(devlink_info_version_running_put); + +static int +devlink_nl_info_fill(struct sk_buff *msg, struct devlink *devlink, + enum devlink_command cmd, u32 portid, + u32 seq, int flags, struct netlink_ext_ack *extack) +{ + struct devlink_info_req req; + void *hdr; + int err; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + err = -EMSGSIZE; + if (devlink_nl_put_handle(msg, devlink)) + goto err_cancel_msg; + + req.msg = msg; + err = devlink->ops->info_get(devlink, &req, extack); + if (err) + goto err_cancel_msg; + + genlmsg_end(msg, hdr); return 0; + +err_cancel_msg: + genlmsg_cancel(msg, hdr); + return err; } -#define DEVLINK_HEALTH_BUFFER_SIZE (4096 - GENL_HDRLEN) -#define DEVLINK_HEALTH_BUFFER_DATA_SIZE (DEVLINK_HEALTH_BUFFER_SIZE / 2) -#define DEVLINK_HEALTH_SIZE_TO_BUFFERS(size) DIV_ROUND_UP(size, DEVLINK_HEALTH_BUFFER_DATA_SIZE) -#define DEVLINK_HEALTH_BUFFER_MAX_CHUNK 1024 +static int devlink_nl_cmd_info_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct sk_buff *msg; + int err; -struct devlink_health_buffer { - void *data; - u64 offset; - u64 bytes_left; - u64 bytes_left_metadata; - u64 max_nested_depth; - u64 curr_nest; -}; + if (!devlink->ops || !devlink->ops->info_get) + return -EOPNOTSUPP; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; -struct devlink_health_buffer_desc { + err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET, + info->snd_portid, info->snd_seq, 0, + info->extack); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink *devlink; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + if (idx < start) { + idx++; + continue; + } + + mutex_lock(&devlink->lock); + err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + cb->extack); + mutex_unlock(&devlink->lock); + if (err) + break; + idx++; + } + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + +struct devlink_fmsg_item { + struct list_head list; int attrtype; - u16 len; u8 nla_type; - u8 nest_end; + u16 len; int value[0]; }; -static void -devlink_health_buffers_reset(struct devlink_health_buffer **buffers_list, - u64 num_of_buffers) -{ - u64 i; - - for (i = 0; i < num_of_buffers; i++) { - memset(buffers_list[i]->data, 0, DEVLINK_HEALTH_BUFFER_SIZE); - buffers_list[i]->offset = 0; - buffers_list[i]->bytes_left = DEVLINK_HEALTH_BUFFER_DATA_SIZE; - buffers_list[i]->bytes_left_metadata = - DEVLINK_HEALTH_BUFFER_DATA_SIZE; - buffers_list[i]->max_nested_depth = 0; - buffers_list[i]->curr_nest = 0; - } -} - -static void -devlink_health_buffers_destroy(struct devlink_health_buffer **buffers_list, - u64 size); +struct devlink_fmsg { + struct list_head item_list; +}; -static struct devlink_health_buffer ** -devlink_health_buffers_create(u64 size) +static struct devlink_fmsg *devlink_fmsg_alloc(void) { - struct devlink_health_buffer **buffers_list; - u64 num_of_buffers = DEVLINK_HEALTH_SIZE_TO_BUFFERS(size); - u64 i; + struct devlink_fmsg *fmsg; - buffers_list = kcalloc(num_of_buffers, - sizeof(struct devlink_health_buffer *), - GFP_KERNEL); - if (!buffers_list) + fmsg = kzalloc(sizeof(*fmsg), GFP_KERNEL); + if (!fmsg) return NULL; - for (i = 0; i < num_of_buffers; i++) { - struct devlink_health_buffer *buffer; - void *data; + INIT_LIST_HEAD(&fmsg->item_list); - buffer = kzalloc(sizeof(*buffer), GFP_KERNEL); - data = kzalloc(DEVLINK_HEALTH_BUFFER_SIZE, GFP_KERNEL); - if (!buffer || !data) { - kfree(buffer); - kfree(data); - goto buffers_cleanup; - } - buffers_list[i] = buffer; - buffer->data = data; + return fmsg; +} + +static void devlink_fmsg_free(struct devlink_fmsg *fmsg) +{ + struct devlink_fmsg_item *item, *tmp; + + list_for_each_entry_safe(item, tmp, &fmsg->item_list, list) { + list_del(&item->list); + kfree(item); } - devlink_health_buffers_reset(buffers_list, num_of_buffers); + kfree(fmsg); +} - return buffers_list; +static int devlink_fmsg_nest_common(struct devlink_fmsg *fmsg, + int attrtype) +{ + struct devlink_fmsg_item *item; -buffers_cleanup: - devlink_health_buffers_destroy(buffers_list, --i); - kfree(buffers_list); - return NULL; + item = kzalloc(sizeof(*item), GFP_KERNEL); + if (!item) + return -ENOMEM; + + item->attrtype = attrtype; + list_add_tail(&item->list, &fmsg->item_list); + + return 0; } -static void -devlink_health_buffers_destroy(struct devlink_health_buffer **buffers_list, - u64 num_of_buffers) +int devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg) { - u64 i; + return devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_OBJ_NEST_START); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_start); - for (i = 0; i < num_of_buffers; i++) { - kfree(buffers_list[i]->data); - kfree(buffers_list[i]); - } +static int devlink_fmsg_nest_end(struct devlink_fmsg *fmsg) +{ + return devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_NEST_END); } -void -devlink_health_buffer_offset_inc(struct devlink_health_buffer *buffer, - int len) +int devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg) { - buffer->offset += len; + return devlink_fmsg_nest_end(fmsg); } +EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_end); -/* In order to store a nest, need two descriptors, for start and end */ -#define DEVLINK_HEALTH_BUFFER_NEST_SIZE (sizeof(struct devlink_health_buffer_desc) * 2) +#define DEVLINK_FMSG_MAX_SIZE (GENLMSG_DEFAULT_SIZE - GENL_HDRLEN - NLA_HDRLEN) -int devlink_health_buffer_verify_len(struct devlink_health_buffer *buffer, - int len, int metadata_len) +static int devlink_fmsg_put_name(struct devlink_fmsg *fmsg, const char *name) { - if (len > DEVLINK_HEALTH_BUFFER_DATA_SIZE) - return -EINVAL; + struct devlink_fmsg_item *item; - if (buffer->bytes_left < len || - buffer->bytes_left_metadata < metadata_len) + if (strlen(name) + 1 > DEVLINK_FMSG_MAX_SIZE) + return -EMSGSIZE; + + item = kzalloc(sizeof(*item) + strlen(name) + 1, GFP_KERNEL); + if (!item) return -ENOMEM; + item->nla_type = NLA_NUL_STRING; + item->len = strlen(name) + 1; + item->attrtype = DEVLINK_ATTR_FMSG_OBJ_NAME; + memcpy(&item->value, name, item->len); + list_add_tail(&item->list, &fmsg->item_list); + return 0; } -static struct devlink_health_buffer_desc * -devlink_health_buffer_get_desc_from_offset(struct devlink_health_buffer *buffer) +int devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name) { - return buffer->data + buffer->offset; + int err; + + err = devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_PAIR_NEST_START); + if (err) + return err; + + err = devlink_fmsg_put_name(fmsg, name); + if (err) + return err; + + return 0; } +EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_start); -int -devlink_health_buffer_nest_start(struct devlink_health_buffer *buffer, - int attrtype) +int devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg) +{ + return devlink_fmsg_nest_end(fmsg); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_end); + +int devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg, + const char *name) { - struct devlink_health_buffer_desc *desc; int err; - err = devlink_health_buffer_verify_len(buffer, 0, - DEVLINK_HEALTH_BUFFER_NEST_SIZE); + err = devlink_fmsg_pair_nest_start(fmsg, name); if (err) return err; - if (attrtype != DEVLINK_ATTR_HEALTH_BUFFER_OBJECT && - attrtype != DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR && - attrtype != DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE && - attrtype != DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_ARRAY) - return -EINVAL; + err = devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_ARR_NEST_START); + if (err) + return err; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_start); + +int devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg) +{ + int err; + + err = devlink_fmsg_nest_end(fmsg); + if (err) + return err; - desc = devlink_health_buffer_get_desc_from_offset(buffer); + err = devlink_fmsg_nest_end(fmsg); + if (err) + return err; - desc->attrtype = attrtype; - buffer->bytes_left_metadata -= DEVLINK_HEALTH_BUFFER_NEST_SIZE; - devlink_health_buffer_offset_inc(buffer, sizeof(*desc)); + return 0; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_end); - buffer->curr_nest++; - buffer->max_nested_depth = max(buffer->max_nested_depth, - buffer->curr_nest); +static int devlink_fmsg_put_value(struct devlink_fmsg *fmsg, + const void *value, u16 value_len, + u8 value_nla_type) +{ + struct devlink_fmsg_item *item; + + if (value_len > DEVLINK_FMSG_MAX_SIZE) + return -EMSGSIZE; + + item = kzalloc(sizeof(*item) + value_len, GFP_KERNEL); + if (!item) + return -ENOMEM; + + item->nla_type = value_nla_type; + item->len = value_len; + item->attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA; + memcpy(&item->value, value, item->len); + list_add_tail(&item->list, &fmsg->item_list); return 0; } -EXPORT_SYMBOL_GPL(devlink_health_buffer_nest_start); -enum devlink_health_buffer_nest_end_cancel { - DEVLINK_HEALTH_BUFFER_NEST_END = 1, - DEVLINK_HEALTH_BUFFER_NEST_CANCEL, -}; +int devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value) +{ + return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_FLAG); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_bool_put); -static void -devlink_health_buffer_nest_end_cancel(struct devlink_health_buffer *buffer, - enum devlink_health_buffer_nest_end_cancel nest) +int devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value) { - struct devlink_health_buffer_desc *desc; + return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U8); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_u8_put); - WARN_ON(!buffer->curr_nest); - buffer->curr_nest--; +int devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value) +{ + return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U32); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_u32_put); - desc = devlink_health_buffer_get_desc_from_offset(buffer); - desc->nest_end = nest; - devlink_health_buffer_offset_inc(buffer, sizeof(*desc)); +int devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value) +{ + return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U64); } +EXPORT_SYMBOL_GPL(devlink_fmsg_u64_put); -void devlink_health_buffer_nest_end(struct devlink_health_buffer *buffer) +int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value) { - devlink_health_buffer_nest_end_cancel(buffer, - DEVLINK_HEALTH_BUFFER_NEST_END); + return devlink_fmsg_put_value(fmsg, value, strlen(value) + 1, + NLA_NUL_STRING); } -EXPORT_SYMBOL_GPL(devlink_health_buffer_nest_end); +EXPORT_SYMBOL_GPL(devlink_fmsg_string_put); -void devlink_health_buffer_nest_cancel(struct devlink_health_buffer *buffer) +int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, + u16 value_len) { - devlink_health_buffer_nest_end_cancel(buffer, - DEVLINK_HEALTH_BUFFER_NEST_CANCEL); + return devlink_fmsg_put_value(fmsg, value, value_len, NLA_BINARY); } -EXPORT_SYMBOL_GPL(devlink_health_buffer_nest_cancel); +EXPORT_SYMBOL_GPL(devlink_fmsg_binary_put); -int -devlink_health_buffer_put_object_name(struct devlink_health_buffer *buffer, - char *name) +int devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name, + bool value) { - struct devlink_health_buffer_desc *desc; int err; - err = devlink_health_buffer_verify_len(buffer, strlen(name) + 1, - sizeof(*desc)); + err = devlink_fmsg_pair_nest_start(fmsg, name); if (err) return err; - desc = devlink_health_buffer_get_desc_from_offset(buffer); - desc->attrtype = DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_NAME; - desc->nla_type = NLA_NUL_STRING; - desc->len = strlen(name) + 1; - memcpy(&desc->value, name, desc->len); - devlink_health_buffer_offset_inc(buffer, sizeof(*desc) + desc->len); + err = devlink_fmsg_bool_put(fmsg, value); + if (err) + return err; - buffer->bytes_left_metadata -= sizeof(*desc); - buffer->bytes_left -= (strlen(name) + 1); + err = devlink_fmsg_pair_nest_end(fmsg); + if (err) + return err; return 0; } -EXPORT_SYMBOL_GPL(devlink_health_buffer_put_object_name); +EXPORT_SYMBOL_GPL(devlink_fmsg_bool_pair_put); -static int -devlink_health_buffer_put_value(struct devlink_health_buffer *buffer, - u8 nla_type, void *value, int len) +int devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name, + u8 value) { - struct devlink_health_buffer_desc *desc; int err; - err = devlink_health_buffer_verify_len(buffer, len, sizeof(*desc)); + err = devlink_fmsg_pair_nest_start(fmsg, name); if (err) return err; - desc = devlink_health_buffer_get_desc_from_offset(buffer); - desc->attrtype = DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA; - desc->nla_type = nla_type; - desc->len = len; - memcpy(&desc->value, value, len); - devlink_health_buffer_offset_inc(buffer, sizeof(*desc) + desc->len); + err = devlink_fmsg_u8_put(fmsg, value); + if (err) + return err; - buffer->bytes_left_metadata -= sizeof(*desc); - buffer->bytes_left -= len; + err = devlink_fmsg_pair_nest_end(fmsg); + if (err) + return err; return 0; } +EXPORT_SYMBOL_GPL(devlink_fmsg_u8_pair_put); -int -devlink_health_buffer_put_value_u8(struct devlink_health_buffer *buffer, - u8 value) +int devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name, + u32 value) { int err; - err = devlink_health_buffer_put_value(buffer, NLA_U8, &value, - sizeof(value)); + err = devlink_fmsg_pair_nest_start(fmsg, name); if (err) return err; - return 0; -} -EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_u8); - -int -devlink_health_buffer_put_value_u32(struct devlink_health_buffer *buffer, - u32 value) -{ - int err; + err = devlink_fmsg_u32_put(fmsg, value); + if (err) + return err; - err = devlink_health_buffer_put_value(buffer, NLA_U32, &value, - sizeof(value)); + err = devlink_fmsg_pair_nest_end(fmsg); if (err) return err; return 0; } -EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_u32); +EXPORT_SYMBOL_GPL(devlink_fmsg_u32_pair_put); -int -devlink_health_buffer_put_value_u64(struct devlink_health_buffer *buffer, - u64 value) +int devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name, + u64 value) { int err; - err = devlink_health_buffer_put_value(buffer, NLA_U64, &value, - sizeof(value)); + err = devlink_fmsg_pair_nest_start(fmsg, name); + if (err) + return err; + + err = devlink_fmsg_u64_put(fmsg, value); + if (err) + return err; + + err = devlink_fmsg_pair_nest_end(fmsg); if (err) return err; return 0; } -EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_u64); +EXPORT_SYMBOL_GPL(devlink_fmsg_u64_pair_put); -int -devlink_health_buffer_put_value_string(struct devlink_health_buffer *buffer, - char *name) +int devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name, + const char *value) { int err; - if (strlen(name) + 1 > DEVLINK_HEALTH_BUFFER_MAX_CHUNK) - return -EINVAL; + err = devlink_fmsg_pair_nest_start(fmsg, name); + if (err) + return err; - err = devlink_health_buffer_put_value(buffer, NLA_NUL_STRING, name, - strlen(name) + 1); + err = devlink_fmsg_string_put(fmsg, value); + if (err) + return err; + + err = devlink_fmsg_pair_nest_end(fmsg); if (err) return err; return 0; } -EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_string); +EXPORT_SYMBOL_GPL(devlink_fmsg_string_pair_put); -int -devlink_health_buffer_put_value_data(struct devlink_health_buffer *buffer, - void *data, int len) +int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, + const void *value, u16 value_len) { int err; - if (len > DEVLINK_HEALTH_BUFFER_MAX_CHUNK) - return -EINVAL; + err = devlink_fmsg_pair_nest_start(fmsg, name); + if (err) + return err; - err = devlink_health_buffer_put_value(buffer, NLA_BINARY, data, len); + err = devlink_fmsg_binary_put(fmsg, value, value_len); + if (err) + return err; + + err = devlink_fmsg_pair_nest_end(fmsg); if (err) return err; return 0; } -EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_data); +EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_put); static int -devlink_health_buffer_fill_data(struct sk_buff *skb, - struct devlink_health_buffer_desc *desc) +devlink_fmsg_item_fill_type(struct devlink_fmsg_item *msg, struct sk_buff *skb) { - int err = -EINVAL; - - switch (desc->nla_type) { + switch (msg->nla_type) { + case NLA_FLAG: case NLA_U8: - err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA, - *(u8 *)desc->value); - break; case NLA_U32: - err = nla_put_u32(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA, - *(u32 *)desc->value); - break; case NLA_U64: - err = nla_put_u64_64bit(skb, - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA, - *(u64 *)desc->value, DEVLINK_ATTR_PAD); - break; case NLA_NUL_STRING: - err = nla_put_string(skb, - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA, - (char *)&desc->value); - break; case NLA_BINARY: - err = nla_put(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA, - desc->len, (void *)&desc->value); - break; + return nla_put_u8(skb, DEVLINK_ATTR_FMSG_OBJ_VALUE_TYPE, + msg->nla_type); + default: + return -EINVAL; } - - return err; } static int -devlink_health_buffer_fill_type(struct sk_buff *skb, - struct devlink_health_buffer_desc *desc) +devlink_fmsg_item_fill_data(struct devlink_fmsg_item *msg, struct sk_buff *skb) { - int err = -EINVAL; + int attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA; + u8 tmp; + + switch (msg->nla_type) { + case NLA_FLAG: + /* Always provide flag data, regardless of its value */ + tmp = *(bool *) msg->value; - switch (desc->nla_type) { + return nla_put_u8(skb, attrtype, tmp); case NLA_U8: - err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE, - NLA_U8); - break; + return nla_put_u8(skb, attrtype, *(u8 *) msg->value); case NLA_U32: - err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE, - NLA_U32); - break; + return nla_put_u32(skb, attrtype, *(u32 *) msg->value); case NLA_U64: - err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE, - NLA_U64); - break; + return nla_put_u64_64bit(skb, attrtype, *(u64 *) msg->value, + DEVLINK_ATTR_PAD); case NLA_NUL_STRING: - err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE, - NLA_NUL_STRING); - break; + return nla_put_string(skb, attrtype, (char *) &msg->value); case NLA_BINARY: - err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE, - NLA_BINARY); - break; + return nla_put(skb, attrtype, msg->len, (void *) &msg->value); + default: + return -EINVAL; } - - return err; -} - -static inline struct devlink_health_buffer_desc * -devlink_health_buffer_get_next_desc(struct devlink_health_buffer_desc *desc) -{ - return (void *)&desc->value + desc->len; } static int -devlink_health_buffer_prepare_skb(struct sk_buff *skb, - struct devlink_health_buffer *buffer) +devlink_fmsg_prepare_skb(struct devlink_fmsg *fmsg, struct sk_buff *skb, + int *start) { - struct devlink_health_buffer_desc *last_desc, *desc; - struct nlattr **buffer_nlattr; - int err; + struct devlink_fmsg_item *item; + struct nlattr *fmsg_nlattr; int i = 0; + int err; - buffer_nlattr = kcalloc(buffer->max_nested_depth, - sizeof(*buffer_nlattr), GFP_KERNEL); - if (!buffer_nlattr) - return -EINVAL; + fmsg_nlattr = nla_nest_start(skb, DEVLINK_ATTR_FMSG); + if (!fmsg_nlattr) + return -EMSGSIZE; - last_desc = devlink_health_buffer_get_desc_from_offset(buffer); - desc = buffer->data; - while (desc != last_desc) { - switch (desc->attrtype) { - case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT: - case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR: - case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE: - case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_ARRAY: - buffer_nlattr[i] = nla_nest_start(skb, desc->attrtype); - if (!buffer_nlattr[i]) - goto nla_put_failure; + list_for_each_entry(item, &fmsg->item_list, list) { + if (i < *start) { i++; + continue; + } + + switch (item->attrtype) { + case DEVLINK_ATTR_FMSG_OBJ_NEST_START: + case DEVLINK_ATTR_FMSG_PAIR_NEST_START: + case DEVLINK_ATTR_FMSG_ARR_NEST_START: + case DEVLINK_ATTR_FMSG_NEST_END: + err = nla_put_flag(skb, item->attrtype); break; - case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA: - err = devlink_health_buffer_fill_data(skb, desc); + case DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA: + err = devlink_fmsg_item_fill_type(item, skb); if (err) - goto nla_put_failure; - err = devlink_health_buffer_fill_type(skb, desc); - if (err) - goto nla_put_failure; + break; + err = devlink_fmsg_item_fill_data(item, skb); break; - case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_NAME: - err = nla_put_string(skb, desc->attrtype, - (char *)&desc->value); - if (err) - goto nla_put_failure; + case DEVLINK_ATTR_FMSG_OBJ_NAME: + err = nla_put_string(skb, item->attrtype, + (char *) &item->value); break; default: - WARN_ON(!desc->nest_end); - WARN_ON(i <= 0); - if (desc->nest_end == DEVLINK_HEALTH_BUFFER_NEST_END) - nla_nest_end(skb, buffer_nlattr[--i]); - else - nla_nest_cancel(skb, buffer_nlattr[--i]); + err = -EINVAL; break; } - desc = devlink_health_buffer_get_next_desc(desc); + if (!err) + *start = ++i; + else + break; } - return 0; - -nla_put_failure: - kfree(buffer_nlattr); + nla_nest_end(skb, fmsg_nlattr); return err; } -static int -devlink_health_buffer_snd(struct genl_info *info, - enum devlink_command cmd, int flags, - struct devlink_health_buffer **buffers_array, - u64 num_of_buffers) +static int devlink_fmsg_snd(struct devlink_fmsg *fmsg, + struct genl_info *info, + enum devlink_command cmd, int flags) { - struct sk_buff *skb; struct nlmsghdr *nlh; + struct sk_buff *skb; + bool last = false; + int index = 0; void *hdr; int err; - u64 i; - for (i = 0; i < num_of_buffers; i++) { - /* Skip buffer if driver did not fill it up with any data */ - if (!buffers_array[i]->offset) - continue; + while (!last) { + int tmp_index = index; skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return -ENOMEM; hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, - &devlink_nl_family, NLM_F_MULTI, cmd); - if (!hdr) + &devlink_nl_family, flags | NLM_F_MULTI, cmd); + if (!hdr) { + err = -EMSGSIZE; goto nla_put_failure; + } - err = devlink_health_buffer_prepare_skb(skb, buffers_array[i]); - if (err) + err = devlink_fmsg_prepare_skb(fmsg, skb, &index); + if (!err) + last = true; + else if (err != -EMSGSIZE || tmp_index == index) goto nla_put_failure; genlmsg_end(skb, hdr); @@ -4086,31 +4388,28 @@ devlink_health_buffer_snd(struct genl_info *info, return -ENOMEM; nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, NLMSG_DONE, 0, flags | NLM_F_MULTI); - err = genlmsg_reply(skb, info); - if (err) - return err; + if (!nlh) { + err = -EMSGSIZE; + goto nla_put_failure; + } - return 0; + return genlmsg_reply(skb, info); nla_put_failure: - err = -EIO; nlmsg_free(skb); return err; } struct devlink_health_reporter { struct list_head list; - struct devlink_health_buffer **dump_buffers_array; - struct mutex dump_lock; /* lock parallel read/write from dump buffers */ - struct devlink_health_buffer **diagnose_buffers_array; - struct mutex diagnose_lock; /* lock parallel read/write from diagnose buffers */ void *priv; const struct devlink_health_reporter_ops *ops; struct devlink *devlink; + struct devlink_fmsg *dump_fmsg; + struct mutex dump_lock; /* lock parallel read/write from dump buffers */ u64 graceful_period; bool auto_recover; u8 health_state; - u8 dump_avail; u64 dump_ts; u64 error_count; u64 recovery_count; @@ -4164,9 +4463,7 @@ devlink_health_reporter_create(struct devlink *devlink, goto unlock; } - if (WARN_ON(ops->dump && !ops->dump_size) || - WARN_ON(ops->diagnose && !ops->diagnose_size) || - WARN_ON(auto_recover && !ops->recover) || + if (WARN_ON(auto_recover && !ops->recover) || WARN_ON(graceful_period && !ops->recover)) { reporter = ERR_PTR(-EINVAL); goto unlock; @@ -4178,37 +4475,13 @@ devlink_health_reporter_create(struct devlink *devlink, goto unlock; } - if (ops->dump) { - reporter->dump_buffers_array = - devlink_health_buffers_create(ops->dump_size); - if (!reporter->dump_buffers_array) { - kfree(reporter); - reporter = ERR_PTR(-ENOMEM); - goto unlock; - } - } - - if (ops->diagnose) { - reporter->diagnose_buffers_array = - devlink_health_buffers_create(ops->diagnose_size); - if (!reporter->diagnose_buffers_array) { - devlink_health_buffers_destroy(reporter->dump_buffers_array, - DEVLINK_HEALTH_SIZE_TO_BUFFERS(ops->dump_size)); - kfree(reporter); - reporter = ERR_PTR(-ENOMEM); - goto unlock; - } - } - - list_add_tail(&reporter->list, &devlink->reporter_list); - mutex_init(&reporter->dump_lock); - mutex_init(&reporter->diagnose_lock); - reporter->priv = priv; reporter->ops = ops; reporter->devlink = devlink; reporter->graceful_period = graceful_period; reporter->auto_recover = auto_recover; + mutex_init(&reporter->dump_lock); + list_add_tail(&reporter->list, &devlink->reporter_list); unlock: mutex_unlock(&devlink->lock); return reporter; @@ -4225,12 +4498,10 @@ devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) { mutex_lock(&reporter->devlink->lock); list_del(&reporter->list); - devlink_health_buffers_destroy(reporter->dump_buffers_array, - DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->dump_size)); - devlink_health_buffers_destroy(reporter->diagnose_buffers_array, - DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->diagnose_size)); - kfree(reporter); mutex_unlock(&reporter->devlink->lock); + if (reporter->dump_fmsg) + devlink_fmsg_free(reporter->dump_fmsg); + kfree(reporter); } EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy); @@ -4254,6 +4525,15 @@ devlink_health_reporter_recover(struct devlink_health_reporter *reporter, return 0; } +static void +devlink_health_dump_clear(struct devlink_health_reporter *reporter) +{ + if (!reporter->dump_fmsg) + return; + devlink_fmsg_free(reporter->dump_fmsg); + reporter->dump_fmsg = NULL; +} + static int devlink_health_do_dump(struct devlink_health_reporter *reporter, void *priv_ctx) { @@ -4262,20 +4542,34 @@ static int devlink_health_do_dump(struct devlink_health_reporter *reporter, if (!reporter->ops->dump) return 0; - if (reporter->dump_avail) + if (reporter->dump_fmsg) return 0; - devlink_health_buffers_reset(reporter->dump_buffers_array, - DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->dump_size)); - err = reporter->ops->dump(reporter, reporter->dump_buffers_array, - DEVLINK_HEALTH_BUFFER_SIZE, - DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->dump_size), - priv_ctx); - if (!err) { - reporter->dump_avail = true; - reporter->dump_ts = jiffies; + reporter->dump_fmsg = devlink_fmsg_alloc(); + if (!reporter->dump_fmsg) { + err = -ENOMEM; + return err; } + err = devlink_fmsg_obj_nest_start(reporter->dump_fmsg); + if (err) + goto dump_err; + + err = reporter->ops->dump(reporter, reporter->dump_fmsg, + priv_ctx); + if (err) + goto dump_err; + + err = devlink_fmsg_obj_nest_end(reporter->dump_fmsg); + if (err) + goto dump_err; + + reporter->dump_ts = jiffies; + + return 0; + +dump_err: + devlink_health_dump_clear(reporter); return err; } @@ -4283,7 +4577,6 @@ int devlink_health_report(struct devlink_health_reporter *reporter, const char *msg, void *priv_ctx) { struct devlink *devlink = reporter->devlink; - int err = 0; /* write a log message of the current error */ WARN_ON(!msg); @@ -4311,9 +4604,9 @@ int devlink_health_report(struct devlink_health_reporter *reporter, mutex_unlock(&reporter->dump_lock); if (reporter->auto_recover) - err = devlink_health_reporter_recover(reporter, priv_ctx); + return devlink_health_reporter_recover(reporter, priv_ctx); - return err; + return 0; } EXPORT_SYMBOL_GPL(devlink_health_report); @@ -4357,23 +4650,22 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg, if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE, reporter->health_state)) goto reporter_nest_cancel; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR, + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR_COUNT, reporter->error_count, DEVLINK_ATTR_PAD)) goto reporter_nest_cancel; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER, + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER_COUNT, reporter->recovery_count, DEVLINK_ATTR_PAD)) goto reporter_nest_cancel; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, + if (reporter->ops->recover && + nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, reporter->graceful_period, DEVLINK_ATTR_PAD)) goto reporter_nest_cancel; - if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, + if (reporter->ops->recover && + nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, reporter->auto_recover)) goto reporter_nest_cancel; - if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_AVAIL, - reporter->dump_avail)) - goto reporter_nest_cancel; - if (reporter->dump_avail && + if (reporter->dump_fmsg && nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS, jiffies_to_msecs(reporter->dump_ts), DEVLINK_ATTR_PAD)) @@ -4474,7 +4766,7 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, if (!reporter->ops->recover && (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] || info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])) - return -EINVAL; + return -EOPNOTSUPP; if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) reporter->graceful_period = @@ -4505,7 +4797,7 @@ static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb, { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; - u64 num_of_buffers; + struct devlink_fmsg *fmsg; int err; reporter = devlink_health_reporter_get_from_info(devlink, info); @@ -4515,50 +4807,35 @@ static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb, if (!reporter->ops->diagnose) return -EOPNOTSUPP; - num_of_buffers = - DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->diagnose_size); + fmsg = devlink_fmsg_alloc(); + if (!fmsg) + return -ENOMEM; - mutex_lock(&reporter->diagnose_lock); - devlink_health_buffers_reset(reporter->diagnose_buffers_array, - num_of_buffers); + err = devlink_fmsg_obj_nest_start(fmsg); + if (err) + goto out; - err = reporter->ops->diagnose(reporter, - reporter->diagnose_buffers_array, - DEVLINK_HEALTH_BUFFER_SIZE, - num_of_buffers); + err = reporter->ops->diagnose(reporter, fmsg); if (err) goto out; - err = devlink_health_buffer_snd(info, - DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, - 0, reporter->diagnose_buffers_array, - num_of_buffers); + err = devlink_fmsg_obj_nest_end(fmsg); if (err) goto out; - mutex_unlock(&reporter->diagnose_lock); - return 0; + err = devlink_fmsg_snd(fmsg, info, + DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, 0); out: - mutex_unlock(&reporter->diagnose_lock); + devlink_fmsg_free(fmsg); return err; } -static void -devlink_health_dump_clear(struct devlink_health_reporter *reporter) -{ - reporter->dump_avail = false; - reporter->dump_ts = 0; - devlink_health_buffers_reset(reporter->dump_buffers_array, - DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->dump_size)); -} - static int devlink_nl_cmd_health_reporter_dump_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; - u64 num_of_buffers; int err; reporter = devlink_health_reporter_get_from_info(devlink, info); @@ -4568,18 +4845,13 @@ static int devlink_nl_cmd_health_reporter_dump_get_doit(struct sk_buff *skb, if (!reporter->ops->dump) return -EOPNOTSUPP; - num_of_buffers = - DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->dump_size); - mutex_lock(&reporter->dump_lock); err = devlink_health_do_dump(reporter, NULL); if (err) goto out; - err = devlink_health_buffer_snd(info, - DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, - 0, reporter->dump_buffers_array, - num_of_buffers); + err = devlink_fmsg_snd(reporter->dump_fmsg, info, + DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, 0); out: mutex_unlock(&reporter->dump_lock); @@ -4597,6 +4869,9 @@ devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb, if (!reporter) return -EINVAL; + if (!reporter->ops->dump) + return -EOPNOTSUPP; + mutex_lock(&reporter->dump_lock); devlink_health_dump_clear(reporter); mutex_unlock(&reporter->dump_lock); @@ -4631,6 +4906,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64 }, [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 }, + [DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = { .type = NLA_NUL_STRING }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -4830,6 +5107,21 @@ static const struct genl_ops devlink_nl_ops[] = { .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { + .cmd = DEVLINK_CMD_PORT_PARAM_GET, + .doit = devlink_nl_cmd_port_param_get_doit, + .dumpit = devlink_nl_cmd_port_param_get_dumpit, + .policy = devlink_nl_policy, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_PORT_PARAM_SET, + .doit = devlink_nl_cmd_port_param_set_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, + }, + { .cmd = DEVLINK_CMD_REGION_GET, .doit = devlink_nl_cmd_region_get_doit, .dumpit = devlink_nl_cmd_region_get_dumpit, @@ -4852,6 +5144,14 @@ static const struct genl_ops devlink_nl_ops[] = { .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { + .cmd = DEVLINK_CMD_INFO_GET, + .doit = devlink_nl_cmd_info_get_doit, + .dumpit = devlink_nl_cmd_info_get_dumpit, + .policy = devlink_nl_policy, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + /* can be retrieved by unprivileged users */ + }, + { .cmd = DEVLINK_CMD_HEALTH_REPORTER_GET, .doit = devlink_nl_cmd_health_reporter_get_doit, .dumpit = devlink_nl_cmd_health_reporter_get_dumpit, @@ -4896,6 +5196,13 @@ static const struct genl_ops devlink_nl_ops[] = { .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | DEVLINK_NL_FLAG_NO_LOCK, }, + { + .cmd = DEVLINK_CMD_FLASH_UPDATE, + .doit = devlink_nl_cmd_flash_update, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { @@ -4979,6 +5286,14 @@ EXPORT_SYMBOL_GPL(devlink_unregister); */ void devlink_free(struct devlink *devlink) { + WARN_ON(!list_empty(&devlink->reporter_list)); + WARN_ON(!list_empty(&devlink->region_list)); + WARN_ON(!list_empty(&devlink->param_list)); + WARN_ON(!list_empty(&devlink->resource_list)); + WARN_ON(!list_empty(&devlink->dpipe_table_list)); + WARN_ON(!list_empty(&devlink->sb_list)); + WARN_ON(!list_empty(&devlink->port_list)); + kfree(devlink); } EXPORT_SYMBOL_GPL(devlink_free); @@ -5009,6 +5324,7 @@ int devlink_port_register(struct devlink *devlink, devlink_port->index = port_index; devlink_port->registered = true; list_add_tail(&devlink_port->list, &devlink->port_list); + INIT_LIST_HEAD(&devlink_port->param_list); mutex_unlock(&devlink->lock); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); return 0; @@ -5526,18 +5842,23 @@ out: } EXPORT_SYMBOL_GPL(devlink_resource_occ_get_unregister); -/** - * devlink_params_register - register configuration parameters - * - * @devlink: devlink - * @params: configuration parameters array - * @params_count: number of parameters provided - * - * Register the configuration parameters supported by the driver. - */ -int devlink_params_register(struct devlink *devlink, - const struct devlink_param *params, - size_t params_count) +static int devlink_param_verify(const struct devlink_param *param) +{ + if (!param || !param->name || !param->supported_cmodes) + return -EINVAL; + if (param->generic) + return devlink_param_generic_verify(param); + else + return devlink_param_driver_verify(param); +} + +static int __devlink_params_register(struct devlink *devlink, + unsigned int port_index, + struct list_head *param_list, + const struct devlink_param *params, + size_t params_count, + enum devlink_command reg_cmd, + enum devlink_command unreg_cmd) { const struct devlink_param *param = params; int i; @@ -5545,20 +5866,12 @@ int devlink_params_register(struct devlink *devlink, mutex_lock(&devlink->lock); for (i = 0; i < params_count; i++, param++) { - if (!param || !param->name || !param->supported_cmodes) { - err = -EINVAL; + err = devlink_param_verify(param); + if (err) goto rollback; - } - if (param->generic) { - err = devlink_param_generic_verify(param); - if (err) - goto rollback; - } else { - err = devlink_param_driver_verify(param); - if (err) - goto rollback; - } - err = devlink_param_register_one(devlink, param); + + err = devlink_param_register_one(devlink, port_index, + param_list, param, reg_cmd); if (err) goto rollback; } @@ -5570,11 +5883,48 @@ rollback: if (!i) goto unlock; for (param--; i > 0; i--, param--) - devlink_param_unregister_one(devlink, param); + devlink_param_unregister_one(devlink, port_index, param_list, + param, unreg_cmd); unlock: mutex_unlock(&devlink->lock); return err; } + +static void __devlink_params_unregister(struct devlink *devlink, + unsigned int port_index, + struct list_head *param_list, + const struct devlink_param *params, + size_t params_count, + enum devlink_command cmd) +{ + const struct devlink_param *param = params; + int i; + + mutex_lock(&devlink->lock); + for (i = 0; i < params_count; i++, param++) + devlink_param_unregister_one(devlink, 0, param_list, param, + cmd); + mutex_unlock(&devlink->lock); +} + +/** + * devlink_params_register - register configuration parameters + * + * @devlink: devlink + * @params: configuration parameters array + * @params_count: number of parameters provided + * + * Register the configuration parameters supported by the driver. + */ +int devlink_params_register(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count) +{ + return __devlink_params_register(devlink, 0, &devlink->param_list, + params, params_count, + DEVLINK_CMD_PARAM_NEW, + DEVLINK_CMD_PARAM_DEL); +} EXPORT_SYMBOL_GPL(devlink_params_register); /** @@ -5587,36 +5937,103 @@ void devlink_params_unregister(struct devlink *devlink, const struct devlink_param *params, size_t params_count) { - const struct devlink_param *param = params; - int i; - - mutex_lock(&devlink->lock); - for (i = 0; i < params_count; i++, param++) - devlink_param_unregister_one(devlink, param); - mutex_unlock(&devlink->lock); + return __devlink_params_unregister(devlink, 0, &devlink->param_list, + params, params_count, + DEVLINK_CMD_PARAM_DEL); } EXPORT_SYMBOL_GPL(devlink_params_unregister); /** - * devlink_param_driverinit_value_get - get configuration parameter - * value for driver initializing + * devlink_params_publish - publish configuration parameters * * @devlink: devlink - * @param_id: parameter ID - * @init_val: value of parameter in driverinit configuration mode * - * This function should be used by the driver to get driverinit - * configuration for initialization after reload command. + * Publish previously registered configuration parameters. */ -int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, - union devlink_param_value *init_val) +void devlink_params_publish(struct devlink *devlink) { struct devlink_param_item *param_item; - if (!devlink->ops || !devlink->ops->reload) - return -EOPNOTSUPP; + list_for_each_entry(param_item, &devlink->param_list, list) { + if (param_item->published) + continue; + param_item->published = true; + devlink_param_notify(devlink, 0, param_item, + DEVLINK_CMD_PARAM_NEW); + } +} +EXPORT_SYMBOL_GPL(devlink_params_publish); - param_item = devlink_param_find_by_id(&devlink->param_list, param_id); +/** + * devlink_params_unpublish - unpublish configuration parameters + * + * @devlink: devlink + * + * Unpublish previously registered configuration parameters. + */ +void devlink_params_unpublish(struct devlink *devlink) +{ + struct devlink_param_item *param_item; + + list_for_each_entry(param_item, &devlink->param_list, list) { + if (!param_item->published) + continue; + param_item->published = false; + devlink_param_notify(devlink, 0, param_item, + DEVLINK_CMD_PARAM_DEL); + } +} +EXPORT_SYMBOL_GPL(devlink_params_unpublish); + +/** + * devlink_port_params_register - register port configuration parameters + * + * @devlink_port: devlink port + * @params: configuration parameters array + * @params_count: number of parameters provided + * + * Register the configuration parameters supported by the port. + */ +int devlink_port_params_register(struct devlink_port *devlink_port, + const struct devlink_param *params, + size_t params_count) +{ + return __devlink_params_register(devlink_port->devlink, + devlink_port->index, + &devlink_port->param_list, params, + params_count, + DEVLINK_CMD_PORT_PARAM_NEW, + DEVLINK_CMD_PORT_PARAM_DEL); +} +EXPORT_SYMBOL_GPL(devlink_port_params_register); + +/** + * devlink_port_params_unregister - unregister port configuration + * parameters + * + * @devlink_port: devlink port + * @params: configuration parameters array + * @params_count: number of parameters provided + */ +void devlink_port_params_unregister(struct devlink_port *devlink_port, + const struct devlink_param *params, + size_t params_count) +{ + return __devlink_params_unregister(devlink_port->devlink, + devlink_port->index, + &devlink_port->param_list, + params, params_count, + DEVLINK_CMD_PORT_PARAM_DEL); +} +EXPORT_SYMBOL_GPL(devlink_port_params_unregister); + +static int +__devlink_param_driverinit_value_get(struct list_head *param_list, u32 param_id, + union devlink_param_value *init_val) +{ + struct devlink_param_item *param_item; + + param_item = devlink_param_find_by_id(param_list, param_id); if (!param_item) return -EINVAL; @@ -5632,6 +6049,54 @@ int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, return 0; } + +static int +__devlink_param_driverinit_value_set(struct devlink *devlink, + unsigned int port_index, + struct list_head *param_list, u32 param_id, + union devlink_param_value init_val, + enum devlink_command cmd) +{ + struct devlink_param_item *param_item; + + param_item = devlink_param_find_by_id(param_list, param_id); + if (!param_item) + return -EINVAL; + + if (!devlink_param_cmode_is_supported(param_item->param, + DEVLINK_PARAM_CMODE_DRIVERINIT)) + return -EOPNOTSUPP; + + if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING) + strcpy(param_item->driverinit_value.vstr, init_val.vstr); + else + param_item->driverinit_value = init_val; + param_item->driverinit_value_valid = true; + + devlink_param_notify(devlink, port_index, param_item, cmd); + return 0; +} + +/** + * devlink_param_driverinit_value_get - get configuration parameter + * value for driver initializing + * + * @devlink: devlink + * @param_id: parameter ID + * @init_val: value of parameter in driverinit configuration mode + * + * This function should be used by the driver to get driverinit + * configuration for initialization after reload command. + */ +int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, + union devlink_param_value *init_val) +{ + if (!devlink->ops || !devlink->ops->reload) + return -EOPNOTSUPP; + + return __devlink_param_driverinit_value_get(&devlink->param_list, + param_id, init_val); +} EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_get); /** @@ -5649,26 +6114,61 @@ EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_get); int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, union devlink_param_value init_val) { - struct devlink_param_item *param_item; + return __devlink_param_driverinit_value_set(devlink, 0, + &devlink->param_list, + param_id, init_val, + DEVLINK_CMD_PARAM_NEW); +} +EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_set); - param_item = devlink_param_find_by_id(&devlink->param_list, param_id); - if (!param_item) - return -EINVAL; +/** + * devlink_port_param_driverinit_value_get - get configuration parameter + * value for driver initializing + * + * @devlink_port: devlink_port + * @param_id: parameter ID + * @init_val: value of parameter in driverinit configuration mode + * + * This function should be used by the driver to get driverinit + * configuration for initialization after reload command. + */ +int devlink_port_param_driverinit_value_get(struct devlink_port *devlink_port, + u32 param_id, + union devlink_param_value *init_val) +{ + struct devlink *devlink = devlink_port->devlink; - if (!devlink_param_cmode_is_supported(param_item->param, - DEVLINK_PARAM_CMODE_DRIVERINIT)) + if (!devlink->ops || !devlink->ops->reload) return -EOPNOTSUPP; - if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING) - strcpy(param_item->driverinit_value.vstr, init_val.vstr); - else - param_item->driverinit_value = init_val; - param_item->driverinit_value_valid = true; + return __devlink_param_driverinit_value_get(&devlink_port->param_list, + param_id, init_val); +} +EXPORT_SYMBOL_GPL(devlink_port_param_driverinit_value_get); - devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); - return 0; +/** + * devlink_port_param_driverinit_value_set - set value of configuration + * parameter for driverinit + * configuration mode + * + * @devlink_port: devlink_port + * @param_id: parameter ID + * @init_val: value of parameter to set for driverinit configuration mode + * + * This function should be used by the driver to set driverinit + * configuration mode default value. + */ +int devlink_port_param_driverinit_value_set(struct devlink_port *devlink_port, + u32 param_id, + union devlink_param_value init_val) +{ + return __devlink_param_driverinit_value_set(devlink_port->devlink, + devlink_port->index, + &devlink_port->param_list, + param_id, init_val, + DEVLINK_CMD_PORT_PARAM_NEW); } -EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_set); +EXPORT_SYMBOL_GPL(devlink_port_param_driverinit_value_set); /** * devlink_param_value_changed - notify devlink on a parameter's value @@ -5681,7 +6181,6 @@ EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_set); * This function should be used by the driver to notify devlink on value * change, excluding driverinit configuration mode. * For driverinit configuration mode driver should use the function - * devlink_param_driverinit_value_set() instead. */ void devlink_param_value_changed(struct devlink *devlink, u32 param_id) { @@ -5690,11 +6189,38 @@ void devlink_param_value_changed(struct devlink *devlink, u32 param_id) param_item = devlink_param_find_by_id(&devlink->param_list, param_id); WARN_ON(!param_item); - devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); + devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); } EXPORT_SYMBOL_GPL(devlink_param_value_changed); /** + * devlink_port_param_value_changed - notify devlink on a parameter's value + * change. Should be called by the driver + * right after the change. + * + * @devlink_port: devlink_port + * @param_id: parameter ID + * + * This function should be used by the driver to notify devlink on value + * change, excluding driverinit configuration mode. + * For driverinit configuration mode driver should use the function + * devlink_port_param_driverinit_value_set() instead. + */ +void devlink_port_param_value_changed(struct devlink_port *devlink_port, + u32 param_id) +{ + struct devlink_param_item *param_item; + + param_item = devlink_param_find_by_id(&devlink_port->param_list, + param_id); + WARN_ON(!param_item); + + devlink_param_notify(devlink_port->devlink, devlink_port->index, + param_item, DEVLINK_CMD_PORT_PARAM_NEW); +} +EXPORT_SYMBOL_GPL(devlink_port_param_value_changed); + +/** * devlink_param_value_str_fill - Safely fill-up the string preventing * from overflow of the preallocated buffer * @@ -5863,6 +6389,99 @@ unlock: } EXPORT_SYMBOL_GPL(devlink_region_snapshot_create); +static void __devlink_compat_running_version(struct devlink *devlink, + char *buf, size_t len) +{ + const struct nlattr *nlattr; + struct devlink_info_req req; + struct sk_buff *msg; + int rem, err; + + if (!devlink->ops->info_get) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + req.msg = msg; + err = devlink->ops->info_get(devlink, &req, NULL); + if (err) + goto free_msg; + + nla_for_each_attr(nlattr, (void *)msg->data, msg->len, rem) { + const struct nlattr *kv; + int rem_kv; + + if (nla_type(nlattr) != DEVLINK_ATTR_INFO_VERSION_RUNNING) + continue; + + nla_for_each_nested(kv, nlattr, rem_kv) { + if (nla_type(kv) != DEVLINK_ATTR_INFO_VERSION_VALUE) + continue; + + strlcat(buf, nla_data(kv), len); + strlcat(buf, " ", len); + } + } +free_msg: + nlmsg_free(msg); +} + +void devlink_compat_running_version(struct net_device *dev, + char *buf, size_t len) +{ + struct devlink_port *devlink_port; + struct devlink *devlink; + + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + mutex_lock(&devlink->lock); + list_for_each_entry(devlink_port, &devlink->port_list, list) { + if (devlink_port->type == DEVLINK_PORT_TYPE_ETH && + devlink_port->type_dev == dev) { + __devlink_compat_running_version(devlink, + buf, len); + mutex_unlock(&devlink->lock); + goto out; + } + } + mutex_unlock(&devlink->lock); + } +out: + mutex_unlock(&devlink_mutex); +} + +int devlink_compat_flash_update(struct net_device *dev, const char *file_name) +{ + struct devlink_port *devlink_port; + struct devlink *devlink; + + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + mutex_lock(&devlink->lock); + list_for_each_entry(devlink_port, &devlink->port_list, list) { + int ret = -EOPNOTSUPP; + + if (devlink_port->type != DEVLINK_PORT_TYPE_ETH || + devlink_port->type_dev != dev) + continue; + + mutex_unlock(&devlink_mutex); + if (devlink->ops->flash_update) + ret = devlink->ops->flash_update(devlink, + file_name, + NULL, NULL); + mutex_unlock(&devlink->lock); + return ret; + } + mutex_unlock(&devlink->lock); + } + mutex_unlock(&devlink_mutex); + + return -EOPNOTSUPP; +} + static int __init devlink_module_init(void) { return genl_register_family(&devlink_nl_family); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 158264f7cfaf..1320e8dce559 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -27,7 +27,9 @@ #include <linux/rtnetlink.h> #include <linux/sched/signal.h> #include <linux/net.h> +#include <net/devlink.h> #include <net/xdp_sock.h> +#include <net/flow_offload.h> /* * Some useful ethtool_ops methods that're device independent. @@ -803,6 +805,12 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, if (ops->get_eeprom_len) info.eedump_len = ops->get_eeprom_len(dev); + rtnl_unlock(); + if (!info.fw_version[0]) + devlink_compat_running_version(dev, info.fw_version, + sizeof(info.fw_version)); + rtnl_lock(); + if (copy_to_user(useraddr, &info, sizeof(info))) return -EFAULT; return 0; @@ -1348,12 +1356,9 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) if (regs.len > reglen) regs.len = reglen; - regbuf = NULL; - if (reglen) { - regbuf = vzalloc(reglen); - if (!regbuf) - return -ENOMEM; - } + regbuf = vzalloc(reglen); + if (!regbuf) + return -ENOMEM; ops->get_regs(dev, ®s, regbuf); @@ -2033,11 +2038,17 @@ static noinline_for_stack int ethtool_flash_device(struct net_device *dev, if (copy_from_user(&efl, useraddr, sizeof(efl))) return -EFAULT; + efl.data[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0; - if (!dev->ethtool_ops->flash_device) - return -EOPNOTSUPP; + if (!dev->ethtool_ops->flash_device) { + int ret; - efl.data[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0; + rtnl_unlock(); + ret = devlink_compat_flash_update(dev, efl.data); + rtnl_lock(); + + return ret; + } return dev->ethtool_ops->flash_device(dev, &efl); } @@ -2816,3 +2827,241 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) return rc; } + +struct ethtool_rx_flow_key { + struct flow_dissector_key_basic basic; + union { + struct flow_dissector_key_ipv4_addrs ipv4; + struct flow_dissector_key_ipv6_addrs ipv6; + }; + struct flow_dissector_key_ports tp; + struct flow_dissector_key_ip ip; + struct flow_dissector_key_vlan vlan; + struct flow_dissector_key_eth_addrs eth_addrs; +} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ + +struct ethtool_rx_flow_match { + struct flow_dissector dissector; + struct ethtool_rx_flow_key key; + struct ethtool_rx_flow_key mask; +}; + +struct ethtool_rx_flow_rule * +ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) +{ + const struct ethtool_rx_flow_spec *fs = input->fs; + static struct in6_addr zero_addr = {}; + struct ethtool_rx_flow_match *match; + struct ethtool_rx_flow_rule *flow; + struct flow_action_entry *act; + + flow = kzalloc(sizeof(struct ethtool_rx_flow_rule) + + sizeof(struct ethtool_rx_flow_match), GFP_KERNEL); + if (!flow) + return ERR_PTR(-ENOMEM); + + /* ethtool_rx supports only one single action per rule. */ + flow->rule = flow_rule_alloc(1); + if (!flow->rule) { + kfree(flow); + return ERR_PTR(-ENOMEM); + } + + match = (struct ethtool_rx_flow_match *)flow->priv; + flow->rule->match.dissector = &match->dissector; + flow->rule->match.mask = &match->mask; + flow->rule->match.key = &match->key; + + match->mask.basic.n_proto = htons(0xffff); + + switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS)) { + case TCP_V4_FLOW: + case UDP_V4_FLOW: { + const struct ethtool_tcpip4_spec *v4_spec, *v4_m_spec; + + match->key.basic.n_proto = htons(ETH_P_IP); + + v4_spec = &fs->h_u.tcp_ip4_spec; + v4_m_spec = &fs->m_u.tcp_ip4_spec; + + if (v4_m_spec->ip4src) { + match->key.ipv4.src = v4_spec->ip4src; + match->mask.ipv4.src = v4_m_spec->ip4src; + } + if (v4_m_spec->ip4dst) { + match->key.ipv4.dst = v4_spec->ip4dst; + match->mask.ipv4.dst = v4_m_spec->ip4dst; + } + if (v4_m_spec->ip4src || + v4_m_spec->ip4dst) { + match->dissector.used_keys |= + BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS); + match->dissector.offset[FLOW_DISSECTOR_KEY_IPV4_ADDRS] = + offsetof(struct ethtool_rx_flow_key, ipv4); + } + if (v4_m_spec->psrc) { + match->key.tp.src = v4_spec->psrc; + match->mask.tp.src = v4_m_spec->psrc; + } + if (v4_m_spec->pdst) { + match->key.tp.dst = v4_spec->pdst; + match->mask.tp.dst = v4_m_spec->pdst; + } + if (v4_m_spec->psrc || + v4_m_spec->pdst) { + match->dissector.used_keys |= + BIT(FLOW_DISSECTOR_KEY_PORTS); + match->dissector.offset[FLOW_DISSECTOR_KEY_PORTS] = + offsetof(struct ethtool_rx_flow_key, tp); + } + if (v4_m_spec->tos) { + match->key.ip.tos = v4_spec->tos; + match->mask.ip.tos = v4_m_spec->tos; + match->dissector.used_keys |= + BIT(FLOW_DISSECTOR_KEY_IP); + match->dissector.offset[FLOW_DISSECTOR_KEY_IP] = + offsetof(struct ethtool_rx_flow_key, ip); + } + } + break; + case TCP_V6_FLOW: + case UDP_V6_FLOW: { + const struct ethtool_tcpip6_spec *v6_spec, *v6_m_spec; + + match->key.basic.n_proto = htons(ETH_P_IPV6); + + v6_spec = &fs->h_u.tcp_ip6_spec; + v6_m_spec = &fs->m_u.tcp_ip6_spec; + if (memcmp(v6_m_spec->ip6src, &zero_addr, sizeof(zero_addr))) { + memcpy(&match->key.ipv6.src, v6_spec->ip6src, + sizeof(match->key.ipv6.src)); + memcpy(&match->mask.ipv6.src, v6_m_spec->ip6src, + sizeof(match->mask.ipv6.src)); + } + if (memcmp(v6_m_spec->ip6dst, &zero_addr, sizeof(zero_addr))) { + memcpy(&match->key.ipv6.dst, v6_spec->ip6dst, + sizeof(match->key.ipv6.dst)); + memcpy(&match->mask.ipv6.dst, v6_m_spec->ip6dst, + sizeof(match->mask.ipv6.dst)); + } + if (memcmp(v6_m_spec->ip6src, &zero_addr, sizeof(zero_addr)) || + memcmp(v6_m_spec->ip6src, &zero_addr, sizeof(zero_addr))) { + match->dissector.used_keys |= + BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS); + match->dissector.offset[FLOW_DISSECTOR_KEY_IPV6_ADDRS] = + offsetof(struct ethtool_rx_flow_key, ipv6); + } + if (v6_m_spec->psrc) { + match->key.tp.src = v6_spec->psrc; + match->mask.tp.src = v6_m_spec->psrc; + } + if (v6_m_spec->pdst) { + match->key.tp.dst = v6_spec->pdst; + match->mask.tp.dst = v6_m_spec->pdst; + } + if (v6_m_spec->psrc || + v6_m_spec->pdst) { + match->dissector.used_keys |= + BIT(FLOW_DISSECTOR_KEY_PORTS); + match->dissector.offset[FLOW_DISSECTOR_KEY_PORTS] = + offsetof(struct ethtool_rx_flow_key, tp); + } + if (v6_m_spec->tclass) { + match->key.ip.tos = v6_spec->tclass; + match->mask.ip.tos = v6_m_spec->tclass; + match->dissector.used_keys |= + BIT(FLOW_DISSECTOR_KEY_IP); + match->dissector.offset[FLOW_DISSECTOR_KEY_IP] = + offsetof(struct ethtool_rx_flow_key, ip); + } + } + break; + default: + ethtool_rx_flow_rule_destroy(flow); + return ERR_PTR(-EINVAL); + } + + switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS)) { + case TCP_V4_FLOW: + case TCP_V6_FLOW: + match->key.basic.ip_proto = IPPROTO_TCP; + break; + case UDP_V4_FLOW: + case UDP_V6_FLOW: + match->key.basic.ip_proto = IPPROTO_UDP; + break; + } + match->mask.basic.ip_proto = 0xff; + + match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_BASIC); + match->dissector.offset[FLOW_DISSECTOR_KEY_BASIC] = + offsetof(struct ethtool_rx_flow_key, basic); + + if (fs->flow_type & FLOW_EXT) { + const struct ethtool_flow_ext *ext_h_spec = &fs->h_ext; + const struct ethtool_flow_ext *ext_m_spec = &fs->m_ext; + + if (ext_m_spec->vlan_etype && + ext_m_spec->vlan_tci) { + match->key.vlan.vlan_tpid = ext_h_spec->vlan_etype; + match->mask.vlan.vlan_tpid = ext_m_spec->vlan_etype; + + match->key.vlan.vlan_id = + ntohs(ext_h_spec->vlan_tci) & 0x0fff; + match->mask.vlan.vlan_id = + ntohs(ext_m_spec->vlan_tci) & 0x0fff; + + match->key.vlan.vlan_priority = + (ntohs(ext_h_spec->vlan_tci) & 0xe000) >> 13; + match->mask.vlan.vlan_priority = + (ntohs(ext_m_spec->vlan_tci) & 0xe000) >> 13; + + match->dissector.used_keys |= + BIT(FLOW_DISSECTOR_KEY_VLAN); + match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] = + offsetof(struct ethtool_rx_flow_key, vlan); + } + } + if (fs->flow_type & FLOW_MAC_EXT) { + const struct ethtool_flow_ext *ext_h_spec = &fs->h_ext; + const struct ethtool_flow_ext *ext_m_spec = &fs->m_ext; + + memcpy(match->key.eth_addrs.dst, ext_h_spec->h_dest, + ETH_ALEN); + memcpy(match->mask.eth_addrs.dst, ext_m_spec->h_dest, + ETH_ALEN); + + match->dissector.used_keys |= + BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS); + match->dissector.offset[FLOW_DISSECTOR_KEY_ETH_ADDRS] = + offsetof(struct ethtool_rx_flow_key, eth_addrs); + } + + act = &flow->rule->action.entries[0]; + switch (fs->ring_cookie) { + case RX_CLS_FLOW_DISC: + act->id = FLOW_ACTION_DROP; + break; + case RX_CLS_FLOW_WAKE: + act->id = FLOW_ACTION_WAKE; + break; + default: + act->id = FLOW_ACTION_QUEUE; + if (fs->flow_type & FLOW_RSS) + act->queue.ctx = input->rss_ctx; + + act->queue.vf = ethtool_get_flow_spec_ring_vf(fs->ring_cookie); + act->queue.index = ethtool_get_flow_spec_ring(fs->ring_cookie); + break; + } + + return flow; +} +EXPORT_SYMBOL(ethtool_rx_flow_rule_create); + +void ethtool_rx_flow_rule_destroy(struct ethtool_rx_flow_rule *flow) +{ + kfree(flow->rule); + kfree(flow); +} +EXPORT_SYMBOL(ethtool_rx_flow_rule_destroy); diff --git a/net/core/filter.c b/net/core/filter.c index 7559d6835ecb..5132c054c981 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -73,6 +73,7 @@ #include <linux/seg6_local.h> #include <net/seg6.h> #include <net/seg6_local.h> +#include <net/lwtunnel.h> /** * sk_filter_trim_cap - run a packet through a socket filter @@ -1793,6 +1794,20 @@ static const struct bpf_func_proto bpf_skb_pull_data_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk) +{ + sk = sk_to_full_sk(sk); + + return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL; +} + +static const struct bpf_func_proto bpf_sk_fullsock_proto = { + .func = bpf_sk_fullsock, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, +}; + static inline int sk_skb_try_make_writable(struct sk_buff *skb, unsigned int write_len) { @@ -2789,8 +2804,7 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) u32 off = skb_mac_header_len(skb); int ret; - /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ - if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) + if (!skb_is_gso_tcp(skb)) return -ENOTSUPP; ret = skb_cow(skb, len_diff); @@ -2831,8 +2845,7 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) u32 off = skb_mac_header_len(skb); int ret; - /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ - if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) + if (!skb_is_gso_tcp(skb)) return -ENOTSUPP; ret = skb_unclone(skb, GFP_ATOMIC); @@ -2957,8 +2970,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff) u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); int ret; - /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ - if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) + if (!skb_is_gso_tcp(skb)) return -ENOTSUPP; ret = skb_cow(skb, len_diff); @@ -2987,8 +2999,7 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff) u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); int ret; - /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ - if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) + if (!skb_is_gso_tcp(skb)) return -ENOTSUPP; ret = skb_unclone(skb, GFP_ATOMIC); @@ -4112,10 +4123,12 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, /* Only some socketops are supported */ switch (optname) { case SO_RCVBUF: + val = min_t(u32, val, sysctl_rmem_max); sk->sk_userlocks |= SOCK_RCVBUF_LOCK; sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF); break; case SO_SNDBUF: + val = min_t(u32, val, sysctl_wmem_max); sk->sk_userlocks |= SOCK_SNDBUF_LOCK; sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); break; @@ -4801,7 +4814,15 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len } #endif /* CONFIG_IPV6_SEG6_BPF */ -BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, +#if IS_ENABLED(CONFIG_LWTUNNEL_BPF) +static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, + bool ingress) +{ + return bpf_lwt_push_ip_encap(skb, hdr, len, ingress); +} +#endif + +BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, u32, len) { switch (type) { @@ -4810,13 +4831,40 @@ BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, case BPF_LWT_ENCAP_SEG6_INLINE: return bpf_push_seg6_encap(skb, type, hdr, len); #endif +#if IS_ENABLED(CONFIG_LWTUNNEL_BPF) + case BPF_LWT_ENCAP_IP: + return bpf_push_ip_encap(skb, hdr, len, true /* ingress */); +#endif default: return -EINVAL; } } -static const struct bpf_func_proto bpf_lwt_push_encap_proto = { - .func = bpf_lwt_push_encap, +BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type, + void *, hdr, u32, len) +{ + switch (type) { +#if IS_ENABLED(CONFIG_LWTUNNEL_BPF) + case BPF_LWT_ENCAP_IP: + return bpf_push_ip_encap(skb, hdr, len, false /* egress */); +#endif + default: + return -EINVAL; + } +} + +static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = { + .func = bpf_lwt_in_push_encap, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE +}; + +static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = { + .func = bpf_lwt_xmit_push_encap, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, @@ -5016,6 +5064,54 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { }; #endif /* CONFIG_IPV6_SEG6_BPF */ +#define CONVERT_COMMON_TCP_SOCK_FIELDS(md_type, CONVERT) \ +do { \ + switch (si->off) { \ + case offsetof(md_type, snd_cwnd): \ + CONVERT(snd_cwnd); break; \ + case offsetof(md_type, srtt_us): \ + CONVERT(srtt_us); break; \ + case offsetof(md_type, snd_ssthresh): \ + CONVERT(snd_ssthresh); break; \ + case offsetof(md_type, rcv_nxt): \ + CONVERT(rcv_nxt); break; \ + case offsetof(md_type, snd_nxt): \ + CONVERT(snd_nxt); break; \ + case offsetof(md_type, snd_una): \ + CONVERT(snd_una); break; \ + case offsetof(md_type, mss_cache): \ + CONVERT(mss_cache); break; \ + case offsetof(md_type, ecn_flags): \ + CONVERT(ecn_flags); break; \ + case offsetof(md_type, rate_delivered): \ + CONVERT(rate_delivered); break; \ + case offsetof(md_type, rate_interval_us): \ + CONVERT(rate_interval_us); break; \ + case offsetof(md_type, packets_out): \ + CONVERT(packets_out); break; \ + case offsetof(md_type, retrans_out): \ + CONVERT(retrans_out); break; \ + case offsetof(md_type, total_retrans): \ + CONVERT(total_retrans); break; \ + case offsetof(md_type, segs_in): \ + CONVERT(segs_in); break; \ + case offsetof(md_type, data_segs_in): \ + CONVERT(data_segs_in); break; \ + case offsetof(md_type, segs_out): \ + CONVERT(segs_out); break; \ + case offsetof(md_type, data_segs_out): \ + CONVERT(data_segs_out); break; \ + case offsetof(md_type, lost_out): \ + CONVERT(lost_out); break; \ + case offsetof(md_type, sacked_out): \ + CONVERT(sacked_out); break; \ + case offsetof(md_type, bytes_received): \ + CONVERT(bytes_received); break; \ + case offsetof(md_type, bytes_acked): \ + CONVERT(bytes_acked); break; \ + } \ +} while (0) + #ifdef CONFIG_INET static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, int dif, int sdif, u8 family, u8 proto) @@ -5253,6 +5349,79 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { .arg5_type = ARG_ANYTHING, }; +bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, bytes_acked)) + return false; + + if (off % size != 0) + return false; + + switch (off) { + case offsetof(struct bpf_tcp_sock, bytes_received): + case offsetof(struct bpf_tcp_sock, bytes_acked): + return size == sizeof(__u64); + default: + return size == sizeof(__u32); + } +} + +u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + +#define BPF_TCP_SOCK_GET_COMMON(FIELD) \ + do { \ + BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD) > \ + FIELD_SIZEOF(struct bpf_tcp_sock, FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\ + si->dst_reg, si->src_reg, \ + offsetof(struct tcp_sock, FIELD)); \ + } while (0) + + CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_tcp_sock, + BPF_TCP_SOCK_GET_COMMON); + + if (insn > insn_buf) + return insn - insn_buf; + + switch (si->off) { + case offsetof(struct bpf_tcp_sock, rtt_min): + BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) != + sizeof(struct minmax)); + BUILD_BUG_ON(sizeof(struct minmax) < + sizeof(struct minmax_sample)); + + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct tcp_sock, rtt_min) + + offsetof(struct minmax_sample, v)); + break; + } + + return insn - insn_buf; +} + +BPF_CALL_1(bpf_tcp_sock, struct sock *, sk) +{ + sk = sk_to_full_sk(sk); + + if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +static const struct bpf_func_proto bpf_tcp_sock_proto = { + .func = bpf_tcp_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -5282,7 +5451,8 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_lwt_seg6_adjust_srh || func == bpf_lwt_seg6_action || #endif - func == bpf_lwt_push_encap) + func == bpf_lwt_in_push_encap || + func == bpf_lwt_xmit_push_encap) return true; return false; @@ -5314,10 +5484,20 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_tail_call_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + default: + break; + } + + if (!capable(CAP_SYS_ADMIN)) + return NULL; + + switch (func_id) { + case BPF_FUNC_spin_lock: + return &bpf_spin_lock_proto; + case BPF_FUNC_spin_unlock: + return &bpf_spin_unlock_proto; case BPF_FUNC_trace_printk: - if (capable(CAP_SYS_ADMIN)) - return bpf_get_trace_printk_proto(); - /* else, fall through */ + return bpf_get_trace_printk_proto(); default: return NULL; } @@ -5396,6 +5576,12 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; + case BPF_FUNC_sk_fullsock: + return &bpf_sk_fullsock_proto; +#ifdef CONFIG_INET + case BPF_FUNC_tcp_sock: + return &bpf_tcp_sock_proto; +#endif default: return sk_filter_func_proto(func_id, prog); } @@ -5467,6 +5653,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_uid_proto; case BPF_FUNC_fib_lookup: return &bpf_skb_fib_lookup_proto; + case BPF_FUNC_sk_fullsock: + return &bpf_sk_fullsock_proto; #ifdef CONFIG_XFRM case BPF_FUNC_skb_get_xfrm_state: return &bpf_skb_get_xfrm_state_proto; @@ -5484,6 +5672,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; + case BPF_FUNC_tcp_sock: + return &bpf_tcp_sock_proto; #endif default: return bpf_base_func_proto(func_id); @@ -5660,7 +5850,7 @@ lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_lwt_push_encap: - return &bpf_lwt_push_encap_proto; + return &bpf_lwt_in_push_encap_proto; default: return lwt_out_func_proto(func_id, prog); } @@ -5696,6 +5886,8 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_l4_csum_replace_proto; case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; + case BPF_FUNC_lwt_push_encap: + return &bpf_lwt_xmit_push_encap_proto; default: return lwt_out_func_proto(func_id, prog); } @@ -5754,6 +5946,11 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (size != sizeof(__u64)) return false; break; + case offsetof(struct __sk_buff, sk): + if (type == BPF_WRITE || size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; + break; default: /* Only narrow read access allowed for now. */ if (type == BPF_WRITE) { @@ -5925,31 +6122,44 @@ full_access: return true; } -static bool __sock_filter_check_size(int off, int size, +bool bpf_sock_common_is_valid_access(int off, int size, + enum bpf_access_type type, struct bpf_insn_access_aux *info) { - const int size_default = sizeof(__u32); - switch (off) { - case bpf_ctx_range(struct bpf_sock, src_ip4): - case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): - bpf_ctx_record_field_size(info, size_default); - return bpf_ctx_narrow_access_ok(off, size, size_default); + case bpf_ctx_range_till(struct bpf_sock, type, priority): + return false; + default: + return bpf_sock_is_valid_access(off, size, type, info); } - - return size == size_default; } bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { + const int size_default = sizeof(__u32); + if (off < 0 || off >= sizeof(struct bpf_sock)) return false; if (off % size != 0) return false; - if (!__sock_filter_check_size(off, size, info)) - return false; - return true; + + switch (off) { + case offsetof(struct bpf_sock, state): + case offsetof(struct bpf_sock, family): + case offsetof(struct bpf_sock, type): + case offsetof(struct bpf_sock, protocol): + case offsetof(struct bpf_sock, dst_port): + case offsetof(struct bpf_sock, src_port): + case bpf_ctx_range(struct bpf_sock, src_ip4): + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): + case bpf_ctx_range(struct bpf_sock, dst_ip4): + case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + } + + return size == size_default; } static bool sock_filter_is_valid_access(int off, int size, @@ -6708,6 +6918,27 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, target_size)); break; + case offsetof(struct __sk_buff, gso_segs): + /* si->dst_reg = skb_shinfo(SKB); */ +#ifdef NET_SKBUFF_DATA_USES_OFFSET + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, head)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), + BPF_REG_AX, si->src_reg, + offsetof(struct sk_buff, end)); + *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); +#else + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, end)); +#endif + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs), + si->dst_reg, si->dst_reg, + bpf_target_off(struct skb_shared_info, + gso_segs, 2, + target_size)); + break; case offsetof(struct __sk_buff, wire_len): BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, pkt_len) != 4); @@ -6717,6 +6948,13 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, off += offsetof(struct qdisc_skb_cb, pkt_len); *target_size = 4; *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off); + break; + + case offsetof(struct __sk_buff, sk): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + break; } return insn - insn_buf; @@ -6765,24 +7003,32 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct bpf_sock, family): - BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2); - - *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, - offsetof(struct sock, sk_family)); + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock_common, skc_family), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, + skc_family, + FIELD_SIZEOF(struct sock_common, + skc_family), + target_size)); break; case offsetof(struct bpf_sock, type): + BUILD_BUG_ON(HWEIGHT32(SK_FL_TYPE_MASK) != BITS_PER_BYTE * 2); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, __sk_flags_offset)); *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); + *target_size = 2; break; case offsetof(struct bpf_sock, protocol): + BUILD_BUG_ON(HWEIGHT32(SK_FL_PROTO_MASK) != BITS_PER_BYTE); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, __sk_flags_offset)); *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT); + *target_size = 1; break; case offsetof(struct bpf_sock, src_ip4): @@ -6794,6 +7040,15 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, target_size)); break; + case offsetof(struct bpf_sock, dst_ip4): + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_daddr, + FIELD_SIZEOF(struct sock_common, + skc_daddr), + target_size)); + break; + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) off = si->off; @@ -6812,6 +7067,23 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, #endif break; + case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + off = si->off; + off -= offsetof(struct bpf_sock, dst_ip6[0]); + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, + skc_v6_daddr.s6_addr32[0], + FIELD_SIZEOF(struct sock_common, + skc_v6_daddr.s6_addr32[0]), + target_size) + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); + *target_size = 4; +#endif + break; + case offsetof(struct bpf_sock, src_port): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock_common, skc_num), @@ -6821,6 +7093,26 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, skc_num), target_size)); break; + + case offsetof(struct bpf_sock, dst_port): + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock_common, skc_dport), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_dport, + FIELD_SIZEOF(struct sock_common, + skc_dport), + target_size)); + break; + + case offsetof(struct bpf_sock, state): + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock_common, skc_state), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_state, + FIELD_SIZEOF(struct sock_common, + skc_state), + target_size)); + break; } return insn - insn_buf; @@ -7068,6 +7360,85 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, struct bpf_insn *insn = insn_buf; int off; +/* Helper macro for adding read access to tcp_sock or sock fields. */ +#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ + do { \ + BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ + FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, \ + is_fullsock), \ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + is_fullsock)); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, sk),\ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, sk));\ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \ + OBJ_FIELD), \ + si->dst_reg, si->dst_reg, \ + offsetof(OBJ, OBJ_FIELD)); \ + } while (0) + +#define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \ + SOCK_OPS_GET_FIELD(FIELD, FIELD, struct tcp_sock) + +/* Helper macro for adding write access to tcp_sock or sock fields. + * The macro is called with two registers, dst_reg which contains a pointer + * to ctx (context) and src_reg which contains the value that should be + * stored. However, we need an additional register since we cannot overwrite + * dst_reg because it may be used later in the program. + * Instead we "borrow" one of the other register. We first save its value + * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore + * it at the end of the macro. + */ +#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ + do { \ + int reg = BPF_REG_9; \ + BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ + FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, \ + is_fullsock), \ + reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + is_fullsock)); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, sk),\ + reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, sk));\ + *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD), \ + reg, si->src_reg, \ + offsetof(OBJ, OBJ_FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + } while (0) + +#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \ + do { \ + if (TYPE == BPF_WRITE) \ + SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ + else \ + SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ + } while (0) + + CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_sock_ops, + SOCK_OPS_GET_TCP_SOCK_FIELD); + + if (insn > insn_buf) + return insn - insn_buf; + switch (si->off) { case offsetof(struct bpf_sock_ops, op) ... offsetof(struct bpf_sock_ops, replylong[3]): @@ -7225,175 +7596,15 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, FIELD_SIZEOF(struct minmax_sample, t)); break; -/* Helper macro for adding read access to tcp_sock or sock fields. */ -#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ - do { \ - BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ - FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ - struct bpf_sock_ops_kern, \ - is_fullsock), \ - si->dst_reg, si->src_reg, \ - offsetof(struct bpf_sock_ops_kern, \ - is_fullsock)); \ - *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2); \ - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ - struct bpf_sock_ops_kern, sk),\ - si->dst_reg, si->src_reg, \ - offsetof(struct bpf_sock_ops_kern, sk));\ - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \ - OBJ_FIELD), \ - si->dst_reg, si->dst_reg, \ - offsetof(OBJ, OBJ_FIELD)); \ - } while (0) - -/* Helper macro for adding write access to tcp_sock or sock fields. - * The macro is called with two registers, dst_reg which contains a pointer - * to ctx (context) and src_reg which contains the value that should be - * stored. However, we need an additional register since we cannot overwrite - * dst_reg because it may be used later in the program. - * Instead we "borrow" one of the other register. We first save its value - * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore - * it at the end of the macro. - */ -#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ - do { \ - int reg = BPF_REG_9; \ - BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ - FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ - if (si->dst_reg == reg || si->src_reg == reg) \ - reg--; \ - if (si->dst_reg == reg || si->src_reg == reg) \ - reg--; \ - *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \ - offsetof(struct bpf_sock_ops_kern, \ - temp)); \ - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ - struct bpf_sock_ops_kern, \ - is_fullsock), \ - reg, si->dst_reg, \ - offsetof(struct bpf_sock_ops_kern, \ - is_fullsock)); \ - *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ - struct bpf_sock_ops_kern, sk),\ - reg, si->dst_reg, \ - offsetof(struct bpf_sock_ops_kern, sk));\ - *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD), \ - reg, si->src_reg, \ - offsetof(OBJ, OBJ_FIELD)); \ - *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \ - offsetof(struct bpf_sock_ops_kern, \ - temp)); \ - } while (0) - -#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \ - do { \ - if (TYPE == BPF_WRITE) \ - SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ - else \ - SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ - } while (0) - - case offsetof(struct bpf_sock_ops, snd_cwnd): - SOCK_OPS_GET_FIELD(snd_cwnd, snd_cwnd, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, srtt_us): - SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock); - break; - case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags): SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags, struct tcp_sock); break; - case offsetof(struct bpf_sock_ops, snd_ssthresh): - SOCK_OPS_GET_FIELD(snd_ssthresh, snd_ssthresh, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, rcv_nxt): - SOCK_OPS_GET_FIELD(rcv_nxt, rcv_nxt, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, snd_nxt): - SOCK_OPS_GET_FIELD(snd_nxt, snd_nxt, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, snd_una): - SOCK_OPS_GET_FIELD(snd_una, snd_una, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, mss_cache): - SOCK_OPS_GET_FIELD(mss_cache, mss_cache, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, ecn_flags): - SOCK_OPS_GET_FIELD(ecn_flags, ecn_flags, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, rate_delivered): - SOCK_OPS_GET_FIELD(rate_delivered, rate_delivered, - struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, rate_interval_us): - SOCK_OPS_GET_FIELD(rate_interval_us, rate_interval_us, - struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, packets_out): - SOCK_OPS_GET_FIELD(packets_out, packets_out, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, retrans_out): - SOCK_OPS_GET_FIELD(retrans_out, retrans_out, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, total_retrans): - SOCK_OPS_GET_FIELD(total_retrans, total_retrans, - struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, segs_in): - SOCK_OPS_GET_FIELD(segs_in, segs_in, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, data_segs_in): - SOCK_OPS_GET_FIELD(data_segs_in, data_segs_in, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, segs_out): - SOCK_OPS_GET_FIELD(segs_out, segs_out, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, data_segs_out): - SOCK_OPS_GET_FIELD(data_segs_out, data_segs_out, - struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, lost_out): - SOCK_OPS_GET_FIELD(lost_out, lost_out, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, sacked_out): - SOCK_OPS_GET_FIELD(sacked_out, sacked_out, struct tcp_sock); - break; - case offsetof(struct bpf_sock_ops, sk_txhash): SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, struct sock, type); break; - - case offsetof(struct bpf_sock_ops, bytes_received): - SOCK_OPS_GET_FIELD(bytes_received, bytes_received, - struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, bytes_acked): - SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock); - break; - } return insn - insn_buf; } @@ -7698,6 +7909,7 @@ const struct bpf_verifier_ops flow_dissector_verifier_ops = { }; const struct bpf_prog_ops flow_dissector_prog_ops = { + .test_run = bpf_prog_test_run_flow_dissector, }; int sk_detach_filter(struct sock *sk) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 9f2840510e63..bb1a54747d64 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -683,6 +683,46 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, } } +bool __skb_flow_bpf_dissect(struct bpf_prog *prog, + const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + struct bpf_flow_keys *flow_keys) +{ + struct bpf_skb_data_end cb_saved; + struct bpf_skb_data_end *cb; + u32 result; + + /* Note that even though the const qualifier is discarded + * throughout the execution of the BPF program, all changes(the + * control block) are reverted after the BPF program returns. + * Therefore, __skb_flow_dissect does not alter the skb. + */ + + cb = (struct bpf_skb_data_end *)skb->cb; + + /* Save Control Block */ + memcpy(&cb_saved, cb, sizeof(cb_saved)); + memset(cb, 0, sizeof(*cb)); + + /* Pass parameters to the BPF program */ + memset(flow_keys, 0, sizeof(*flow_keys)); + cb->qdisc_cb.flow_keys = flow_keys; + flow_keys->nhoff = skb_network_offset(skb); + flow_keys->thoff = flow_keys->nhoff; + + bpf_compute_data_pointers((struct sk_buff *)skb); + result = BPF_PROG_RUN(prog, skb); + + /* Restore state */ + memcpy(cb, &cb_saved, sizeof(cb_saved)); + + flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, 0, skb->len); + flow_keys->thoff = clamp_t(u16, flow_keys->thoff, + flow_keys->nhoff, skb->len); + + return result == BPF_OK; +} + /** * __skb_flow_dissect - extract the flow_keys struct and return it * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified @@ -714,7 +754,6 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_vlan *key_vlan; enum flow_dissect_ret fdret; enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; - struct bpf_prog *attached = NULL; int num_hdrs = 0; u8 ip_proto = 0; bool ret; @@ -754,53 +793,30 @@ bool __skb_flow_dissect(const struct sk_buff *skb, FLOW_DISSECTOR_KEY_BASIC, target_container); - rcu_read_lock(); if (skb) { + struct bpf_flow_keys flow_keys; + struct bpf_prog *attached = NULL; + + rcu_read_lock(); + if (skb->dev) attached = rcu_dereference(dev_net(skb->dev)->flow_dissector_prog); else if (skb->sk) attached = rcu_dereference(sock_net(skb->sk)->flow_dissector_prog); else WARN_ON_ONCE(1); - } - if (attached) { - /* Note that even though the const qualifier is discarded - * throughout the execution of the BPF program, all changes(the - * control block) are reverted after the BPF program returns. - * Therefore, __skb_flow_dissect does not alter the skb. - */ - struct bpf_flow_keys flow_keys = {}; - struct bpf_skb_data_end cb_saved; - struct bpf_skb_data_end *cb; - u32 result; - - cb = (struct bpf_skb_data_end *)skb->cb; - - /* Save Control Block */ - memcpy(&cb_saved, cb, sizeof(cb_saved)); - memset(cb, 0, sizeof(cb_saved)); - /* Pass parameters to the BPF program */ - cb->qdisc_cb.flow_keys = &flow_keys; - flow_keys.nhoff = nhoff; - flow_keys.thoff = nhoff; - - bpf_compute_data_pointers((struct sk_buff *)skb); - result = BPF_PROG_RUN(attached, skb); - - /* Restore state */ - memcpy(cb, &cb_saved, sizeof(cb_saved)); - - flow_keys.nhoff = clamp_t(u16, flow_keys.nhoff, 0, skb->len); - flow_keys.thoff = clamp_t(u16, flow_keys.thoff, - flow_keys.nhoff, skb->len); - - __skb_flow_bpf_to_target(&flow_keys, flow_dissector, - target_container); + if (attached) { + ret = __skb_flow_bpf_dissect(attached, skb, + flow_dissector, + &flow_keys); + __skb_flow_bpf_to_target(&flow_keys, flow_dissector, + target_container); + rcu_read_unlock(); + return ret; + } rcu_read_unlock(); - return result == BPF_OK; } - rcu_read_unlock(); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c new file mode 100644 index 000000000000..c3a00eac4804 --- /dev/null +++ b/net/core/flow_offload.c @@ -0,0 +1,153 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/kernel.h> +#include <linux/slab.h> +#include <net/flow_offload.h> + +struct flow_rule *flow_rule_alloc(unsigned int num_actions) +{ + struct flow_rule *rule; + + rule = kzalloc(sizeof(struct flow_rule) + + sizeof(struct flow_action_entry) * num_actions, + GFP_KERNEL); + if (!rule) + return NULL; + + rule->action.num_entries = num_actions; + + return rule; +} +EXPORT_SYMBOL(flow_rule_alloc); + +#define FLOW_DISSECTOR_MATCH(__rule, __type, __out) \ + const struct flow_match *__m = &(__rule)->match; \ + struct flow_dissector *__d = (__m)->dissector; \ + \ + (__out)->key = skb_flow_dissector_target(__d, __type, (__m)->key); \ + (__out)->mask = skb_flow_dissector_target(__d, __type, (__m)->mask); \ + +void flow_rule_match_basic(const struct flow_rule *rule, + struct flow_match_basic *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_BASIC, out); +} +EXPORT_SYMBOL(flow_rule_match_basic); + +void flow_rule_match_control(const struct flow_rule *rule, + struct flow_match_control *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_CONTROL, out); +} +EXPORT_SYMBOL(flow_rule_match_control); + +void flow_rule_match_eth_addrs(const struct flow_rule *rule, + struct flow_match_eth_addrs *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ETH_ADDRS, out); +} +EXPORT_SYMBOL(flow_rule_match_eth_addrs); + +void flow_rule_match_vlan(const struct flow_rule *rule, + struct flow_match_vlan *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_VLAN, out); +} +EXPORT_SYMBOL(flow_rule_match_vlan); + +void flow_rule_match_ipv4_addrs(const struct flow_rule *rule, + struct flow_match_ipv4_addrs *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPV4_ADDRS, out); +} +EXPORT_SYMBOL(flow_rule_match_ipv4_addrs); + +void flow_rule_match_ipv6_addrs(const struct flow_rule *rule, + struct flow_match_ipv6_addrs *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPV6_ADDRS, out); +} +EXPORT_SYMBOL(flow_rule_match_ipv6_addrs); + +void flow_rule_match_ip(const struct flow_rule *rule, + struct flow_match_ip *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IP, out); +} +EXPORT_SYMBOL(flow_rule_match_ip); + +void flow_rule_match_ports(const struct flow_rule *rule, + struct flow_match_ports *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_PORTS, out); +} +EXPORT_SYMBOL(flow_rule_match_ports); + +void flow_rule_match_tcp(const struct flow_rule *rule, + struct flow_match_tcp *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_TCP, out); +} +EXPORT_SYMBOL(flow_rule_match_tcp); + +void flow_rule_match_icmp(const struct flow_rule *rule, + struct flow_match_icmp *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ICMP, out); +} +EXPORT_SYMBOL(flow_rule_match_icmp); + +void flow_rule_match_mpls(const struct flow_rule *rule, + struct flow_match_mpls *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_MPLS, out); +} +EXPORT_SYMBOL(flow_rule_match_mpls); + +void flow_rule_match_enc_control(const struct flow_rule *rule, + struct flow_match_control *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_CONTROL, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_control); + +void flow_rule_match_enc_ipv4_addrs(const struct flow_rule *rule, + struct flow_match_ipv4_addrs *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_ipv4_addrs); + +void flow_rule_match_enc_ipv6_addrs(const struct flow_rule *rule, + struct flow_match_ipv6_addrs *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_ipv6_addrs); + +void flow_rule_match_enc_ip(const struct flow_rule *rule, + struct flow_match_ip *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_IP, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_ip); + +void flow_rule_match_enc_ports(const struct flow_rule *rule, + struct flow_match_ports *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_PORTS, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_ports); + +void flow_rule_match_enc_keyid(const struct flow_rule *rule, + struct flow_match_enc_keyid *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_KEYID, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_keyid); + +void flow_rule_match_enc_opts(const struct flow_rule *rule, + struct flow_match_enc_opts *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_OPTS, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_opts); diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index a648568c5e8f..cf2f8897ca19 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -16,6 +16,8 @@ #include <linux/types.h> #include <linux/bpf.h> #include <net/lwtunnel.h> +#include <net/gre.h> +#include <net/ip6_route.h> struct bpf_lwt_prog { struct bpf_prog *prog; @@ -55,6 +57,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, switch (ret) { case BPF_OK: + case BPF_LWT_REROUTE: break; case BPF_REDIRECT: @@ -87,6 +90,30 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, return ret; } +static int bpf_lwt_input_reroute(struct sk_buff *skb) +{ + int err = -EINVAL; + + if (skb->protocol == htons(ETH_P_IP)) { + struct iphdr *iph = ip_hdr(skb); + + err = ip_route_input_noref(skb, iph->daddr, iph->saddr, + iph->tos, skb_dst(skb)->dev); + } else if (skb->protocol == htons(ETH_P_IPV6)) { + err = ipv6_stub->ipv6_route_input(skb); + } else { + err = -EAFNOSUPPORT; + } + + if (err) + goto err; + return dst_input(skb); + +err: + kfree_skb(skb); + return err; +} + static int bpf_input(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); @@ -98,11 +125,11 @@ static int bpf_input(struct sk_buff *skb) ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT); if (ret < 0) return ret; + if (ret == BPF_LWT_REROUTE) + return bpf_lwt_input_reroute(skb); } if (unlikely(!dst->lwtstate->orig_input)) { - pr_warn_once("orig_input not set on dst for prog %s\n", - bpf->out.name); kfree_skb(skb); return -EINVAL; } @@ -147,6 +174,102 @@ static int xmit_check_hhlen(struct sk_buff *skb) return 0; } +static int bpf_lwt_xmit_reroute(struct sk_buff *skb) +{ + struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev); + int oif = l3mdev ? l3mdev->ifindex : 0; + struct dst_entry *dst = NULL; + int err = -EAFNOSUPPORT; + struct sock *sk; + struct net *net; + bool ipv4; + + if (skb->protocol == htons(ETH_P_IP)) + ipv4 = true; + else if (skb->protocol == htons(ETH_P_IPV6)) + ipv4 = false; + else + goto err; + + sk = sk_to_full_sk(skb->sk); + if (sk) { + if (sk->sk_bound_dev_if) + oif = sk->sk_bound_dev_if; + net = sock_net(sk); + } else { + net = dev_net(skb_dst(skb)->dev); + } + + if (ipv4) { + struct iphdr *iph = ip_hdr(skb); + struct flowi4 fl4 = {}; + struct rtable *rt; + + fl4.flowi4_oif = oif; + fl4.flowi4_mark = skb->mark; + fl4.flowi4_uid = sock_net_uid(net, sk); + fl4.flowi4_tos = RT_TOS(iph->tos); + fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; + fl4.flowi4_proto = iph->protocol; + fl4.daddr = iph->daddr; + fl4.saddr = iph->saddr; + + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto err; + } + dst = &rt->dst; + } else { + struct ipv6hdr *iph6 = ipv6_hdr(skb); + struct flowi6 fl6 = {}; + + fl6.flowi6_oif = oif; + fl6.flowi6_mark = skb->mark; + fl6.flowi6_uid = sock_net_uid(net, sk); + fl6.flowlabel = ip6_flowinfo(iph6); + fl6.flowi6_proto = iph6->nexthdr; + fl6.daddr = iph6->daddr; + fl6.saddr = iph6->saddr; + + err = ipv6_stub->ipv6_dst_lookup(net, skb->sk, &dst, &fl6); + if (unlikely(err)) + goto err; + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + goto err; + } + } + if (unlikely(dst->error)) { + err = dst->error; + dst_release(dst); + goto err; + } + + /* Although skb header was reserved in bpf_lwt_push_ip_encap(), it + * was done for the previous dst, so we are doing it here again, in + * case the new dst needs much more space. The call below is a noop + * if there is enough header space in skb. + */ + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + if (unlikely(err)) + goto err; + + skb_dst_drop(skb); + skb_dst_set(skb, dst); + + err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb); + if (unlikely(err)) + return err; + + /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */ + return LWTUNNEL_XMIT_DONE; + +err: + kfree_skb(skb); + return err; +} + static int bpf_xmit(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); @@ -154,11 +277,20 @@ static int bpf_xmit(struct sk_buff *skb) bpf = bpf_lwt_lwtunnel(dst->lwtstate); if (bpf->xmit.prog) { + __be16 proto = skb->protocol; int ret; ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT); switch (ret) { case BPF_OK: + /* If the header changed, e.g. via bpf_lwt_push_encap, + * BPF_LWT_REROUTE below should have been used if the + * protocol was also changed. + */ + if (skb->protocol != proto) { + kfree_skb(skb); + return -EINVAL; + } /* If the header was expanded, headroom might be too * small for L2 header to come, expand as needed. */ @@ -169,6 +301,8 @@ static int bpf_xmit(struct sk_buff *skb) return LWTUNNEL_XMIT_CONTINUE; case BPF_REDIRECT: return LWTUNNEL_XMIT_DONE; + case BPF_LWT_REROUTE: + return bpf_lwt_xmit_reroute(skb); default: return ret; } @@ -390,6 +524,133 @@ static const struct lwtunnel_encap_ops bpf_encap_ops = { .owner = THIS_MODULE, }; +static int handle_gso_type(struct sk_buff *skb, unsigned int gso_type, + int encap_len) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + + gso_type |= SKB_GSO_DODGY; + shinfo->gso_type |= gso_type; + skb_decrease_gso_size(shinfo, encap_len); + shinfo->gso_segs = 0; + return 0; +} + +static int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len) +{ + int next_hdr_offset; + void *next_hdr; + __u8 protocol; + + /* SCTP and UDP_L4 gso need more nuanced handling than what + * handle_gso_type() does above: skb_decrease_gso_size() is not enough. + * So at the moment only TCP GSO packets are let through. + */ + if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) + return -ENOTSUPP; + + if (ipv4) { + protocol = ip_hdr(skb)->protocol; + next_hdr_offset = sizeof(struct iphdr); + next_hdr = skb_network_header(skb) + next_hdr_offset; + } else { + protocol = ipv6_hdr(skb)->nexthdr; + next_hdr_offset = sizeof(struct ipv6hdr); + next_hdr = skb_network_header(skb) + next_hdr_offset; + } + + switch (protocol) { + case IPPROTO_GRE: + next_hdr_offset += sizeof(struct gre_base_hdr); + if (next_hdr_offset > encap_len) + return -EINVAL; + + if (((struct gre_base_hdr *)next_hdr)->flags & GRE_CSUM) + return handle_gso_type(skb, SKB_GSO_GRE_CSUM, + encap_len); + return handle_gso_type(skb, SKB_GSO_GRE, encap_len); + + case IPPROTO_UDP: + next_hdr_offset += sizeof(struct udphdr); + if (next_hdr_offset > encap_len) + return -EINVAL; + + if (((struct udphdr *)next_hdr)->check) + return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL_CSUM, + encap_len); + return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL, encap_len); + + case IPPROTO_IP: + case IPPROTO_IPV6: + if (ipv4) + return handle_gso_type(skb, SKB_GSO_IPXIP4, encap_len); + else + return handle_gso_type(skb, SKB_GSO_IPXIP6, encap_len); + + default: + return -EPROTONOSUPPORT; + } +} + +int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress) +{ + struct iphdr *iph; + bool ipv4; + int err; + + if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM)) + return -EINVAL; + + /* validate protocol and length */ + iph = (struct iphdr *)hdr; + if (iph->version == 4) { + ipv4 = true; + if (unlikely(len < iph->ihl * 4)) + return -EINVAL; + } else if (iph->version == 6) { + ipv4 = false; + if (unlikely(len < sizeof(struct ipv6hdr))) + return -EINVAL; + } else { + return -EINVAL; + } + + if (ingress) + err = skb_cow_head(skb, len + skb->mac_len); + else + err = skb_cow_head(skb, + len + LL_RESERVED_SPACE(skb_dst(skb)->dev)); + if (unlikely(err)) + return err; + + /* push the encap headers and fix pointers */ + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + skb_push(skb, len); + if (ingress) + skb_postpush_rcsum(skb, iph, len); + skb_reset_network_header(skb); + memcpy(skb_network_header(skb), hdr, len); + bpf_compute_data_pointers(skb); + skb_clear_hash(skb); + + if (ipv4) { + skb->protocol = htons(ETH_P_IP); + iph = ip_hdr(skb); + + if (!iph->check) + iph->check = ip_fast_csum((unsigned char *)iph, + iph->ihl); + } else { + skb->protocol = htons(ETH_P_IPV6); + } + + if (skb_is_gso(skb)) + return handle_gso_encap(skb, ipv4, len); + + return 0; +} + static int __init bpf_lwt_init(void) { return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 0b171756453c..19b557bd294b 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -122,18 +122,18 @@ int lwtunnel_build_state(u16 encap_type, ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); - if (likely(ops && ops->build_state && try_module_get(ops->owner))) { + if (likely(ops && ops->build_state && try_module_get(ops->owner))) found = true; + rcu_read_unlock(); + + if (found) { ret = ops->build_state(encap, family, cfg, lws, extack); if (ret) module_put(ops->owner); - } - rcu_read_unlock(); - - /* don't rely on -EOPNOTSUPP to detect match as build_state - * handlers could return it - */ - if (!found) { + } else { + /* don't rely on -EOPNOTSUPP to detect match as build_state + * handlers could return it + */ NL_SET_ERR_MSG_ATTR(extack, encap, "LWT encapsulation type not supported"); } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 4230400b9a30..30f6fd8f68e0 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -42,6 +42,8 @@ #include <linux/inetdevice.h> #include <net/addrconf.h> +#include <trace/events/neigh.h> + #define DEBUG #define NEIGH_DEBUG 1 #define neigh_dbg(level, fmt, ...) \ @@ -102,6 +104,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh) if (neigh->parms->neigh_cleanup) neigh->parms->neigh_cleanup(neigh); + trace_neigh_cleanup_and_release(neigh, 0); __neigh_notify(neigh, RTM_DELNEIGH, 0, 0); call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); neigh_release(neigh); @@ -1095,6 +1098,8 @@ out: if (notify) neigh_update_notify(neigh, 0); + trace_neigh_timer_handler(neigh, 0); + neigh_release(neigh); } @@ -1165,6 +1170,7 @@ out_unlock_bh: else write_unlock(&neigh->lock); local_bh_enable(); + trace_neigh_event_send_done(neigh, rc); return rc; out_dead: @@ -1172,6 +1178,7 @@ out_dead: goto out_unlock_bh; write_unlock_bh(&neigh->lock); kfree_skb(skb); + trace_neigh_event_send_dead(neigh, 1); return 1; } EXPORT_SYMBOL(__neigh_event_send); @@ -1227,6 +1234,8 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, struct net_device *dev; int update_isrouter = 0; + trace_neigh_update(neigh, lladdr, new, flags, nlmsg_pid); + write_lock_bh(&neigh->lock); dev = neigh->dev; @@ -1393,6 +1402,8 @@ out: if (notify) neigh_update_notify(neigh, nlmsg_pid); + trace_neigh_update_done(neigh, err); + return err; } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index ff9fd2bb4ce4..7c5061123ead 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -12,7 +12,6 @@ #include <linux/capability.h> #include <linux/kernel.h> #include <linux/netdevice.h> -#include <net/switchdev.h> #include <linux/if_arp.h> #include <linux/slab.h> #include <linux/sched/signal.h> @@ -501,16 +500,11 @@ static ssize_t phys_switch_id_show(struct device *dev, return restart_syscall(); if (dev_isalive(netdev)) { - struct switchdev_attr attr = { - .orig_dev = netdev, - .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, - .flags = SWITCHDEV_F_NO_RECURSE, - }; + struct netdev_phys_item_id ppid = { }; - ret = switchdev_port_attr_get(netdev, &attr); + ret = dev_get_port_parent_id(netdev, &ppid, false); if (!ret) - ret = sprintf(buf, "%*phN\n", attr.u.ppid.id_len, - attr.u.ppid.id); + ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id); } rtnl_unlock(); diff --git a/net/core/net-traces.c b/net/core/net-traces.c index 419af6dfe29f..470b179d599e 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -43,6 +43,14 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(fdb_delete); EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update); #endif +#include <trace/events/neigh.h> +EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update); +EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update_done); +EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_timer_handler); +EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_event_send_done); +EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_event_send_dead); +EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_cleanup_and_release); + EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 43a932cb609b..5b2252c6d49b 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -136,17 +136,19 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, if (!(pool->p.flags & PP_FLAG_DMA_MAP)) goto skip_dma_map; - /* Setup DMA mapping: use page->private for DMA-addr + /* Setup DMA mapping: use 'struct page' area for storing DMA-addr + * since dma_addr_t can be either 32 or 64 bits and does not always fit + * into page private data (i.e 32bit cpu with 64bit DMA caps) * This mapping is kept for lifetime of page, until leaving pool. */ - dma = dma_map_page(pool->p.dev, page, 0, - (PAGE_SIZE << pool->p.order), - pool->p.dma_dir); + dma = dma_map_page_attrs(pool->p.dev, page, 0, + (PAGE_SIZE << pool->p.order), + pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC); if (dma_mapping_error(pool->p.dev, dma)) { put_page(page); return NULL; } - set_page_private(page, dma); /* page->private = dma; */ + page->dma_addr = dma; skip_dma_map: /* When page just alloc'ed is should/must have refcnt 1. */ @@ -175,13 +177,17 @@ EXPORT_SYMBOL(page_pool_alloc_pages); static void __page_pool_clean_page(struct page_pool *pool, struct page *page) { + dma_addr_t dma; + if (!(pool->p.flags & PP_FLAG_DMA_MAP)) return; + dma = page->dma_addr; /* DMA unmap */ - dma_unmap_page(pool->p.dev, page_private(page), - PAGE_SIZE << pool->p.order, pool->p.dma_dir); - set_page_private(page, 0); + dma_unmap_page_attrs(pool->p.dev, dma, + PAGE_SIZE << pool->p.order, pool->p.dma_dir, + DMA_ATTR_SKIP_CPU_SYNC); + page->dma_addr = 0; } /* Return a page to the page allocator, cleaning up our state */ diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f5a98082ac7a..a51cab95ba64 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -46,7 +46,6 @@ #include <linux/inet.h> #include <linux/netdevice.h> -#include <net/switchdev.h> #include <net/ip.h> #include <net/protocol.h> #include <net/arp.h> @@ -1146,22 +1145,17 @@ static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev) static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) { + struct netdev_phys_item_id ppid = { }; int err; - struct switchdev_attr attr = { - .orig_dev = dev, - .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, - .flags = SWITCHDEV_F_NO_RECURSE, - }; - err = switchdev_port_attr_get(dev, &attr); + err = dev_get_port_parent_id(dev, &ppid, false); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } - if (nla_put(skb, IFLA_PHYS_SWITCH_ID, attr.u.ppid.id_len, - attr.u.ppid.id)) + if (nla_put(skb, IFLA_PHYS_SWITCH_ID, ppid.id_len, ppid.id)) return -EMSGSIZE; return 0; diff --git a/net/core/scm.c b/net/core/scm.c index b1ff8a441748..52ef219cf6df 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -29,6 +29,7 @@ #include <linux/pid.h> #include <linux/nsproxy.h> #include <linux/slab.h> +#include <linux/errqueue.h> #include <linux/uaccess.h> @@ -252,6 +253,32 @@ out: } EXPORT_SYMBOL(put_cmsg); +void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss_internal) +{ + struct scm_timestamping64 tss; + int i; + + for (i = 0; i < ARRAY_SIZE(tss.ts); i++) { + tss.ts[i].tv_sec = tss_internal->ts[i].tv_sec; + tss.ts[i].tv_nsec = tss_internal->ts[i].tv_nsec; + } + + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPING_NEW, sizeof(tss), &tss); +} +EXPORT_SYMBOL(put_cmsg_scm_timestamping64); + +void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_internal *tss_internal) +{ + struct scm_timestamping tss; + int i; + + for (i = 0; i < ARRAY_SIZE(tss.ts); i++) + tss.ts[i] = timespec64_to_timespec(tss_internal->ts[i]); + + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPING_OLD, sizeof(tss), &tss); +} +EXPORT_SYMBOL(put_cmsg_scm_timestamping); + void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) { struct cmsghdr __user *cm diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 26d848484912..2415d9cb9b89 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -356,6 +356,8 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) */ void *netdev_alloc_frag(unsigned int fragsz) { + fragsz = SKB_DATA_ALIGN(fragsz); + return __netdev_alloc_frag(fragsz, GFP_ATOMIC); } EXPORT_SYMBOL(netdev_alloc_frag); @@ -369,6 +371,8 @@ static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) void *napi_alloc_frag(unsigned int fragsz) { + fragsz = SKB_DATA_ALIGN(fragsz); + return __napi_alloc_frag(fragsz, GFP_ATOMIC); } EXPORT_SYMBOL(napi_alloc_frag); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index e76ed8df9f13..ae6f06e45737 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -554,8 +554,7 @@ static void sk_psock_destroy_deferred(struct work_struct *gc) struct sk_psock *psock = container_of(gc, struct sk_psock, gc); /* No sk_callback_lock since already detached. */ - if (psock->parser.enabled) - strp_done(&psock->parser.strp); + strp_done(&psock->parser.strp); cancel_work_sync(&psock->work); diff --git a/net/core/sock.c b/net/core/sock.c index 900e8a9435f5..f5d82f3fa474 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -335,14 +335,68 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(__sk_backlog_rcv); -static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) +static int sock_get_timeout(long timeo, void *optval, bool old_timeval) { - struct timeval tv; + struct __kernel_sock_timeval tv; + int size; - if (optlen < sizeof(tv)) - return -EINVAL; - if (copy_from_user(&tv, optval, sizeof(tv))) - return -EFAULT; + if (timeo == MAX_SCHEDULE_TIMEOUT) { + tv.tv_sec = 0; + tv.tv_usec = 0; + } else { + tv.tv_sec = timeo / HZ; + tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; + } + + if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { + struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; + *(struct old_timeval32 *)optval = tv32; + return sizeof(tv32); + } + + if (old_timeval) { + struct __kernel_old_timeval old_tv; + old_tv.tv_sec = tv.tv_sec; + old_tv.tv_usec = tv.tv_usec; + *(struct __kernel_old_timeval *)optval = old_tv; + size = sizeof(old_tv); + } else { + *(struct __kernel_sock_timeval *)optval = tv; + size = sizeof(tv); + } + + return size; +} + +static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval) +{ + struct __kernel_sock_timeval tv; + + if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { + struct old_timeval32 tv32; + + if (optlen < sizeof(tv32)) + return -EINVAL; + + if (copy_from_user(&tv32, optval, sizeof(tv32))) + return -EFAULT; + tv.tv_sec = tv32.tv_sec; + tv.tv_usec = tv32.tv_usec; + } else if (old_timeval) { + struct __kernel_old_timeval old_tv; + + if (optlen < sizeof(old_tv)) + return -EINVAL; + if (copy_from_user(&old_tv, optval, sizeof(old_tv))) + return -EFAULT; + tv.tv_sec = old_tv.tv_sec; + tv.tv_usec = old_tv.tv_usec; + } else { + if (optlen < sizeof(tv)) + return -EINVAL; + if (copy_from_user(&tv, optval, sizeof(tv))) + return -EFAULT; + } if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) return -EDOM; @@ -360,8 +414,8 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) *timeo_p = MAX_SCHEDULE_TIMEOUT; if (tv.tv_sec == 0 && tv.tv_usec == 0) return 0; - if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1)) - *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ); + if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) + *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ); return 0; } @@ -731,6 +785,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname, */ val = min_t(u32, val, sysctl_wmem_max); set_sndbuf: + /* Ensure val * 2 fits into an int, to prevent max_t() + * from treating it as a negative value. + */ + val = min_t(int, val, INT_MAX / 2); sk->sk_userlocks |= SOCK_SNDBUF_LOCK; sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); /* Wake up sending tasks if we upped the value. */ @@ -742,6 +800,12 @@ set_sndbuf: ret = -EPERM; break; } + + /* No negative values (to prevent underflow, as val will be + * multiplied by 2). + */ + if (val < 0) + val = 0; goto set_sndbuf; case SO_RCVBUF: @@ -752,6 +816,10 @@ set_sndbuf: */ val = min_t(u32, val, sysctl_rmem_max); set_rcvbuf: + /* Ensure val * 2 fits into an int, to prevent max_t() + * from treating it as a negative value. + */ + val = min_t(int, val, INT_MAX / 2); sk->sk_userlocks |= SOCK_RCVBUF_LOCK; /* * We double it on the way in to account for @@ -776,6 +844,12 @@ set_rcvbuf: ret = -EPERM; break; } + + /* No negative values (to prevent underflow, as val will be + * multiplied by 2). + */ + if (val < 0) + val = 0; goto set_rcvbuf; case SO_KEEPALIVE: @@ -833,10 +907,17 @@ set_rcvbuf: clear_bit(SOCK_PASSCRED, &sock->flags); break; - case SO_TIMESTAMP: - case SO_TIMESTAMPNS: + case SO_TIMESTAMP_OLD: + case SO_TIMESTAMP_NEW: + case SO_TIMESTAMPNS_OLD: + case SO_TIMESTAMPNS_NEW: if (valbool) { - if (optname == SO_TIMESTAMP) + if (optname == SO_TIMESTAMP_NEW || optname == SO_TIMESTAMPNS_NEW) + sock_set_flag(sk, SOCK_TSTAMP_NEW); + else + sock_reset_flag(sk, SOCK_TSTAMP_NEW); + + if (optname == SO_TIMESTAMP_OLD || optname == SO_TIMESTAMP_NEW) sock_reset_flag(sk, SOCK_RCVTSTAMPNS); else sock_set_flag(sk, SOCK_RCVTSTAMPNS); @@ -845,10 +926,14 @@ set_rcvbuf: } else { sock_reset_flag(sk, SOCK_RCVTSTAMP); sock_reset_flag(sk, SOCK_RCVTSTAMPNS); + sock_reset_flag(sk, SOCK_TSTAMP_NEW); } break; - case SO_TIMESTAMPING: + case SO_TIMESTAMPING_NEW: + sock_set_flag(sk, SOCK_TSTAMP_NEW); + /* fall through */ + case SO_TIMESTAMPING_OLD: if (val & ~SOF_TIMESTAMPING_MASK) { ret = -EINVAL; break; @@ -879,9 +964,13 @@ set_rcvbuf: if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE); - else + else { + if (optname == SO_TIMESTAMPING_NEW) + sock_reset_flag(sk, SOCK_TSTAMP_NEW); + sock_disable_timestamp(sk, (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); + } break; case SO_RCVLOWAT: @@ -893,12 +982,14 @@ set_rcvbuf: sk->sk_rcvlowat = val ? : 1; break; - case SO_RCVTIMEO: - ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen); + case SO_RCVTIMEO_OLD: + case SO_RCVTIMEO_NEW: + ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD); break; - case SO_SNDTIMEO: - ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen); + case SO_SNDTIMEO_OLD: + case SO_SNDTIMEO_NEW: + ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD); break; case SO_ATTACH_FILTER: @@ -1121,7 +1212,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname, int val; u64 val64; struct linger ling; - struct timeval tm; + struct old_timeval32 tm32; + struct __kernel_old_timeval tm; + struct __kernel_sock_timeval stm; struct sock_txtime txtime; } v; @@ -1208,39 +1301,36 @@ int sock_getsockopt(struct socket *sock, int level, int optname, sock_warn_obsolete_bsdism("getsockopt"); break; - case SO_TIMESTAMP: + case SO_TIMESTAMP_OLD: v.val = sock_flag(sk, SOCK_RCVTSTAMP) && + !sock_flag(sk, SOCK_TSTAMP_NEW) && !sock_flag(sk, SOCK_RCVTSTAMPNS); break; - case SO_TIMESTAMPNS: - v.val = sock_flag(sk, SOCK_RCVTSTAMPNS); + case SO_TIMESTAMPNS_OLD: + v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); + break; + + case SO_TIMESTAMP_NEW: + v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); break; - case SO_TIMESTAMPING: + case SO_TIMESTAMPNS_NEW: + v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); + break; + + case SO_TIMESTAMPING_OLD: v.val = sk->sk_tsflags; break; - case SO_RCVTIMEO: - lv = sizeof(struct timeval); - if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) { - v.tm.tv_sec = 0; - v.tm.tv_usec = 0; - } else { - v.tm.tv_sec = sk->sk_rcvtimeo / HZ; - v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ; - } + case SO_RCVTIMEO_OLD: + case SO_RCVTIMEO_NEW: + lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname); break; - case SO_SNDTIMEO: - lv = sizeof(struct timeval); - if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) { - v.tm.tv_sec = 0; - v.tm.tv_usec = 0; - } else { - v.tm.tv_sec = sk->sk_sndtimeo / HZ; - v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ; - } + case SO_SNDTIMEO_OLD: + case SO_SNDTIMEO_NEW: + lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname); break; case SO_RCVLOWAT: @@ -1775,7 +1865,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) */ sk_refcnt_debug_inc(newsk); sk_set_socket(newsk, NULL); - newsk->sk_wq = NULL; + RCU_INIT_POINTER(newsk->sk_wq, NULL); if (newsk->sk_prot->sockets_allocated) sk_sockets_allocated_inc(newsk); @@ -2147,7 +2237,7 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, return -EINVAL; sockc->mark = *(u32 *)CMSG_DATA(cmsg); break; - case SO_TIMESTAMPING: + case SO_TIMESTAMPING_OLD: if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) return -EINVAL; @@ -2405,7 +2495,7 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) } if (sk_has_memory_pressure(sk)) { - int alloc; + u64 alloc; if (!sk_under_memory_pressure(sk)) return 1; @@ -2738,11 +2828,11 @@ void sock_init_data(struct socket *sock, struct sock *sk) if (sock) { sk->sk_type = sock->type; - sk->sk_wq = sock->wq; + RCU_INIT_POINTER(sk->sk_wq, sock->wq); sock->sk = sk; sk->sk_uid = SOCK_INODE(sock)->i_uid; } else { - sk->sk_wq = NULL; + RCU_INIT_POINTER(sk->sk_wq, NULL); sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0); } diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index d67ec17f2cc8..84bf2861f45f 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -36,6 +36,15 @@ static int net_msg_warn; /* Unused, but still a sysctl */ int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0; EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); +/* 0 - Keep current behavior: + * IPv4: inherit all current settings from init_net + * IPv6: reset all settings to default + * 1 - Both inherit all current settings from init_net + * 2 - Both reset all settings to default + */ +int sysctl_devconf_inherit_init_net __read_mostly; +EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); + #ifdef CONFIG_RPS static int rps_sock_flow_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -544,6 +553,15 @@ static struct ctl_table net_core_table[] = { .extra1 = &zero, .extra2 = &one, }, + { + .procname = "devconf_inherit_init_net", + .data = &sysctl_devconf_inherit_init_net, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, + }, { } }; |