diff options
Diffstat (limited to 'kernel')
56 files changed, 1489 insertions, 650 deletions
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig new file mode 100644 index 000000000000..bd04f4a44c01 --- /dev/null +++ b/kernel/bpf/Kconfig @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: GPL-2.0-only + +# BPF interpreter that, for example, classic socket filters depend on. +config BPF + bool + +# Used by archs to tell that they support BPF JIT compiler plus which +# flavour. Only one of the two can be selected for a specific arch since +# eBPF JIT supersedes the cBPF JIT. + +# Classic BPF JIT (cBPF) +config HAVE_CBPF_JIT + bool + +# Extended BPF JIT (eBPF) +config HAVE_EBPF_JIT + bool + +# Used by archs to tell that they want the BPF JIT compiler enabled by +# default for kernels that were compiled with BPF JIT support. +config ARCH_WANT_DEFAULT_BPF_JIT + bool + +menu "BPF subsystem" + +config BPF_SYSCALL + bool "Enable bpf() system call" + select BPF + select IRQ_WORK + select TASKS_TRACE_RCU + select BINARY_PRINTF + select NET_SOCK_MSG if INET + default n + help + Enable the bpf() system call that allows to manipulate BPF programs + and maps via file descriptors. + +config BPF_JIT + bool "Enable BPF Just In Time compiler" + depends on BPF + depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT + depends on MODULES + help + BPF programs are normally handled by a BPF interpreter. This option + allows the kernel to generate native code when a program is loaded + into the kernel. This will significantly speed-up processing of BPF + programs. + + Note, an admin should enable this feature changing: + /proc/sys/net/core/bpf_jit_enable + /proc/sys/net/core/bpf_jit_harden (optional) + /proc/sys/net/core/bpf_jit_kallsyms (optional) + +config BPF_JIT_ALWAYS_ON + bool "Permanently enable BPF JIT and remove BPF interpreter" + depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT + help + Enables BPF JIT and removes BPF interpreter to avoid speculative + execution of BPF instructions by the interpreter. + +config BPF_JIT_DEFAULT_ON + def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON + depends on HAVE_EBPF_JIT && BPF_JIT + +config BPF_UNPRIV_DEFAULT_OFF + bool "Disable unprivileged BPF by default" + depends on BPF_SYSCALL + help + Disables unprivileged BPF by default by setting the corresponding + /proc/sys/kernel/unprivileged_bpf_disabled knob to 2. An admin can + still reenable it by setting it to 0 later on, or permanently + disable it by setting it to 1 (from which no other transition to + 0 is possible anymore). + +source "kernel/bpf/preload/Kconfig" + +config BPF_LSM + bool "Enable BPF LSM Instrumentation" + depends on BPF_EVENTS + depends on BPF_SYSCALL + depends on SECURITY + depends on BPF_JIT + help + Enables instrumentation of the security hooks with BPF programs for + implementing dynamic MAC and Audit Policies. + + If you are unsure how to answer this question, answer N. + +endmenu # "BPF subsystem" diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 2921ca39a93e..96ceed0e0fb5 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -72,7 +72,7 @@ void bpf_inode_storage_free(struct inode *inode) return; } - /* Netiher the bpf_prog nor the bpf-map's syscall + /* Neither the bpf_prog nor the bpf-map's syscall * could be modifying the local_storage->list now. * Thus, no elem can be added-to or deleted-from the * local_storage->list by the bpf_prog or by the bpf-map's syscall. diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 931870f9cf56..2d4fbdbb194e 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -473,15 +473,16 @@ bool bpf_link_is_iter(struct bpf_link *link) return link->ops == &bpf_iter_link_lops; } -int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, + struct bpf_prog *prog) { - union bpf_iter_link_info __user *ulinfo; struct bpf_link_primer link_primer; struct bpf_iter_target_info *tinfo; union bpf_iter_link_info linfo; struct bpf_iter_link *link; u32 prog_btf_id, linfo_len; bool existed = false; + bpfptr_t ulinfo; int err; if (attr->link_create.target_fd || attr->link_create.flags) @@ -489,18 +490,18 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) memset(&linfo, 0, sizeof(union bpf_iter_link_info)); - ulinfo = u64_to_user_ptr(attr->link_create.iter_info); + ulinfo = make_bpfptr(attr->link_create.iter_info, uattr.is_kernel); linfo_len = attr->link_create.iter_info_len; - if (!ulinfo ^ !linfo_len) + if (bpfptr_is_null(ulinfo) ^ !linfo_len) return -EINVAL; - if (ulinfo) { + if (!bpfptr_is_null(ulinfo)) { err = bpf_check_uarg_tail_zero(ulinfo, sizeof(linfo), linfo_len); if (err) return err; linfo_len = min_t(u32, linfo_len, sizeof(linfo)); - if (copy_from_user(&linfo, ulinfo, linfo_len)) + if (copy_from_bpfptr(&linfo, ulinfo, linfo_len)) return -EFAULT; } diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 5efb2b24012c..06062370c3b8 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -107,10 +107,12 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_inode_storage_get_proto; case BPF_FUNC_inode_storage_delete: return &bpf_inode_storage_delete_proto; +#ifdef CONFIG_NET case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; +#endif /* CONFIG_NET */ case BPF_FUNC_spin_lock: return &bpf_spin_lock_proto; case BPF_FUNC_spin_unlock: @@ -125,7 +127,7 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } /* The set of hooks which are called without pagefaults disabled and are allowed - * to "sleep" and thus can be used for sleeable BPF programs. + * to "sleep" and thus can be used for sleepable BPF programs. */ BTF_SET_START(sleepable_lsm_hooks) BTF_ID(func, bpf_lsm_bpf) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 0600ed325fa0..cb4b72997d9b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -51,7 +51,7 @@ * The BTF type section contains a list of 'struct btf_type' objects. * Each one describes a C type. Recall from the above section * that a 'struct btf_type' object could be immediately followed by extra - * data in order to desribe some particular C types. + * data in order to describe some particular C types. * * type_id: * ~~~~~~~ @@ -1143,7 +1143,7 @@ static void *btf_show_obj_safe(struct btf_show *show, /* * We need a new copy to our safe object, either because we haven't - * yet copied and are intializing safe data, or because the data + * yet copied and are initializing safe data, or because the data * we want falls outside the boundaries of the safe object. */ if (!safe) { @@ -3417,7 +3417,7 @@ static struct btf_kind_operations func_proto_ops = { * BTF_KIND_FUNC_PROTO cannot be directly referred by * a struct's member. * - * It should be a funciton pointer instead. + * It should be a function pointer instead. * (i.e. struct's member -> BTF_KIND_PTR -> BTF_KIND_FUNC_PROTO) * * Hence, there is no btf_func_check_member(). @@ -4257,7 +4257,7 @@ static int btf_parse_hdr(struct btf_verifier_env *env) return 0; } -static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, +static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, u32 log_level, char __user *log_ubuf, u32 log_size) { struct btf_verifier_env *env = NULL; @@ -4306,7 +4306,7 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, btf->data = data; btf->data_size = btf_data_size; - if (copy_from_user(data, btf_data, btf_data_size)) { + if (copy_from_bpfptr(data, btf_data, btf_data_size)) { err = -EFAULT; goto errout; } @@ -5206,6 +5206,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, m->ret_size = ret; for (i = 0; i < nargs; i++) { + if (i == nargs - 1 && args[i].type == 0) { + bpf_log(log, + "The function %s with variable args is unsupported.\n", + tname); + return -EINVAL; + } ret = __get_type_size(btf, args[i].type, &t); if (ret < 0) { bpf_log(log, @@ -5213,6 +5219,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]); return -EINVAL; } + if (ret == 0) { + bpf_log(log, + "The function %s has malformed void argument.\n", + tname); + return -EINVAL; + } m->arg_size[i] = ret; } m->nr_args = nargs; @@ -5780,12 +5792,12 @@ static int __btf_new_fd(struct btf *btf) return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC); } -int btf_new_fd(const union bpf_attr *attr) +int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr) { struct btf *btf; int ret; - btf = btf_parse(u64_to_user_ptr(attr->btf), + btf = btf_parse(make_bpfptr(attr->btf, uattr.is_kernel), attr->btf_size, attr->btf_log_level, u64_to_user_ptr(attr->btf_log_buf), attr->btf_log_size); @@ -6085,3 +6097,65 @@ struct module *btf_try_get_module(const struct btf *btf) return res; } + +BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int, flags) +{ + struct btf *btf; + long ret; + + if (flags) + return -EINVAL; + + if (name_sz <= 1 || name[name_sz - 1]) + return -EINVAL; + + btf = bpf_get_btf_vmlinux(); + if (IS_ERR(btf)) + return PTR_ERR(btf); + + ret = btf_find_by_name_kind(btf, name, kind); + /* ret is never zero, since btf_find_by_name_kind returns + * positive btf_id or negative error. + */ + if (ret < 0) { + struct btf *mod_btf; + int id; + + /* If name is not found in vmlinux's BTF then search in module's BTFs */ + spin_lock_bh(&btf_idr_lock); + idr_for_each_entry(&btf_idr, mod_btf, id) { + if (!btf_is_module(mod_btf)) + continue; + /* linear search could be slow hence unlock/lock + * the IDR to avoiding holding it for too long + */ + btf_get(mod_btf); + spin_unlock_bh(&btf_idr_lock); + ret = btf_find_by_name_kind(mod_btf, name, kind); + if (ret > 0) { + int btf_obj_fd; + + btf_obj_fd = __btf_new_fd(mod_btf); + if (btf_obj_fd < 0) { + btf_put(mod_btf); + return btf_obj_fd; + } + return ret | (((u64)btf_obj_fd) << 32); + } + spin_lock_bh(&btf_idr_lock); + btf_put(mod_btf); + } + spin_unlock_bh(&btf_idr_lock); + } + return ret; +} + +const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = { + .func = bpf_btf_find_by_name_kind, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5e31ee9f7512..034ad93a1ad7 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1392,29 +1392,54 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn) select_insn: goto *jumptable[insn->code]; - /* ALU */ -#define ALU(OPCODE, OP) \ - ALU64_##OPCODE##_X: \ - DST = DST OP SRC; \ - CONT; \ - ALU_##OPCODE##_X: \ - DST = (u32) DST OP (u32) SRC; \ - CONT; \ - ALU64_##OPCODE##_K: \ - DST = DST OP IMM; \ - CONT; \ - ALU_##OPCODE##_K: \ - DST = (u32) DST OP (u32) IMM; \ + /* Explicitly mask the register-based shift amounts with 63 or 31 + * to avoid undefined behavior. Normally this won't affect the + * generated code, for example, in case of native 64 bit archs such + * as x86-64 or arm64, the compiler is optimizing the AND away for + * the interpreter. In case of JITs, each of the JIT backends compiles + * the BPF shift operations to machine instructions which produce + * implementation-defined results in such a case; the resulting + * contents of the register may be arbitrary, but program behaviour + * as a whole remains defined. In other words, in case of JIT backends, + * the AND must /not/ be added to the emitted LSH/RSH/ARSH translation. + */ + /* ALU (shifts) */ +#define SHT(OPCODE, OP) \ + ALU64_##OPCODE##_X: \ + DST = DST OP (SRC & 63); \ + CONT; \ + ALU_##OPCODE##_X: \ + DST = (u32) DST OP ((u32) SRC & 31); \ + CONT; \ + ALU64_##OPCODE##_K: \ + DST = DST OP IMM; \ + CONT; \ + ALU_##OPCODE##_K: \ + DST = (u32) DST OP (u32) IMM; \ + CONT; + /* ALU (rest) */ +#define ALU(OPCODE, OP) \ + ALU64_##OPCODE##_X: \ + DST = DST OP SRC; \ + CONT; \ + ALU_##OPCODE##_X: \ + DST = (u32) DST OP (u32) SRC; \ + CONT; \ + ALU64_##OPCODE##_K: \ + DST = DST OP IMM; \ + CONT; \ + ALU_##OPCODE##_K: \ + DST = (u32) DST OP (u32) IMM; \ CONT; - ALU(ADD, +) ALU(SUB, -) ALU(AND, &) ALU(OR, |) - ALU(LSH, <<) - ALU(RSH, >>) ALU(XOR, ^) ALU(MUL, *) + SHT(LSH, <<) + SHT(RSH, >>) +#undef SHT #undef ALU ALU_NEG: DST = (u32) -DST; @@ -1439,13 +1464,13 @@ select_insn: insn++; CONT; ALU_ARSH_X: - DST = (u64) (u32) (((s32) DST) >> SRC); + DST = (u64) (u32) (((s32) DST) >> (SRC & 31)); CONT; ALU_ARSH_K: DST = (u64) (u32) (((s32) DST) >> IMM); CONT; ALU64_ARSH_X: - (*(s64 *) &DST) >>= SRC; + (*(s64 *) &DST) >>= (SRC & 63); CONT; ALU64_ARSH_K: (*(s64 *) &DST) >>= IMM; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 5dd3e866599a..a1a0c4e791c6 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -601,7 +601,8 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) { - return __bpf_xdp_redirect_map(map, ifindex, flags, __cpu_map_lookup_elem); + return __bpf_xdp_redirect_map(map, ifindex, flags, 0, + __cpu_map_lookup_elem); } static int cpu_map_btf_id; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index aa516472ce46..2a75e6c2d27d 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -57,6 +57,7 @@ struct xdp_dev_bulk_queue { struct list_head flush_node; struct net_device *dev; struct net_device *dev_rx; + struct bpf_prog *xdp_prog; unsigned int count; }; @@ -197,6 +198,7 @@ static void dev_map_free(struct bpf_map *map) list_del_rcu(&dtab->list); spin_unlock(&dev_map_lock); + bpf_clear_redirect_map(map); synchronize_rcu(); /* Make sure prior __dev_map_entry_free() have completed. */ @@ -326,22 +328,69 @@ bool dev_map_can_have_prog(struct bpf_map *map) return false; } +static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, + struct xdp_frame **frames, int n, + struct net_device *dev) +{ + struct xdp_txq_info txq = { .dev = dev }; + struct xdp_buff xdp; + int i, nframes = 0; + + for (i = 0; i < n; i++) { + struct xdp_frame *xdpf = frames[i]; + u32 act; + int err; + + xdp_convert_frame_to_buff(xdpf, &xdp); + xdp.txq = &txq; + + act = bpf_prog_run_xdp(xdp_prog, &xdp); + switch (act) { + case XDP_PASS: + err = xdp_update_frame_from_buff(&xdp, xdpf); + if (unlikely(err < 0)) + xdp_return_frame_rx_napi(xdpf); + else + frames[nframes++] = xdpf; + break; + default: + bpf_warn_invalid_xdp_action(act); + fallthrough; + case XDP_ABORTED: + trace_xdp_exception(dev, xdp_prog, act); + fallthrough; + case XDP_DROP: + xdp_return_frame_rx_napi(xdpf); + break; + } + } + return nframes; /* sent frames count */ +} + static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) { struct net_device *dev = bq->dev; + unsigned int cnt = bq->count; int sent = 0, err = 0; + int to_send = cnt; int i; - if (unlikely(!bq->count)) + if (unlikely(!cnt)) return; - for (i = 0; i < bq->count; i++) { + for (i = 0; i < cnt; i++) { struct xdp_frame *xdpf = bq->q[i]; prefetch(xdpf); } - sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags); + if (bq->xdp_prog) { + to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev); + if (!to_send) + goto out; + } + + sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_send, bq->q, flags); if (sent < 0) { /* If ndo_xdp_xmit fails with an errno, no frames have * been xmit'ed. @@ -353,13 +402,12 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) /* If not all frames have been transmitted, it is our * responsibility to free them */ - for (i = sent; unlikely(i < bq->count); i++) + for (i = sent; unlikely(i < to_send); i++) xdp_return_frame_rx_napi(bq->q[i]); - trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, bq->count - sent, err); - bq->dev_rx = NULL; +out: bq->count = 0; - __list_del_clearprev(&bq->flush_node); + trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err); } /* __dev_flush is called from xdp_do_flush() which _must_ be signaled @@ -377,13 +425,17 @@ void __dev_flush(void) struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); struct xdp_dev_bulk_queue *bq, *tmp; - list_for_each_entry_safe(bq, tmp, flush_list, flush_node) + list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { bq_xmit_all(bq, XDP_XMIT_FLUSH); + bq->dev_rx = NULL; + bq->xdp_prog = NULL; + __list_del_clearprev(&bq->flush_node); + } } /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or - * update happens in parallel here a dev_put wont happen until after reading the - * ifindex. + * update happens in parallel here a dev_put won't happen until after reading + * the ifindex. */ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { @@ -401,7 +453,7 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key) * Thus, safe percpu variable access. */ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, - struct net_device *dev_rx) + struct net_device *dev_rx, struct bpf_prog *xdp_prog) { struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); @@ -412,18 +464,22 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, /* Ingress dev_rx will be the same for all xdp_frame's in * bulk_queue, because bq stored per-CPU and must be flushed * from net_device drivers NAPI func end. + * + * Do the same with xdp_prog and flush_list since these fields + * are only ever modified together. */ - if (!bq->dev_rx) + if (!bq->dev_rx) { bq->dev_rx = dev_rx; + bq->xdp_prog = xdp_prog; + list_add(&bq->flush_node, flush_list); + } bq->q[bq->count++] = xdpf; - - if (!bq->flush_node.prev) - list_add(&bq->flush_node, flush_list); } static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, - struct net_device *dev_rx) + struct net_device *dev_rx, + struct bpf_prog *xdp_prog) { struct xdp_frame *xdpf; int err; @@ -439,55 +495,115 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, if (unlikely(!xdpf)) return -EOVERFLOW; - bq_enqueue(dev, xdpf, dev_rx); + bq_enqueue(dev, xdpf, dev_rx, xdp_prog); return 0; } -static struct xdp_buff *dev_map_run_prog(struct net_device *dev, - struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) +int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, + struct net_device *dev_rx) { - struct xdp_txq_info txq = { .dev = dev }; - u32 act; + return __xdp_enqueue(dev, xdp, dev_rx, NULL); +} - xdp_set_data_meta_invalid(xdp); - xdp->txq = &txq; +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, + struct net_device *dev_rx) +{ + struct net_device *dev = dst->dev; - act = bpf_prog_run_xdp(xdp_prog, xdp); - switch (act) { - case XDP_PASS: - return xdp; - case XDP_DROP: - break; - default: - bpf_warn_invalid_xdp_action(act); - fallthrough; - case XDP_ABORTED: - trace_xdp_exception(dev, xdp_prog, act); - break; - } + return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog); +} - xdp_return_buff(xdp); - return NULL; +static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_buff *xdp, + int exclude_ifindex) +{ + if (!obj || obj->dev->ifindex == exclude_ifindex || + !obj->dev->netdev_ops->ndo_xdp_xmit) + return false; + + if (xdp_ok_fwd_dev(obj->dev, xdp->data_end - xdp->data)) + return false; + + return true; } -int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, - struct net_device *dev_rx) +static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj, + struct net_device *dev_rx, + struct xdp_frame *xdpf) { - return __xdp_enqueue(dev, xdp, dev_rx); + struct xdp_frame *nxdpf; + + nxdpf = xdpf_clone(xdpf); + if (!nxdpf) + return -ENOMEM; + + bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog); + + return 0; } -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, - struct net_device *dev_rx) +int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, + struct bpf_map *map, bool exclude_ingress) { - struct net_device *dev = dst->dev; + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + int exclude_ifindex = exclude_ingress ? dev_rx->ifindex : 0; + struct bpf_dtab_netdev *dst, *last_dst = NULL; + struct hlist_head *head; + struct xdp_frame *xdpf; + unsigned int i; + int err; - if (dst->xdp_prog) { - xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog); - if (!xdp) - return 0; + xdpf = xdp_convert_buff_to_frame(xdp); + if (unlikely(!xdpf)) + return -EOVERFLOW; + + if (map->map_type == BPF_MAP_TYPE_DEVMAP) { + for (i = 0; i < map->max_entries; i++) { + dst = READ_ONCE(dtab->netdev_map[i]); + if (!is_valid_dst(dst, xdp, exclude_ifindex)) + continue; + + /* we only need n-1 clones; last_dst enqueued below */ + if (!last_dst) { + last_dst = dst; + continue; + } + + err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf); + if (err) + return err; + + last_dst = dst; + } + } else { /* BPF_MAP_TYPE_DEVMAP_HASH */ + for (i = 0; i < dtab->n_buckets; i++) { + head = dev_map_index_hash(dtab, i); + hlist_for_each_entry_rcu(dst, head, index_hlist, + lockdep_is_held(&dtab->index_lock)) { + if (!is_valid_dst(dst, xdp, exclude_ifindex)) + continue; + + /* we only need n-1 clones; last_dst enqueued below */ + if (!last_dst) { + last_dst = dst; + continue; + } + + err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf); + if (err) + return err; + + last_dst = dst; + } + } } - return __xdp_enqueue(dev, xdp, dev_rx); + + /* consume the last copy of the frame */ + if (last_dst) + bq_enqueue(last_dst->dev, xdpf, dev_rx, last_dst->xdp_prog); + else + xdp_return_frame_rx_napi(xdpf); /* dtab is empty */ + + return 0; } int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, @@ -504,6 +620,87 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, return 0; } +static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst, + struct sk_buff *skb, + struct bpf_prog *xdp_prog) +{ + struct sk_buff *nskb; + int err; + + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return -ENOMEM; + + err = dev_map_generic_redirect(dst, nskb, xdp_prog); + if (unlikely(err)) { + consume_skb(nskb); + return err; + } + + return 0; +} + +int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, + struct bpf_prog *xdp_prog, struct bpf_map *map, + bool exclude_ingress) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + int exclude_ifindex = exclude_ingress ? dev->ifindex : 0; + struct bpf_dtab_netdev *dst, *last_dst = NULL; + struct hlist_head *head; + struct hlist_node *next; + unsigned int i; + int err; + + if (map->map_type == BPF_MAP_TYPE_DEVMAP) { + for (i = 0; i < map->max_entries; i++) { + dst = READ_ONCE(dtab->netdev_map[i]); + if (!dst || dst->dev->ifindex == exclude_ifindex) + continue; + + /* we only need n-1 clones; last_dst enqueued below */ + if (!last_dst) { + last_dst = dst; + continue; + } + + err = dev_map_redirect_clone(last_dst, skb, xdp_prog); + if (err) + return err; + + last_dst = dst; + } + } else { /* BPF_MAP_TYPE_DEVMAP_HASH */ + for (i = 0; i < dtab->n_buckets; i++) { + head = dev_map_index_hash(dtab, i); + hlist_for_each_entry_safe(dst, next, head, index_hlist) { + if (!dst || dst->dev->ifindex == exclude_ifindex) + continue; + + /* we only need n-1 clones; last_dst enqueued below */ + if (!last_dst) { + last_dst = dst; + continue; + } + + err = dev_map_redirect_clone(last_dst, skb, xdp_prog); + if (err) + return err; + + last_dst = dst; + } + } + } + + /* consume the first skb and return */ + if (last_dst) + return dev_map_generic_redirect(last_dst, skb, xdp_prog); + + /* dtab is empty */ + consume_skb(skb); + return 0; +} + static void *dev_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); @@ -730,12 +927,16 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) { - return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_lookup_elem); + return __bpf_xdp_redirect_map(map, ifindex, flags, + BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, + __dev_map_lookup_elem); } static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) { - return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_hash_lookup_elem); + return __bpf_xdp_redirect_map(map, ifindex, flags, + BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, + __dev_map_hash_lookup_elem); } static int dev_map_btf_id; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d7ebb12ffffc..6f6681b07364 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -46,12 +46,12 @@ * events, kprobes and tracing to be invoked before the prior invocation * from one of these contexts completed. sys_bpf() uses the same mechanism * by pinning the task to the current CPU and incrementing the recursion - * protection accross the map operation. + * protection across the map operation. * * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain * operations like memory allocations (even with GFP_ATOMIC) from atomic * contexts. This is required because even with GFP_ATOMIC the memory - * allocator calls into code pathes which acquire locks with long held lock + * allocator calls into code paths which acquire locks with long held lock * sections. To ensure the deterministic behaviour these locks are regular * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only * true atomic contexts on an RT kernel are the low level hardware @@ -1401,6 +1401,100 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } +static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, + void *value, bool is_lru_map, + bool is_percpu, u64 flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_nulls_head *head; + unsigned long bflags; + struct htab_elem *l; + u32 hash, key_size; + struct bucket *b; + int ret; + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size, htab->hashrnd); + b = __select_bucket(htab, hash); + head = &b->head; + + ret = htab_lock_bucket(htab, b, hash, &bflags); + if (ret) + return ret; + + l = lookup_elem_raw(head, hash, key, key_size); + if (!l) { + ret = -ENOENT; + } else { + if (is_percpu) { + u32 roundup_value_size = round_up(map->value_size, 8); + void __percpu *pptr; + int off = 0, cpu; + + pptr = htab_elem_get_ptr(l, key_size); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(value + off, + per_cpu_ptr(pptr, cpu), + roundup_value_size); + off += roundup_value_size; + } + } else { + u32 roundup_key_size = round_up(map->key_size, 8); + + if (flags & BPF_F_LOCK) + copy_map_value_locked(map, value, l->key + + roundup_key_size, + true); + else + copy_map_value(map, value, l->key + + roundup_key_size); + check_and_init_map_lock(map, value); + } + + hlist_nulls_del_rcu(&l->hash_node); + if (!is_lru_map) + free_htab_elem(htab, l); + } + + htab_unlock_bucket(htab, b, hash, bflags); + + if (is_lru_map && l) + bpf_lru_push_free(&htab->lru, &l->lru_node); + + return ret; +} + +static int htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + return __htab_map_lookup_and_delete_elem(map, key, value, false, false, + flags); +} + +static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map, + void *key, void *value, + u64 flags) +{ + return __htab_map_lookup_and_delete_elem(map, key, value, false, true, + flags); +} + +static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + return __htab_map_lookup_and_delete_elem(map, key, value, true, false, + flags); +} + +static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map, + void *key, void *value, + u64 flags) +{ + return __htab_map_lookup_and_delete_elem(map, key, value, true, true, + flags); +} + static int __htab_map_lookup_and_delete_batch(struct bpf_map *map, const union bpf_attr *attr, @@ -1934,6 +2028,7 @@ const struct bpf_map_ops htab_map_ops = { .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_map_lookup_elem, + .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem, .map_update_elem = htab_map_update_elem, .map_delete_elem = htab_map_delete_elem, .map_gen_lookup = htab_map_gen_lookup, @@ -1954,6 +2049,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_lru_map_lookup_elem, + .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem, .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, .map_update_elem = htab_lru_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, @@ -2077,6 +2173,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_percpu_map_lookup_elem, + .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem, .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, @@ -2096,6 +2193,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_lru_percpu_map_lookup_elem, + .map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem, .map_update_elem = htab_lru_percpu_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 544773970dbc..a2f1f15ce432 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -14,6 +14,7 @@ #include <linux/jiffies.h> #include <linux/pid_namespace.h> #include <linux/proc_ns.h> +#include <linux/security.h> #include "../../lib/kstrtox.h" @@ -692,38 +693,41 @@ static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, return -EINVAL; } -/* Per-cpu temp buffers which can be used by printf-like helpers for %s or %p +/* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary + * arguments representation. */ -#define MAX_PRINTF_BUF_LEN 512 +#define MAX_BPRINTF_BUF_LEN 512 -struct bpf_printf_buf { - char tmp_buf[MAX_PRINTF_BUF_LEN]; +/* Support executing three nested bprintf helper calls on a given CPU */ +#define MAX_BPRINTF_NEST_LEVEL 3 +struct bpf_bprintf_buffers { + char tmp_bufs[MAX_BPRINTF_NEST_LEVEL][MAX_BPRINTF_BUF_LEN]; }; -static DEFINE_PER_CPU(struct bpf_printf_buf, bpf_printf_buf); -static DEFINE_PER_CPU(int, bpf_printf_buf_used); +static DEFINE_PER_CPU(struct bpf_bprintf_buffers, bpf_bprintf_bufs); +static DEFINE_PER_CPU(int, bpf_bprintf_nest_level); static int try_get_fmt_tmp_buf(char **tmp_buf) { - struct bpf_printf_buf *bufs; - int used; + struct bpf_bprintf_buffers *bufs; + int nest_level; preempt_disable(); - used = this_cpu_inc_return(bpf_printf_buf_used); - if (WARN_ON_ONCE(used > 1)) { - this_cpu_dec(bpf_printf_buf_used); + nest_level = this_cpu_inc_return(bpf_bprintf_nest_level); + if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) { + this_cpu_dec(bpf_bprintf_nest_level); preempt_enable(); return -EBUSY; } - bufs = this_cpu_ptr(&bpf_printf_buf); - *tmp_buf = bufs->tmp_buf; + bufs = this_cpu_ptr(&bpf_bprintf_bufs); + *tmp_buf = bufs->tmp_bufs[nest_level - 1]; return 0; } void bpf_bprintf_cleanup(void) { - if (this_cpu_read(bpf_printf_buf_used)) { - this_cpu_dec(bpf_printf_buf_used); + if (this_cpu_read(bpf_bprintf_nest_level)) { + this_cpu_dec(bpf_bprintf_nest_level); preempt_enable(); } } @@ -760,7 +764,7 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, if (num_args && try_get_fmt_tmp_buf(&tmp_buf)) return -EBUSY; - tmp_buf_end = tmp_buf + MAX_PRINTF_BUF_LEN; + tmp_buf_end = tmp_buf + MAX_BPRINTF_BUF_LEN; *bin_args = (u32 *)tmp_buf; } @@ -1066,11 +1070,13 @@ bpf_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_probe_read_user: return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: - return &bpf_probe_read_kernel_proto; + return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + NULL : &bpf_probe_read_kernel_proto; case BPF_FUNC_probe_read_user_str: return &bpf_probe_read_user_str_proto; case BPF_FUNC_probe_read_kernel_str: - return &bpf_probe_read_kernel_str_proto; + return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + NULL : &bpf_probe_read_kernel_str_proto; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; case BPF_FUNC_snprintf: diff --git a/kernel/bpf/preload/iterators/iterators.bpf.c b/kernel/bpf/preload/iterators/iterators.bpf.c index 52aa7b38e8b8..03af863314ea 100644 --- a/kernel/bpf/preload/iterators/iterators.bpf.c +++ b/kernel/bpf/preload/iterators/iterators.bpf.c @@ -2,7 +2,6 @@ /* Copyright (c) 2020 Facebook */ #include <linux/bpf.h> #include <bpf/bpf_helpers.h> -#include <bpf/bpf_tracing.h> #include <bpf/bpf_core_read.h> #pragma clang attribute push (__attribute__((preserve_access_index)), apply_to = record) diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 4838922f723d..93a55391791a 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -102,7 +102,7 @@ static void reuseport_array_free(struct bpf_map *map) /* * ops->map_*_elem() will not be able to access this * array now. Hence, this function only races with - * bpf_sk_reuseport_detach() which was triggerred by + * bpf_sk_reuseport_detach() which was triggered by * close() or disconnect(). * * This function and bpf_sk_reuseport_detach() are diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index f25b719ac786..84b3b35fc0d0 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -221,25 +221,20 @@ static int ringbuf_map_get_next_key(struct bpf_map *map, void *key, return -ENOTSUPP; } -static size_t bpf_ringbuf_mmap_page_cnt(const struct bpf_ringbuf *rb) -{ - size_t data_pages = (rb->mask + 1) >> PAGE_SHIFT; - - /* consumer page + producer page + 2 x data pages */ - return RINGBUF_POS_PAGES + 2 * data_pages; -} - static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) { struct bpf_ringbuf_map *rb_map; - size_t mmap_sz; rb_map = container_of(map, struct bpf_ringbuf_map, map); - mmap_sz = bpf_ringbuf_mmap_page_cnt(rb_map->rb) << PAGE_SHIFT; - - if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > mmap_sz) - return -EINVAL; + if (vma->vm_flags & VM_WRITE) { + /* allow writable mapping for the consumer_pos only */ + if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EPERM; + } else { + vma->vm_flags &= ~VM_MAYWRITE; + } + /* remap_vmalloc_range() checks size and offset constraints */ return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); } @@ -315,6 +310,9 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) return NULL; len = round_up(size + BPF_RINGBUF_HDR_SZ, 8); + if (len > rb->mask + 1) + return NULL; + cons_pos = smp_load_acquire(&rb->consumer_pos); if (in_nmi()) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 941ca06d9dfa..e343f158e556 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -50,7 +50,8 @@ static DEFINE_SPINLOCK(map_idr_lock); static DEFINE_IDR(link_idr); static DEFINE_SPINLOCK(link_idr_lock); -int sysctl_unprivileged_bpf_disabled __read_mostly; +int sysctl_unprivileged_bpf_disabled __read_mostly = + IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0; static const struct bpf_map_ops * const bpf_map_types[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) @@ -72,11 +73,10 @@ static const struct bpf_map_ops * const bpf_map_types[] = { * copy_from_user() call. However, this is not a concern since this function is * meant to be a future-proofing of bits. */ -int bpf_check_uarg_tail_zero(void __user *uaddr, +int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size, size_t actual_size) { - unsigned char __user *addr = uaddr + expected_size; int res; if (unlikely(actual_size > PAGE_SIZE)) /* silly large */ @@ -85,7 +85,12 @@ int bpf_check_uarg_tail_zero(void __user *uaddr, if (actual_size <= expected_size) return 0; - res = check_zeroed_user(addr, actual_size - expected_size); + if (uaddr.is_kernel) + res = memchr_inv(uaddr.kernel + expected_size, 0, + actual_size - expected_size) == NULL; + else + res = check_zeroed_user(uaddr.user + expected_size, + actual_size - expected_size); if (res < 0) return res; return res ? 0 : -E2BIG; @@ -1004,6 +1009,17 @@ static void *__bpf_copy_key(void __user *ukey, u64 key_size) return NULL; } +static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size) +{ + if (key_size) + return memdup_bpfptr(ukey, key_size); + + if (!bpfptr_is_null(ukey)) + return ERR_PTR(-EINVAL); + + return NULL; +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags @@ -1074,10 +1090,10 @@ err_put: #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags -static int map_update_elem(union bpf_attr *attr) +static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) { - void __user *ukey = u64_to_user_ptr(attr->key); - void __user *uvalue = u64_to_user_ptr(attr->value); + bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel); + bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel); int ufd = attr->map_fd; struct bpf_map *map; void *key, *value; @@ -1103,7 +1119,7 @@ static int map_update_elem(union bpf_attr *attr) goto err_put; } - key = __bpf_copy_key(ukey, map->key_size); + key = ___bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; @@ -1123,7 +1139,7 @@ static int map_update_elem(union bpf_attr *attr) goto free_key; err = -EFAULT; - if (copy_from_user(value, uvalue, value_size) != 0) + if (copy_from_bpfptr(value, uvalue, value_size) != 0) goto free_value; err = bpf_map_update_value(map, f, key, value, attr->flags); @@ -1468,7 +1484,7 @@ free_buf: return err; } -#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value +#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags static int map_lookup_and_delete_elem(union bpf_attr *attr) { @@ -1484,6 +1500,9 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM)) return -EINVAL; + if (attr->flags & ~BPF_F_LOCK) + return -EINVAL; + f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) @@ -1494,24 +1513,47 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) goto err_put; } + if (attr->flags && + (map->map_type == BPF_MAP_TYPE_QUEUE || + map->map_type == BPF_MAP_TYPE_STACK)) { + err = -EINVAL; + goto err_put; + } + + if ((attr->flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map)) { + err = -EINVAL; + goto err_put; + } + key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; } - value_size = map->value_size; + value_size = bpf_map_value_size(map); err = -ENOMEM; value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; + err = -ENOTSUPP; if (map->map_type == BPF_MAP_TYPE_QUEUE || map->map_type == BPF_MAP_TYPE_STACK) { err = map->ops->map_pop_elem(map, value); - } else { - err = -ENOTSUPP; + } else if (map->map_type == BPF_MAP_TYPE_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { + if (!bpf_map_is_dev_bound(map)) { + bpf_disable_instrumentation(); + rcu_read_lock(); + err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags); + rcu_read_unlock(); + bpf_enable_instrumentation(); + } } if (err) @@ -1931,6 +1973,11 @@ static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr) attr->expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE; break; + case BPF_PROG_TYPE_SK_REUSEPORT: + if (!attr->expected_attach_type) + attr->expected_attach_type = + BPF_SK_REUSEPORT_SELECT; + break; } } @@ -2014,6 +2061,15 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, if (expected_attach_type == BPF_SK_LOOKUP) return 0; return -EINVAL; + case BPF_PROG_TYPE_SK_REUSEPORT: + switch (expected_attach_type) { + case BPF_SK_REUSEPORT_SELECT: + case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE: + return 0; + default: + return -EINVAL; + } + case BPF_PROG_TYPE_SYSCALL: case BPF_PROG_TYPE_EXT: if (expected_attach_type) return -EINVAL; @@ -2073,9 +2129,9 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD attach_prog_fd +#define BPF_PROG_LOAD_LAST_FIELD fd_array -static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) +static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) { enum bpf_prog_type type = attr->prog_type; struct bpf_prog *prog, *dst_prog = NULL; @@ -2100,8 +2156,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) return -EPERM; /* copy eBPF program license from user space */ - if (strncpy_from_user(license, u64_to_user_ptr(attr->license), - sizeof(license) - 1) < 0) + if (strncpy_from_bpfptr(license, + make_bpfptr(attr->license, uattr.is_kernel), + sizeof(license) - 1) < 0) return -EFAULT; license[sizeof(license) - 1] = 0; @@ -2185,8 +2242,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) prog->len = attr->insn_cnt; err = -EFAULT; - if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns), - bpf_prog_insn_size(prog)) != 0) + if (copy_from_bpfptr(prog->insns, + make_bpfptr(attr->insns, uattr.is_kernel), + bpf_prog_insn_size(prog)) != 0) goto free_prog_sec; prog->orig_prog = NULL; @@ -3422,7 +3480,7 @@ static int bpf_prog_get_info_by_fd(struct file *file, u32 ulen; int err; - err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); + err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); @@ -3701,7 +3759,7 @@ static int bpf_map_get_info_by_fd(struct file *file, u32 info_len = attr->info.info_len; int err; - err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); + err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); @@ -3744,7 +3802,7 @@ static int bpf_btf_get_info_by_fd(struct file *file, u32 info_len = attr->info.info_len; int err; - err = bpf_check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len); + err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len); if (err) return err; @@ -3761,7 +3819,7 @@ static int bpf_link_get_info_by_fd(struct file *file, u32 info_len = attr->info.info_len; int err; - err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); + err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); @@ -3824,7 +3882,7 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, #define BPF_BTF_LOAD_LAST_FIELD btf_log_level -static int bpf_btf_load(const union bpf_attr *attr) +static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr) { if (CHECK_ATTR(BPF_BTF_LOAD)) return -EINVAL; @@ -3832,7 +3890,7 @@ static int bpf_btf_load(const union bpf_attr *attr) if (!bpf_capable()) return -EPERM; - return btf_new_fd(attr); + return btf_new_fd(attr, uattr); } #define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id @@ -4022,13 +4080,14 @@ err_put: return err; } -static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +static int tracing_bpf_link_attach(const union bpf_attr *attr, bpfptr_t uattr, + struct bpf_prog *prog) { if (attr->link_create.attach_type != prog->expected_attach_type) return -EINVAL; if (prog->expected_attach_type == BPF_TRACE_ITER) - return bpf_iter_link_attach(attr, prog); + return bpf_iter_link_attach(attr, uattr, prog); else if (prog->type == BPF_PROG_TYPE_EXT) return bpf_tracing_prog_attach(prog, attr->link_create.target_fd, @@ -4037,7 +4096,7 @@ static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog * } #define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len -static int link_create(union bpf_attr *attr) +static int link_create(union bpf_attr *attr, bpfptr_t uattr) { enum bpf_prog_type ptype; struct bpf_prog *prog; @@ -4056,7 +4115,7 @@ static int link_create(union bpf_attr *attr) goto out; if (prog->type == BPF_PROG_TYPE_EXT) { - ret = tracing_bpf_link_attach(attr, prog); + ret = tracing_bpf_link_attach(attr, uattr, prog); goto out; } @@ -4077,7 +4136,7 @@ static int link_create(union bpf_attr *attr) ret = cgroup_bpf_link_attach(attr, prog); break; case BPF_PROG_TYPE_TRACING: - ret = tracing_bpf_link_attach(attr, prog); + ret = tracing_bpf_link_attach(attr, uattr, prog); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: case BPF_PROG_TYPE_SK_LOOKUP: @@ -4365,7 +4424,7 @@ out_prog_put: return ret; } -SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) +static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; int err; @@ -4380,7 +4439,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz /* copy attributes from user space, may be less than sizeof(bpf_attr) */ memset(&attr, 0, sizeof(attr)); - if (copy_from_user(&attr, uattr, size) != 0) + if (copy_from_bpfptr(&attr, uattr, size) != 0) return -EFAULT; err = security_bpf(cmd, &attr, size); @@ -4395,7 +4454,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = map_lookup_elem(&attr); break; case BPF_MAP_UPDATE_ELEM: - err = map_update_elem(&attr); + err = map_update_elem(&attr, uattr); break; case BPF_MAP_DELETE_ELEM: err = map_delete_elem(&attr); @@ -4422,21 +4481,21 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = bpf_prog_detach(&attr); break; case BPF_PROG_QUERY: - err = bpf_prog_query(&attr, uattr); + err = bpf_prog_query(&attr, uattr.user); break; case BPF_PROG_TEST_RUN: - err = bpf_prog_test_run(&attr, uattr); + err = bpf_prog_test_run(&attr, uattr.user); break; case BPF_PROG_GET_NEXT_ID: - err = bpf_obj_get_next_id(&attr, uattr, + err = bpf_obj_get_next_id(&attr, uattr.user, &prog_idr, &prog_idr_lock); break; case BPF_MAP_GET_NEXT_ID: - err = bpf_obj_get_next_id(&attr, uattr, + err = bpf_obj_get_next_id(&attr, uattr.user, &map_idr, &map_idr_lock); break; case BPF_BTF_GET_NEXT_ID: - err = bpf_obj_get_next_id(&attr, uattr, + err = bpf_obj_get_next_id(&attr, uattr.user, &btf_idr, &btf_idr_lock); break; case BPF_PROG_GET_FD_BY_ID: @@ -4446,38 +4505,38 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = bpf_map_get_fd_by_id(&attr); break; case BPF_OBJ_GET_INFO_BY_FD: - err = bpf_obj_get_info_by_fd(&attr, uattr); + err = bpf_obj_get_info_by_fd(&attr, uattr.user); break; case BPF_RAW_TRACEPOINT_OPEN: err = bpf_raw_tracepoint_open(&attr); break; case BPF_BTF_LOAD: - err = bpf_btf_load(&attr); + err = bpf_btf_load(&attr, uattr); break; case BPF_BTF_GET_FD_BY_ID: err = bpf_btf_get_fd_by_id(&attr); break; case BPF_TASK_FD_QUERY: - err = bpf_task_fd_query(&attr, uattr); + err = bpf_task_fd_query(&attr, uattr.user); break; case BPF_MAP_LOOKUP_AND_DELETE_ELEM: err = map_lookup_and_delete_elem(&attr); break; case BPF_MAP_LOOKUP_BATCH: - err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH); + err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_BATCH); break; case BPF_MAP_LOOKUP_AND_DELETE_BATCH: - err = bpf_map_do_batch(&attr, uattr, + err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_AND_DELETE_BATCH); break; case BPF_MAP_UPDATE_BATCH: - err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH); + err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_UPDATE_BATCH); break; case BPF_MAP_DELETE_BATCH: - err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH); + err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_DELETE_BATCH); break; case BPF_LINK_CREATE: - err = link_create(&attr); + err = link_create(&attr, uattr); break; case BPF_LINK_UPDATE: err = link_update(&attr); @@ -4486,7 +4545,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = bpf_link_get_fd_by_id(&attr); break; case BPF_LINK_GET_NEXT_ID: - err = bpf_obj_get_next_id(&attr, uattr, + err = bpf_obj_get_next_id(&attr, uattr.user, &link_idr, &link_idr_lock); break; case BPF_ENABLE_STATS: @@ -4508,3 +4567,94 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz return err; } + +SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) +{ + return __sys_bpf(cmd, USER_BPFPTR(uattr), size); +} + +static bool syscall_prog_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= U16_MAX) + return false; + if (off % size != 0) + return false; + return true; +} + +BPF_CALL_3(bpf_sys_bpf, int, cmd, void *, attr, u32, attr_size) +{ + switch (cmd) { + case BPF_MAP_CREATE: + case BPF_MAP_UPDATE_ELEM: + case BPF_MAP_FREEZE: + case BPF_PROG_LOAD: + case BPF_BTF_LOAD: + break; + /* case BPF_PROG_TEST_RUN: + * is not part of this list to prevent recursive test_run + */ + default: + return -EINVAL; + } + return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size); +} + +static const struct bpf_func_proto bpf_sys_bpf_proto = { + .func = bpf_sys_bpf, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +const struct bpf_func_proto * __weak +tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return bpf_base_func_proto(func_id); +} + +BPF_CALL_1(bpf_sys_close, u32, fd) +{ + /* When bpf program calls this helper there should not be + * an fdget() without matching completed fdput(). + * This helper is allowed in the following callchain only: + * sys_bpf->prog_test_run->bpf_prog->bpf_sys_close + */ + return close_fd(fd); +} + +static const struct bpf_func_proto bpf_sys_close_proto = { + .func = bpf_sys_close, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_sys_bpf: + return &bpf_sys_bpf_proto; + case BPF_FUNC_btf_find_by_name_kind: + return &bpf_btf_find_by_name_kind_proto; + case BPF_FUNC_sys_close: + return &bpf_sys_close_proto; + default: + return tracing_prog_func_proto(func_id, prog); + } +} + +const struct bpf_verifier_ops bpf_syscall_verifier_ops = { + .get_func_proto = syscall_prog_func_proto, + .is_valid_access = syscall_prog_is_valid_access, +}; + +const struct bpf_prog_ops bpf_syscall_prog_ops = { + .test_run = bpf_prog_test_run_syscall, +}; diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index ceac5281bd31..3d7127f439a1 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -111,28 +111,31 @@ struct tnum tnum_xor(struct tnum a, struct tnum b) return TNUM(v & ~mu, mu); } -/* half-multiply add: acc += (unknown * mask * value). - * An intermediate step in the multiply algorithm. +/* Generate partial products by multiplying each bit in the multiplier (tnum a) + * with the multiplicand (tnum b), and add the partial products after + * appropriately bit-shifting them. Instead of directly performing tnum addition + * on the generated partial products, equivalenty, decompose each partial + * product into two tnums, consisting of the value-sum (acc_v) and the + * mask-sum (acc_m) and then perform tnum addition on them. The following paper + * explains the algorithm in more detail: https://arxiv.org/abs/2105.05398. */ -static struct tnum hma(struct tnum acc, u64 value, u64 mask) -{ - while (mask) { - if (mask & 1) - acc = tnum_add(acc, TNUM(0, value)); - mask >>= 1; - value <<= 1; - } - return acc; -} - struct tnum tnum_mul(struct tnum a, struct tnum b) { - struct tnum acc; - u64 pi; - - pi = a.value * b.value; - acc = hma(TNUM(pi, 0), a.mask, b.mask | b.value); - return hma(acc, b.mask, a.value); + u64 acc_v = a.value * b.value; + struct tnum acc_m = TNUM(0, 0); + + while (a.value || a.mask) { + /* LSB of tnum a is a certain 1 */ + if (a.value & 1) + acc_m = tnum_add(acc_m, TNUM(0, b.mask)); + /* LSB of tnum a is uncertain */ + else if (a.mask & 1) + acc_m = tnum_add(acc_m, TNUM(0, b.value | b.mask)); + /* Note: no case for LSB is certain 0 */ + a = tnum_rshift(a, 1); + b = tnum_lshift(b, 1); + } + return tnum_add(TNUM(acc_v, 0), acc_m); } /* Note that if a and b disagree - i.e. one has a 'known 1' where the other has diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 2d44b5aa0057..28a3630c48ee 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -552,7 +552,7 @@ static void notrace inc_misses_counter(struct bpf_prog *prog) * __bpf_prog_enter returns: * 0 - skip execution of the bpf prog * 1 - execute bpf prog - * [2..MAX_U64] - excute bpf prog and record execution time. + * [2..MAX_U64] - execute bpf prog and record execution time. * This is start time. */ u64 notrace __bpf_prog_enter(struct bpf_prog *prog) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 757476c91c98..e04e33893cff 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -47,7 +47,7 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { * - unreachable insns exist (shouldn't be a forest. program = one function) * - out of bounds or malformed jumps * The second pass is all possible path descent from the 1st insn. - * Since it's analyzing all pathes through the program, the length of the + * Since it's analyzing all paths through the program, the length of the * analysis is limited to 64k insn, which may be hit even if total number of * insn is less then 4K, but there are too many branches that change stack/regs. * Number of 'branches to be analyzed' is limited to 1k @@ -132,7 +132,7 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { * If it's ok, then verifier allows this BPF_CALL insn and looks at * .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets * R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function - * returns ether pointer to map value or NULL. + * returns either pointer to map value or NULL. * * When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off' * insn, the register holding that pointer in the true branch changes state to @@ -737,81 +737,104 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose(env, "\n"); } -#define COPY_STATE_FN(NAME, COUNT, FIELD, SIZE) \ -static int copy_##NAME##_state(struct bpf_func_state *dst, \ - const struct bpf_func_state *src) \ -{ \ - if (!src->FIELD) \ - return 0; \ - if (WARN_ON_ONCE(dst->COUNT < src->COUNT)) { \ - /* internal bug, make state invalid to reject the program */ \ - memset(dst, 0, sizeof(*dst)); \ - return -EFAULT; \ - } \ - memcpy(dst->FIELD, src->FIELD, \ - sizeof(*src->FIELD) * (src->COUNT / SIZE)); \ - return 0; \ -} -/* copy_reference_state() */ -COPY_STATE_FN(reference, acquired_refs, refs, 1) -/* copy_stack_state() */ -COPY_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE) -#undef COPY_STATE_FN - -#define REALLOC_STATE_FN(NAME, COUNT, FIELD, SIZE) \ -static int realloc_##NAME##_state(struct bpf_func_state *state, int size, \ - bool copy_old) \ -{ \ - u32 old_size = state->COUNT; \ - struct bpf_##NAME##_state *new_##FIELD; \ - int slot = size / SIZE; \ - \ - if (size <= old_size || !size) { \ - if (copy_old) \ - return 0; \ - state->COUNT = slot * SIZE; \ - if (!size && old_size) { \ - kfree(state->FIELD); \ - state->FIELD = NULL; \ - } \ - return 0; \ - } \ - new_##FIELD = kmalloc_array(slot, sizeof(struct bpf_##NAME##_state), \ - GFP_KERNEL); \ - if (!new_##FIELD) \ - return -ENOMEM; \ - if (copy_old) { \ - if (state->FIELD) \ - memcpy(new_##FIELD, state->FIELD, \ - sizeof(*new_##FIELD) * (old_size / SIZE)); \ - memset(new_##FIELD + old_size / SIZE, 0, \ - sizeof(*new_##FIELD) * (size - old_size) / SIZE); \ - } \ - state->COUNT = slot * SIZE; \ - kfree(state->FIELD); \ - state->FIELD = new_##FIELD; \ - return 0; \ -} -/* realloc_reference_state() */ -REALLOC_STATE_FN(reference, acquired_refs, refs, 1) -/* realloc_stack_state() */ -REALLOC_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE) -#undef REALLOC_STATE_FN - -/* do_check() starts with zero-sized stack in struct bpf_verifier_state to - * make it consume minimal amount of memory. check_stack_write() access from - * the program calls into realloc_func_state() to grow the stack size. - * Note there is a non-zero 'parent' pointer inside bpf_verifier_state - * which realloc_stack_state() copies over. It points to previous - * bpf_verifier_state which is never reallocated. +/* copy array src of length n * size bytes to dst. dst is reallocated if it's too + * small to hold src. This is different from krealloc since we don't want to preserve + * the contents of dst. + * + * Leaves dst untouched if src is NULL or length is zero. Returns NULL if memory could + * not be allocated. */ -static int realloc_func_state(struct bpf_func_state *state, int stack_size, - int refs_size, bool copy_old) +static void *copy_array(void *dst, const void *src, size_t n, size_t size, gfp_t flags) { - int err = realloc_reference_state(state, refs_size, copy_old); - if (err) - return err; - return realloc_stack_state(state, stack_size, copy_old); + size_t bytes; + + if (ZERO_OR_NULL_PTR(src)) + goto out; + + if (unlikely(check_mul_overflow(n, size, &bytes))) + return NULL; + + if (ksize(dst) < bytes) { + kfree(dst); + dst = kmalloc_track_caller(bytes, flags); + if (!dst) + return NULL; + } + + memcpy(dst, src, bytes); +out: + return dst ? dst : ZERO_SIZE_PTR; +} + +/* resize an array from old_n items to new_n items. the array is reallocated if it's too + * small to hold new_n items. new items are zeroed out if the array grows. + * + * Contrary to krealloc_array, does not free arr if new_n is zero. + */ +static void *realloc_array(void *arr, size_t old_n, size_t new_n, size_t size) +{ + if (!new_n || old_n == new_n) + goto out; + + arr = krealloc_array(arr, new_n, size, GFP_KERNEL); + if (!arr) + return NULL; + + if (new_n > old_n) + memset(arr + old_n * size, 0, (new_n - old_n) * size); + +out: + return arr ? arr : ZERO_SIZE_PTR; +} + +static int copy_reference_state(struct bpf_func_state *dst, const struct bpf_func_state *src) +{ + dst->refs = copy_array(dst->refs, src->refs, src->acquired_refs, + sizeof(struct bpf_reference_state), GFP_KERNEL); + if (!dst->refs) + return -ENOMEM; + + dst->acquired_refs = src->acquired_refs; + return 0; +} + +static int copy_stack_state(struct bpf_func_state *dst, const struct bpf_func_state *src) +{ + size_t n = src->allocated_stack / BPF_REG_SIZE; + + dst->stack = copy_array(dst->stack, src->stack, n, sizeof(struct bpf_stack_state), + GFP_KERNEL); + if (!dst->stack) + return -ENOMEM; + + dst->allocated_stack = src->allocated_stack; + return 0; +} + +static int resize_reference_state(struct bpf_func_state *state, size_t n) +{ + state->refs = realloc_array(state->refs, state->acquired_refs, n, + sizeof(struct bpf_reference_state)); + if (!state->refs) + return -ENOMEM; + + state->acquired_refs = n; + return 0; +} + +static int grow_stack_state(struct bpf_func_state *state, int size) +{ + size_t old_n = state->allocated_stack / BPF_REG_SIZE, n = size / BPF_REG_SIZE; + + if (old_n >= n) + return 0; + + state->stack = realloc_array(state->stack, old_n, n, sizeof(struct bpf_stack_state)); + if (!state->stack) + return -ENOMEM; + + state->allocated_stack = size; + return 0; } /* Acquire a pointer id from the env and update the state->refs to include @@ -825,7 +848,7 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx) int new_ofs = state->acquired_refs; int id, err; - err = realloc_reference_state(state, state->acquired_refs + 1, true); + err = resize_reference_state(state, state->acquired_refs + 1); if (err) return err; id = ++env->id_gen; @@ -854,18 +877,6 @@ static int release_reference_state(struct bpf_func_state *state, int ptr_id) return -EINVAL; } -static int transfer_reference_state(struct bpf_func_state *dst, - struct bpf_func_state *src) -{ - int err = realloc_reference_state(dst, src->acquired_refs, false); - if (err) - return err; - err = copy_reference_state(dst, src); - if (err) - return err; - return 0; -} - static void free_func_state(struct bpf_func_state *state) { if (!state) @@ -904,10 +915,6 @@ static int copy_func_state(struct bpf_func_state *dst, { int err; - err = realloc_func_state(dst, src->allocated_stack, src->acquired_refs, - false); - if (err) - return err; memcpy(dst, src, offsetof(struct bpf_func_state, acquired_refs)); err = copy_reference_state(dst, src); if (err) @@ -919,16 +926,13 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, const struct bpf_verifier_state *src) { struct bpf_func_state *dst; - u32 jmp_sz = sizeof(struct bpf_idx_pair) * src->jmp_history_cnt; int i, err; - if (dst_state->jmp_history_cnt < src->jmp_history_cnt) { - kfree(dst_state->jmp_history); - dst_state->jmp_history = kmalloc(jmp_sz, GFP_USER); - if (!dst_state->jmp_history) - return -ENOMEM; - } - memcpy(dst_state->jmp_history, src->jmp_history, jmp_sz); + dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history, + src->jmp_history_cnt, sizeof(struct bpf_idx_pair), + GFP_USER); + if (!dst_state->jmp_history) + return -ENOMEM; dst_state->jmp_history_cnt = src->jmp_history_cnt; /* if dst has more stack frames then src frame, free them */ @@ -2590,8 +2594,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, u32 dst_reg = env->prog->insnsi[insn_idx].dst_reg; struct bpf_reg_state *reg = NULL; - err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE), - state->acquired_refs, true); + err = grow_stack_state(state, round_up(slot + 1, BPF_REG_SIZE)); if (err) return err; /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, @@ -2613,7 +2616,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, if (dst_reg != BPF_REG_FP) { /* The backtracking logic can only recognize explicit * stack slot address like [fp - 8]. Other spill of - * scalar via different register has to be conervative. + * scalar via different register has to be conservative. * Backtrack from here and mark all registers as precise * that contributed into 'reg' being a constant. */ @@ -2753,8 +2756,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, if (value_reg && register_is_null(value_reg)) writing_zero = true; - err = realloc_func_state(state, round_up(-min_off, BPF_REG_SIZE), - state->acquired_refs, true); + err = grow_stack_state(state, round_up(-min_off, BPF_REG_SIZE)); if (err) return err; @@ -5629,7 +5631,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn subprog /* subprog number within this prog */); /* Transfer references to the callee */ - err = transfer_reference_state(callee, caller); + err = copy_reference_state(callee, caller); if (err) return err; @@ -5780,7 +5782,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) } /* Transfer references to the caller */ - err = transfer_reference_state(caller, callee); + err = copy_reference_state(caller, callee); if (err) return err; @@ -6409,18 +6411,10 @@ enum { }; static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, - const struct bpf_reg_state *off_reg, - u32 *alu_limit, u8 opcode) + u32 *alu_limit, bool mask_to_left) { - bool off_is_neg = off_reg->smin_value < 0; - bool mask_to_left = (opcode == BPF_ADD && off_is_neg) || - (opcode == BPF_SUB && !off_is_neg); u32 max = 0, ptr_limit = 0; - if (!tnum_is_const(off_reg->var_off) && - (off_reg->smin_value < 0) != (off_reg->smax_value < 0)) - return REASON_BOUNDS; - switch (ptr_reg->type) { case PTR_TO_STACK: /* Offset 0 is out-of-bounds, but acceptable start for the @@ -6486,15 +6480,41 @@ static bool sanitize_needed(u8 opcode) return opcode == BPF_ADD || opcode == BPF_SUB; } +struct bpf_sanitize_info { + struct bpf_insn_aux_data aux; + bool mask_to_left; +}; + +static struct bpf_verifier_state * +sanitize_speculative_path(struct bpf_verifier_env *env, + const struct bpf_insn *insn, + u32 next_idx, u32 curr_idx) +{ + struct bpf_verifier_state *branch; + struct bpf_reg_state *regs; + + branch = push_stack(env, next_idx, curr_idx, true); + if (branch && insn) { + regs = branch->frame[branch->curframe]->regs; + if (BPF_SRC(insn->code) == BPF_K) { + mark_reg_unknown(env, regs, insn->dst_reg); + } else if (BPF_SRC(insn->code) == BPF_X) { + mark_reg_unknown(env, regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->src_reg); + } + } + return branch; +} + static int sanitize_ptr_alu(struct bpf_verifier_env *env, struct bpf_insn *insn, const struct bpf_reg_state *ptr_reg, const struct bpf_reg_state *off_reg, struct bpf_reg_state *dst_reg, - struct bpf_insn_aux_data *tmp_aux, + struct bpf_sanitize_info *info, const bool commit_window) { - struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : tmp_aux; + struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : &info->aux; struct bpf_verifier_state *vstate = env->cur_state; bool off_is_imm = tnum_is_const(off_reg->var_off); bool off_is_neg = off_reg->smin_value < 0; @@ -6515,7 +6535,16 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, if (vstate->speculative) goto do_sim; - err = retrieve_ptr_limit(ptr_reg, off_reg, &alu_limit, opcode); + if (!commit_window) { + if (!tnum_is_const(off_reg->var_off) && + (off_reg->smin_value < 0) != (off_reg->smax_value < 0)) + return REASON_BOUNDS; + + info->mask_to_left = (opcode == BPF_ADD && off_is_neg) || + (opcode == BPF_SUB && !off_is_neg); + } + + err = retrieve_ptr_limit(ptr_reg, &alu_limit, info->mask_to_left); if (err < 0) return err; @@ -6523,8 +6552,8 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, /* In commit phase we narrow the masking window based on * the observed pointer move after the simulated operation. */ - alu_state = tmp_aux->alu_state; - alu_limit = abs(tmp_aux->alu_limit - alu_limit); + alu_state = info->aux.alu_state; + alu_limit = abs(info->aux.alu_limit - alu_limit); } else { alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0; alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0; @@ -6539,8 +6568,12 @@ do_sim: /* If we're in commit phase, we're done here given we already * pushed the truncated dst_reg into the speculative verification * stack. + * + * Also, when register is a known constant, we rewrite register-based + * operation to immediate-based, and thus do not need masking (and as + * a consequence, do not need to simulate the zero-truncation either). */ - if (commit_window) + if (commit_window || off_is_imm) return 0; /* Simulate and find potential out-of-bounds access under @@ -6556,12 +6589,26 @@ do_sim: tmp = *dst_reg; *dst_reg = *ptr_reg; } - ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true); + ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1, + env->insn_idx); if (!ptr_is_dst_reg && ret) *dst_reg = tmp; return !ret ? REASON_STACK : 0; } +static void sanitize_mark_insn_seen(struct bpf_verifier_env *env) +{ + struct bpf_verifier_state *vstate = env->cur_state; + + /* If we simulate paths under speculation, we don't update the + * insn as 'seen' such that when we verify unreachable paths in + * the non-speculative domain, sanitize_dead_code() can still + * rewrite/sanitize them. + */ + if (!vstate->speculative) + env->insn_aux_data[env->insn_idx].seen = env->pass_cnt; +} + static int sanitize_err(struct bpf_verifier_env *env, const struct bpf_insn *insn, int reason, const struct bpf_reg_state *off_reg, @@ -6685,7 +6732,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value, umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value; - struct bpf_insn_aux_data tmp_aux = {}; + struct bpf_sanitize_info info = {}; u8 opcode = BPF_OP(insn->code); u32 dst = insn->dst_reg; int ret; @@ -6754,7 +6801,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, if (sanitize_needed(opcode)) { ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg, - &tmp_aux, false); + &info, false); if (ret < 0) return sanitize_err(env, insn, ret, off_reg, dst_reg); } @@ -6895,7 +6942,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return -EACCES; if (sanitize_needed(opcode)) { ret = sanitize_ptr_alu(env, insn, dst_reg, off_reg, dst_reg, - &tmp_aux, true); + &info, true); if (ret < 0) return sanitize_err(env, insn, ret, off_reg, dst_reg); } @@ -7084,11 +7131,10 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, s32 smin_val = src_reg->s32_min_value; u32 umax_val = src_reg->u32_max_value; - /* Assuming scalar64_min_max_and will be called so its safe - * to skip updating register for known 32-bit case. - */ - if (src_known && dst_known) + if (src_known && dst_known) { + __mark_reg32_known(dst_reg, var32_off.value); return; + } /* We get our minimum from the var_off, since that's inherently * bitwise. Our maximum is the minimum of the operands' maxima. @@ -7108,7 +7154,6 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, dst_reg->s32_min_value = dst_reg->u32_min_value; dst_reg->s32_max_value = dst_reg->u32_max_value; } - } static void scalar_min_max_and(struct bpf_reg_state *dst_reg, @@ -7155,11 +7200,10 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, s32 smin_val = src_reg->s32_min_value; u32 umin_val = src_reg->u32_min_value; - /* Assuming scalar64_min_max_or will be called so it is safe - * to skip updating register for known case. - */ - if (src_known && dst_known) + if (src_known && dst_known) { + __mark_reg32_known(dst_reg, var32_off.value); return; + } /* We get our maximum from the var_off, and our minimum is the * maximum of the operands' minima @@ -7224,11 +7268,10 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg, struct tnum var32_off = tnum_subreg(dst_reg->var_off); s32 smin_val = src_reg->s32_min_value; - /* Assuming scalar64_min_max_xor will be called so it is safe - * to skip updating register for known case. - */ - if (src_known && dst_known) + if (src_known && dst_known) { + __mark_reg32_known(dst_reg, var32_off.value); return; + } /* We get both minimum and maximum from the var32_off. */ dst_reg->u32_min_value = var32_off.value; @@ -8744,14 +8787,28 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (err) return err; } + if (pred == 1) { - /* only follow the goto, ignore fall-through */ + /* Only follow the goto, ignore fall-through. If needed, push + * the fall-through branch for simulation under speculative + * execution. + */ + if (!env->bypass_spec_v1 && + !sanitize_speculative_path(env, insn, *insn_idx + 1, + *insn_idx)) + return -EFAULT; *insn_idx += insn->off; return 0; } else if (pred == 0) { - /* only follow fall-through branch, since - * that's where the program will go + /* Only follow the fall-through branch, since that's where the + * program will go. If needed, push the goto branch for + * simulation under speculative execution. */ + if (!env->bypass_spec_v1 && + !sanitize_speculative_path(env, insn, + *insn_idx + insn->off + 1, + *insn_idx)) + return -EFAULT; return 0; } @@ -8913,12 +8970,14 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) mark_reg_known_zero(env, regs, insn->dst_reg); dst_reg->map_ptr = map; - if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) { + if (insn->src_reg == BPF_PSEUDO_MAP_VALUE || + insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) { dst_reg->type = PTR_TO_MAP_VALUE; dst_reg->off = aux->map_off; if (map_value_has_spin_lock(map)) dst_reg->id = ++env->id_gen; - } else if (insn->src_reg == BPF_PSEUDO_MAP_FD) { + } else if (insn->src_reg == BPF_PSEUDO_MAP_FD || + insn->src_reg == BPF_PSEUDO_MAP_IDX) { dst_reg->type = CONST_PTR_TO_MAP; } else { verbose(env, "bpf verifier is misconfigured\n"); @@ -9049,7 +9108,7 @@ static int check_return_code(struct bpf_verifier_env *env) !prog->aux->attach_func_proto->type) return 0; - /* eBPF calling convetion is such that R0 is used + /* eBPF calling convention is such that R0 is used * to return the value from eBPF program. * Make sure that it's readable at this time * of bpf_exit, which means that program wrote @@ -9434,7 +9493,7 @@ static int check_abnormal_return(struct bpf_verifier_env *env) static int check_btf_func(struct bpf_verifier_env *env, const union bpf_attr *attr, - union bpf_attr __user *uattr) + bpfptr_t uattr) { const struct btf_type *type, *func_proto, *ret_type; u32 i, nfuncs, urec_size, min_size; @@ -9443,7 +9502,7 @@ static int check_btf_func(struct bpf_verifier_env *env, struct bpf_func_info_aux *info_aux = NULL; struct bpf_prog *prog; const struct btf *btf; - void __user *urecord; + bpfptr_t urecord; u32 prev_offset = 0; bool scalar_return; int ret = -ENOMEM; @@ -9471,7 +9530,7 @@ static int check_btf_func(struct bpf_verifier_env *env, prog = env->prog; btf = prog->aux->btf; - urecord = u64_to_user_ptr(attr->func_info); + urecord = make_bpfptr(attr->func_info, uattr.is_kernel); min_size = min_t(u32, krec_size, urec_size); krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN); @@ -9489,13 +9548,15 @@ static int check_btf_func(struct bpf_verifier_env *env, /* set the size kernel expects so loader can zero * out the rest of the record. */ - if (put_user(min_size, &uattr->func_info_rec_size)) + if (copy_to_bpfptr_offset(uattr, + offsetof(union bpf_attr, func_info_rec_size), + &min_size, sizeof(min_size))) ret = -EFAULT; } goto err_free; } - if (copy_from_user(&krecord[i], urecord, min_size)) { + if (copy_from_bpfptr(&krecord[i], urecord, min_size)) { ret = -EFAULT; goto err_free; } @@ -9547,7 +9608,7 @@ static int check_btf_func(struct bpf_verifier_env *env, } prev_offset = krecord[i].insn_off; - urecord += urec_size; + bpfptr_add(&urecord, urec_size); } prog->aux->func_info = krecord; @@ -9579,14 +9640,14 @@ static void adjust_btf_func(struct bpf_verifier_env *env) static int check_btf_line(struct bpf_verifier_env *env, const union bpf_attr *attr, - union bpf_attr __user *uattr) + bpfptr_t uattr) { u32 i, s, nr_linfo, ncopy, expected_size, rec_size, prev_offset = 0; struct bpf_subprog_info *sub; struct bpf_line_info *linfo; struct bpf_prog *prog; const struct btf *btf; - void __user *ulinfo; + bpfptr_t ulinfo; int err; nr_linfo = attr->line_info_cnt; @@ -9612,7 +9673,7 @@ static int check_btf_line(struct bpf_verifier_env *env, s = 0; sub = env->subprog_info; - ulinfo = u64_to_user_ptr(attr->line_info); + ulinfo = make_bpfptr(attr->line_info, uattr.is_kernel); expected_size = sizeof(struct bpf_line_info); ncopy = min_t(u32, expected_size, rec_size); for (i = 0; i < nr_linfo; i++) { @@ -9620,14 +9681,15 @@ static int check_btf_line(struct bpf_verifier_env *env, if (err) { if (err == -E2BIG) { verbose(env, "nonzero tailing record in line_info"); - if (put_user(expected_size, - &uattr->line_info_rec_size)) + if (copy_to_bpfptr_offset(uattr, + offsetof(union bpf_attr, line_info_rec_size), + &expected_size, sizeof(expected_size))) err = -EFAULT; } goto err_free; } - if (copy_from_user(&linfo[i], ulinfo, ncopy)) { + if (copy_from_bpfptr(&linfo[i], ulinfo, ncopy)) { err = -EFAULT; goto err_free; } @@ -9679,7 +9741,7 @@ static int check_btf_line(struct bpf_verifier_env *env, } prev_offset = linfo[i].insn_off; - ulinfo += rec_size; + bpfptr_add(&ulinfo, rec_size); } if (s != env->subprog_cnt) { @@ -9701,7 +9763,7 @@ err_free: static int check_btf_info(struct bpf_verifier_env *env, const union bpf_attr *attr, - union bpf_attr __user *uattr) + bpfptr_t uattr) { struct btf *btf; int err; @@ -9746,13 +9808,6 @@ static bool range_within(struct bpf_reg_state *old, old->s32_max_value >= cur->s32_max_value; } -/* Maximum number of register states that can exist at once */ -#define ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) -struct idpair { - u32 old; - u32 cur; -}; - /* If in the old state two registers had the same id, then they need to have * the same id in the new state as well. But that id could be different from * the old state, so we need to track the mapping from old to new ids. @@ -9763,11 +9818,11 @@ struct idpair { * So we look through our idmap to see if this old id has been seen before. If * so, we require the new id to match; otherwise, we add the id pair to the map. */ -static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap) +static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap) { unsigned int i; - for (i = 0; i < ID_MAP_SIZE; i++) { + for (i = 0; i < BPF_ID_MAP_SIZE; i++) { if (!idmap[i].old) { /* Reached an empty slot; haven't seen this id before */ idmap[i].old = old_id; @@ -9844,7 +9899,7 @@ static void clean_verifier_state(struct bpf_verifier_env *env, * Since the verifier pushes the branch states as it sees them while exploring * the program the condition of walking the branch instruction for the second * time means that all states below this branch were already explored and - * their final liveness markes are already propagated. + * their final liveness marks are already propagated. * Hence when the verifier completes the search of state list in is_state_visited() * we can call this clean_live_states() function to mark all liveness states * as REG_LIVE_DONE to indicate that 'parent' pointers of 'struct bpf_reg_state' @@ -9880,7 +9935,7 @@ next: /* Returns true if (rold safe implies rcur safe) */ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, - struct idpair *idmap) + struct bpf_id_pair *idmap) { bool equal; @@ -9998,7 +10053,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, static bool stacksafe(struct bpf_func_state *old, struct bpf_func_state *cur, - struct idpair *idmap) + struct bpf_id_pair *idmap) { int i, spi; @@ -10095,32 +10150,23 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur) * whereas register type in current state is meaningful, it means that * the current state will reach 'bpf_exit' instruction safely */ -static bool func_states_equal(struct bpf_func_state *old, +static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old, struct bpf_func_state *cur) { - struct idpair *idmap; - bool ret = false; int i; - idmap = kcalloc(ID_MAP_SIZE, sizeof(struct idpair), GFP_KERNEL); - /* If we failed to allocate the idmap, just say it's not safe */ - if (!idmap) - return false; - - for (i = 0; i < MAX_BPF_REG; i++) { - if (!regsafe(&old->regs[i], &cur->regs[i], idmap)) - goto out_free; - } + memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch)); + for (i = 0; i < MAX_BPF_REG; i++) + if (!regsafe(&old->regs[i], &cur->regs[i], env->idmap_scratch)) + return false; - if (!stacksafe(old, cur, idmap)) - goto out_free; + if (!stacksafe(old, cur, env->idmap_scratch)) + return false; if (!refsafe(old, cur)) - goto out_free; - ret = true; -out_free: - kfree(idmap); - return ret; + return false; + + return true; } static bool states_equal(struct bpf_verifier_env *env, @@ -10147,7 +10193,7 @@ static bool states_equal(struct bpf_verifier_env *env, for (i = 0; i <= old->curframe; i++) { if (old->frame[i]->callsite != cur->frame[i]->callsite) return false; - if (!func_states_equal(old->frame[i], cur->frame[i])) + if (!func_states_equal(env, old->frame[i], cur->frame[i])) return false; } return true; @@ -10624,7 +10670,7 @@ static int do_check(struct bpf_verifier_env *env) } regs = cur_regs(env); - env->insn_aux_data[env->insn_idx].seen = env->pass_cnt; + sanitize_mark_insn_seen(env); prev_insn_idx = env->insn_idx; if (class == BPF_ALU || class == BPF_ALU64) { @@ -10851,7 +10897,7 @@ process_bpf_exit: return err; env->insn_idx++; - env->insn_aux_data[env->insn_idx].seen = env->pass_cnt; + sanitize_mark_insn_seen(env); } else { verbose(env, "invalid BPF_LD mode\n"); return -EINVAL; @@ -11184,6 +11230,7 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) struct bpf_map *map; struct fd f; u64 addr; + u32 fd; if (i == insn_cnt - 1 || insn[1].code != 0 || insn[1].dst_reg != 0 || insn[1].src_reg != 0 || @@ -11213,16 +11260,38 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) /* In final convert_pseudo_ld_imm64() step, this is * converted into regular 64-bit imm load insn. */ - if ((insn[0].src_reg != BPF_PSEUDO_MAP_FD && - insn[0].src_reg != BPF_PSEUDO_MAP_VALUE) || - (insn[0].src_reg == BPF_PSEUDO_MAP_FD && - insn[1].imm != 0)) { - verbose(env, - "unrecognized bpf_ld_imm64 insn\n"); + switch (insn[0].src_reg) { + case BPF_PSEUDO_MAP_VALUE: + case BPF_PSEUDO_MAP_IDX_VALUE: + break; + case BPF_PSEUDO_MAP_FD: + case BPF_PSEUDO_MAP_IDX: + if (insn[1].imm == 0) + break; + fallthrough; + default: + verbose(env, "unrecognized bpf_ld_imm64 insn\n"); return -EINVAL; } - f = fdget(insn[0].imm); + switch (insn[0].src_reg) { + case BPF_PSEUDO_MAP_IDX_VALUE: + case BPF_PSEUDO_MAP_IDX: + if (bpfptr_is_null(env->fd_array)) { + verbose(env, "fd_idx without fd_array is invalid\n"); + return -EPROTO; + } + if (copy_from_bpfptr_offset(&fd, env->fd_array, + insn[0].imm * sizeof(fd), + sizeof(fd))) + return -EFAULT; + break; + default: + fd = insn[0].imm; + break; + } + + f = fdget(fd); map = __bpf_map_get(f); if (IS_ERR(map)) { verbose(env, "fd %d is not pointing to valid bpf_map\n", @@ -11237,7 +11306,8 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) } aux = &env->insn_aux_data[i]; - if (insn->src_reg == BPF_PSEUDO_MAP_FD) { + if (insn[0].src_reg == BPF_PSEUDO_MAP_FD || + insn[0].src_reg == BPF_PSEUDO_MAP_IDX) { addr = (unsigned long)map; } else { u32 off = insn[1].imm; @@ -11360,6 +11430,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, { struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; struct bpf_insn *insn = new_prog->insnsi; + u32 old_seen = old_data[off].seen; u32 prog_len; int i; @@ -11380,7 +11451,8 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, memcpy(new_data + off + cnt - 1, old_data + off, sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); for (i = off; i < off + cnt - 1; i++) { - new_data[i].seen = env->pass_cnt; + /* Expand insni[off]'s seen count to the patched range. */ + new_data[i].seen = old_seen; new_data[i].zext_dst = insn_has_def32(env, insn + i); } env->insn_aux_data = new_data; @@ -12449,7 +12521,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) prog->aux->max_pkt_offset = MAX_PACKET_OFF; /* mark bpf_tail_call as different opcode to avoid - * conditional branch in the interpeter for every normal + * conditional branch in the interpreter for every normal * call and to prevent accidental JITing by JIT compiler * that doesn't support bpf_tail_call yet */ @@ -12704,6 +12776,9 @@ static void free_states(struct bpf_verifier_env *env) * insn_aux_data was touched. These variables are compared to clear temporary * data from failed pass. For testing and experiments do_check_common() can be * run multiple times even when prior attempt to verify is unsuccessful. + * + * Note that special handling is needed on !env->bypass_spec_v1 if this is + * ever called outside of error path with subsequent program rejection. */ static void sanitize_insn_aux_data(struct bpf_verifier_env *env) { @@ -13200,6 +13275,17 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, return 0; } +BTF_SET_START(btf_id_deny) +BTF_ID_UNUSED +#ifdef CONFIG_SMP +BTF_ID(func, migrate_disable) +BTF_ID(func, migrate_enable) +#endif +#if !defined CONFIG_PREEMPT_RCU && !defined CONFIG_TINY_RCU +BTF_ID(func, rcu_read_unlock_strict) +#endif +BTF_SET_END(btf_id_deny) + static int check_attach_btf_id(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; @@ -13210,6 +13296,14 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) int ret; u64 key; + if (prog->type == BPF_PROG_TYPE_SYSCALL) { + if (prog->aux->sleepable) + /* attach_btf_id checked to be zero already */ + return 0; + verbose(env, "Syscall programs can only be sleepable\n"); + return -EINVAL; + } + if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING && prog->type != BPF_PROG_TYPE_LSM) { verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n"); @@ -13259,6 +13353,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) ret = bpf_lsm_verify_prog(&env->log, prog); if (ret < 0) return ret; + } else if (prog->type == BPF_PROG_TYPE_TRACING && + btf_id_set_contains(&btf_id_deny, btf_id)) { + return -EINVAL; } key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id); @@ -13281,8 +13378,7 @@ struct btf *bpf_get_btf_vmlinux(void) return btf_vmlinux; } -int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, - union bpf_attr __user *uattr) +int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr) { u64 start_time = ktime_get_ns(); struct bpf_verifier_env *env; @@ -13312,6 +13408,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->insn_aux_data[i].orig_idx = i; env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; + env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel); is_priv = bpf_capable(); bpf_get_btf_vmlinux(); @@ -13358,12 +13455,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; - if (bpf_prog_is_dev_bound(env->prog->aux)) { - ret = bpf_prog_offload_verifier_prep(env->prog); - if (ret) - goto skip_full_check; - } - env->explored_states = kvcalloc(state_htab_size(env), sizeof(struct bpf_verifier_state_list *), GFP_USER); @@ -13391,6 +13482,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (ret < 0) goto skip_full_check; + if (bpf_prog_is_dev_bound(env->prog->aux)) { + ret = bpf_prog_offload_verifier_prep(env->prog); + if (ret) + goto skip_full_check; + } + ret = check_cfg(env); if (ret < 0) goto skip_full_check; diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 391aa570369b..1f274d7fc934 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -820,6 +820,10 @@ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent struct cgroup *cgrp = kn->priv; int ret; + /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */ + if (strchr(new_name_str, '\n')) + return -EINVAL; + if (kernfs_type(kn) != KERNFS_DIR) return -ENOTDIR; if (kn->parent != new_parent) @@ -1001,7 +1005,7 @@ static int check_cgroupfs_options(struct fs_context *fc) ctx->subsys_mask &= enabled; /* - * In absense of 'none', 'name=' or subsystem name options, + * In absence of 'none', 'name=' and subsystem name options, * let's default to 'all'. */ if (!ctx->subsys_mask && !ctx->none && !ctx->name) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e049edd66776..21ecc6ee6a6d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -468,7 +468,7 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, * @cgrp: the cgroup of interest * @ss: the subsystem of interest * - * Find and get @cgrp's css assocaited with @ss. If the css doesn't exist + * Find and get @cgrp's css associated with @ss. If the css doesn't exist * or is offline, %NULL is returned. */ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, @@ -1633,7 +1633,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) /** * css_clear_dir - remove subsys files in a cgroup directory - * @css: taget css + * @css: target css */ static void css_clear_dir(struct cgroup_subsys_state *css) { @@ -5350,7 +5350,7 @@ out_unlock: /* * This is called when the refcnt of a css is confirmed to be killed. * css_tryget_online() is now guaranteed to fail. Tell the subsystem to - * initate destruction and put the css ref from kill_css(). + * initiate destruction and put the css ref from kill_css(). */ static void css_killed_work_fn(struct work_struct *work) { @@ -5634,8 +5634,6 @@ int __init cgroup_init_early(void) return 0; } -static u16 cgroup_disable_mask __initdata; - /** * cgroup_init - cgroup initialization * @@ -5694,12 +5692,8 @@ int __init cgroup_init(void) * disabled flag and cftype registration needs kmalloc, * both of which aren't available during early_init. */ - if (cgroup_disable_mask & (1 << ssid)) { - static_branch_disable(cgroup_subsys_enabled_key[ssid]); - printk(KERN_INFO "Disabling %s control group subsystem\n", - ss->name); + if (!cgroup_ssid_enabled(ssid)) continue; - } if (cgroup1_ssid_disabled(ssid)) printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n", @@ -6058,7 +6052,7 @@ out_revert: * @kargs: the arguments passed to create the child process * * This calls the cancel_fork() callbacks if a fork failed *after* - * cgroup_can_fork() succeded and cleans up references we took to + * cgroup_can_fork() succeeded and cleans up references we took to * prepare a new css_set for the child process in cgroup_can_fork(). */ void cgroup_cancel_fork(struct task_struct *child, @@ -6214,7 +6208,10 @@ static int __init cgroup_disable(char *str) if (strcmp(token, ss->name) && strcmp(token, ss->legacy_name)) continue; - cgroup_disable_mask |= 1 << i; + + static_branch_disable(cgroup_subsys_enabled_key[i]); + pr_info("Disabling %s control group subsystem\n", + ss->name); } } return 1; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index a945504c0ae7..adb5190c4429 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3376,7 +3376,7 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) } /** - * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed + * cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed * @nodemask: the nodemask to be checked * * Are any of the nodes in the nodemask allowed in current->mems_allowed? diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index ae042c347c64..3135406608c7 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c @@ -244,7 +244,7 @@ EXPORT_SYMBOL(rdmacg_uncharge); * This function follows charging resource in hierarchical way. * It will fail if the charge would cause the new value to exceed the * hierarchical limit. - * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL. + * Returns 0 if the charge succeeded, otherwise -EAGAIN, -ENOMEM or -EINVAL. * Returns pointer to rdmacg for this resource when charging is successful. * * Charger needs to account resources on two criteria. diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 3a3fd2993a65..cee265cb535c 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -75,7 +75,7 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) * @root: root of the tree to traversal * @cpu: target cpu * - * Walks the udpated rstat_cpu tree on @cpu from @root. %NULL @pos starts + * Walks the updated rstat_cpu tree on @cpu from @root. %NULL @pos starts * the traversal and %NULL return indicates the end. During traversal, * each returned cgroup is unlinked from the tree. Must be called with the * matching cgroup_rstat_cpu_lock held. diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 825284baaf46..684a6061a13a 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -464,6 +464,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); VMCOREINFO_STRUCT_SIZE(mem_section); VMCOREINFO_OFFSET(mem_section, section_mem_map); + VMCOREINFO_NUMBER(SECTION_SIZE_BITS); VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS); #endif VMCOREINFO_STRUCT_SIZE(page); diff --git a/kernel/entry/common.c b/kernel/entry/common.c index a0b3b04fb596..bf16395b9e13 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -5,6 +5,7 @@ #include <linux/highmem.h> #include <linux/livepatch.h> #include <linux/audit.h> +#include <linux/tick.h> #include "common.h" @@ -186,7 +187,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, local_irq_disable_exit_to_user(); /* Check if any of the above work has queued a deferred wakeup */ - rcu_nocb_flush_deferred_wakeup(); + tick_nohz_user_enter_prepare(); ti_work = READ_ONCE(current_thread_info()->flags); } @@ -202,7 +203,7 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs) lockdep_assert_irqs_disabled(); /* Flush pending rcuog wakeup before the last need_resched() check */ - rcu_nocb_flush_deferred_wakeup(); + tick_nohz_user_enter_prepare(); if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) ti_work = exit_to_user_mode_loop(regs, ti_work); diff --git a/kernel/events/core.c b/kernel/events/core.c index 2e947a485898..fe88d6eea3c2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4609,7 +4609,9 @@ find_get_context(struct pmu *pmu, struct task_struct *task, cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); ctx = &cpuctx->ctx; get_ctx(ctx); + raw_spin_lock_irqsave(&ctx->lock, flags); ++ctx->pin_count; + raw_spin_unlock_irqrestore(&ctx->lock, flags); return ctx; } @@ -6389,8 +6391,6 @@ void perf_event_wakeup(struct perf_event *event) static void perf_sigtrap(struct perf_event *event) { - struct kernel_siginfo info; - /* * We'd expect this to only occur if the irq_work is delayed and either * ctx->task or current has changed in the meantime. This can be the @@ -6405,13 +6405,8 @@ static void perf_sigtrap(struct perf_event *event) if (current->flags & PF_EXITING) return; - clear_siginfo(&info); - info.si_signo = SIGTRAP; - info.si_code = TRAP_PERF; - info.si_errno = event->attr.type; - info.si_perf = event->attr.sig_data; - info.si_addr = (void __user *)event->pending_addr; - force_sig_info(&info); + force_sig_perf((void __user *)event->pending_addr, + event->attr.type, event->attr.sig_data); } static void perf_pending_event_disable(struct perf_event *event) diff --git a/kernel/futex.c b/kernel/futex.c index c98b825da9cf..4938a00bc785 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3710,8 +3710,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, if (op & FUTEX_CLOCK_REALTIME) { flags |= FLAGS_CLOCKRT; - if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \ - cmd != FUTEX_WAIT_REQUEUE_PI) + if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) return -ENOSYS; } @@ -3758,42 +3757,52 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, return -ENOSYS; } +static __always_inline bool futex_cmd_has_timeout(u32 cmd) +{ + switch (cmd) { + case FUTEX_WAIT: + case FUTEX_LOCK_PI: + case FUTEX_WAIT_BITSET: + case FUTEX_WAIT_REQUEUE_PI: + return true; + } + return false; +} + +static __always_inline int +futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t) +{ + if (!timespec64_valid(ts)) + return -EINVAL; + + *t = timespec64_to_ktime(*ts); + if (cmd == FUTEX_WAIT) + *t = ktime_add_safe(ktime_get(), *t); + else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME)) + *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t); + return 0; +} SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, const struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, u32, val3) { - struct timespec64 ts; + int ret, cmd = op & FUTEX_CMD_MASK; ktime_t t, *tp = NULL; - u32 val2 = 0; - int cmd = op & FUTEX_CMD_MASK; + struct timespec64 ts; - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET || - cmd == FUTEX_WAIT_REQUEUE_PI)) { + if (utime && futex_cmd_has_timeout(cmd)) { if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) return -EFAULT; if (get_timespec64(&ts, utime)) return -EFAULT; - if (!timespec64_valid(&ts)) - return -EINVAL; - - t = timespec64_to_ktime(ts); - if (cmd == FUTEX_WAIT) - t = ktime_add_safe(ktime_get(), t); - else if (!(op & FUTEX_CLOCK_REALTIME)) - t = timens_ktime_to_host(CLOCK_MONOTONIC, t); + ret = futex_init_timeout(cmd, op, &ts, &t); + if (ret) + return ret; tp = &t; } - /* - * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*. - * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. - */ - if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || - cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) - val2 = (u32) (unsigned long) utime; - return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); + return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } #ifdef CONFIG_COMPAT @@ -3959,31 +3968,20 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, const struct old_timespec32 __user *, utime, u32 __user *, uaddr2, u32, val3) { - struct timespec64 ts; + int ret, cmd = op & FUTEX_CMD_MASK; ktime_t t, *tp = NULL; - int val2 = 0; - int cmd = op & FUTEX_CMD_MASK; + struct timespec64 ts; - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET || - cmd == FUTEX_WAIT_REQUEUE_PI)) { + if (utime && futex_cmd_has_timeout(cmd)) { if (get_old_timespec32(&ts, utime)) return -EFAULT; - if (!timespec64_valid(&ts)) - return -EINVAL; - - t = timespec64_to_ktime(ts); - if (cmd == FUTEX_WAIT) - t = ktime_add_safe(ktime_get(), t); - else if (!(op & FUTEX_CLOCK_REALTIME)) - t = timens_ktime_to_host(CLOCK_MONOTONIC, t); + ret = futex_init_timeout(cmd, op, &ts, &t); + if (ret) + return ret; tp = &t; } - if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || - cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) - val2 = (int) (unsigned long) utime; - return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); + return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } #endif /* CONFIG_COMPAT_32BIT_TIME */ diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 23a7a0ba1388..db8c248ebc8c 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -70,9 +70,6 @@ bool irq_work_queue(struct irq_work *work) if (!irq_work_claim(work)) return false; - /*record irq_work call stack in order to print it in KASAN reports*/ - kasan_record_aux_stack(work); - /* Queue the entry and raise the IPI if needed. */ preempt_disable(); __irq_work_queue_local(work); diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index c1dd02f3be8b..e65de172ccf7 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -266,9 +266,10 @@ static const struct file_operations debugfs_ops = .release = single_release }; -static void __init kcsan_debugfs_init(void) +static int __init kcsan_debugfs_init(void) { debugfs_create_file("kcsan", 0644, NULL, NULL, &debugfs_ops); + return 0; } late_initcall(kcsan_debugfs_init); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 48d736aa03b2..7641bd407239 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -5736,7 +5736,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - trace_lock_acquired(lock, ip); + trace_lock_contended(lock, ip); if (unlikely(!lock_stat || !lockdep_enabled())) return; @@ -5754,7 +5754,7 @@ void lock_acquired(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - trace_lock_contended(lock, ip); + trace_lock_acquired(lock, ip); if (unlikely(!lock_stat || !lockdep_enabled())) return; diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index a7276aaf2abc..db9301591e3f 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -57,7 +57,7 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, task->blocked_on = waiter; } -void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, +void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct task_struct *task) { DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); @@ -65,7 +65,7 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter); task->blocked_on = NULL; - list_del_init(&waiter->list); + INIT_LIST_HEAD(&waiter->list); waiter->task = NULL; } diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h index 1edd3f45a4ec..53e631e1d76d 100644 --- a/kernel/locking/mutex-debug.h +++ b/kernel/locking/mutex-debug.h @@ -22,7 +22,7 @@ extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); extern void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct task_struct *task); -extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, +extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct task_struct *task); extern void debug_mutex_unlock(struct mutex *lock); extern void debug_mutex_init(struct mutex *lock, const char *name, diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index cb6b112ce155..013e1b08a1bf 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -194,7 +194,7 @@ static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_wait * Add @waiter to a given location in the lock wait_list and set the * FLAG_WAITERS flag if it's the first waiter. */ -static void __sched +static void __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct list_head *list) { @@ -205,6 +205,16 @@ __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); } +static void +__mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter) +{ + list_del(&waiter->list); + if (likely(list_empty(&lock->wait_list))) + __mutex_clear_flag(lock, MUTEX_FLAGS); + + debug_mutex_remove_waiter(lock, waiter, current); +} + /* * Give up ownership to a specific task, when @task = NULL, this is equivalent * to a regular unlock. Sets PICKUP on a handoff, clears HANDOFF, preserves @@ -1061,9 +1071,7 @@ acquired: __ww_mutex_check_waiters(lock, ww_ctx); } - mutex_remove_waiter(lock, &waiter, current); - if (likely(list_empty(&lock->wait_list))) - __mutex_clear_flag(lock, MUTEX_FLAGS); + __mutex_remove_waiter(lock, &waiter); debug_mutex_free_waiter(&waiter); @@ -1080,7 +1088,7 @@ skip_wait: err: __set_current_state(TASK_RUNNING); - mutex_remove_waiter(lock, &waiter, current); + __mutex_remove_waiter(lock, &waiter); err_early_kill: spin_unlock(&lock->wait_lock); debug_mutex_free_waiter(&waiter); diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 1c2287d3fa71..f0c710b1d192 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -10,12 +10,10 @@ * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: */ -#define mutex_remove_waiter(lock, waiter, task) \ - __list_del((waiter)->list.prev, (waiter)->list.next) - #define debug_mutex_wake_waiter(lock, waiter) do { } while (0) #define debug_mutex_free_waiter(waiter) do { } while (0) #define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) +#define debug_mutex_remove_waiter(lock, waiter, ti) do { } while (0) #define debug_mutex_unlock(lock) do { } while (0) #define debug_mutex_init(lock, name, key) do { } while (0) diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index b94f3831e963..ec36b73f4733 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -66,12 +66,12 @@ void queued_write_lock_slowpath(struct qrwlock *lock) arch_spin_lock(&lock->wait_lock); /* Try to acquire the lock directly if no reader is present */ - if (!atomic_read(&lock->cnts) && - (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0)) + if (!(cnts = atomic_read(&lock->cnts)) && + atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED)) goto unlock; /* Set the waiting flag to notify readers that a writer is pending */ - atomic_add(_QW_WAITING, &lock->cnts); + atomic_or(_QW_WAITING, &lock->cnts); /* When no more readers or writers, set the locked flag */ do { diff --git a/kernel/module.c b/kernel/module.c index b5dd92e35b02..7e78dfabca97 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2401,6 +2401,15 @@ static long get_offset(struct module *mod, unsigned int *size, return ret; } +static bool module_init_layout_section(const char *sname) +{ +#ifndef CONFIG_MODULE_UNLOAD + if (module_exit_section(sname)) + return true; +#endif + return module_init_section(sname); +} + /* * Lay out the SHF_ALLOC sections in a way not dissimilar to how ld * might -- code, read-only data, read-write data, small data. Tally @@ -2435,7 +2444,7 @@ static void layout_sections(struct module *mod, struct load_info *info) if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL - || module_init_section(sname)) + || module_init_layout_section(sname)) continue; s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i); pr_debug("\t%s\n", sname); @@ -2468,7 +2477,7 @@ static void layout_sections(struct module *mod, struct load_info *info) if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL - || !module_init_section(sname)) + || !module_init_layout_section(sname)) continue; s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i) | INIT_OFFSET_MASK); @@ -2807,11 +2816,7 @@ void * __weak module_alloc(unsigned long size) bool __weak module_init_section(const char *name) { -#ifndef CONFIG_MODULE_UNLOAD - return strstarts(name, ".init") || module_exit_section(name); -#else return strstarts(name, ".init"); -#endif } bool __weak module_exit_section(const char *name) diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 7a1414622051..94232186fccb 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -391,6 +391,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) /* No obstacles. */ return vprintk_default(fmt, args); } +EXPORT_SYMBOL(vprintk); void __init printk_safe_init(void) { @@ -411,4 +412,3 @@ void __init printk_safe_init(void) /* Flush pending messages that did not have scheduled IRQ works. */ printk_safe_flush(); } -EXPORT_SYMBOL(vprintk); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 76f09456ec4b..2997ca600d18 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -170,6 +170,21 @@ void __ptrace_unlink(struct task_struct *child) spin_unlock(&child->sighand->siglock); } +static bool looks_like_a_spurious_pid(struct task_struct *task) +{ + if (task->exit_code != ((PTRACE_EVENT_EXEC << 8) | SIGTRAP)) + return false; + + if (task_pid_vnr(task) == task->ptrace_message) + return false; + /* + * The tracee changed its pid but the PTRACE_EVENT_EXEC event + * was not wait()'ed, most probably debugger targets the old + * leader which was destroyed in de_thread(). + */ + return true; +} + /* Ensure that nothing can wake it up, even SIGKILL */ static bool ptrace_freeze_traced(struct task_struct *task) { @@ -180,7 +195,8 @@ static bool ptrace_freeze_traced(struct task_struct *task) return ret; spin_lock_irq(&task->sighand->siglock); - if (task_is_traced(task) && !__fatal_signal_pending(task)) { + if (task_is_traced(task) && !looks_like_a_spurious_pid(task) && + !__fatal_signal_pending(task)) { task->state = __TASK_TRACED; ret = true; } diff --git a/kernel/resource.c b/kernel/resource.c index 028a5ab18818..ca9f5198a01f 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1805,7 +1805,7 @@ static struct resource *__request_free_mem_region(struct device *dev, REGION_DISJOINT) continue; - if (!__request_region_locked(res, &iomem_resource, addr, size, + if (__request_region_locked(res, &iomem_resource, addr, size, name, 0)) break; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9143163fa678..4ca80df205ce 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -938,7 +938,7 @@ DEFINE_STATIC_KEY_FALSE(sched_uclamp_used); static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) { - return clamp_value / UCLAMP_BUCKET_DELTA; + return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1); } static inline unsigned int uclamp_none(enum uclamp_id clamp_id) @@ -6389,7 +6389,6 @@ int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) { return __sched_setscheduler(p, attr, false, true); } -EXPORT_SYMBOL_GPL(sched_setattr_nocheck); /** * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 9c882f20803e..c5aacbd492a1 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -885,6 +885,7 @@ static const struct seq_operations sched_debug_sops = { #define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F)) #define __P(F) __PS(#F, F) #define P(F) __PS(#F, p->F) +#define PM(F, M) __PS(#F, p->F & (M)) #define __PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS((long long)(F))) #define __PN(F) __PSN(#F, F) #define PN(F) __PSN(#F, p->F) @@ -1011,7 +1012,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(se.avg.util_avg); P(se.avg.last_update_time); P(se.avg.util_est.ewma); - P(se.avg.util_est.enqueued); + PM(se.avg.util_est.enqueued, ~UTIL_AVG_UNCHANGED); #endif #ifdef CONFIG_UCLAMP_TASK __PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1d75af1ecfb4..2c8a9352590d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3499,10 +3499,9 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf static inline void update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; + long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; unsigned long load_avg; u64 load_sum = 0; - s64 delta_sum; u32 divider; if (!runnable_sum) @@ -3549,13 +3548,13 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq load_sum = (s64)se_weight(se) * runnable_sum; load_avg = div_s64(load_sum, divider); - delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum; - delta_avg = load_avg - se->avg.load_avg; + delta = load_avg - se->avg.load_avg; se->avg.load_sum = runnable_sum; se->avg.load_avg = load_avg; - add_positive(&cfs_rq->avg.load_avg, delta_avg); - add_positive(&cfs_rq->avg.load_sum, delta_sum); + + add_positive(&cfs_rq->avg.load_avg, delta); + cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider; } static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) @@ -3766,11 +3765,17 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + u32 divider = get_pelt_divider(&cfs_rq->avg); + dequeue_load_avg(cfs_rq, se); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); - sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); + cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); - sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum); + cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); @@ -3902,7 +3907,7 @@ static inline unsigned long _task_util_est(struct task_struct *p) { struct util_est ue = READ_ONCE(p->se.avg.util_est); - return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED); + return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED)); } static inline unsigned long task_util_est(struct task_struct *p) @@ -4002,7 +4007,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq, * Reset EWMA on utilization increases, the moving average is used only * to smooth utilization decreases. */ - ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED); + ue.enqueued = task_util(p); if (sched_feat(UTIL_EST_FASTUP)) { if (ue.ewma < ue.enqueued) { ue.ewma = ue.enqueued; @@ -4051,6 +4056,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq, ue.ewma += last_ewma_diff; ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; done: + ue.enqueued |= UTIL_AVG_UNCHANGED; WRITE_ONCE(p->se.avg.util_est, ue); trace_sched_util_est_se_tp(&p->se); @@ -6217,7 +6223,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool } if (has_idle_core) - set_idle_cores(this, false); + set_idle_cores(target, false); if (sched_feat(SIS_PROP) && !has_idle_core) { time = cpu_clock(this) - time; @@ -8030,7 +8036,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) /* Propagate pending load changes to the parent, if any: */ se = cfs_rq->tg->se[cpu]; if (se && !skip_blocked_update(se)) - update_load_avg(cfs_rq_of(se), se, 0); + update_load_avg(cfs_rq_of(se), se, UPDATE_TG); /* * There can be a lot of idle CPU cgroups. Don't let fully @@ -10878,16 +10884,22 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq; + list_add_leaf_cfs_rq(cfs_rq_of(se)); + /* Start to propagate at parent */ se = se->parent; for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - if (cfs_rq_throttled(cfs_rq)) - break; + if (!cfs_rq_throttled(cfs_rq)){ + update_load_avg(cfs_rq, se, UPDATE_TG); + list_add_leaf_cfs_rq(cfs_rq); + continue; + } - update_load_avg(cfs_rq, se, UPDATE_TG); + if (list_add_leaf_cfs_rq(cfs_rq)) + break; } } #else diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 1462846d244e..cfe94ffd2b38 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -42,15 +42,6 @@ static inline u32 get_pelt_divider(struct sched_avg *avg) return LOAD_AVG_MAX - 1024 + avg->period_contrib; } -/* - * When a task is dequeued, its estimated utilization should not be update if - * its util_avg has not been updated at least once. - * This flag is used to synchronize util_avg updates with util_est updates. - * We map this information into the LSB bit of the utilization saved at - * dequeue time (i.e. util_est.dequeued). - */ -#define UTIL_AVG_UNCHANGED 0x1 - static inline void cfs_se_util_change(struct sched_avg *avg) { unsigned int enqueued; @@ -58,7 +49,7 @@ static inline void cfs_se_util_change(struct sched_avg *avg) if (!sched_feat(UTIL_EST)) return; - /* Avoid store if the flag has been already set */ + /* Avoid store if the flag has been already reset */ enqueued = avg->util_est.enqueued; if (!(enqueued & UTIL_AVG_UNCHANGED)) return; diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index db27b69fa92a..cc25a3cff41f 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -972,7 +972,7 @@ void psi_cgroup_free(struct cgroup *cgroup) */ void cgroup_move_task(struct task_struct *task, struct css_set *to) { - unsigned int task_flags = 0; + unsigned int task_flags; struct rq_flags rf; struct rq *rq; @@ -987,15 +987,31 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) rq = task_rq_lock(task, &rf); - if (task_on_rq_queued(task)) { - task_flags = TSK_RUNNING; - if (task_current(rq, task)) - task_flags |= TSK_ONCPU; - } else if (task->in_iowait) - task_flags = TSK_IOWAIT; - - if (task->in_memstall) - task_flags |= TSK_MEMSTALL; + /* + * We may race with schedule() dropping the rq lock between + * deactivating prev and switching to next. Because the psi + * updates from the deactivation are deferred to the switch + * callback to save cgroup tree updates, the task's scheduling + * state here is not coherent with its psi state: + * + * schedule() cgroup_move_task() + * rq_lock() + * deactivate_task() + * p->on_rq = 0 + * psi_dequeue() // defers TSK_RUNNING & TSK_IOWAIT updates + * pick_next_task() + * rq_unlock() + * rq_lock() + * psi_task_change() // old cgroup + * task->cgroups = to + * psi_task_change() // new cgroup + * rq_unlock() + * rq_lock() + * psi_sched_switch() // does deferred updates in new cgroup + * + * Don't rely on the scheduling state. Use psi_flags instead. + */ + task_flags = task->psi_flags; if (task_flags) psi_task_change(task, task_flags, 0); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 6ecd3f3a52b5..9f58049ac16d 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1105,28 +1105,30 @@ static int seccomp_do_user_notification(int this_syscall, up(&match->notif->request); wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM); - mutex_unlock(&match->notify_lock); /* * This is where we wait for a reply from userspace. */ -wait: - err = wait_for_completion_interruptible(&n.ready); - mutex_lock(&match->notify_lock); - if (err == 0) { - /* Check if we were woken up by a addfd message */ + do { + mutex_unlock(&match->notify_lock); + err = wait_for_completion_interruptible(&n.ready); + mutex_lock(&match->notify_lock); + if (err != 0) + goto interrupted; + addfd = list_first_entry_or_null(&n.addfd, struct seccomp_kaddfd, list); - if (addfd && n.state != SECCOMP_NOTIFY_REPLIED) { + /* Check if we were woken up by a addfd message */ + if (addfd) seccomp_handle_addfd(addfd); - mutex_unlock(&match->notify_lock); - goto wait; - } - ret = n.val; - err = n.error; - flags = n.flags; - } + } while (n.state != SECCOMP_NOTIFY_REPLIED); + + ret = n.val; + err = n.error; + flags = n.flags; + +interrupted: /* If there were any pending addfd calls, clear them out */ list_for_each_entry_safe(addfd, tmp, &n.addfd, list) { /* The process went away before we got a chance to handle it */ diff --git a/kernel/signal.c b/kernel/signal.c index 66e88649cf74..f7c6ffcbd044 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1236,6 +1236,7 @@ static inline bool has_si_pid_and_uid(struct kernel_siginfo *info) case SIL_TIMER: case SIL_POLL: case SIL_FAULT: + case SIL_FAULT_TRAPNO: case SIL_FAULT_MCEERR: case SIL_FAULT_BNDERR: case SIL_FAULT_PKUERR: @@ -1804,6 +1805,21 @@ int force_sig_pkuerr(void __user *addr, u32 pkey) } #endif +int force_sig_perf(void __user *addr, u32 type, u64 sig_data) +{ + struct kernel_siginfo info; + + clear_siginfo(&info); + info.si_signo = SIGTRAP; + info.si_errno = 0; + info.si_code = TRAP_PERF; + info.si_addr = addr; + info.si_perf_data = sig_data; + info.si_perf_type = type; + + return force_sig_info(&info); +} + /* For the crazy architectures that include trap information in * the errno field, instead of an actual errno value. */ @@ -2564,6 +2580,7 @@ static void hide_si_addr_tag_bits(struct ksignal *ksig) { switch (siginfo_layout(ksig->sig, ksig->info.si_code)) { case SIL_FAULT: + case SIL_FAULT_TRAPNO: case SIL_FAULT_MCEERR: case SIL_FAULT_BNDERR: case SIL_FAULT_PKUERR: @@ -3251,6 +3268,10 @@ enum siginfo_layout siginfo_layout(unsigned sig, int si_code) #endif else if ((sig == SIGTRAP) && (si_code == TRAP_PERF)) layout = SIL_PERF_EVENT; +#ifdef __ARCH_SI_TRAPNO + else if (layout == SIL_FAULT) + layout = SIL_FAULT_TRAPNO; +#endif } else if (si_code <= NSIGPOLL) layout = SIL_POLL; @@ -3354,35 +3375,28 @@ void copy_siginfo_to_external32(struct compat_siginfo *to, break; case SIL_FAULT: to->si_addr = ptr_to_compat(from->si_addr); -#ifdef __ARCH_SI_TRAPNO + break; + case SIL_FAULT_TRAPNO: + to->si_addr = ptr_to_compat(from->si_addr); to->si_trapno = from->si_trapno; -#endif break; case SIL_FAULT_MCEERR: to->si_addr = ptr_to_compat(from->si_addr); -#ifdef __ARCH_SI_TRAPNO - to->si_trapno = from->si_trapno; -#endif to->si_addr_lsb = from->si_addr_lsb; break; case SIL_FAULT_BNDERR: to->si_addr = ptr_to_compat(from->si_addr); -#ifdef __ARCH_SI_TRAPNO - to->si_trapno = from->si_trapno; -#endif to->si_lower = ptr_to_compat(from->si_lower); to->si_upper = ptr_to_compat(from->si_upper); break; case SIL_FAULT_PKUERR: to->si_addr = ptr_to_compat(from->si_addr); -#ifdef __ARCH_SI_TRAPNO - to->si_trapno = from->si_trapno; -#endif to->si_pkey = from->si_pkey; break; case SIL_PERF_EVENT: to->si_addr = ptr_to_compat(from->si_addr); - to->si_perf = from->si_perf; + to->si_perf_data = from->si_perf_data; + to->si_perf_type = from->si_perf_type; break; case SIL_CHLD: to->si_pid = from->si_pid; @@ -3438,35 +3452,28 @@ static int post_copy_siginfo_from_user32(kernel_siginfo_t *to, break; case SIL_FAULT: to->si_addr = compat_ptr(from->si_addr); -#ifdef __ARCH_SI_TRAPNO + break; + case SIL_FAULT_TRAPNO: + to->si_addr = compat_ptr(from->si_addr); to->si_trapno = from->si_trapno; -#endif break; case SIL_FAULT_MCEERR: to->si_addr = compat_ptr(from->si_addr); -#ifdef __ARCH_SI_TRAPNO - to->si_trapno = from->si_trapno; -#endif to->si_addr_lsb = from->si_addr_lsb; break; case SIL_FAULT_BNDERR: to->si_addr = compat_ptr(from->si_addr); -#ifdef __ARCH_SI_TRAPNO - to->si_trapno = from->si_trapno; -#endif to->si_lower = compat_ptr(from->si_lower); to->si_upper = compat_ptr(from->si_upper); break; case SIL_FAULT_PKUERR: to->si_addr = compat_ptr(from->si_addr); -#ifdef __ARCH_SI_TRAPNO - to->si_trapno = from->si_trapno; -#endif to->si_pkey = from->si_pkey; break; case SIL_PERF_EVENT: to->si_addr = compat_ptr(from->si_addr); - to->si_perf = from->si_perf; + to->si_perf_data = from->si_perf_data; + to->si_perf_type = from->si_perf_type; break; case SIL_CHLD: to->si_pid = from->si_pid; @@ -4644,11 +4651,13 @@ static inline void siginfo_buildtime_checks(void) /* sigfault */ CHECK_OFFSET(si_addr); + CHECK_OFFSET(si_trapno); CHECK_OFFSET(si_addr_lsb); CHECK_OFFSET(si_lower); CHECK_OFFSET(si_upper); CHECK_OFFSET(si_pkey); - CHECK_OFFSET(si_perf); + CHECK_OFFSET(si_perf_data); + CHECK_OFFSET(si_perf_type); /* sigpoll */ CHECK_OFFSET(si_band); diff --git a/kernel/smp.c b/kernel/smp.c index e21074900006..52bf159ec400 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -211,7 +211,7 @@ static u64 cfd_seq_inc(unsigned int src, unsigned int dst, unsigned int type) } while (0) /* Record current CSD work for current CPU, NULL to erase. */ -static void __csd_lock_record(call_single_data_t *csd) +static void __csd_lock_record(struct __call_single_data *csd) { if (!csd) { smp_mb(); /* NULL cur_csd after unlock. */ @@ -226,13 +226,13 @@ static void __csd_lock_record(call_single_data_t *csd) /* Or before unlock, as the case may be. */ } -static __always_inline void csd_lock_record(call_single_data_t *csd) +static __always_inline void csd_lock_record(struct __call_single_data *csd) { if (static_branch_unlikely(&csdlock_debug_enabled)) __csd_lock_record(csd); } -static int csd_lock_wait_getcpu(call_single_data_t *csd) +static int csd_lock_wait_getcpu(struct __call_single_data *csd) { unsigned int csd_type; @@ -282,7 +282,7 @@ static const char *csd_lock_get_type(unsigned int type) return (type >= ARRAY_SIZE(seq_type)) ? "?" : seq_type[type]; } -static void csd_lock_print_extended(call_single_data_t *csd, int cpu) +static void csd_lock_print_extended(struct __call_single_data *csd, int cpu) { struct cfd_seq_local *seq = &per_cpu(cfd_seq_local, cpu); unsigned int srccpu = csd->node.src; @@ -321,7 +321,7 @@ static void csd_lock_print_extended(call_single_data_t *csd, int cpu) * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU, * so waiting on other types gets much less information. */ -static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id) +static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *ts1, int *bug_id) { int cpu = -1; int cpux; @@ -387,7 +387,7 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in * previous function call. For multi-cpu calls its even more interesting * as we'll have to ensure no other cpu is observing our csd. */ -static void __csd_lock_wait(call_single_data_t *csd) +static void __csd_lock_wait(struct __call_single_data *csd) { int bug_id = 0; u64 ts0, ts1; @@ -401,7 +401,7 @@ static void __csd_lock_wait(call_single_data_t *csd) smp_acquire__after_ctrl_dep(); } -static __always_inline void csd_lock_wait(call_single_data_t *csd) +static __always_inline void csd_lock_wait(struct __call_single_data *csd) { if (static_branch_unlikely(&csdlock_debug_enabled)) { __csd_lock_wait(csd); @@ -431,17 +431,17 @@ static void __smp_call_single_queue_debug(int cpu, struct llist_node *node) #else #define cfd_seq_store(var, src, dst, type) -static void csd_lock_record(call_single_data_t *csd) +static void csd_lock_record(struct __call_single_data *csd) { } -static __always_inline void csd_lock_wait(call_single_data_t *csd) +static __always_inline void csd_lock_wait(struct __call_single_data *csd) { smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK)); } #endif -static __always_inline void csd_lock(call_single_data_t *csd) +static __always_inline void csd_lock(struct __call_single_data *csd) { csd_lock_wait(csd); csd->node.u_flags |= CSD_FLAG_LOCK; @@ -454,7 +454,7 @@ static __always_inline void csd_lock(call_single_data_t *csd) smp_wmb(); } -static __always_inline void csd_unlock(call_single_data_t *csd) +static __always_inline void csd_unlock(struct __call_single_data *csd) { WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK)); @@ -501,7 +501,7 @@ void __smp_call_single_queue(int cpu, struct llist_node *node) * for execution on the given CPU. data must already have * ->func, ->info, and ->flags set. */ -static int generic_exec_single(int cpu, call_single_data_t *csd) +static int generic_exec_single(int cpu, struct __call_single_data *csd) { if (cpu == smp_processor_id()) { smp_call_func_t func = csd->func; @@ -784,7 +784,7 @@ EXPORT_SYMBOL(smp_call_function_single); * NOTE: Be careful, there is unfortunately no current debugging facility to * validate the correctness of this serialization. */ -int smp_call_function_single_async(int cpu, call_single_data_t *csd) +int smp_call_function_single_async(int cpu, struct __call_single_data *csd) { int err = 0; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 14edf84cc571..d4a78e08f6d8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -225,7 +225,27 @@ static int bpf_stats_handler(struct ctl_table *table, int write, mutex_unlock(&bpf_stats_enabled_mutex); return ret; } -#endif + +static int bpf_unpriv_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret, unpriv_enable = *(int *)table->data; + bool locked_state = unpriv_enable == 1; + struct ctl_table tmp = *table; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + tmp.data = &unpriv_enable; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret) { + if (locked_state && unpriv_enable != 1) + return -EPERM; + *(int *)table->data = unpriv_enable; + } + return ret; +} +#endif /* CONFIG_BPF_SYSCALL && CONFIG_SYSCTL */ /* * /proc/sys support @@ -2600,10 +2620,9 @@ static struct ctl_table kern_table[] = { .data = &sysctl_unprivileged_bpf_disabled, .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), .mode = 0644, - /* only handle a transition from default "0" to "1" */ - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_ONE, + .proc_handler = bpf_unpriv_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &two, }, { .procname = "bpf_stats_enabled", diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index bea9d08b1698..5897828b9d7e 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -92,7 +92,7 @@ static int alarmtimer_rtc_add_device(struct device *dev, if (rtcdev) return -EBUSY; - if (!rtc->ops->set_alarm) + if (!test_bit(RTC_FEATURE_ALARM, rtc->features)) return -1; if (!device_may_wakeup(rtc->dev.parent)) return -1; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 828b091501ca..6784f27a3099 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -230,6 +230,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; +EXPORT_SYMBOL_GPL(tick_nohz_full_mask); bool tick_nohz_full_running; EXPORT_SYMBOL_GPL(tick_nohz_full_running); static atomic_t tick_dep_mask; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d2d7cf6cfe83..7a52bc172841 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -215,16 +215,11 @@ const struct bpf_func_proto bpf_probe_read_user_str_proto = { static __always_inline int bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr) { - int ret = security_locked_down(LOCKDOWN_BPF_READ); + int ret; - if (unlikely(ret < 0)) - goto fail; ret = copy_from_kernel_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) - goto fail; - return ret; -fail: - memset(dst, 0, size); + memset(dst, 0, size); return ret; } @@ -246,10 +241,7 @@ const struct bpf_func_proto bpf_probe_read_kernel_proto = { static __always_inline int bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr) { - int ret = security_locked_down(LOCKDOWN_BPF_READ); - - if (unlikely(ret < 0)) - goto fail; + int ret; /* * The strncpy_from_kernel_nofault() call will likely not fill the @@ -262,11 +254,7 @@ bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr) */ ret = strncpy_from_kernel_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) - goto fail; - - return ret; -fail: - memset(dst, 0, size); + memset(dst, 0, size); return ret; } @@ -1011,16 +999,20 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_probe_read_user: return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: - return &bpf_probe_read_kernel_proto; + return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + NULL : &bpf_probe_read_kernel_proto; case BPF_FUNC_probe_read_user_str: return &bpf_probe_read_user_str_proto; case BPF_FUNC_probe_read_kernel_str: - return &bpf_probe_read_kernel_str_proto; + return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + NULL : &bpf_probe_read_kernel_str_proto; #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE case BPF_FUNC_probe_read: - return &bpf_probe_read_compat_proto; + return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + NULL : &bpf_probe_read_compat_proto; case BPF_FUNC_probe_read_str: - return &bpf_probe_read_compat_str_proto; + return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + NULL : &bpf_probe_read_compat_str_proto; #endif #ifdef CONFIG_CGROUPS case BPF_FUNC_get_current_cgroup_id: diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2e8a3fde7104..72ef4dccbcc4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1967,12 +1967,18 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, static void print_ip_ins(const char *fmt, const unsigned char *p) { + char ins[MCOUNT_INSN_SIZE]; int i; + if (copy_from_kernel_nofault(ins, p, MCOUNT_INSN_SIZE)) { + printk(KERN_CONT "%s[FAULT] %px\n", fmt, p); + return; + } + printk(KERN_CONT "%s", fmt); for (i = 0; i < MCOUNT_INSN_SIZE; i++) - printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); + printk(KERN_CONT "%s%02x", i ? ":" : "", ins[i]); } enum ftrace_bug_type ftrace_bug_type; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 560e4c8d3825..d23a09d3eb37 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2198,9 +2198,6 @@ struct saved_cmdlines_buffer { }; static struct saved_cmdlines_buffer *savedcmd; -/* temporary disable recording */ -static atomic_t trace_record_taskinfo_disabled __read_mostly; - static inline char *get_saved_cmdlines(int idx) { return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN]; @@ -2486,8 +2483,6 @@ static bool tracing_record_taskinfo_skip(int flags) { if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID)))) return true; - if (atomic_read(&trace_record_taskinfo_disabled) || !tracing_is_on()) - return true; if (!__this_cpu_read(trace_taskinfo_save)) return true; return false; @@ -2736,7 +2731,7 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, (entry = this_cpu_read(trace_buffered_event))) { /* Try to use the per cpu buffer first */ val = this_cpu_inc_return(trace_buffered_event_cnt); - if ((len < (PAGE_SIZE - sizeof(*entry))) && val == 1) { + if ((len < (PAGE_SIZE - sizeof(*entry) - sizeof(entry->array[0]))) && val == 1) { trace_event_setup(entry, type, trace_ctx); entry->array[0] = len; return entry; @@ -3704,6 +3699,9 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, goto print; while (*p) { + bool star = false; + int len = 0; + j = 0; /* We only care about %s and variants */ @@ -3725,13 +3723,17 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, /* Need to test cases like %08.*s */ for (j = 1; p[i+j]; j++) { if (isdigit(p[i+j]) || - p[i+j] == '*' || p[i+j] == '.') continue; + if (p[i+j] == '*') { + star = true; + continue; + } break; } if (p[i+j] == 's') break; + star = false; } j = 0; } @@ -3744,6 +3746,9 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, iter->fmt[i] = '\0'; trace_seq_vprintf(&iter->seq, iter->fmt, ap); + if (star) + len = va_arg(ap, int); + /* The ap now points to the string data of the %s */ str = va_arg(ap, const char *); @@ -3762,8 +3767,18 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, int ret; /* Try to safely read the string */ - ret = strncpy_from_kernel_nofault(iter->fmt, str, - iter->fmt_size); + if (star) { + if (len + 1 > iter->fmt_size) + len = iter->fmt_size - 1; + if (len < 0) + len = 0; + ret = copy_from_kernel_nofault(iter->fmt, str, len); + iter->fmt[len] = 0; + star = false; + } else { + ret = strncpy_from_kernel_nofault(iter->fmt, str, + iter->fmt_size); + } if (ret < 0) trace_seq_printf(&iter->seq, "(0x%px)", str); else @@ -3775,7 +3790,10 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, strncpy(iter->fmt, p + i, j + 1); iter->fmt[j+1] = '\0'; } - trace_seq_printf(&iter->seq, iter->fmt, str); + if (star) + trace_seq_printf(&iter->seq, iter->fmt, len, str); + else + trace_seq_printf(&iter->seq, iter->fmt, str); p += i + j + 1; } @@ -3975,9 +3993,6 @@ static void *s_start(struct seq_file *m, loff_t *pos) return ERR_PTR(-EBUSY); #endif - if (!iter->snapshot) - atomic_inc(&trace_record_taskinfo_disabled); - if (*pos != iter->pos) { iter->ent = NULL; iter->cpu = 0; @@ -4020,9 +4035,6 @@ static void s_stop(struct seq_file *m, void *p) return; #endif - if (!iter->snapshot) - atomic_dec(&trace_record_taskinfo_disabled); - trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); } diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index c1637f90c8a3..4702efb00ff2 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -115,9 +115,9 @@ u64 notrace trace_clock_global(void) prev_time = READ_ONCE(trace_clock_struct.prev_time); now = sched_clock_cpu(this_cpu); - /* Make sure that now is always greater than prev_time */ + /* Make sure that now is always greater than or equal to prev_time */ if ((s64)(now - prev_time) < 0) - now = prev_time + 1; + now = prev_time; /* * If in an NMI context then dont risk lockups and simply return @@ -131,7 +131,7 @@ u64 notrace trace_clock_global(void) /* Reread prev_time in case it was already updated */ prev_time = READ_ONCE(trace_clock_struct.prev_time); if ((s64)(now - prev_time) < 0) - now = prev_time + 1; + now = prev_time; trace_clock_struct.prev_time = now; diff --git a/kernel/up.c b/kernel/up.c index df50828cc2f0..a38b8b095251 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -25,7 +25,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, } EXPORT_SYMBOL(smp_call_function_single); -int smp_call_function_single_async(int cpu, call_single_data_t *csd) +int smp_call_function_single_async(int cpu, struct __call_single_data *csd) { unsigned long flags; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 7c397907d0e9..92d3bcc5a5e0 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -302,10 +302,10 @@ void touch_softlockup_watchdog_sync(void) __this_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT); } -static int is_softlockup(unsigned long touch_ts, unsigned long period_ts) +static int is_softlockup(unsigned long touch_ts, + unsigned long period_ts, + unsigned long now) { - unsigned long now = get_timestamp(); - if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){ /* Warn about unreasonable delays. */ if (time_after(now, period_ts + get_softlockup_thresh())) @@ -353,8 +353,7 @@ static int softlockup_fn(void *data) /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { - unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts); - unsigned long period_ts = __this_cpu_read(watchdog_report_ts); + unsigned long touch_ts, period_ts, now; struct pt_regs *regs = get_irq_regs(); int duration; int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; @@ -377,11 +376,22 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); /* + * Read the current timestamp first. It might become invalid anytime + * when a virtual machine is stopped by the host or when the watchog + * is touched from NMI. + */ + now = get_timestamp(); + /* * If a virtual machine is stopped by the host it can look to - * the watchdog like a soft lockup. Check to see if the host - * stopped the vm before we process the timestamps. + * the watchdog like a soft lockup. This function touches the watchdog. */ kvm_check_and_clear_guest_paused(); + /* + * The stored timestamp is comparable with @now only when not touched. + * It might get touched anytime from NMI. Make sure that is_softlockup() + * uses the same (valid) value. + */ + period_ts = READ_ONCE(*this_cpu_ptr(&watchdog_report_ts)); /* Reset the interval when touched by known problematic code. */ if (period_ts == SOFTLOCKUP_DELAY_REPORT) { @@ -398,13 +408,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) return HRTIMER_RESTART; } - /* check for a softlockup - * This is done by making sure a high priority task is - * being scheduled. The task touches the watchdog to - * indicate it is getting cpu time. If it hasn't then - * this is a good indication some task is hogging the cpu - */ - duration = is_softlockup(touch_ts, period_ts); + /* Check for a softlockup. */ + touch_ts = __this_cpu_read(watchdog_touch_ts); + duration = is_softlockup(touch_ts, period_ts, now); if (unlikely(duration)) { /* * Prevent multiple soft-lockup reports if one cpu is already diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b19d759e55a5..50142fc08902 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -50,6 +50,7 @@ #include <linux/uaccess.h> #include <linux/sched/isolation.h> #include <linux/nmi.h> +#include <linux/kvm_para.h> #include "workqueue_internal.h" @@ -5772,6 +5773,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) { unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; bool lockup_detected = false; + unsigned long now = jiffies; struct worker_pool *pool; int pi; @@ -5786,6 +5788,12 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) if (list_empty(&pool->worklist)) continue; + /* + * If a virtual machine is stopped by the host it can look to + * the watchdog like a stall. + */ + kvm_check_and_clear_guest_paused(); + /* get the latest of pool and touched timestamps */ if (pool->cpu >= 0) touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu)); @@ -5799,12 +5807,12 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) ts = touched; /* did we stall? */ - if (time_after(jiffies, ts + thresh)) { + if (time_after(now, ts + thresh)) { lockup_detected = true; pr_emerg("BUG: workqueue lockup - pool"); pr_cont_pool_info(pool); pr_cont(" stuck for %us!\n", - jiffies_to_msecs(jiffies - pool_ts) / 1000); + jiffies_to_msecs(now - pool_ts) / 1000); } } |