diff options
Diffstat (limited to 'kernel/bpf')
33 files changed, 4752 insertions, 1362 deletions
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index 2dfe1079f772..6a906ff93006 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -31,6 +31,7 @@ config BPF_SYSCALL select TASKS_TRACE_RCU select BINARY_PRINTF select NET_SOCK_MSG if NET + select NET_XGRESS if NET select PAGE_POOL if NET default n help diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 1d3892168d32..f526b7573e97 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -12,7 +12,7 @@ obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o -obj-$(CONFIG_BPF_SYSCALL) += disasm.o +obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o obj-$(CONFIG_BPF_JIT) += dispatcher.o @@ -21,6 +21,7 @@ obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o obj-$(CONFIG_BPF_SYSCALL) += offload.o obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o +obj-$(CONFIG_BPF_SYSCALL) += tcx.o endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 2058e89b5ddd..c85ff9162a5c 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -1012,11 +1012,16 @@ static void prog_array_map_poke_untrack(struct bpf_map *map, mutex_unlock(&aux->poke_mutex); } +void __weak bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, + struct bpf_prog *new, struct bpf_prog *old) +{ + WARN_ON_ONCE(1); +} + static void prog_array_map_poke_run(struct bpf_map *map, u32 key, struct bpf_prog *old, struct bpf_prog *new) { - u8 *old_addr, *new_addr, *old_bypass_addr; struct prog_poke_elem *elem; struct bpf_array_aux *aux; @@ -1025,7 +1030,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, list_for_each_entry(elem, &aux->poke_progs, list) { struct bpf_jit_poke_descriptor *poke; - int i, ret; + int i; for (i = 0; i < elem->aux->size_poke_tab; i++) { poke = &elem->aux->poke_tab[i]; @@ -1044,21 +1049,10 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, * activated, so tail call updates can arrive from here * while JIT is still finishing its final fixup for * non-activated poke entries. - * 3) On program teardown, the program's kallsym entry gets - * removed out of RCU callback, but we can only untrack - * from sleepable context, therefore bpf_arch_text_poke() - * might not see that this is in BPF text section and - * bails out with -EINVAL. As these are unreachable since - * RCU grace period already passed, we simply skip them. - * 4) Also programs reaching refcount of zero while patching + * 3) Also programs reaching refcount of zero while patching * is in progress is okay since we're protected under * poke_mutex and untrack the programs before the JIT - * buffer is freed. When we're still in the middle of - * patching and suddenly kallsyms entry of the program - * gets evicted, we just skip the rest which is fine due - * to point 3). - * 5) Any other error happening below from bpf_arch_text_poke() - * is a unexpected bug. + * buffer is freed. */ if (!READ_ONCE(poke->tailcall_target_stable)) continue; @@ -1068,39 +1062,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, poke->tail_call.key != key) continue; - old_bypass_addr = old ? NULL : poke->bypass_addr; - old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL; - new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL; - - if (new) { - ret = bpf_arch_text_poke(poke->tailcall_target, - BPF_MOD_JUMP, - old_addr, new_addr); - BUG_ON(ret < 0 && ret != -EINVAL); - if (!old) { - ret = bpf_arch_text_poke(poke->tailcall_bypass, - BPF_MOD_JUMP, - poke->bypass_addr, - NULL); - BUG_ON(ret < 0 && ret != -EINVAL); - } - } else { - ret = bpf_arch_text_poke(poke->tailcall_bypass, - BPF_MOD_JUMP, - old_bypass_addr, - poke->bypass_addr); - BUG_ON(ret < 0 && ret != -EINVAL); - /* let other CPUs finish the execution of program - * so that it will not possible to expose them - * to invalid nop, stack unwind, nop state - */ - if (!ret) - synchronize_rcu(); - ret = bpf_arch_text_poke(poke->tailcall_target, - BPF_MOD_JUMP, - old_addr, NULL); - BUG_ON(ret < 0 && ret != -EINVAL); - } + bpf_arch_poke_desc_update(poke, new, old); } } } diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 96856f130cbf..0fae79164187 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -782,9 +782,7 @@ struct bpf_iter_num_kern { int end; /* final value, exclusive */ } __aligned(8); -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global functions as their definitions will be in vmlinux BTF"); +__bpf_kfunc_start_defs(); __bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end) { @@ -793,8 +791,6 @@ __bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end) BUILD_BUG_ON(sizeof(struct bpf_iter_num_kern) != sizeof(struct bpf_iter_num)); BUILD_BUG_ON(__alignof__(struct bpf_iter_num_kern) != __alignof__(struct bpf_iter_num)); - BTF_TYPE_EMIT(struct btf_iter_num); - /* start == end is legit, it's an empty range and we'll just get NULL * on first (and any subsequent) bpf_iter_num_next() call */ @@ -845,4 +841,4 @@ __bpf_kfunc void bpf_iter_num_destroy(struct bpf_iter_num *it) s->cur = s->end = 0; } -__diag_pop(); +__bpf_kfunc_end_defs(); diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index b5149cfce7d4..146824cc9689 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -553,7 +553,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, void *value, u64 map_flags, gfp_t gfp_flags) { struct bpf_local_storage_data *old_sdata = NULL; - struct bpf_local_storage_elem *selem = NULL; + struct bpf_local_storage_elem *alloc_selem, *selem = NULL; struct bpf_local_storage *local_storage; unsigned long flags; int err; @@ -607,11 +607,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, } } - if (gfp_flags == GFP_KERNEL) { - selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); - if (!selem) - return ERR_PTR(-ENOMEM); - } + /* A lookup has just been done before and concluded a new selem is + * needed. The chance of an unnecessary alloc is unlikely. + */ + alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); + if (!alloc_selem) + return ERR_PTR(-ENOMEM); raw_spin_lock_irqsave(&local_storage->lock, flags); @@ -623,13 +624,13 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, * simple. */ err = -EAGAIN; - goto unlock_err; + goto unlock; } old_sdata = bpf_local_storage_lookup(local_storage, smap, false); err = check_flags(old_sdata, map_flags); if (err) - goto unlock_err; + goto unlock; if (old_sdata && (map_flags & BPF_F_LOCK)) { copy_map_value_locked(&smap->map, old_sdata->data, value, @@ -638,23 +639,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, goto unlock; } - if (gfp_flags != GFP_KERNEL) { - /* local_storage->lock is held. Hence, we are sure - * we can unlink and uncharge the old_sdata successfully - * later. Hence, instead of charging the new selem now - * and then uncharge the old selem later (which may cause - * a potential but unnecessary charge failure), avoid taking - * a charge at all here (the "!old_sdata" check) and the - * old_sdata will not be uncharged later during - * bpf_selem_unlink_storage_nolock(). - */ - selem = bpf_selem_alloc(smap, owner, value, !old_sdata, gfp_flags); - if (!selem) { - err = -ENOMEM; - goto unlock_err; - } - } - + alloc_selem = NULL; /* First, link the new selem to the map */ bpf_selem_link_map(smap, selem); @@ -665,20 +650,16 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (old_sdata) { bpf_selem_unlink_map(SELEM(old_sdata)); bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), - false, false); + true, false); } unlock: raw_spin_unlock_irqrestore(&local_storage->lock, flags); - return SDATA(selem); - -unlock_err: - raw_spin_unlock_irqrestore(&local_storage->lock, flags); - if (selem) { + if (alloc_selem) { mem_uncharge(smap, owner, smap->elem_size); - bpf_selem_free(selem, smap, true); + bpf_selem_free(alloc_selem, smap, true); } - return ERR_PTR(err); + return err ? ERR_PTR(err) : SDATA(selem); } static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache) @@ -779,7 +760,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) * of the loop will set the free_cgroup_storage to true. */ free_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, false, true); + local_storage, selem, true, true); } raw_spin_unlock_irqrestore(&local_storage->lock, flags); diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index 8f3c8b2b4490..cbd8d3720c2b 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -75,6 +75,5 @@ void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, void bpf_lru_destroy(struct bpf_lru *lru); struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash); void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node); -void bpf_lru_promote(struct bpf_lru *lru, struct bpf_lru_node *node); #endif diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 116a0ce378ec..db6176fb64dc 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -374,9 +374,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, struct bpf_struct_ops_value *uvalue, *kvalue; const struct btf_member *member; const struct btf_type *t = st_ops->type; - struct bpf_tramp_links *tlinks = NULL; + struct bpf_tramp_links *tlinks; void *udata, *kdata; - int prog_fd, err = 0; + int prog_fd, err; void *image, *image_end; u32 i; @@ -509,9 +509,12 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, } if (st_map->map.map_flags & BPF_F_LINK) { - err = st_ops->validate(kdata); - if (err) - goto reset_unlock; + err = 0; + if (st_ops->validate) { + err = st_ops->validate(kdata); + if (err) + goto reset_unlock; + } set_memory_rox((long)st_map->image, 1); /* Let bpf_link handle registration & unregistration. * @@ -612,7 +615,10 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map) if (st_map->links) bpf_struct_ops_map_put_progs(st_map); bpf_map_area_free(st_map->links); - bpf_jit_free_exec(st_map->image); + if (st_map->image) { + bpf_jit_free_exec(st_map->image); + bpf_jit_uncharge_modmem(PAGE_SIZE); + } bpf_map_area_free(st_map->uvalue); bpf_map_area_free(st_map); } @@ -654,6 +660,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) struct bpf_struct_ops_map *st_map; const struct btf_type *t, *vt; struct bpf_map *map; + int ret; st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id); if (!st_ops) @@ -663,9 +670,6 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) if (attr->value_size != vt->size) return ERR_PTR(-EINVAL); - if (attr->map_flags & BPF_F_LINK && (!st_ops->validate || !st_ops->update)) - return ERR_PTR(-EOPNOTSUPP); - t = st_ops->type; st_map_size = sizeof(*st_map) + @@ -681,12 +685,27 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) st_map->st_ops = st_ops; map = &st_map->map; + ret = bpf_jit_charge_modmem(PAGE_SIZE); + if (ret) { + __bpf_struct_ops_map_free(map); + return ERR_PTR(ret); + } + + st_map->image = bpf_jit_alloc_exec(PAGE_SIZE); + if (!st_map->image) { + /* __bpf_struct_ops_map_free() uses st_map->image as flag + * for "charged or not". In this case, we need to unchange + * here. + */ + bpf_jit_uncharge_modmem(PAGE_SIZE); + __bpf_struct_ops_map_free(map); + return ERR_PTR(-ENOMEM); + } st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); st_map->links = bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_links *), NUMA_NO_NODE); - st_map->image = bpf_jit_alloc_exec(PAGE_SIZE); - if (!st_map->uvalue || !st_map->links || !st_map->image) { + if (!st_map->uvalue || !st_map->links) { __bpf_struct_ops_map_free(map); return ERR_PTR(-ENOMEM); } @@ -815,7 +834,7 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map struct bpf_struct_ops_map *st_map, *old_st_map; struct bpf_map *old_map; struct bpf_struct_ops_link *st_link; - int err = 0; + int err; st_link = container_of(link, struct bpf_struct_ops_link, link); st_map = container_of(new_map, struct bpf_struct_ops_map, map); @@ -823,6 +842,9 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map if (!bpf_struct_ops_valid_to_reg(new_map)) return -EINVAL; + if (!st_map->st_ops->update) + return -EOPNOTSUPP; + mutex_lock(&update_mutex); old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex)); @@ -904,4 +926,3 @@ err_out: kfree(link); return err; } - diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 817204d53372..15d71d2986d3 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -29,6 +29,7 @@ #include <net/netfilter/nf_bpf_link.h> #include <net/sock.h> +#include <net/xdp.h> #include "../tools/lib/bpf/relo_core.h" /* BTF (BPF Type Format) is the meta data format which describes @@ -552,7 +553,7 @@ s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind) return -ENOENT; } -static s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p) +s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p) { struct btf *btf; s32 ret; @@ -3292,6 +3293,8 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, type = BPF_KPTR_UNREF; else if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off))) type = BPF_KPTR_REF; + else if (!strcmp("percpu_kptr", __btf_name_by_offset(btf, t->name_off))) + type = BPF_KPTR_PERCPU; else return -EINVAL; @@ -3307,10 +3310,10 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, return BTF_FIELD_FOUND; } -static const char *btf_find_decl_tag_value(const struct btf *btf, - const struct btf_type *pt, - int comp_idx, const char *tag_key) +const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt, + int comp_idx, const char *tag_key) { + const char *value = NULL; int i; for (i = 1; i < btf_nr_types(btf); i++) { @@ -3324,9 +3327,14 @@ static const char *btf_find_decl_tag_value(const struct btf *btf, continue; if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len)) continue; - return __btf_name_by_offset(btf, t->name_off) + len; + /* Prevent duplicate entries for same type */ + if (value) + return ERR_PTR(-EEXIST); + value = __btf_name_by_offset(btf, t->name_off) + len; } - return NULL; + if (!value) + return ERR_PTR(-ENOENT); + return value; } static int @@ -3344,7 +3352,7 @@ btf_find_graph_root(const struct btf *btf, const struct btf_type *pt, if (t->size != sz) return BTF_FIELD_IGNORE; value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:"); - if (!value_type) + if (IS_ERR(value_type)) return -EINVAL; node_field_name = strstr(value_type, ":"); if (!node_field_name) @@ -3456,6 +3464,7 @@ static int btf_find_struct_field(const struct btf *btf, break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_PERCPU: ret = btf_find_kptr(btf, member_type, off, sz, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) @@ -3522,6 +3531,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_PERCPU: ret = btf_find_kptr(btf, var_type, off, sz, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) @@ -3782,6 +3792,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_PERCPU: ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]); if (ret < 0) goto end; @@ -6133,8 +6144,9 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, const char *tname, *mname, *tag_value; u32 vlen, elem_id, mid; - *flag = 0; again: + if (btf_type_is_modifier(t)) + t = btf_type_skip_modifiers(btf, t->type, NULL); tname = __btf_name_by_offset(btf, t->name_off); if (!btf_type_is_struct(t)) { bpf_log(log, "Type '%s' is not a struct\n", tname); @@ -6142,6 +6154,14 @@ again: } vlen = btf_type_vlen(t); + if (BTF_INFO_KIND(t->info) == BTF_KIND_UNION && vlen != 1 && !(*flag & PTR_UNTRUSTED)) + /* + * walking unions yields untrusted pointers + * with exception of __bpf_md_ptr and other + * unions with a single member + */ + *flag |= PTR_UNTRUSTED; + if (off + size > t->size) { /* If the last element is a variable size array, we may * need to relax the rule. @@ -6302,15 +6322,6 @@ error: * of this field or inside of this struct */ if (btf_type_is_struct(mtype)) { - if (BTF_INFO_KIND(mtype->info) == BTF_KIND_UNION && - btf_type_vlen(mtype) != 1) - /* - * walking unions yields untrusted pointers - * with exception of __bpf_md_ptr and other - * unions with a single member - */ - *flag |= PTR_UNTRUSTED; - /* our field must be inside that union or struct */ t = mtype; @@ -6368,7 +6379,7 @@ error: * that also allows using an array of int as a scratch * space. e.g. skb->cb[]. */ - if (off + size > mtrue_end) { + if (off + size > mtrue_end && !(*flag & PTR_UNTRUSTED)) { bpf_log(log, "access beyond the end of member %s (mend:%u) in struct %s with off %u size %u\n", mname, mtrue_end, tname, off, size); @@ -6476,7 +6487,7 @@ bool btf_struct_ids_match(struct bpf_verifier_log *log, bool strict) { const struct btf_type *type; - enum bpf_type_flag flag; + enum bpf_type_flag flag = 0; int err; /* Are we already done? */ @@ -6948,7 +6959,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, * (either PTR_TO_CTX or SCALAR_VALUE). */ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *regs) + struct bpf_reg_state *regs, bool is_ex_cb) { struct bpf_verifier_log *log = &env->log; struct bpf_prog *prog = env->prog; @@ -7005,7 +7016,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, tname, nargs, MAX_BPF_FUNC_REG_ARGS); return -EINVAL; } - /* check that function returns int */ + /* check that function returns int, exception cb also requires this */ t = btf_type_by_id(btf, t->type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); @@ -7054,6 +7065,14 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, i, btf_type_str(t), tname); return -EINVAL; } + /* We have already ensured that the callback returns an integer, just + * like all global subprogs. We need to determine it only has a single + * scalar argument. + */ + if (is_ex_cb && (nargs != 1 || regs[BPF_REG_1].type != SCALAR_VALUE)) { + bpf_log(log, "exception cb only supports single integer argument\n"); + return -EINVAL; + } return 0; } @@ -7831,6 +7850,7 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) case BPF_PROG_TYPE_SYSCALL: return BTF_KFUNC_HOOK_SYSCALL; case BPF_PROG_TYPE_CGROUP_SKB: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: return BTF_KFUNC_HOOK_CGROUP_SKB; case BPF_PROG_TYPE_SCHED_ACT: return BTF_KFUNC_HOOK_SCHED_ACT; @@ -8500,7 +8520,7 @@ bool btf_nested_type_is_trusted(struct bpf_verifier_log *log, tname = btf_name_by_offset(btf, walk_type->name_off); ret = snprintf(safe_tname, sizeof(safe_tname), "%s%s", tname, suffix); - if (ret < 0) + if (ret >= sizeof(safe_tname)) return false; safe_id = btf_find_by_name_kind(btf, safe_tname, BTF_INFO_KIND(walk_type->info)); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 5b2741aa0d9b..491d20038cbe 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -785,7 +785,8 @@ found: * to descendants * @cgrp: The cgroup which descendants to traverse * @link: A link for which to replace BPF program - * @type: Type of attach operation + * @new_prog: &struct bpf_prog for the target BPF program with its refcnt + * incremented * * Must be called with cgroup_mutex held. */ @@ -1334,7 +1335,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering * @sk: The socket sending or receiving traffic * @skb: The skb that is being sent or received - * @type: The type of program to be executed + * @atype: The type of program to be executed * * If no socket is passed, or the socket is not of type INET or INET6, * this function does nothing and returns 0. @@ -1424,7 +1425,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); /** * __cgroup_bpf_run_filter_sk() - Run a program on a sock * @sk: sock structure to manipulate - * @type: The type of program to be executed + * @atype: The type of program to be executed * * socket is passed is expected to be of type INET or INET6. * @@ -1449,18 +1450,22 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); * provided by user sockaddr * @sk: sock struct that will use sockaddr * @uaddr: sockaddr struct provided by user - * @type: The type of program to be executed + * @uaddrlen: Pointer to the size of the sockaddr struct provided by user. It is + * read-only for AF_INET[6] uaddr but can be modified for AF_UNIX + * uaddr. + * @atype: The type of program to be executed * @t_ctx: Pointer to attach type specific context * @flags: Pointer to u32 which contains higher bits of BPF program * return value (OR'ed together). * - * socket is expected to be of type INET or INET6. + * socket is expected to be of type INET, INET6 or UNIX. * * This function will return %-EPERM if an attached program is found and * returned value != 1 during execution. In all other cases, 0 is returned. */ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, + int *uaddrlen, enum cgroup_bpf_attach_type atype, void *t_ctx, u32 *flags) @@ -1472,21 +1477,31 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, }; struct sockaddr_storage unspec; struct cgroup *cgrp; + int ret; /* Check socket family since not all sockets represent network * endpoint (e.g. AF_UNIX). */ - if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) + if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6 && + sk->sk_family != AF_UNIX) return 0; if (!ctx.uaddr) { memset(&unspec, 0, sizeof(unspec)); ctx.uaddr = (struct sockaddr *)&unspec; + ctx.uaddrlen = 0; + } else { + ctx.uaddrlen = *uaddrlen; } cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, - 0, flags); + ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, + 0, flags); + + if (!ret && uaddr) + *uaddrlen = ctx.uaddrlen; + + return ret; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); @@ -1496,7 +1511,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains * sk with connection information (IP addresses, etc.) May not contain * cgroup info if it is a req sock. - * @type: The type of program to be executed + * @atype: The type of program to be executed * * socket passed is expected to be of type INET or INET6. * @@ -1670,7 +1685,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { * @ppos: value-result argument: value is position at which read from or write * to sysctl is happening, result is new position if program overrode it, * initial value otherwise - * @type: type of program to be executed + * @atype: type of program to be executed * * Program is run when sysctl is being accessed, either read or written, and * can allow or deny such access. @@ -1785,7 +1800,7 @@ static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx, } int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, - int *optname, char __user *optval, + int *optname, sockptr_t optval, int *optlen, char **kernel_optval) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); @@ -1808,7 +1823,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, ctx.optlen = *optlen; - if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) { + if (copy_from_sockptr(ctx.optval, optval, + min(*optlen, max_optlen))) { ret = -EFAULT; goto out; } @@ -1875,8 +1891,8 @@ out: } int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, - int optname, char __user *optval, - int __user *optlen, int max_optlen, + int optname, sockptr_t optval, + sockptr_t optlen, int max_optlen, int retval) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); @@ -1903,8 +1919,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, * one that kernel returned as well to let * BPF programs inspect the value. */ - - if (get_user(ctx.optlen, optlen)) { + if (copy_from_sockptr(&ctx.optlen, optlen, + sizeof(ctx.optlen))) { ret = -EFAULT; goto out; } @@ -1915,8 +1931,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, } orig_optlen = ctx.optlen; - if (copy_from_user(ctx.optval, optval, - min(ctx.optlen, max_optlen)) != 0) { + if (copy_from_sockptr(ctx.optval, optval, + min(ctx.optlen, max_optlen))) { ret = -EFAULT; goto out; } @@ -1930,7 +1946,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, if (ret < 0) goto out; - if (optval && (ctx.optlen > max_optlen || ctx.optlen < 0)) { + if (!sockptr_is_null(optval) && + (ctx.optlen > max_optlen || ctx.optlen < 0)) { if (orig_optlen > PAGE_SIZE && ctx.optlen >= 0) { pr_info_once("bpf getsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n", ctx.optlen, max_optlen); @@ -1942,11 +1959,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, } if (ctx.optlen != 0) { - if (optval && copy_to_user(optval, ctx.optval, ctx.optlen)) { + if (!sockptr_is_null(optval) && + copy_to_sockptr(optval, ctx.optval, ctx.optlen)) { ret = -EFAULT; goto out; } - if (put_user(ctx.optlen, optlen)) { + if (copy_to_sockptr(optlen, &ctx.optlen, sizeof(ctx.optlen))) { ret = -EFAULT; goto out; } @@ -2519,10 +2537,13 @@ cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: return NULL; default: return &bpf_get_retval_proto; @@ -2534,10 +2555,13 @@ cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: return NULL; default: return &bpf_set_retval_proto; diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c index 810378f04fbc..f04a468cf6a7 100644 --- a/kernel/bpf/cgroup_iter.c +++ b/kernel/bpf/cgroup_iter.c @@ -282,7 +282,7 @@ static struct bpf_iter_reg bpf_cgroup_reg_info = { .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__cgroup, cgroup), - PTR_TO_BTF_ID_OR_NULL }, + PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, }, .seq_info = &cgroup_iter_seq_info, }; @@ -294,3 +294,66 @@ static int __init bpf_cgroup_iter_init(void) } late_initcall(bpf_cgroup_iter_init); + +struct bpf_iter_css { + __u64 __opaque[3]; +} __attribute__((aligned(8))); + +struct bpf_iter_css_kern { + struct cgroup_subsys_state *start; + struct cgroup_subsys_state *pos; + unsigned int flags; +} __attribute__((aligned(8))); + +__bpf_kfunc_start_defs(); + +__bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it, + struct cgroup_subsys_state *start, unsigned int flags) +{ + struct bpf_iter_css_kern *kit = (void *)it; + + BUILD_BUG_ON(sizeof(struct bpf_iter_css_kern) > sizeof(struct bpf_iter_css)); + BUILD_BUG_ON(__alignof__(struct bpf_iter_css_kern) != __alignof__(struct bpf_iter_css)); + + kit->start = NULL; + switch (flags) { + case BPF_CGROUP_ITER_DESCENDANTS_PRE: + case BPF_CGROUP_ITER_DESCENDANTS_POST: + case BPF_CGROUP_ITER_ANCESTORS_UP: + break; + default: + return -EINVAL; + } + + kit->start = start; + kit->pos = NULL; + kit->flags = flags; + return 0; +} + +__bpf_kfunc struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it) +{ + struct bpf_iter_css_kern *kit = (void *)it; + + if (!kit->start) + return NULL; + + switch (kit->flags) { + case BPF_CGROUP_ITER_DESCENDANTS_PRE: + kit->pos = css_next_descendant_pre(kit->pos, kit->start); + break; + case BPF_CGROUP_ITER_DESCENDANTS_POST: + kit->pos = css_next_descendant_post(kit->pos, kit->start); + break; + case BPF_CGROUP_ITER_ANCESTORS_UP: + kit->pos = kit->pos ? kit->pos->parent : kit->start; + } + + return kit->pos; +} + +__bpf_kfunc void bpf_iter_css_destroy(struct bpf_iter_css *it) +{ +} + +__bpf_kfunc_end_defs(); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index dc85240a0134..fe254ae035fe 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -61,6 +61,7 @@ #define AX regs[BPF_REG_AX] #define ARG1 regs[BPF_REG_ARG1] #define CTX regs[BPF_REG_CTX] +#define OFF insn->off #define IMM insn->imm struct bpf_mem_alloc bpf_global_ma; @@ -211,7 +212,7 @@ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog, const struct bpf_line_info *linfo; void **jited_linfo; - if (!prog->aux->jited_linfo) + if (!prog->aux->jited_linfo || prog->aux->func_idx > prog->aux->func_cnt) /* Userspace did not provide linfo */ return; @@ -370,9 +371,18 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, s32 end_new, s32 curr, const bool probe_pass) { - const s32 off_min = S16_MIN, off_max = S16_MAX; + s64 off_min, off_max, off; s32 delta = end_new - end_old; - s32 off = insn->off; + + if (insn->code == (BPF_JMP32 | BPF_JA)) { + off = insn->imm; + off_min = S32_MIN; + off_max = S32_MAX; + } else { + off = insn->off; + off_min = S16_MIN; + off_max = S16_MAX; + } if (curr < pos && curr + off + 1 >= end_old) off += delta; @@ -380,8 +390,12 @@ static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, off -= delta; if (off < off_min || off > off_max) return -ERANGE; - if (!probe_pass) - insn->off = off; + if (!probe_pass) { + if (insn->code == (BPF_JMP32 | BPF_JA)) + insn->imm = off; + else + insn->off = off; + } return 0; } @@ -529,7 +543,7 @@ static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) { int i; - for (i = 0; i < fp->aux->func_cnt; i++) + for (i = 0; i < fp->aux->real_func_cnt; i++) bpf_prog_kallsyms_del(fp->aux->func[i]); } @@ -579,7 +593,7 @@ bpf_prog_ksym_set_name(struct bpf_prog *prog) sym = bin2hex(sym, prog->tag, sizeof(prog->tag)); /* prog->aux->name will be ignored if full btf name is available */ - if (prog->aux->func_info_cnt) { + if (prog->aux->func_info_cnt && prog->aux->func_idx < prog->aux->func_info_cnt) { type = btf_type_by_id(prog->aux->btf, prog->aux->func_info[prog->aux->func_idx].type_id); func_name = btf_name_by_offset(prog->aux->btf, type->name_off); @@ -613,7 +627,11 @@ static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n) if (val < ksym->start) return -1; - if (val >= ksym->end) + /* Ensure that we detect return addresses as part of the program, when + * the final instruction is a call for a program part of the stack + * trace. Therefore, do val > ksym->end instead of val >= ksym->end. + */ + if (val > ksym->end) return 1; return 0; @@ -723,7 +741,7 @@ bool is_bpf_text_address(unsigned long addr) return ret; } -static struct bpf_prog *bpf_prog_ksym_find(unsigned long addr) +struct bpf_prog *bpf_prog_ksym_find(unsigned long addr) { struct bpf_ksym *ksym = bpf_ksym_find(addr); @@ -860,7 +878,7 @@ static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_ins GFP_KERNEL); if (!pack) return NULL; - pack->ptr = module_alloc(BPF_PROG_PACK_SIZE); + pack->ptr = bpf_jit_alloc_exec(BPF_PROG_PACK_SIZE); if (!pack->ptr) { kfree(pack); return NULL; @@ -884,7 +902,7 @@ void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) mutex_lock(&pack_mutex); if (size > BPF_PROG_PACK_SIZE) { size = round_up(size, PAGE_SIZE); - ptr = module_alloc(size); + ptr = bpf_jit_alloc_exec(size); if (ptr) { bpf_fill_ill_insns(ptr, size); set_vm_flush_reset_perms(ptr); @@ -922,7 +940,7 @@ void bpf_prog_pack_free(struct bpf_binary_header *hdr) mutex_lock(&pack_mutex); if (hdr->size > BPF_PROG_PACK_SIZE) { - module_memfree(hdr); + bpf_jit_free_exec(hdr); goto out; } @@ -946,7 +964,7 @@ void bpf_prog_pack_free(struct bpf_binary_header *hdr) if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0, BPF_PROG_CHUNK_COUNT, 0) == 0) { list_del(&pack->list); - module_memfree(pack->ptr); + bpf_jit_free_exec(pack->ptr); kfree(pack); } out: @@ -1198,7 +1216,7 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog, if (!extra_pass) addr = NULL; else if (prog->aux->func && - off >= 0 && off < prog->aux->func_cnt) + off >= 0 && off < prog->aux->real_func_cnt) addr = (u8 *)prog->aux->func[off]->bpf_func; else return -EINVAL; @@ -1271,7 +1289,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, case BPF_ALU | BPF_MOD | BPF_K: *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); - *to++ = BPF_ALU32_REG(from->code, from->dst_reg, BPF_REG_AX); + *to++ = BPF_ALU32_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off); break; case BPF_ALU64 | BPF_ADD | BPF_K: @@ -1285,7 +1303,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, case BPF_ALU64 | BPF_MOD | BPF_K: *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); - *to++ = BPF_ALU64_REG(from->code, from->dst_reg, BPF_REG_AX); + *to++ = BPF_ALU64_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off); break; case BPF_JMP | BPF_JEQ | BPF_K: @@ -1523,6 +1541,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); INSN_3(ALU64, DIV, X), \ INSN_3(ALU64, MOD, X), \ INSN_2(ALU64, NEG), \ + INSN_3(ALU64, END, TO_LE), \ /* Immediate based. */ \ INSN_3(ALU64, ADD, K), \ INSN_3(ALU64, SUB, K), \ @@ -1591,6 +1610,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); INSN_3(JMP, JSLE, K), \ INSN_3(JMP, JSET, K), \ INSN_2(JMP, JA), \ + INSN_2(JMP32, JA), \ /* Store instructions. */ \ /* Register based. */ \ INSN_3(STX, MEM, B), \ @@ -1610,6 +1630,9 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); INSN_3(LDX, MEM, H), \ INSN_3(LDX, MEM, W), \ INSN_3(LDX, MEM, DW), \ + INSN_3(LDX, MEMSX, B), \ + INSN_3(LDX, MEMSX, H), \ + INSN_3(LDX, MEMSX, W), \ /* Immediate based. */ \ INSN_3(LD, IMM, DW) @@ -1635,12 +1658,6 @@ bool bpf_opcode_in_insntable(u8 code) } #ifndef CONFIG_BPF_JIT_ALWAYS_ON -u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) -{ - memset(dst, 0, size); - return -EFAULT; -} - /** * ___bpf_prog_run - run eBPF program on a given context * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers @@ -1666,6 +1683,9 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn) [BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H, [BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W, [BPF_LDX | BPF_PROBE_MEM | BPF_DW] = &&LDX_PROBE_MEM_DW, + [BPF_LDX | BPF_PROBE_MEMSX | BPF_B] = &&LDX_PROBE_MEMSX_B, + [BPF_LDX | BPF_PROBE_MEMSX | BPF_H] = &&LDX_PROBE_MEMSX_H, + [BPF_LDX | BPF_PROBE_MEMSX | BPF_W] = &&LDX_PROBE_MEMSX_W, }; #undef BPF_INSN_3_LBL #undef BPF_INSN_2_LBL @@ -1733,13 +1753,36 @@ select_insn: DST = -DST; CONT; ALU_MOV_X: - DST = (u32) SRC; + switch (OFF) { + case 0: + DST = (u32) SRC; + break; + case 8: + DST = (u32)(s8) SRC; + break; + case 16: + DST = (u32)(s16) SRC; + break; + } CONT; ALU_MOV_K: DST = (u32) IMM; CONT; ALU64_MOV_X: - DST = SRC; + switch (OFF) { + case 0: + DST = SRC; + break; + case 8: + DST = (s8) SRC; + break; + case 16: + DST = (s16) SRC; + break; + case 32: + DST = (s32) SRC; + break; + } CONT; ALU64_MOV_K: DST = IMM; @@ -1761,36 +1804,114 @@ select_insn: (*(s64 *) &DST) >>= IMM; CONT; ALU64_MOD_X: - div64_u64_rem(DST, SRC, &AX); - DST = AX; + switch (OFF) { + case 0: + div64_u64_rem(DST, SRC, &AX); + DST = AX; + break; + case 1: + AX = div64_s64(DST, SRC); + DST = DST - AX * SRC; + break; + } CONT; ALU_MOD_X: - AX = (u32) DST; - DST = do_div(AX, (u32) SRC); + switch (OFF) { + case 0: + AX = (u32) DST; + DST = do_div(AX, (u32) SRC); + break; + case 1: + AX = abs((s32)DST); + AX = do_div(AX, abs((s32)SRC)); + if ((s32)DST < 0) + DST = (u32)-AX; + else + DST = (u32)AX; + break; + } CONT; ALU64_MOD_K: - div64_u64_rem(DST, IMM, &AX); - DST = AX; + switch (OFF) { + case 0: + div64_u64_rem(DST, IMM, &AX); + DST = AX; + break; + case 1: + AX = div64_s64(DST, IMM); + DST = DST - AX * IMM; + break; + } CONT; ALU_MOD_K: - AX = (u32) DST; - DST = do_div(AX, (u32) IMM); + switch (OFF) { + case 0: + AX = (u32) DST; + DST = do_div(AX, (u32) IMM); + break; + case 1: + AX = abs((s32)DST); + AX = do_div(AX, abs((s32)IMM)); + if ((s32)DST < 0) + DST = (u32)-AX; + else + DST = (u32)AX; + break; + } CONT; ALU64_DIV_X: - DST = div64_u64(DST, SRC); + switch (OFF) { + case 0: + DST = div64_u64(DST, SRC); + break; + case 1: + DST = div64_s64(DST, SRC); + break; + } CONT; ALU_DIV_X: - AX = (u32) DST; - do_div(AX, (u32) SRC); - DST = (u32) AX; + switch (OFF) { + case 0: + AX = (u32) DST; + do_div(AX, (u32) SRC); + DST = (u32) AX; + break; + case 1: + AX = abs((s32)DST); + do_div(AX, abs((s32)SRC)); + if (((s32)DST < 0) == ((s32)SRC < 0)) + DST = (u32)AX; + else + DST = (u32)-AX; + break; + } CONT; ALU64_DIV_K: - DST = div64_u64(DST, IMM); + switch (OFF) { + case 0: + DST = div64_u64(DST, IMM); + break; + case 1: + DST = div64_s64(DST, IMM); + break; + } CONT; ALU_DIV_K: - AX = (u32) DST; - do_div(AX, (u32) IMM); - DST = (u32) AX; + switch (OFF) { + case 0: + AX = (u32) DST; + do_div(AX, (u32) IMM); + DST = (u32) AX; + break; + case 1: + AX = abs((s32)DST); + do_div(AX, abs((s32)IMM)); + if (((s32)DST < 0) == ((s32)IMM < 0)) + DST = (u32)AX; + else + DST = (u32)-AX; + break; + } CONT; ALU_END_TO_BE: switch (IMM) { @@ -1818,6 +1939,19 @@ select_insn: break; } CONT; + ALU64_END_TO_LE: + switch (IMM) { + case 16: + DST = (__force u16) __swab16(DST); + break; + case 32: + DST = (__force u32) __swab32(DST); + break; + case 64: + DST = (__force u64) __swab64(DST); + break; + } + CONT; /* CALL */ JMP_CALL: @@ -1867,6 +2001,9 @@ out: JMP_JA: insn += insn->off; CONT; + JMP32_JA: + insn += insn->imm; + CONT; JMP_EXIT: return BPF_R0; /* JMP */ @@ -1931,8 +2068,8 @@ out: DST = *(SIZE *)(unsigned long) (SRC + insn->off); \ CONT; \ LDX_PROBE_MEM_##SIZEOP: \ - bpf_probe_read_kernel(&DST, sizeof(SIZE), \ - (const void *)(long) (SRC + insn->off)); \ + bpf_probe_read_kernel_common(&DST, sizeof(SIZE), \ + (const void *)(long) (SRC + insn->off)); \ DST = *((SIZE *)&DST); \ CONT; @@ -1942,6 +2079,21 @@ out: LDST(DW, u64) #undef LDST +#define LDSX(SIZEOP, SIZE) \ + LDX_MEMSX_##SIZEOP: \ + DST = *(SIZE *)(unsigned long) (SRC + insn->off); \ + CONT; \ + LDX_PROBE_MEMSX_##SIZEOP: \ + bpf_probe_read_kernel_common(&DST, sizeof(SIZE), \ + (const void *)(long) (SRC + insn->off)); \ + DST = *((SIZE *)&DST); \ + CONT; + + LDSX(B, s8) + LDSX(H, s16) + LDSX(W, s32) +#undef LDSX + #define ATOMIC_ALU_OP(BOP, KOP) \ case BOP: \ if (BPF_SIZE(insn->code) == BPF_W) \ @@ -2577,7 +2729,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) #endif if (aux->dst_trampoline) bpf_trampoline_put(aux->dst_trampoline); - for (i = 0; i < aux->func_cnt; i++) { + for (i = 0; i < aux->real_func_cnt; i++) { /* We can just unlink the subprog poke descriptor table as * it was originally linked to the main program and is also * released along with it. @@ -2585,7 +2737,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) aux->func[i]->aux->poke_tab = NULL; bpf_jit_free(aux->func[i]); } - if (aux->func_cnt) { + if (aux->real_func_cnt) { kfree(aux->func); bpf_prog_unlock_free(aux->prog); } else { @@ -2770,6 +2922,15 @@ int __weak bpf_arch_text_invalidate(void *dst, size_t len) return -ENOTSUPP; } +bool __weak bpf_jit_supports_exceptions(void) +{ + return false; +} + +void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie) +{ +} + #ifdef CONFIG_BPF_SYSCALL static int __init bpf_global_ma_init(void) { diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 286ab3db0fde..8a0bb80fe48a 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -61,8 +61,6 @@ struct bpf_cpu_map_entry { /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ struct xdp_bulk_queue __percpu *bulkq; - struct bpf_cpu_map *cmap; - /* Queue with potential multi-producers, and single-consumer kthread */ struct ptr_ring *queue; struct task_struct *kthread; @@ -70,11 +68,8 @@ struct bpf_cpu_map_entry { struct bpf_cpumap_val value; struct bpf_prog *prog; - atomic_t refcnt; /* Control when this struct can be free'ed */ - struct rcu_head rcu; - - struct work_struct kthread_stop_wq; struct completion kthread_running; + struct rcu_work free_work; }; struct bpf_cpu_map { @@ -119,11 +114,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) return &cmap->map; } -static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) -{ - atomic_inc(&rcpu->refcnt); -} - static void __cpu_map_ring_cleanup(struct ptr_ring *ring) { /* The tear-down procedure should have made sure that queue is @@ -144,35 +134,6 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring) } } -static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) -{ - if (atomic_dec_and_test(&rcpu->refcnt)) { - if (rcpu->prog) - bpf_prog_put(rcpu->prog); - /* The queue should be empty at this point */ - __cpu_map_ring_cleanup(rcpu->queue); - ptr_ring_cleanup(rcpu->queue, NULL); - kfree(rcpu->queue); - kfree(rcpu); - } -} - -/* called from workqueue, to workaround syscall using preempt_disable */ -static void cpu_map_kthread_stop(struct work_struct *work) -{ - struct bpf_cpu_map_entry *rcpu; - - rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq); - - /* Wait for flush in __cpu_map_entry_free(), via full RCU barrier, - * as it waits until all in-flight call_rcu() callbacks complete. - */ - rcu_barrier(); - - /* kthread_stop will wake_up_process and wait for it to complete */ - kthread_stop(rcpu->kthread); -} - static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, struct list_head *listp, struct xdp_cpumap_stats *stats) @@ -397,7 +358,6 @@ static int cpu_map_kthread_run(void *data) } __set_current_state(TASK_RUNNING); - put_cpu_map_entry(rcpu); return 0; } @@ -474,9 +434,6 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, if (IS_ERR(rcpu->kthread)) goto free_prog; - get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */ - get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */ - /* Make sure kthread runs on a single CPU */ kthread_bind(rcpu->kthread, cpu); wake_up_process(rcpu->kthread); @@ -503,40 +460,40 @@ free_rcu: return NULL; } -static void __cpu_map_entry_free(struct rcu_head *rcu) +static void __cpu_map_entry_free(struct work_struct *work) { struct bpf_cpu_map_entry *rcpu; /* This cpu_map_entry have been disconnected from map and one - * RCU grace-period have elapsed. Thus, XDP cannot queue any + * RCU grace-period have elapsed. Thus, XDP cannot queue any * new packets and cannot change/set flush_needed that can * find this entry. */ - rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu); + rcpu = container_of(to_rcu_work(work), struct bpf_cpu_map_entry, free_work); + /* kthread_stop will wake_up_process and wait for it to complete. + * cpu_map_kthread_run() makes sure the pointer ring is empty + * before exiting. + */ + kthread_stop(rcpu->kthread); + + if (rcpu->prog) + bpf_prog_put(rcpu->prog); + /* The queue should be empty at this point */ + __cpu_map_ring_cleanup(rcpu->queue); + ptr_ring_cleanup(rcpu->queue, NULL); + kfree(rcpu->queue); free_percpu(rcpu->bulkq); - /* Cannot kthread_stop() here, last put free rcpu resources */ - put_cpu_map_entry(rcpu); + kfree(rcpu); } -/* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to - * ensure any driver rcu critical sections have completed, but this - * does not guarantee a flush has happened yet. Because driver side - * rcu_read_lock/unlock only protects the running XDP program. The - * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a - * pending flush op doesn't fail. - * - * The bpf_cpu_map_entry is still used by the kthread, and there can - * still be pending packets (in queue and percpu bulkq). A refcnt - * makes sure to last user (kthread_stop vs. call_rcu) free memory - * resources. - * - * The rcu callback __cpu_map_entry_free flush remaining packets in - * percpu bulkq to queue. Due to caller map_delete_elem() disable - * preemption, cannot call kthread_stop() to make sure queue is empty. - * Instead a work_queue is started for stopping kthread, - * cpu_map_kthread_stop, which waits for an RCU grace period before - * stopping kthread, emptying the queue. +/* After the xchg of the bpf_cpu_map_entry pointer, we need to make sure the old + * entry is no longer in use before freeing. We use queue_rcu_work() to call + * __cpu_map_entry_free() in a separate workqueue after waiting for an RCU grace + * period. This means that (a) all pending enqueue and flush operations have + * completed (because of the RCU callback), and (b) we are in a workqueue + * context where we can stop the kthread and wait for it to exit before freeing + * everything. */ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, u32 key_cpu, struct bpf_cpu_map_entry *rcpu) @@ -545,9 +502,8 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, old_rcpu = unrcu_pointer(xchg(&cmap->cpu_map[key_cpu], RCU_INITIALIZER(rcpu))); if (old_rcpu) { - call_rcu(&old_rcpu->rcu, __cpu_map_entry_free); - INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop); - schedule_work(&old_rcpu->kthread_stop_wq); + INIT_RCU_WORK(&old_rcpu->free_work, __cpu_map_entry_free); + queue_rcu_work(system_wq, &old_rcpu->free_work); } } @@ -559,7 +515,7 @@ static long cpu_map_delete_elem(struct bpf_map *map, void *key) if (key_cpu >= map->max_entries) return -EINVAL; - /* notice caller map_delete_elem() use preempt_disable() */ + /* notice caller map_delete_elem() uses rcu_read_lock() */ __cpu_map_entry_replace(cmap, key_cpu, NULL); return 0; } @@ -595,7 +551,6 @@ static long cpu_map_update_elem(struct bpf_map *map, void *key, void *value, rcpu = __cpu_map_entry_alloc(map, &cpumap_value, key_cpu); if (!rcpu) return -ENOMEM; - rcpu->cmap = cmap; } rcu_read_lock(); __cpu_map_entry_replace(cmap, key_cpu, rcpu); @@ -611,16 +566,15 @@ static void cpu_map_free(struct bpf_map *map) /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, * so the bpf programs (can be more than one that used this map) were * disconnected from events. Wait for outstanding critical sections in - * these programs to complete. The rcu critical section only guarantees - * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map. - * It does __not__ ensure pending flush operations (if any) are - * complete. + * these programs to complete. synchronize_rcu() below not only + * guarantees no further "XDP/bpf-side" reads against + * bpf_cpu_map->cpu_map, but also ensure pending flush operations + * (if any) are completed. */ - synchronize_rcu(); - /* For cpu_map the remote CPUs can still be using the entries - * (struct bpf_cpu_map_entry). + /* The only possible user of bpf_cpu_map_entry is + * cpu_map_kthread_run(). */ for (i = 0; i < cmap->map.max_entries; i++) { struct bpf_cpu_map_entry *rcpu; @@ -629,8 +583,8 @@ static void cpu_map_free(struct bpf_map *map) if (!rcpu) continue; - /* bq flush and cleanup happens after RCU grace-period */ - __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ + /* Stop kthread and cleanup entry directly */ + __cpu_map_entry_free(&rcpu->free_work.work); } bpf_map_area_free(cmap->cpu_map); bpf_map_area_free(cmap); @@ -810,6 +764,16 @@ void __cpu_map_flush(void) } } +#ifdef CONFIG_DEBUG_NET +bool cpu_map_check_flush(void) +{ + if (list_empty(this_cpu_ptr(&cpu_map_flush_list))) + return false; + __cpu_map_flush(); + return true; +} +#endif + static int __init cpu_map_init(void) { int cpu; diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index 938a60ff4295..e01c741e54e7 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -9,7 +9,6 @@ /** * struct bpf_cpumask - refcounted BPF cpumask wrapper structure * @cpumask: The actual cpumask embedded in the struct. - * @rcu: The RCU head used to free the cpumask with RCU safety. * @usage: Object reference counter. When the refcount goes to 0, the * memory is released back to the BPF allocator, which provides * RCU safety. @@ -25,7 +24,6 @@ */ struct bpf_cpumask { cpumask_t cpumask; - struct rcu_head rcu; refcount_t usage; }; @@ -36,9 +34,7 @@ static bool cpu_valid(u32 cpu) return cpu < nr_cpu_ids; } -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global kfuncs as their definitions will be in BTF"); +__bpf_kfunc_start_defs(); /** * bpf_cpumask_create() - Create a mutable BPF cpumask. @@ -82,16 +78,6 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) return cpumask; } -static void cpumask_free_cb(struct rcu_head *head) -{ - struct bpf_cpumask *cpumask; - - cpumask = container_of(head, struct bpf_cpumask, rcu); - migrate_disable(); - bpf_mem_cache_free(&bpf_cpumask_ma, cpumask); - migrate_enable(); -} - /** * bpf_cpumask_release() - Release a previously acquired BPF cpumask. * @cpumask: The cpumask being released. @@ -102,8 +88,12 @@ static void cpumask_free_cb(struct rcu_head *head) */ __bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask) { - if (refcount_dec_and_test(&cpumask->usage)) - call_rcu(&cpumask->rcu, cpumask_free_cb); + if (!refcount_dec_and_test(&cpumask->usage)) + return; + + migrate_disable(); + bpf_mem_cache_free_rcu(&bpf_cpumask_ma, cpumask); + migrate_enable(); } /** @@ -415,7 +405,7 @@ __bpf_kfunc u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, return cpumask_any_and_distribute(src1, src2); } -__diag_pop(); +__bpf_kfunc_end_defs(); BTF_SET8_START(cpumask_kfunc_btf_ids) BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 49cc0b5671c6..a936c704d4e7 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -65,7 +65,6 @@ struct xdp_dev_bulk_queue { struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; - struct bpf_dtab *dtab; struct bpf_prog *xdp_prog; struct rcu_head rcu; unsigned int idx; @@ -419,6 +418,16 @@ void __dev_flush(void) } } +#ifdef CONFIG_DEBUG_NET +bool dev_check_flush(void) +{ + if (list_empty(this_cpu_ptr(&dev_flush_list))) + return false; + __dev_flush(); + return true; +} +#endif + /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or * by local_bh_disable() (from XDP calls inside NAPI). The * rcu_read_lock_bh_held() below makes lockdep accept both. @@ -874,7 +883,6 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, } dev->idx = idx; - dev->dtab = dtab; if (prog) { dev->xdp_prog = prog; dev->val.bpf_prog.id = prog->aux->id; diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 7b4afb7d96db..49940c26a227 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -87,6 +87,17 @@ const char *const bpf_alu_string[16] = { [BPF_END >> 4] = "endian", }; +static const char *const bpf_alu_sign_string[16] = { + [BPF_DIV >> 4] = "s/=", + [BPF_MOD >> 4] = "s%=", +}; + +static const char *const bpf_movsx_string[4] = { + [0] = "(s8)", + [1] = "(s16)", + [3] = "(s32)", +}; + static const char *const bpf_atomic_alu_string[16] = { [BPF_ADD >> 4] = "add", [BPF_AND >> 4] = "and", @@ -101,6 +112,12 @@ static const char *const bpf_ldst_string[] = { [BPF_DW >> 3] = "u64", }; +static const char *const bpf_ldsx_string[] = { + [BPF_W >> 3] = "s32", + [BPF_H >> 3] = "s16", + [BPF_B >> 3] = "s8", +}; + static const char *const bpf_jmp_string[16] = { [BPF_JA >> 4] = "jmp", [BPF_JEQ >> 4] = "==", @@ -128,6 +145,27 @@ static void print_bpf_end_insn(bpf_insn_print_t verbose, insn->imm, insn->dst_reg); } +static void print_bpf_bswap_insn(bpf_insn_print_t verbose, + void *private_data, + const struct bpf_insn *insn) +{ + verbose(private_data, "(%02x) r%d = bswap%d r%d\n", + insn->code, insn->dst_reg, + insn->imm, insn->dst_reg); +} + +static bool is_sdiv_smod(const struct bpf_insn *insn) +{ + return (BPF_OP(insn->code) == BPF_DIV || BPF_OP(insn->code) == BPF_MOD) && + insn->off == 1; +} + +static bool is_movsx(const struct bpf_insn *insn) +{ + return BPF_OP(insn->code) == BPF_MOV && + (insn->off == 8 || insn->off == 16 || insn->off == 32); +} + void print_bpf_insn(const struct bpf_insn_cbs *cbs, const struct bpf_insn *insn, bool allow_ptr_leaks) @@ -138,7 +176,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, if (class == BPF_ALU || class == BPF_ALU64) { if (BPF_OP(insn->code) == BPF_END) { if (class == BPF_ALU64) - verbose(cbs->private_data, "BUG_alu64_%02x\n", insn->code); + print_bpf_bswap_insn(verbose, cbs->private_data, insn); else print_bpf_end_insn(verbose, cbs->private_data, insn); } else if (BPF_OP(insn->code) == BPF_NEG) { @@ -147,17 +185,20 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, insn->dst_reg, class == BPF_ALU ? 'w' : 'r', insn->dst_reg); } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(cbs->private_data, "(%02x) %c%d %s %c%d\n", + verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n", insn->code, class == BPF_ALU ? 'w' : 'r', insn->dst_reg, - bpf_alu_string[BPF_OP(insn->code) >> 4], + is_sdiv_smod(insn) ? bpf_alu_sign_string[BPF_OP(insn->code) >> 4] + : bpf_alu_string[BPF_OP(insn->code) >> 4], + is_movsx(insn) ? bpf_movsx_string[(insn->off >> 3) - 1] : "", class == BPF_ALU ? 'w' : 'r', insn->src_reg); } else { verbose(cbs->private_data, "(%02x) %c%d %s %d\n", insn->code, class == BPF_ALU ? 'w' : 'r', insn->dst_reg, - bpf_alu_string[BPF_OP(insn->code) >> 4], + is_sdiv_smod(insn) ? bpf_alu_sign_string[BPF_OP(insn->code) >> 4] + : bpf_alu_string[BPF_OP(insn->code) >> 4], insn->imm); } } else if (class == BPF_STX) { @@ -218,13 +259,15 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, verbose(cbs->private_data, "BUG_st_%02x\n", insn->code); } } else if (class == BPF_LDX) { - if (BPF_MODE(insn->code) != BPF_MEM) { + if (BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_MEMSX) { verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code); return; } verbose(cbs->private_data, "(%02x) r%d = *(%s *)(r%d %+d)\n", insn->code, insn->dst_reg, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + BPF_MODE(insn->code) == BPF_MEM ? + bpf_ldst_string[BPF_SIZE(insn->code) >> 3] : + bpf_ldsx_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->off); } else if (class == BPF_LD) { if (BPF_MODE(insn->code) == BPF_ABS) { @@ -279,6 +322,9 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, } else if (insn->code == (BPF_JMP | BPF_JA)) { verbose(cbs->private_data, "(%02x) goto pc%+d\n", insn->code, insn->off); + } else if (insn->code == (BPF_JMP32 | BPF_JA)) { + verbose(cbs->private_data, "(%02x) gotol pc%+d\n", + insn->code, insn->imm); } else if (insn->code == (BPF_JMP | BPF_EXIT)) { verbose(cbs->private_data, "(%02x) exit\n", insn->code); } else if (BPF_SRC(insn->code) == BPF_X) { diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 56d3da7d0bc6..fd8d4b0addfc 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -155,13 +155,15 @@ static inline int htab_lock_bucket(const struct bpf_htab *htab, hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); preempt_disable(); + local_irq_save(flags); if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { __this_cpu_dec(*(htab->map_locked[hash])); + local_irq_restore(flags); preempt_enable(); return -EBUSY; } - raw_spin_lock_irqsave(&b->raw_lock, flags); + raw_spin_lock(&b->raw_lock); *pflags = flags; return 0; @@ -172,8 +174,9 @@ static inline void htab_unlock_bucket(const struct bpf_htab *htab, unsigned long flags) { hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); - raw_spin_unlock_irqrestore(&b->raw_lock, flags); + raw_spin_unlock(&b->raw_lock); __this_cpu_dec(*(htab->map_locked[hash])); + local_irq_restore(flags); preempt_enable(); } @@ -302,6 +305,7 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, struct htab_elem *l; if (node) { + bpf_map_inc_elem_count(&htab->map); l = container_of(node, struct htab_elem, lru_node); memcpy(l->key, key, htab->map.key_size); return l; @@ -510,12 +514,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab->n_buckets > U32_MAX / sizeof(struct bucket)) goto free_htab; + err = bpf_map_init_elem_count(&htab->map); + if (err) + goto free_htab; + err = -ENOMEM; htab->buckets = bpf_map_area_alloc(htab->n_buckets * sizeof(struct bucket), htab->map.numa_node); if (!htab->buckets) - goto free_htab; + goto free_elem_count; for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) { htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map, @@ -593,6 +601,8 @@ free_map_locked: bpf_map_area_free(htab->buckets); bpf_mem_alloc_destroy(&htab->pcpu_ma); bpf_mem_alloc_destroy(&htab->ma); +free_elem_count: + bpf_map_free_elem_count(&htab->map); free_htab: lockdep_unregister_key(&htab->lockdep_key); bpf_map_area_free(htab); @@ -804,6 +814,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) if (l == tgt_l) { hlist_nulls_del_rcu(&l->hash_node); check_and_free_fields(htab, l); + bpf_map_dec_elem_count(&htab->map); break; } @@ -900,6 +911,8 @@ static bool is_map_full(struct bpf_htab *htab) static void inc_elem_count(struct bpf_htab *htab) { + bpf_map_inc_elem_count(&htab->map); + if (htab->use_percpu_counter) percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH); else @@ -908,6 +921,8 @@ static void inc_elem_count(struct bpf_htab *htab) static void dec_elem_count(struct bpf_htab *htab) { + bpf_map_dec_elem_count(&htab->map); + if (htab->use_percpu_counter) percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH); else @@ -920,6 +935,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) htab_put_fd_value(htab, l); if (htab_is_prealloc(htab)) { + bpf_map_dec_elem_count(&htab->map); check_and_free_fields(htab, l); __pcpu_freelist_push(&htab->freelist, &l->fnode); } else { @@ -1000,6 +1016,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, if (!l) return ERR_PTR(-E2BIG); l_new = container_of(l, struct htab_elem, fnode); + bpf_map_inc_elem_count(&htab->map); } } else { if (is_map_full(htab)) @@ -1168,6 +1185,7 @@ err: static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem) { check_and_free_fields(htab, elem); + bpf_map_dec_elem_count(&htab->map); bpf_lru_push_free(&htab->lru, &elem->lru_node); } @@ -1357,8 +1375,10 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, err: htab_unlock_bucket(htab, b, hash, flags); err_lock_bucket: - if (l_new) + if (l_new) { + bpf_map_dec_elem_count(&htab->map); bpf_lru_push_free(&htab->lru, &l_new->lru_node); + } return ret; } @@ -1523,6 +1543,7 @@ static void htab_map_free(struct bpf_map *map) prealloc_destroy(htab); } + bpf_map_free_elem_count(map); free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); bpf_mem_alloc_destroy(&htab->pcpu_ma); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9e80efa59a5d..56b0c1f678ee 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -22,6 +22,7 @@ #include <linux/security.h> #include <linux/btf_ids.h> #include <linux/bpf_mem_alloc.h> +#include <linux/kasan.h> #include "../../lib/kstrtox.h" @@ -286,6 +287,7 @@ static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0"); BUILD_BUG_ON(sizeof(*l) != sizeof(__u32)); BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32)); + preempt_disable(); arch_spin_lock(l); } @@ -294,6 +296,7 @@ static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) arch_spinlock_t *l = (void *)lock; arch_spin_unlock(l); + preempt_enable(); } #else @@ -1174,13 +1177,6 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map ret = -EBUSY; goto out; } - if (!atomic64_read(&map->usercnt)) { - /* maps with timers must be either held by user space - * or pinned in bpffs. - */ - ret = -EPERM; - goto out; - } /* allocate hrtimer via map_kmalloc to use memcg accounting */ t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node); if (!t) { @@ -1193,7 +1189,21 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map rcu_assign_pointer(t->callback_fn, NULL); hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); t->timer.function = bpf_timer_cb; - timer->timer = t; + WRITE_ONCE(timer->timer, t); + /* Guarantee the order between timer->timer and map->usercnt. So + * when there are concurrent uref release and bpf timer init, either + * bpf_timer_cancel_and_free() called by uref release reads a no-NULL + * timer or atomic64_read() below returns a zero usercnt. + */ + smp_mb(); + if (!atomic64_read(&map->usercnt)) { + /* maps with timers must be either held by user space + * or pinned in bpffs. + */ + WRITE_ONCE(timer->timer, NULL); + kfree(t); + ret = -EPERM; + } out: __bpf_spin_unlock_irqrestore(&timer->lock); return ret; @@ -1269,7 +1279,7 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla if (in_nmi()) return -EOPNOTSUPP; - if (flags > BPF_F_TIMER_ABS) + if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN)) return -EINVAL; __bpf_spin_lock_irqsave(&timer->lock); t = timer->timer; @@ -1283,6 +1293,9 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla else mode = HRTIMER_MODE_REL_SOFT; + if (flags & BPF_F_TIMER_CPU_PIN) + mode |= HRTIMER_MODE_PINNED; + hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode); out: __bpf_spin_unlock_irqrestore(&timer->lock); @@ -1368,7 +1381,7 @@ void bpf_timer_cancel_and_free(void *val) /* The subsequent bpf_timer_start/cancel() helpers won't be able to use * this timer, since it won't be initialized. */ - timer->timer = NULL; + WRITE_ONCE(timer->timer, NULL); out: __bpf_spin_unlock_irqrestore(&timer->lock); if (!t) @@ -1805,8 +1818,6 @@ bpf_base_func_proto(enum bpf_func_id func_id) } } -void __bpf_obj_drop_impl(void *p, const struct btf_record *rec); - void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock) { @@ -1838,7 +1849,7 @@ unlock: * bpf_list_head which needs to be freed. */ migrate_disable(); - __bpf_obj_drop_impl(obj, field->graph_root.value_rec); + __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); migrate_enable(); } } @@ -1877,14 +1888,12 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root, migrate_disable(); - __bpf_obj_drop_impl(obj, field->graph_root.value_rec); + __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); migrate_enable(); } } -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global functions as their definitions will be in vmlinux BTF"); +__bpf_kfunc_start_defs(); __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) { @@ -1900,9 +1909,19 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) return p; } +__bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign) +{ + u64 size = local_type_id__k; + + /* The verifier has ensured that meta__ign must be NULL */ + return bpf_mem_alloc(&bpf_global_percpu_ma, size); +} + /* Must be called under migrate_disable(), as required by bpf_mem_free */ -void __bpf_obj_drop_impl(void *p, const struct btf_record *rec) +void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu) { + struct bpf_mem_alloc *ma; + if (rec && rec->refcount_off >= 0 && !refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) { /* Object is refcounted and refcount_dec didn't result in 0 @@ -1913,7 +1932,15 @@ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec) if (rec) bpf_obj_free_fields(rec, p); - bpf_mem_free(&bpf_global_ma, p); + + if (percpu) + ma = &bpf_global_percpu_ma; + else + ma = &bpf_global_ma; + if (rec && rec->refcount_off >= 0) + bpf_mem_free_rcu(ma, p); + else + bpf_mem_free(ma, p); } __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) @@ -1921,7 +1948,13 @@ __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) struct btf_struct_meta *meta = meta__ign; void *p = p__alloc; - __bpf_obj_drop_impl(p, meta ? meta->record : NULL); + __bpf_obj_drop_impl(p, meta ? meta->record : NULL, false); +} + +__bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign) +{ + /* The verifier has ensured that meta__ign must be NULL */ + bpf_mem_free_rcu(&bpf_global_percpu_ma, p__alloc); } __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign) @@ -1942,23 +1975,29 @@ __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta return (void *)p__refcounted_kptr; } -static int __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head, +static int __bpf_list_add(struct bpf_list_node_kern *node, + struct bpf_list_head *head, bool tail, struct btf_record *rec, u64 off) { - struct list_head *n = (void *)node, *h = (void *)head; + struct list_head *n = &node->list_head, *h = (void *)head; /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't * called on its fields, so init here */ if (unlikely(!h->next)) INIT_LIST_HEAD(h); - if (!list_empty(n)) { + + /* node->owner != NULL implies !list_empty(n), no need to separately + * check the latter + */ + if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) { /* Only called from BPF prog, no need to migrate_disable */ - __bpf_obj_drop_impl((void *)n - off, rec); + __bpf_obj_drop_impl((void *)n - off, rec, false); return -EINVAL; } tail ? list_add_tail(n, h) : list_add(n, h); + WRITE_ONCE(node->owner, head); return 0; } @@ -1967,25 +2006,26 @@ __bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head, struct bpf_list_node *node, void *meta__ign, u64 off) { + struct bpf_list_node_kern *n = (void *)node; struct btf_struct_meta *meta = meta__ign; - return __bpf_list_add(node, head, false, - meta ? meta->record : NULL, off); + return __bpf_list_add(n, head, false, meta ? meta->record : NULL, off); } __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head, struct bpf_list_node *node, void *meta__ign, u64 off) { + struct bpf_list_node_kern *n = (void *)node; struct btf_struct_meta *meta = meta__ign; - return __bpf_list_add(node, head, true, - meta ? meta->record : NULL, off); + return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off); } static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail) { struct list_head *n, *h = (void *)head; + struct bpf_list_node_kern *node; /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't * called on its fields, so init here @@ -1994,8 +2034,14 @@ static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tai INIT_LIST_HEAD(h); if (list_empty(h)) return NULL; + n = tail ? h->prev : h->next; + node = container_of(n, struct bpf_list_node_kern, list_head); + if (WARN_ON_ONCE(READ_ONCE(node->owner) != head)) + return NULL; + list_del_init(n); + WRITE_ONCE(node->owner, NULL); return (struct bpf_list_node *)n; } @@ -2012,31 +2058,40 @@ __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, struct bpf_rb_node *node) { + struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node; struct rb_root_cached *r = (struct rb_root_cached *)root; - struct rb_node *n = (struct rb_node *)node; + struct rb_node *n = &node_internal->rb_node; - if (RB_EMPTY_NODE(n)) + /* node_internal->owner != root implies either RB_EMPTY_NODE(n) or + * n is owned by some other tree. No need to check RB_EMPTY_NODE(n) + */ + if (READ_ONCE(node_internal->owner) != root) return NULL; rb_erase_cached(n, r); RB_CLEAR_NODE(n); + WRITE_ONCE(node_internal->owner, NULL); return (struct bpf_rb_node *)n; } /* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF * program */ -static int __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, +static int __bpf_rbtree_add(struct bpf_rb_root *root, + struct bpf_rb_node_kern *node, void *less, struct btf_record *rec, u64 off) { struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node; - struct rb_node *parent = NULL, *n = (struct rb_node *)node; + struct rb_node *parent = NULL, *n = &node->rb_node; bpf_callback_t cb = (bpf_callback_t)less; bool leftmost = true; - if (!RB_EMPTY_NODE(n)) { + /* node->owner != NULL implies !RB_EMPTY_NODE(n), no need to separately + * check the latter + */ + if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) { /* Only called from BPF prog, no need to migrate_disable */ - __bpf_obj_drop_impl((void *)n - off, rec); + __bpf_obj_drop_impl((void *)n - off, rec, false); return -EINVAL; } @@ -2052,6 +2107,7 @@ static int __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, rb_link_node(n, parent, link); rb_insert_color_cached(n, (struct rb_root_cached *)root, leftmost); + WRITE_ONCE(node->owner, root); return 0; } @@ -2060,8 +2116,9 @@ __bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node void *meta__ign, u64 off) { struct btf_struct_meta *meta = meta__ign; + struct bpf_rb_node_kern *n = (void *)node; - return __bpf_rbtree_add(root, node, (void *)less, meta ? meta->record : NULL, off); + return __bpf_rbtree_add(root, n, (void *)less, meta ? meta->record : NULL, off); } __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) @@ -2167,7 +2224,12 @@ __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid) __bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task, struct cgroup *ancestor) { - return task_under_cgroup_hierarchy(task, ancestor); + long ret; + + rcu_read_lock(); + ret = task_under_cgroup_hierarchy(task, ancestor); + rcu_read_unlock(); + return ret; } #endif /* CONFIG_CGROUPS */ @@ -2239,11 +2301,14 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset case BPF_DYNPTR_TYPE_RINGBUF: return ptr->data + ptr->offset + offset; case BPF_DYNPTR_TYPE_SKB: - return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__opt); + if (buffer__opt) + return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__opt); + else + return skb_pointer_if_linear(ptr->data, ptr->offset + offset, len); case BPF_DYNPTR_TYPE_XDP: { void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len); - if (xdp_ptr) + if (!IS_ERR_OR_NULL(xdp_ptr)) return xdp_ptr; if (!buffer__opt) @@ -2402,14 +2467,59 @@ __bpf_kfunc void bpf_rcu_read_unlock(void) rcu_read_unlock(); } -__diag_pop(); +struct bpf_throw_ctx { + struct bpf_prog_aux *aux; + u64 sp; + u64 bp; + int cnt; +}; + +static bool bpf_stack_walker(void *cookie, u64 ip, u64 sp, u64 bp) +{ + struct bpf_throw_ctx *ctx = cookie; + struct bpf_prog *prog; + + if (!is_bpf_text_address(ip)) + return !ctx->cnt; + prog = bpf_prog_ksym_find(ip); + ctx->cnt++; + if (bpf_is_subprog(prog)) + return true; + ctx->aux = prog->aux; + ctx->sp = sp; + ctx->bp = bp; + return false; +} + +__bpf_kfunc void bpf_throw(u64 cookie) +{ + struct bpf_throw_ctx ctx = {}; + + arch_bpf_stack_walk(bpf_stack_walker, &ctx); + WARN_ON_ONCE(!ctx.aux); + if (ctx.aux) + WARN_ON_ONCE(!ctx.aux->exception_boundary); + WARN_ON_ONCE(!ctx.bp); + WARN_ON_ONCE(!ctx.cnt); + /* Prevent KASAN false positives for CONFIG_KASAN_STACK by unpoisoning + * deeper stack depths than ctx.sp as we do not return from bpf_throw, + * which skips compiler generated instrumentation to do the same. + */ + kasan_unpoison_task_stack_below((void *)(long)ctx.sp); + ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp); + WARN(1, "A call to BPF exception callback should never return\n"); +} + +__bpf_kfunc_end_defs(); BTF_SET8_START(generic_btf_ids) #ifdef CONFIG_KEXEC_CORE BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) #endif BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE) BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_push_front_impl) BTF_ID_FLAGS(func, bpf_list_push_back_impl) @@ -2429,6 +2539,7 @@ BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU) #endif BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_throw) BTF_SET8_END(generic_btf_ids) static const struct btf_kfunc_id_set generic_kfunc_set = { @@ -2455,6 +2566,20 @@ BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY) +BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU) +BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY) +#ifdef CONFIG_CGROUPS +BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY) +BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY) +#endif +BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_dynptr_adjust) BTF_ID_FLAGS(func, bpf_dynptr_is_null) BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 4174f76133df..1aafb2ff2e95 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -118,9 +118,7 @@ static struct inode *bpf_get_inode(struct super_block *sb, return ERR_PTR(-ENOSPC); inode->i_ino = get_next_ino(); - inode->i_atime = current_time(inode); - inode->i_mtime = inode->i_atime; - inode->i_ctime = inode->i_atime; + simple_inode_init_ts(inode); inode_init_owner(&nop_mnt_idmap, inode, dir, mode); @@ -148,8 +146,7 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode, d_instantiate(dentry, inode); dget(dentry); - dir->i_mtime = current_time(dir); - dir->i_ctime = dir->i_mtime; + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); } static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir, diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index b0fa190b0979..6abd7c5df4b3 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -78,8 +78,7 @@ static const struct seq_operations bpf_map_seq_ops = { .show = bpf_map_seq_show, }; -BTF_ID_LIST(btf_bpf_map_id) -BTF_ID(struct, bpf_map) +BTF_ID_LIST_GLOBAL_SINGLE(btf_bpf_map_id, struct, bpf_map) static const struct bpf_iter_seq_info bpf_map_seq_info = { .seq_ops = &bpf_map_seq_ops, @@ -93,7 +92,7 @@ static struct bpf_iter_reg bpf_map_reg_info = { .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_map, map), - PTR_TO_BTF_ID_OR_NULL }, + PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, }, .seq_info = &bpf_map_seq_info, }; @@ -193,3 +192,38 @@ static int __init bpf_map_iter_init(void) } late_initcall(bpf_map_iter_init); + +__bpf_kfunc_start_defs(); + +__bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map) +{ + s64 *pcount; + s64 ret = 0; + int cpu; + + if (!map || !map->elem_count) + return 0; + + for_each_possible_cpu(cpu) { + pcount = per_cpu_ptr(map->elem_count, cpu); + ret += READ_ONCE(*pcount); + } + return ret; +} + +__bpf_kfunc_end_defs(); + +BTF_SET8_START(bpf_map_iter_kfunc_ids) +BTF_ID_FLAGS(func, bpf_map_sum_elem_count, KF_TRUSTED_ARGS) +BTF_SET8_END(bpf_map_iter_kfunc_ids) + +static const struct btf_kfunc_id_set bpf_map_iter_kfunc_set = { + .owner = THIS_MODULE, + .set = &bpf_map_iter_kfunc_ids, +}; + +static int init_subsystem(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &bpf_map_iter_kfunc_set); +} +late_initcall(init_subsystem); diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 0668bcd7c926..6a51cfe4c2d6 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -98,11 +98,23 @@ struct bpf_mem_cache { int free_cnt; int low_watermark, high_watermark, batch; int percpu_size; + bool draining; + struct bpf_mem_cache *tgt; - struct rcu_head rcu; + /* list of objects to be freed after RCU GP */ struct llist_head free_by_rcu; + struct llist_node *free_by_rcu_tail; struct llist_head waiting_for_gp; + struct llist_node *waiting_for_gp_tail; + struct rcu_head rcu; atomic_t call_rcu_in_progress; + struct llist_head free_llist_extra_rcu; + + /* list of objects to be freed after RCU tasks trace GP */ + struct llist_head free_by_rcu_ttrace; + struct llist_head waiting_for_gp_ttrace; + struct rcu_head rcu_ttrace; + atomic_t call_rcu_ttrace_in_progress; }; struct bpf_mem_caches { @@ -153,59 +165,87 @@ static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c) #endif } +static void inc_active(struct bpf_mem_cache *c, unsigned long *flags) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + /* In RT irq_work runs in per-cpu kthread, so disable + * interrupts to avoid preemption and interrupts and + * reduce the chance of bpf prog executing on this cpu + * when active counter is busy. + */ + local_irq_save(*flags); + /* alloc_bulk runs from irq_work which will not preempt a bpf + * program that does unit_alloc/unit_free since IRQs are + * disabled there. There is no race to increment 'active' + * counter. It protects free_llist from corruption in case NMI + * bpf prog preempted this loop. + */ + WARN_ON_ONCE(local_inc_return(&c->active) != 1); +} + +static void dec_active(struct bpf_mem_cache *c, unsigned long *flags) +{ + local_dec(&c->active); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_restore(*flags); +} + +static void add_obj_to_free_list(struct bpf_mem_cache *c, void *obj) +{ + unsigned long flags; + + inc_active(c, &flags); + __llist_add(obj, &c->free_llist); + c->free_cnt++; + dec_active(c, &flags); +} + /* Mostly runs from irq_work except __init phase. */ -static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node) +static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic) { struct mem_cgroup *memcg = NULL, *old_memcg; - unsigned long flags; + gfp_t gfp; void *obj; int i; - memcg = get_memcg(c); - old_memcg = set_active_memcg(memcg); + gfp = __GFP_NOWARN | __GFP_ACCOUNT; + gfp |= atomic ? GFP_NOWAIT : GFP_KERNEL; + for (i = 0; i < cnt; i++) { /* - * free_by_rcu is only manipulated by irq work refill_work(). - * IRQ works on the same CPU are called sequentially, so it is - * safe to use __llist_del_first() here. If alloc_bulk() is - * invoked by the initial prefill, there will be no running - * refill_work(), so __llist_del_first() is fine as well. - * - * In most cases, objects on free_by_rcu are from the same CPU. - * If some objects come from other CPUs, it doesn't incur any - * harm because NUMA_NO_NODE means the preference for current - * numa node and it is not a guarantee. + * For every 'c' llist_del_first(&c->free_by_rcu_ttrace); is + * done only by one CPU == current CPU. Other CPUs might + * llist_add() and llist_del_all() in parallel. */ - obj = __llist_del_first(&c->free_by_rcu); - if (!obj) { - /* Allocate, but don't deplete atomic reserves that typical - * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc - * will allocate from the current numa node which is what we - * want here. - */ - obj = __alloc(c, node, GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT); - if (!obj) - break; - } - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - /* In RT irq_work runs in per-cpu kthread, so disable - * interrupts to avoid preemption and interrupts and - * reduce the chance of bpf prog executing on this cpu - * when active counter is busy. - */ - local_irq_save(flags); - /* alloc_bulk runs from irq_work which will not preempt a bpf - * program that does unit_alloc/unit_free since IRQs are - * disabled there. There is no race to increment 'active' - * counter. It protects free_llist from corruption in case NMI - * bpf prog preempted this loop. + obj = llist_del_first(&c->free_by_rcu_ttrace); + if (!obj) + break; + add_obj_to_free_list(c, obj); + } + if (i >= cnt) + return; + + for (; i < cnt; i++) { + obj = llist_del_first(&c->waiting_for_gp_ttrace); + if (!obj) + break; + add_obj_to_free_list(c, obj); + } + if (i >= cnt) + return; + + memcg = get_memcg(c); + old_memcg = set_active_memcg(memcg); + for (; i < cnt; i++) { + /* Allocate, but don't deplete atomic reserves that typical + * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc + * will allocate from the current numa node which is what we + * want here. */ - WARN_ON_ONCE(local_inc_return(&c->active) != 1); - __llist_add(obj, &c->free_llist); - c->free_cnt++; - local_dec(&c->active); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - local_irq_restore(flags); + obj = __alloc(c, node, gfp); + if (!obj) + break; + add_obj_to_free_list(c, obj); } set_active_memcg(old_memcg); mem_cgroup_put(memcg); @@ -222,20 +262,24 @@ static void free_one(void *obj, bool percpu) kfree(obj); } -static void free_all(struct llist_node *llnode, bool percpu) +static int free_all(struct llist_node *llnode, bool percpu) { struct llist_node *pos, *t; + int cnt = 0; - llist_for_each_safe(pos, t, llnode) + llist_for_each_safe(pos, t, llnode) { free_one(pos, percpu); + cnt++; + } + return cnt; } static void __free_rcu(struct rcu_head *head) { - struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); + struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace); - free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size); - atomic_set(&c->call_rcu_in_progress, 0); + free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size); + atomic_set(&c->call_rcu_ttrace_in_progress, 0); } static void __free_rcu_tasks_trace(struct rcu_head *head) @@ -254,60 +298,132 @@ static void enque_to_free(struct bpf_mem_cache *c, void *obj) struct llist_node *llnode = obj; /* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work. - * Nothing races to add to free_by_rcu list. + * Nothing races to add to free_by_rcu_ttrace list. */ - __llist_add(llnode, &c->free_by_rcu); + llist_add(llnode, &c->free_by_rcu_ttrace); } -static void do_call_rcu(struct bpf_mem_cache *c) +static void do_call_rcu_ttrace(struct bpf_mem_cache *c) { struct llist_node *llnode, *t; - if (atomic_xchg(&c->call_rcu_in_progress, 1)) + if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) { + if (unlikely(READ_ONCE(c->draining))) { + llnode = llist_del_all(&c->free_by_rcu_ttrace); + free_all(llnode, !!c->percpu_size); + } return; + } + + WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace)); + llist_for_each_safe(llnode, t, llist_del_all(&c->free_by_rcu_ttrace)) + llist_add(llnode, &c->waiting_for_gp_ttrace); + + if (unlikely(READ_ONCE(c->draining))) { + __free_rcu(&c->rcu_ttrace); + return; + } - WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); - llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu)) - /* There is no concurrent __llist_add(waiting_for_gp) access. - * It doesn't race with llist_del_all either. - * But there could be two concurrent llist_del_all(waiting_for_gp): - * from __free_rcu() and from drain_mem_cache(). - */ - __llist_add(llnode, &c->waiting_for_gp); /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish. * If RCU Tasks Trace grace period implies RCU grace period, free * these elements directly, else use call_rcu() to wait for normal * progs to finish and finally do free_one() on each element. */ - call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace); + call_rcu_tasks_trace(&c->rcu_ttrace, __free_rcu_tasks_trace); } static void free_bulk(struct bpf_mem_cache *c) { + struct bpf_mem_cache *tgt = c->tgt; struct llist_node *llnode, *t; unsigned long flags; int cnt; + WARN_ON_ONCE(tgt->unit_size != c->unit_size); + WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); + do { - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - local_irq_save(flags); - WARN_ON_ONCE(local_inc_return(&c->active) != 1); + inc_active(c, &flags); llnode = __llist_del_first(&c->free_llist); if (llnode) cnt = --c->free_cnt; else cnt = 0; - local_dec(&c->active); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - local_irq_restore(flags); + dec_active(c, &flags); if (llnode) - enque_to_free(c, llnode); + enque_to_free(tgt, llnode); } while (cnt > (c->high_watermark + c->low_watermark) / 2); /* and drain free_llist_extra */ llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) - enque_to_free(c, llnode); - do_call_rcu(c); + enque_to_free(tgt, llnode); + do_call_rcu_ttrace(tgt); +} + +static void __free_by_rcu(struct rcu_head *head) +{ + struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); + struct bpf_mem_cache *tgt = c->tgt; + struct llist_node *llnode; + + WARN_ON_ONCE(tgt->unit_size != c->unit_size); + WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); + + llnode = llist_del_all(&c->waiting_for_gp); + if (!llnode) + goto out; + + llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace); + + /* Objects went through regular RCU GP. Send them to RCU tasks trace */ + do_call_rcu_ttrace(tgt); +out: + atomic_set(&c->call_rcu_in_progress, 0); +} + +static void check_free_by_rcu(struct bpf_mem_cache *c) +{ + struct llist_node *llnode, *t; + unsigned long flags; + + /* drain free_llist_extra_rcu */ + if (unlikely(!llist_empty(&c->free_llist_extra_rcu))) { + inc_active(c, &flags); + llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu)) + if (__llist_add(llnode, &c->free_by_rcu)) + c->free_by_rcu_tail = llnode; + dec_active(c, &flags); + } + + if (llist_empty(&c->free_by_rcu)) + return; + + if (atomic_xchg(&c->call_rcu_in_progress, 1)) { + /* + * Instead of kmalloc-ing new rcu_head and triggering 10k + * call_rcu() to hit rcutree.qhimark and force RCU to notice + * the overload just ask RCU to hurry up. There could be many + * objects in free_by_rcu list. + * This hint reduces memory consumption for an artificial + * benchmark from 2 Gbyte to 150 Mbyte. + */ + rcu_request_urgent_qs_task(current); + return; + } + + WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); + + inc_active(c, &flags); + WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu)); + c->waiting_for_gp_tail = c->free_by_rcu_tail; + dec_active(c, &flags); + + if (unlikely(READ_ONCE(c->draining))) { + free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size); + atomic_set(&c->call_rcu_in_progress, 0); + } else { + call_rcu_hurry(&c->rcu, __free_by_rcu); + } } static void bpf_mem_refill(struct irq_work *work) @@ -321,9 +437,11 @@ static void bpf_mem_refill(struct irq_work *work) /* irq_work runs on this cpu and kmalloc will allocate * from the current numa node which is what we want here. */ - alloc_bulk(c, c->batch, NUMA_NO_NODE); + alloc_bulk(c, c->batch, NUMA_NO_NODE, true); else if (cnt > c->high_watermark) free_bulk(c); + + check_free_by_rcu(c); } static void notrace irq_work_raise(struct bpf_mem_cache *c) @@ -345,8 +463,7 @@ static void notrace irq_work_raise(struct bpf_mem_cache *c) * Typical case will be between 11K and 116K closer to 11K. * bpf progs can and should share bpf_mem_cache when possible. */ - -static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) +static void init_refill_work(struct bpf_mem_cache *c) { init_irq_work(&c->refill_work, bpf_mem_refill); if (c->unit_size <= 256) { @@ -362,12 +479,36 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) c->high_watermark = max(96 * 256 / c->unit_size, 3); } c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1); +} +static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) +{ /* To avoid consuming memory assume that 1st run of bpf * prog won't be doing more than 4 map_update_elem from * irq disabled region */ - alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu)); + alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu), false); +} + +static int check_obj_size(struct bpf_mem_cache *c, unsigned int idx) +{ + struct llist_node *first; + unsigned int obj_size; + + first = c->free_llist.first; + if (!first) + return 0; + + if (c->percpu_size) + obj_size = pcpu_alloc_size(((void **)first)[1]); + else + obj_size = ksize(first); + if (obj_size != c->unit_size) { + WARN_ONCE(1, "bpf_mem_cache[%u]: percpu %d, unexpected object size %u, expect %u\n", + idx, c->percpu_size, obj_size, c->unit_size); + return -EINVAL; + } + return 0; } /* When size != 0 bpf_mem_cache for each cpu. @@ -380,20 +521,22 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) { static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; + int cpu, i, err, unit_size, percpu_size = 0; struct bpf_mem_caches *cc, __percpu *pcc; struct bpf_mem_cache *c, __percpu *pc; struct obj_cgroup *objcg = NULL; - int cpu, i, unit_size, percpu_size = 0; + + /* room for llist_node and per-cpu pointer */ + if (percpu) + percpu_size = LLIST_NODE_SZ + sizeof(void *); + ma->percpu = percpu; if (size) { pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); if (!pc) return -ENOMEM; - if (percpu) - /* room for llist_node and per-cpu pointer */ - percpu_size = LLIST_NODE_SZ + sizeof(void *); - else + if (!percpu) size += LLIST_NODE_SZ; /* room for llist_node */ unit_size = size; @@ -406,19 +549,18 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) c->unit_size = unit_size; c->objcg = objcg; c->percpu_size = percpu_size; + c->tgt = c; + init_refill_work(c); prefill_mem_cache(c, cpu); } ma->cache = pc; return 0; } - /* size == 0 && percpu is an invalid combination */ - if (WARN_ON_ONCE(percpu)) - return -EINVAL; - pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL); if (!pcc) return -ENOMEM; + err = 0; #ifdef CONFIG_MEMCG_KMEM objcg = get_obj_cgroup_from_current(); #endif @@ -428,11 +570,32 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) c = &cc->cache[i]; c->unit_size = sizes[i]; c->objcg = objcg; + c->percpu_size = percpu_size; + c->tgt = c; + + init_refill_work(c); + /* Another bpf_mem_cache will be used when allocating + * c->unit_size in bpf_mem_alloc(), so doesn't prefill + * for the bpf_mem_cache because these free objects will + * never be used. + */ + if (i != bpf_mem_cache_idx(c->unit_size)) + continue; prefill_mem_cache(c, cpu); + err = check_obj_size(c, i); + if (err) + goto out; } } + +out: ma->caches = pcc; - return 0; + /* refill_work is either zeroed or initialized, so it is safe to + * call irq_work_sync(). + */ + if (err) + bpf_mem_alloc_destroy(ma); + return err; } static void drain_mem_cache(struct bpf_mem_cache *c) @@ -441,19 +604,57 @@ static void drain_mem_cache(struct bpf_mem_cache *c) /* No progs are using this bpf_mem_cache, but htab_map_free() called * bpf_mem_cache_free() for all remaining elements and they can be in - * free_by_rcu or in waiting_for_gp lists, so drain those lists now. + * free_by_rcu_ttrace or in waiting_for_gp_ttrace lists, so drain those lists now. * - * Except for waiting_for_gp list, there are no concurrent operations + * Except for waiting_for_gp_ttrace list, there are no concurrent operations * on these lists, so it is safe to use __llist_del_all(). */ - free_all(__llist_del_all(&c->free_by_rcu), percpu); - free_all(llist_del_all(&c->waiting_for_gp), percpu); + free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu); + free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu); free_all(__llist_del_all(&c->free_llist), percpu); free_all(__llist_del_all(&c->free_llist_extra), percpu); + free_all(__llist_del_all(&c->free_by_rcu), percpu); + free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu); + free_all(llist_del_all(&c->waiting_for_gp), percpu); +} + +static void check_mem_cache(struct bpf_mem_cache *c) +{ + WARN_ON_ONCE(!llist_empty(&c->free_by_rcu_ttrace)); + WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace)); + WARN_ON_ONCE(!llist_empty(&c->free_llist)); + WARN_ON_ONCE(!llist_empty(&c->free_llist_extra)); + WARN_ON_ONCE(!llist_empty(&c->free_by_rcu)); + WARN_ON_ONCE(!llist_empty(&c->free_llist_extra_rcu)); + WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); +} + +static void check_leaked_objs(struct bpf_mem_alloc *ma) +{ + struct bpf_mem_caches *cc; + struct bpf_mem_cache *c; + int cpu, i; + + if (ma->cache) { + for_each_possible_cpu(cpu) { + c = per_cpu_ptr(ma->cache, cpu); + check_mem_cache(c); + } + } + if (ma->caches) { + for_each_possible_cpu(cpu) { + cc = per_cpu_ptr(ma->caches, cpu); + for (i = 0; i < NUM_CACHES; i++) { + c = &cc->cache[i]; + check_mem_cache(c); + } + } + } } static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) { + check_leaked_objs(ma); free_percpu(ma->cache); free_percpu(ma->caches); ma->cache = NULL; @@ -462,8 +663,8 @@ static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) static void free_mem_alloc(struct bpf_mem_alloc *ma) { - /* waiting_for_gp lists was drained, but __free_rcu might - * still execute. Wait for it now before we freeing percpu caches. + /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks + * might still execute. Wait for them. * * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used @@ -472,7 +673,8 @@ static void free_mem_alloc(struct bpf_mem_alloc *ma) * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by * using rcu_trace_implies_rcu_gp() as well. */ - rcu_barrier_tasks_trace(); + rcu_barrier(); /* wait for __free_by_rcu */ + rcu_barrier_tasks_trace(); /* wait for __free_rcu */ if (!rcu_trace_implies_rcu_gp()) rcu_barrier(); free_mem_alloc_no_barrier(ma); @@ -498,7 +700,7 @@ static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress) return; } - copy = kmalloc(sizeof(*ma), GFP_KERNEL); + copy = kmemdup(ma, sizeof(*ma), GFP_KERNEL); if (!copy) { /* Slow path with inline barrier-s */ free_mem_alloc(ma); @@ -506,10 +708,7 @@ static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress) } /* Defer barriers into worker to let the rest of map memory to be freed */ - copy->cache = ma->cache; - ma->cache = NULL; - copy->caches = ma->caches; - ma->caches = NULL; + memset(ma, 0, sizeof(*ma)); INIT_WORK(©->work, free_mem_alloc_deferred); queue_work(system_unbound_wq, ©->work); } @@ -524,17 +723,10 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) rcu_in_progress = 0; for_each_possible_cpu(cpu) { c = per_cpu_ptr(ma->cache, cpu); - /* - * refill_work may be unfinished for PREEMPT_RT kernel - * in which irq work is invoked in a per-CPU RT thread. - * It is also possible for kernel with - * arch_irq_work_has_interrupt() being false and irq - * work is invoked in timer interrupt. So waiting for - * the completion of irq work to ease the handling of - * concurrency. - */ + WRITE_ONCE(c->draining, true); irq_work_sync(&c->refill_work); drain_mem_cache(c); + rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } /* objcg is the same across cpus */ @@ -548,8 +740,10 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) cc = per_cpu_ptr(ma->caches, cpu); for (i = 0; i < NUM_CACHES; i++) { c = &cc->cache[i]; + WRITE_ONCE(c->draining, true); irq_work_sync(&c->refill_work); drain_mem_cache(c); + rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } } @@ -581,16 +775,23 @@ static void notrace *unit_alloc(struct bpf_mem_cache *c) local_irq_save(flags); if (local_inc_return(&c->active) == 1) { llnode = __llist_del_first(&c->free_llist); - if (llnode) + if (llnode) { cnt = --c->free_cnt; + *(struct bpf_mem_cache **)llnode = c; + } } local_dec(&c->active); - local_irq_restore(flags); WARN_ON(cnt < 0); if (cnt < c->low_watermark) irq_work_raise(c); + /* Enable IRQ after the enqueue of irq work completes, so irq work + * will run after IRQ is enabled and free_llist may be refilled by + * irq work before other task preempts current task. + */ + local_irq_restore(flags); + return llnode; } @@ -606,6 +807,12 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) BUILD_BUG_ON(LLIST_NODE_SZ > 8); + /* + * Remember bpf_mem_cache that allocated this object. + * The hint is not accurate. + */ + c->tgt = *(struct bpf_mem_cache **)llnode; + local_irq_save(flags); if (local_inc_return(&c->active) == 1) { __llist_add(llnode, &c->free_llist); @@ -620,11 +827,37 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) llist_add(llnode, &c->free_llist_extra); } local_dec(&c->active); - local_irq_restore(flags); if (cnt > c->high_watermark) /* free few objects from current cpu into global kmalloc pool */ irq_work_raise(c); + /* Enable IRQ after irq_work_raise() completes, otherwise when current + * task is preempted by task which does unit_alloc(), unit_alloc() may + * return NULL unexpectedly because irq work is already pending but can + * not been triggered and free_llist can not be refilled timely. + */ + local_irq_restore(flags); +} + +static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) +{ + struct llist_node *llnode = ptr - LLIST_NODE_SZ; + unsigned long flags; + + c->tgt = *(struct bpf_mem_cache **)llnode; + + local_irq_save(flags); + if (local_inc_return(&c->active) == 1) { + if (__llist_add(llnode, &c->free_by_rcu)) + c->free_by_rcu_tail = llnode; + } else { + llist_add(llnode, &c->free_llist_extra_rcu); + } + local_dec(&c->active); + + if (!atomic_read(&c->call_rcu_in_progress)) + irq_work_raise(c); + local_irq_restore(flags); } /* Called from BPF program or from sys_bpf syscall. @@ -646,6 +879,17 @@ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) return !ret ? NULL : ret + LLIST_NODE_SZ; } +static notrace int bpf_mem_free_idx(void *ptr, bool percpu) +{ + size_t size; + + if (percpu) + size = pcpu_alloc_size(*((void **)ptr)); + else + size = ksize(ptr - LLIST_NODE_SZ); + return bpf_mem_cache_idx(size); +} + void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) { int idx; @@ -653,13 +897,27 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) if (!ptr) return; - idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ)); + idx = bpf_mem_free_idx(ptr, ma->percpu); if (idx < 0) return; unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr); } +void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr) +{ + int idx; + + if (!ptr) + return; + + idx = bpf_mem_free_idx(ptr, ma->percpu); + if (idx < 0) + return; + + unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr); +} + void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma) { void *ret; @@ -676,6 +934,14 @@ void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr) unit_free(this_cpu_ptr(ma->cache), ptr); } +void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr) +{ + if (!ptr) + return; + + unit_free_rcu(this_cpu_ptr(ma->cache), ptr); +} + /* Directly does a kfree() without putting 'ptr' back to the free_llist * for reuse and without waiting for a rcu_tasks_trace gp. * The caller must first go through the rcu_tasks_trace gp for 'ptr' @@ -712,9 +978,49 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) memcg = get_memcg(c); old_memcg = set_active_memcg(memcg); ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT); + if (ret) + *(struct bpf_mem_cache **)ret = c; set_active_memcg(old_memcg); mem_cgroup_put(memcg); } return !ret ? NULL : ret + LLIST_NODE_SZ; } + +/* The alignment of dynamic per-cpu area is 8, so c->unit_size and the + * actual size of dynamic per-cpu area will always be matched and there is + * no need to adjust size_index for per-cpu allocation. However for the + * simplicity of the implementation, use an unified size_index for both + * kmalloc and per-cpu allocation. + */ +static __init int bpf_mem_cache_adjust_size(void) +{ + unsigned int size; + + /* Adjusting the indexes in size_index() according to the object_size + * of underlying slab cache, so bpf_mem_alloc() will select a + * bpf_mem_cache with unit_size equal to the object_size of + * the underlying slab cache. + * + * The maximal value of KMALLOC_MIN_SIZE and __kmalloc_minalign() is + * 256-bytes, so only do adjustment for [8-bytes, 192-bytes]. + */ + for (size = 192; size >= 8; size -= 8) { + unsigned int kmalloc_size, index; + + kmalloc_size = kmalloc_size_roundup(size); + if (kmalloc_size == size) + continue; + + if (kmalloc_size <= 192) + index = size_index[(kmalloc_size - 1) / 8]; + else + index = fls(kmalloc_size - 1) - 1; + /* Only overwrite if necessary */ + if (size_index[(size - 1) / 8] != index) + size_index[(size - 1) / 8] = index; + } + + return 0; +} +subsys_initcall(bpf_mem_cache_adjust_size); diff --git a/kernel/bpf/mprog.c b/kernel/bpf/mprog.c new file mode 100644 index 000000000000..1394168062e8 --- /dev/null +++ b/kernel/bpf/mprog.c @@ -0,0 +1,452 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2023 Isovalent */ + +#include <linux/bpf.h> +#include <linux/bpf_mprog.h> + +static int bpf_mprog_link(struct bpf_tuple *tuple, + u32 id_or_fd, u32 flags, + enum bpf_prog_type type) +{ + struct bpf_link *link = ERR_PTR(-EINVAL); + bool id = flags & BPF_F_ID; + + if (id) + link = bpf_link_by_id(id_or_fd); + else if (id_or_fd) + link = bpf_link_get_from_fd(id_or_fd); + if (IS_ERR(link)) + return PTR_ERR(link); + if (type && link->prog->type != type) { + bpf_link_put(link); + return -EINVAL; + } + + tuple->link = link; + tuple->prog = link->prog; + return 0; +} + +static int bpf_mprog_prog(struct bpf_tuple *tuple, + u32 id_or_fd, u32 flags, + enum bpf_prog_type type) +{ + struct bpf_prog *prog = ERR_PTR(-EINVAL); + bool id = flags & BPF_F_ID; + + if (id) + prog = bpf_prog_by_id(id_or_fd); + else if (id_or_fd) + prog = bpf_prog_get(id_or_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + if (type && prog->type != type) { + bpf_prog_put(prog); + return -EINVAL; + } + + tuple->link = NULL; + tuple->prog = prog; + return 0; +} + +static int bpf_mprog_tuple_relative(struct bpf_tuple *tuple, + u32 id_or_fd, u32 flags, + enum bpf_prog_type type) +{ + bool link = flags & BPF_F_LINK; + bool id = flags & BPF_F_ID; + + memset(tuple, 0, sizeof(*tuple)); + if (link) + return bpf_mprog_link(tuple, id_or_fd, flags, type); + /* If no relevant flag is set and no id_or_fd was passed, then + * tuple link/prog is just NULLed. This is the case when before/ + * after selects first/last position without passing fd. + */ + if (!id && !id_or_fd) + return 0; + return bpf_mprog_prog(tuple, id_or_fd, flags, type); +} + +static void bpf_mprog_tuple_put(struct bpf_tuple *tuple) +{ + if (tuple->link) + bpf_link_put(tuple->link); + else if (tuple->prog) + bpf_prog_put(tuple->prog); +} + +/* The bpf_mprog_{replace,delete}() operate on exact idx position with the + * one exception that for deletion we support delete from front/back. In + * case of front idx is -1, in case of back idx is bpf_mprog_total(entry). + * Adjustment to first and last entry is trivial. The bpf_mprog_insert() + * we have to deal with the following cases: + * + * idx + before: + * + * Insert P4 before P3: idx for old array is 1, idx for new array is 2, + * hence we adjust target idx for the new array, so that memmove copies + * P1 and P2 to the new entry, and we insert P4 into idx 2. Inserting + * before P1 would have old idx -1 and new idx 0. + * + * +--+--+--+ +--+--+--+--+ +--+--+--+--+ + * |P1|P2|P3| ==> |P1|P2| |P3| ==> |P1|P2|P4|P3| + * +--+--+--+ +--+--+--+--+ +--+--+--+--+ + * + * idx + after: + * + * Insert P4 after P2: idx for old array is 2, idx for new array is 2. + * Again, memmove copies P1 and P2 to the new entry, and we insert P4 + * into idx 2. Inserting after P3 would have both old/new idx at 4 aka + * bpf_mprog_total(entry). + * + * +--+--+--+ +--+--+--+--+ +--+--+--+--+ + * |P1|P2|P3| ==> |P1|P2| |P3| ==> |P1|P2|P4|P3| + * +--+--+--+ +--+--+--+--+ +--+--+--+--+ + */ +static int bpf_mprog_replace(struct bpf_mprog_entry *entry, + struct bpf_mprog_entry **entry_new, + struct bpf_tuple *ntuple, int idx) +{ + struct bpf_mprog_fp *fp; + struct bpf_mprog_cp *cp; + struct bpf_prog *oprog; + + bpf_mprog_read(entry, idx, &fp, &cp); + oprog = READ_ONCE(fp->prog); + bpf_mprog_write(fp, cp, ntuple); + if (!ntuple->link) { + WARN_ON_ONCE(cp->link); + bpf_prog_put(oprog); + } + *entry_new = entry; + return 0; +} + +static int bpf_mprog_insert(struct bpf_mprog_entry *entry, + struct bpf_mprog_entry **entry_new, + struct bpf_tuple *ntuple, int idx, u32 flags) +{ + int total = bpf_mprog_total(entry); + struct bpf_mprog_entry *peer; + struct bpf_mprog_fp *fp; + struct bpf_mprog_cp *cp; + + peer = bpf_mprog_peer(entry); + bpf_mprog_entry_copy(peer, entry); + if (idx == total) + goto insert; + else if (flags & BPF_F_BEFORE) + idx += 1; + bpf_mprog_entry_grow(peer, idx); +insert: + bpf_mprog_read(peer, idx, &fp, &cp); + bpf_mprog_write(fp, cp, ntuple); + bpf_mprog_inc(peer); + *entry_new = peer; + return 0; +} + +static int bpf_mprog_delete(struct bpf_mprog_entry *entry, + struct bpf_mprog_entry **entry_new, + struct bpf_tuple *dtuple, int idx) +{ + int total = bpf_mprog_total(entry); + struct bpf_mprog_entry *peer; + + peer = bpf_mprog_peer(entry); + bpf_mprog_entry_copy(peer, entry); + if (idx == -1) + idx = 0; + else if (idx == total) + idx = total - 1; + bpf_mprog_entry_shrink(peer, idx); + bpf_mprog_dec(peer); + bpf_mprog_mark_for_release(peer, dtuple); + *entry_new = peer; + return 0; +} + +/* In bpf_mprog_pos_*() we evaluate the target position for the BPF + * program/link that needs to be replaced, inserted or deleted for + * each "rule" independently. If all rules agree on that position + * or existing element, then enact replacement, addition or deletion. + * If this is not the case, then the request cannot be satisfied and + * we bail out with an error. + */ +static int bpf_mprog_pos_exact(struct bpf_mprog_entry *entry, + struct bpf_tuple *tuple) +{ + struct bpf_mprog_fp *fp; + struct bpf_mprog_cp *cp; + int i; + + for (i = 0; i < bpf_mprog_total(entry); i++) { + bpf_mprog_read(entry, i, &fp, &cp); + if (tuple->prog == READ_ONCE(fp->prog)) + return tuple->link == cp->link ? i : -EBUSY; + } + return -ENOENT; +} + +static int bpf_mprog_pos_before(struct bpf_mprog_entry *entry, + struct bpf_tuple *tuple) +{ + struct bpf_mprog_fp *fp; + struct bpf_mprog_cp *cp; + int i; + + for (i = 0; i < bpf_mprog_total(entry); i++) { + bpf_mprog_read(entry, i, &fp, &cp); + if (tuple->prog == READ_ONCE(fp->prog) && + (!tuple->link || tuple->link == cp->link)) + return i - 1; + } + return tuple->prog ? -ENOENT : -1; +} + +static int bpf_mprog_pos_after(struct bpf_mprog_entry *entry, + struct bpf_tuple *tuple) +{ + struct bpf_mprog_fp *fp; + struct bpf_mprog_cp *cp; + int i; + + for (i = 0; i < bpf_mprog_total(entry); i++) { + bpf_mprog_read(entry, i, &fp, &cp); + if (tuple->prog == READ_ONCE(fp->prog) && + (!tuple->link || tuple->link == cp->link)) + return i + 1; + } + return tuple->prog ? -ENOENT : bpf_mprog_total(entry); +} + +int bpf_mprog_attach(struct bpf_mprog_entry *entry, + struct bpf_mprog_entry **entry_new, + struct bpf_prog *prog_new, struct bpf_link *link, + struct bpf_prog *prog_old, + u32 flags, u32 id_or_fd, u64 revision) +{ + struct bpf_tuple rtuple, ntuple = { + .prog = prog_new, + .link = link, + }, otuple = { + .prog = prog_old, + .link = link, + }; + int ret, idx = -ERANGE, tidx; + + if (revision && revision != bpf_mprog_revision(entry)) + return -ESTALE; + if (bpf_mprog_exists(entry, prog_new)) + return -EEXIST; + ret = bpf_mprog_tuple_relative(&rtuple, id_or_fd, + flags & ~BPF_F_REPLACE, + prog_new->type); + if (ret) + return ret; + if (flags & BPF_F_REPLACE) { + tidx = bpf_mprog_pos_exact(entry, &otuple); + if (tidx < 0) { + ret = tidx; + goto out; + } + idx = tidx; + } else if (bpf_mprog_total(entry) == bpf_mprog_max()) { + ret = -ERANGE; + goto out; + } + if (flags & BPF_F_BEFORE) { + tidx = bpf_mprog_pos_before(entry, &rtuple); + if (tidx < -1 || (idx >= -1 && tidx != idx)) { + ret = tidx < -1 ? tidx : -ERANGE; + goto out; + } + idx = tidx; + } + if (flags & BPF_F_AFTER) { + tidx = bpf_mprog_pos_after(entry, &rtuple); + if (tidx < -1 || (idx >= -1 && tidx != idx)) { + ret = tidx < 0 ? tidx : -ERANGE; + goto out; + } + idx = tidx; + } + if (idx < -1) { + if (rtuple.prog || flags) { + ret = -EINVAL; + goto out; + } + idx = bpf_mprog_total(entry); + flags = BPF_F_AFTER; + } + if (idx >= bpf_mprog_max()) { + ret = -ERANGE; + goto out; + } + if (flags & BPF_F_REPLACE) + ret = bpf_mprog_replace(entry, entry_new, &ntuple, idx); + else + ret = bpf_mprog_insert(entry, entry_new, &ntuple, idx, flags); +out: + bpf_mprog_tuple_put(&rtuple); + return ret; +} + +static int bpf_mprog_fetch(struct bpf_mprog_entry *entry, + struct bpf_tuple *tuple, int idx) +{ + int total = bpf_mprog_total(entry); + struct bpf_mprog_cp *cp; + struct bpf_mprog_fp *fp; + struct bpf_prog *prog; + struct bpf_link *link; + + if (idx == -1) + idx = 0; + else if (idx == total) + idx = total - 1; + bpf_mprog_read(entry, idx, &fp, &cp); + prog = READ_ONCE(fp->prog); + link = cp->link; + /* The deletion request can either be without filled tuple in which + * case it gets populated here based on idx, or with filled tuple + * where the only thing we end up doing is the WARN_ON_ONCE() assert. + * If we hit a BPF link at the given index, it must not be removed + * from opts path. + */ + if (link && !tuple->link) + return -EBUSY; + WARN_ON_ONCE(tuple->prog && tuple->prog != prog); + WARN_ON_ONCE(tuple->link && tuple->link != link); + tuple->prog = prog; + tuple->link = link; + return 0; +} + +int bpf_mprog_detach(struct bpf_mprog_entry *entry, + struct bpf_mprog_entry **entry_new, + struct bpf_prog *prog, struct bpf_link *link, + u32 flags, u32 id_or_fd, u64 revision) +{ + struct bpf_tuple rtuple, dtuple = { + .prog = prog, + .link = link, + }; + int ret, idx = -ERANGE, tidx; + + if (flags & BPF_F_REPLACE) + return -EINVAL; + if (revision && revision != bpf_mprog_revision(entry)) + return -ESTALE; + if (!bpf_mprog_total(entry)) + return -ENOENT; + ret = bpf_mprog_tuple_relative(&rtuple, id_or_fd, flags, + prog ? prog->type : + BPF_PROG_TYPE_UNSPEC); + if (ret) + return ret; + if (dtuple.prog) { + tidx = bpf_mprog_pos_exact(entry, &dtuple); + if (tidx < 0) { + ret = tidx; + goto out; + } + idx = tidx; + } + if (flags & BPF_F_BEFORE) { + tidx = bpf_mprog_pos_before(entry, &rtuple); + if (tidx < -1 || (idx >= -1 && tidx != idx)) { + ret = tidx < -1 ? tidx : -ERANGE; + goto out; + } + idx = tidx; + } + if (flags & BPF_F_AFTER) { + tidx = bpf_mprog_pos_after(entry, &rtuple); + if (tidx < -1 || (idx >= -1 && tidx != idx)) { + ret = tidx < 0 ? tidx : -ERANGE; + goto out; + } + idx = tidx; + } + if (idx < -1) { + if (rtuple.prog || flags) { + ret = -EINVAL; + goto out; + } + idx = bpf_mprog_total(entry); + flags = BPF_F_AFTER; + } + if (idx >= bpf_mprog_max()) { + ret = -ERANGE; + goto out; + } + ret = bpf_mprog_fetch(entry, &dtuple, idx); + if (ret) + goto out; + ret = bpf_mprog_delete(entry, entry_new, &dtuple, idx); +out: + bpf_mprog_tuple_put(&rtuple); + return ret; +} + +int bpf_mprog_query(const union bpf_attr *attr, union bpf_attr __user *uattr, + struct bpf_mprog_entry *entry) +{ + u32 __user *uprog_flags, *ulink_flags; + u32 __user *uprog_id, *ulink_id; + struct bpf_mprog_fp *fp; + struct bpf_mprog_cp *cp; + struct bpf_prog *prog; + const u32 flags = 0; + u32 id, count = 0; + u64 revision = 1; + int i, ret = 0; + + if (attr->query.query_flags || attr->query.attach_flags) + return -EINVAL; + if (entry) { + revision = bpf_mprog_revision(entry); + count = bpf_mprog_total(entry); + } + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) + return -EFAULT; + if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision))) + return -EFAULT; + if (copy_to_user(&uattr->query.count, &count, sizeof(count))) + return -EFAULT; + uprog_id = u64_to_user_ptr(attr->query.prog_ids); + uprog_flags = u64_to_user_ptr(attr->query.prog_attach_flags); + ulink_id = u64_to_user_ptr(attr->query.link_ids); + ulink_flags = u64_to_user_ptr(attr->query.link_attach_flags); + if (attr->query.count == 0 || !uprog_id || !count) + return 0; + if (attr->query.count < count) { + count = attr->query.count; + ret = -ENOSPC; + } + for (i = 0; i < bpf_mprog_max(); i++) { + bpf_mprog_read(entry, i, &fp, &cp); + prog = READ_ONCE(fp->prog); + if (!prog) + break; + id = prog->aux->id; + if (copy_to_user(uprog_id + i, &id, sizeof(id))) + return -EFAULT; + if (uprog_flags && + copy_to_user(uprog_flags + i, &flags, sizeof(flags))) + return -EFAULT; + id = cp->link ? cp->link->id : 0; + if (ulink_id && + copy_to_user(ulink_id + i, &id, sizeof(id))) + return -EFAULT; + if (ulink_flags && + copy_to_user(ulink_flags + i, &flags, sizeof(flags))) + return -EFAULT; + if (i + 1 == count) + break; + } + return ret; +} diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 8a26cd8814c1..1a4fec330eaa 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -25,6 +25,7 @@ #include <linux/rhashtable.h> #include <linux/rtnetlink.h> #include <linux/rwsem.h> +#include <net/xdp.h> /* Protects offdevs, members of bpf_offload_netdev and offload members * of all progs. @@ -198,12 +199,14 @@ static int __bpf_prog_dev_bound_init(struct bpf_prog *prog, struct net_device *n offload->netdev = netdev; ondev = bpf_offload_find_netdev(offload->netdev); + /* When program is offloaded require presence of "true" + * bpf_offload_netdev, avoid the one created for !ondev case below. + */ + if (bpf_prog_is_offloaded(prog->aux) && (!ondev || !ondev->offdev)) { + err = -EINVAL; + goto err_free; + } if (!ondev) { - if (bpf_prog_is_offloaded(prog->aux)) { - err = -EINVAL; - goto err_free; - } - /* When only binding to the device, explicitly * create an entry in the hashtable. */ @@ -231,7 +234,14 @@ int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr) attr->prog_type != BPF_PROG_TYPE_XDP) return -EINVAL; - if (attr->prog_flags & ~BPF_F_XDP_DEV_BOUND_ONLY) + if (attr->prog_flags & ~(BPF_F_XDP_DEV_BOUND_ONLY | BPF_F_XDP_HAS_FRAGS)) + return -EINVAL; + + /* Frags are allowed only if program is dev-bound-only, but not + * if it is requesting bpf offload. + */ + if (attr->prog_flags & BPF_F_XDP_HAS_FRAGS && + !(attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY)) return -EINVAL; if (attr->prog_type == BPF_PROG_TYPE_SCHED_CLS && @@ -844,10 +854,11 @@ void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id) if (!ops) goto out; - if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP)) - p = ops->xmo_rx_timestamp; - else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_HASH)) - p = ops->xmo_rx_hash; +#define XDP_METADATA_KFUNC(name, _, __, xmo) \ + if (func_id == bpf_xdp_metadata_kfunc_id(name)) p = ops->xmo; + XDP_METADATA_KFUNC_xxx +#undef XDP_METADATA_KFUNC + out: up_read(&bpf_devs_lock); diff --git a/kernel/bpf/preload/iterators/Makefile b/kernel/bpf/preload/iterators/Makefile index 8937dc6bc8d0..b83c2f5e9be1 100644 --- a/kernel/bpf/preload/iterators/Makefile +++ b/kernel/bpf/preload/iterators/Makefile @@ -50,7 +50,7 @@ iterators.lskel-%.h: $(OUTPUT)/%/iterators.bpf.o | $(BPFTOOL) $(OUTPUT)/%/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT) $(call msg,BPF,$@) $(Q)mkdir -p $(@D) - $(Q)$(CLANG) -g -O2 -target bpf -m$* $(INCLUDES) \ + $(Q)$(CLANG) -g -O2 --target=bpf -m$* $(INCLUDES) \ -c $(filter %.c,$^) -o $@ && \ $(LLVM_STRIP) -g $@ diff --git a/kernel/bpf/preload/iterators/iterators.bpf.c b/kernel/bpf/preload/iterators/iterators.bpf.c index 03af863314ea..b78968b63fab 100644 --- a/kernel/bpf/preload/iterators/iterators.bpf.c +++ b/kernel/bpf/preload/iterators/iterators.bpf.c @@ -73,6 +73,8 @@ static const char *get_name(struct btf *btf, long btf_id, const char *fallback) return str + name_off; } +__s64 bpf_map_sum_elem_count(struct bpf_map *map) __ksym; + SEC("iter/bpf_map") int dump_bpf_map(struct bpf_iter__bpf_map *ctx) { @@ -84,9 +86,12 @@ int dump_bpf_map(struct bpf_iter__bpf_map *ctx) return 0; if (seq_num == 0) - BPF_SEQ_PRINTF(seq, " id name max_entries\n"); + BPF_SEQ_PRINTF(seq, " id name max_entries cur_entries\n"); + + BPF_SEQ_PRINTF(seq, "%4u %-16s %10d %10lld\n", + map->id, map->name, map->max_entries, + bpf_map_sum_elem_count(map)); - BPF_SEQ_PRINTF(seq, "%4u %-16s%6d\n", map->id, map->name, map->max_entries); return 0; } diff --git a/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h b/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h index 70f236a82fe1..5b98ab02025e 100644 --- a/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h +++ b/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ -/* THIS FILE IS AUTOGENERATED! */ +/* THIS FILE IS AUTOGENERATED BY BPFTOOL! */ #ifndef __ITERATORS_BPF_SKEL_H__ #define __ITERATORS_BPF_SKEL_H__ @@ -18,8 +18,6 @@ struct iterators_bpf { int dump_bpf_map_fd; int dump_bpf_prog_fd; } links; - struct iterators_bpf__rodata { - } *rodata; }; static inline int @@ -68,7 +66,6 @@ iterators_bpf__destroy(struct iterators_bpf *skel) iterators_bpf__detach(skel); skel_closenz(skel->progs.dump_bpf_map.prog_fd); skel_closenz(skel->progs.dump_bpf_prog.prog_fd); - skel_free_map_data(skel->rodata, skel->maps.rodata.initial_value, 4096); skel_closenz(skel->maps.rodata.map_fd); skel_free(skel); } @@ -81,15 +78,6 @@ iterators_bpf__open(void) if (!skel) goto cleanup; skel->ctx.sz = (void *)&skel->links - (void *)skel; - skel->rodata = skel_prep_map_data((void *)"\ -\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\ -\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\ -\x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\ -\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\ -\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0", 4096, 98); - if (!skel->rodata) - goto cleanup; - skel->maps.rodata.initial_value = (__u64) (long) skel->rodata; return skel; cleanup: iterators_bpf__destroy(skel); @@ -103,7 +91,7 @@ iterators_bpf__load(struct iterators_bpf *skel) int err; opts.ctx = (struct bpf_loader_ctx *)skel; - opts.data_sz = 6056; + opts.data_sz = 6208; opts.data = (void *)"\ \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ @@ -138,190 +126,197 @@ iterators_bpf__load(struct iterators_bpf *skel) \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x9f\xeb\x01\0\ -\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\xf9\x04\0\0\0\0\0\0\0\0\0\x02\x02\0\ +\x18\0\0\0\0\0\0\0\x80\x04\0\0\x80\x04\0\0\x31\x05\0\0\0\0\0\0\0\0\0\x02\x02\0\ \0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\0\0\0\x04\ \0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\0\0\0\0\0\ \0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\0\0\0\x20\ -\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xa3\0\0\0\x03\0\0\x04\x18\0\0\0\xb1\0\ -\0\0\x09\0\0\0\0\0\0\0\xb5\0\0\0\x0b\0\0\0\x40\0\0\0\xc0\0\0\0\x0b\0\0\0\x80\0\ -\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xc8\0\0\0\0\0\0\x07\0\0\0\0\xd1\0\0\0\0\0\0\ -\x08\x0c\0\0\0\xd7\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\x94\x01\0\0\x03\0\0\x04\ -\x18\0\0\0\x9c\x01\0\0\x0e\0\0\0\0\0\0\0\x9f\x01\0\0\x11\0\0\0\x20\0\0\0\xa4\ -\x01\0\0\x0e\0\0\0\xa0\0\0\0\xb0\x01\0\0\0\0\0\x08\x0f\0\0\0\xb6\x01\0\0\0\0\0\ -\x01\x04\0\0\0\x20\0\0\0\xc3\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\0\0\0\ -\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xc8\x01\0\0\0\0\0\x01\x04\0\0\0\ -\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x2c\x02\0\0\x02\0\0\x04\x10\0\0\0\x13\0\ -\0\0\x03\0\0\0\0\0\0\0\x3f\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x18\0\ -\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x44\x02\0\0\x01\0\0\x0c\ -\x16\0\0\0\x90\x02\0\0\x01\0\0\x04\x08\0\0\0\x99\x02\0\0\x19\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\x02\x1a\0\0\0\xea\x02\0\0\x06\0\0\x04\x38\0\0\0\x9c\x01\0\0\x0e\0\0\ -\0\0\0\0\0\x9f\x01\0\0\x11\0\0\0\x20\0\0\0\xf7\x02\0\0\x1b\0\0\0\xc0\0\0\0\x08\ -\x03\0\0\x15\0\0\0\0\x01\0\0\x11\x03\0\0\x1d\0\0\0\x40\x01\0\0\x1b\x03\0\0\x1e\ -\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\0\0\0\0\ -\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x65\x03\0\0\x02\0\0\x04\ -\x08\0\0\0\x73\x03\0\0\x0e\0\0\0\0\0\0\0\x7c\x03\0\0\x0e\0\0\0\x20\0\0\0\x1b\ -\x03\0\0\x03\0\0\x04\x18\0\0\0\x86\x03\0\0\x1b\0\0\0\0\0\0\0\x8e\x03\0\0\x21\0\ -\0\0\x40\0\0\0\x94\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\0\0\0\ -\0\0\0\0\0\x02\x24\0\0\0\x98\x03\0\0\x01\0\0\x04\x04\0\0\0\xa3\x03\0\0\x0e\0\0\ -\0\0\0\0\0\x0c\x04\0\0\x01\0\0\x04\x04\0\0\0\x15\x04\0\0\x0e\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x8b\x04\0\0\0\0\0\x0e\x25\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\x9f\x04\ -\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\ -\x20\0\0\0\xb5\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\ -\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xca\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xe1\x04\0\0\0\0\0\x0e\x2d\0\0\ -\0\x01\0\0\0\xe9\x04\0\0\x04\0\0\x0f\x62\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\0\x28\ -\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\0\0\ -\x11\0\0\0\xf1\x04\0\0\x01\0\0\x0f\x04\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\x62\ -\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\ -\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\x70\ -\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x30\ -\x3a\x30\0\x2f\x77\x2f\x6e\x65\x74\x2d\x6e\x65\x78\x74\x2f\x6b\x65\x72\x6e\x65\ -\x6c\x2f\x62\x70\x66\x2f\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\ -\x74\x6f\x72\x73\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\ -\x63\0\x09\x73\x74\x72\x75\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\ -\x73\x65\x71\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\ -\x71\x3b\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\ -\x73\x65\x73\x73\x69\x6f\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\ -\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\x6e\ -\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\0\x30\x3a\x31\0\x09\x73\x74\ -\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\ -\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\ -\x29\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x20\x63\ -\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\x3b\0\x30\ -\x3a\x32\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\ -\x29\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\ -\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\ -\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\ -\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\ -\0\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\ -\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\ -\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\ -\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\ -\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\ -\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\ -\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\ -\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\ -\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\ -\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\ -\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\ -\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\ -\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\ -\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\ -\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\ -\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\ -\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\ -\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\ -\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\ -\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\ -\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\ -\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\ -\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\ -\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\ -\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\ -\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\ -\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\ -\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\ -\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\ -\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\ -\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\ -\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\ -\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\ -\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\ -\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\ -\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\ -\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\ -\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ -\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ -\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\ -\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\ -\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\ -\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x2d\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\ -\0\x04\0\0\0\x62\0\0\0\x01\0\0\0\x80\x04\0\0\0\0\0\0\0\0\0\0\x69\x74\x65\x72\ -\x61\x74\x6f\x72\x2e\x72\x6f\x64\x61\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\x2f\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\ -\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\ -\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\ -\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\ -\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\ -\x25\x73\x20\x25\x73\x0a\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\ -\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\x1b\0\0\ -\0\0\0\x79\x11\0\0\0\0\0\0\x79\x11\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\ -\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x23\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\ -\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\ -\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\x7b\x2a\xf0\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\ -\x7b\x1a\xf8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\ -\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x23\0\0\0\xb7\x03\0\0\x0e\0\0\0\ -\xb7\x05\0\0\x18\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\ -\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x3c\x01\0\x01\0\0\0\x42\0\0\ -\0\x7b\0\0\0\x24\x3c\x01\0\x02\0\0\0\x42\0\0\0\xee\0\0\0\x1d\x44\x01\0\x03\0\0\ -\0\x42\0\0\0\x0f\x01\0\0\x06\x4c\x01\0\x04\0\0\0\x42\0\0\0\x1a\x01\0\0\x17\x40\ -\x01\0\x05\0\0\0\x42\0\0\0\x1a\x01\0\0\x1d\x40\x01\0\x06\0\0\0\x42\0\0\0\x43\ -\x01\0\0\x06\x58\x01\0\x08\0\0\0\x42\0\0\0\x56\x01\0\0\x03\x5c\x01\0\x0f\0\0\0\ -\x42\0\0\0\xdc\x01\0\0\x02\x64\x01\0\x1f\0\0\0\x42\0\0\0\x2a\x02\0\0\x01\x6c\ -\x01\0\0\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\ -\0\x10\0\0\0\x02\0\0\0\xea\0\0\0\0\0\0\0\x20\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\ -\x28\0\0\0\x08\0\0\0\x3f\x01\0\0\0\0\0\0\x78\0\0\0\x0d\0\0\0\x3e\0\0\0\0\0\0\0\ -\x88\0\0\0\x0d\0\0\0\xea\0\0\0\0\0\0\0\xa8\0\0\0\x0d\0\0\0\x3f\x01\0\0\0\0\0\0\ -\x1a\0\0\0\x21\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\ -\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\ -\0\0\0\0\0\x0a\0\0\0\x01\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\x10\0\0\0\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x6d\ -\x61\x70\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\ -\0\0\0\0\x79\x12\x08\0\0\0\0\0\x15\x02\x3c\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x79\ -\x27\0\0\0\0\0\0\x79\x11\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\ -\0\x07\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\ -\x31\0\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x7b\ -\x6a\xc8\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xb7\x03\0\0\ -\x04\0\0\0\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\x71\x28\0\0\0\0\0\x79\ -\x78\x30\0\0\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\ -\0\x61\x11\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\0\x03\0\0\0\x0f\x13\0\ -\0\0\0\0\0\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\ -\xb7\x02\0\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\ -\0\0\0\0\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\ -\xb7\x02\0\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\ -\xff\0\0\0\0\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\ -\xbf\x69\0\0\0\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\ -\xff\0\0\0\0\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\ -\x1a\xe8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\x79\xa1\ -\xc8\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x51\0\0\0\xb7\x03\0\0\x11\0\0\0\ -\xb7\x05\0\0\x20\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\ -\0\0\0\0\x17\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x80\x01\0\x01\0\0\0\x42\0\0\ -\0\x7b\0\0\0\x24\x80\x01\0\x02\0\0\0\x42\0\0\0\x60\x02\0\0\x1f\x88\x01\0\x03\0\ -\0\0\x42\0\0\0\x84\x02\0\0\x06\x94\x01\0\x04\0\0\0\x42\0\0\0\x1a\x01\0\0\x17\ -\x84\x01\0\x05\0\0\0\x42\0\0\0\x9d\x02\0\0\x0e\xa0\x01\0\x06\0\0\0\x42\0\0\0\ -\x1a\x01\0\0\x1d\x84\x01\0\x07\0\0\0\x42\0\0\0\x43\x01\0\0\x06\xa4\x01\0\x09\0\ -\0\0\x42\0\0\0\xaf\x02\0\0\x03\xa8\x01\0\x11\0\0\0\x42\0\0\0\x1f\x03\0\0\x02\ -\xb0\x01\0\x18\0\0\0\x42\0\0\0\x5a\x03\0\0\x06\x04\x01\0\x1b\0\0\0\x42\0\0\0\0\ -\0\0\0\0\0\0\0\x1c\0\0\0\x42\0\0\0\xab\x03\0\0\x0f\x10\x01\0\x1d\0\0\0\x42\0\0\ -\0\xc0\x03\0\0\x2d\x14\x01\0\x1f\0\0\0\x42\0\0\0\xf7\x03\0\0\x0d\x0c\x01\0\x21\ -\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x22\0\0\0\x42\0\0\0\xc0\x03\0\0\x02\x14\x01\0\ -\x25\0\0\0\x42\0\0\0\x1e\x04\0\0\x0d\x18\x01\0\x28\0\0\0\x42\0\0\0\0\0\0\0\0\0\ -\0\0\x29\0\0\0\x42\0\0\0\x1e\x04\0\0\x0d\x18\x01\0\x2c\0\0\0\x42\0\0\0\x1e\x04\ -\0\0\x0d\x18\x01\0\x2d\0\0\0\x42\0\0\0\x4c\x04\0\0\x1b\x1c\x01\0\x2e\0\0\0\x42\ -\0\0\0\x4c\x04\0\0\x06\x1c\x01\0\x2f\0\0\0\x42\0\0\0\x6f\x04\0\0\x0d\x24\x01\0\ -\x31\0\0\0\x42\0\0\0\x1f\x03\0\0\x02\xb0\x01\0\x40\0\0\0\x42\0\0\0\x2a\x02\0\0\ -\x01\xc0\x01\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\ -\0\0\0\0\0\x10\0\0\0\x14\0\0\0\xea\0\0\0\0\0\0\0\x20\0\0\0\x14\0\0\0\x3e\0\0\0\ -\0\0\0\0\x28\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x30\0\0\0\x08\0\0\0\x3f\x01\0\0\ -\0\0\0\0\x88\0\0\0\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x98\0\0\0\x1a\0\0\0\xea\0\0\0\0\ -\0\0\0\xb0\0\0\0\x1a\0\0\0\x52\x03\0\0\0\0\0\0\xb8\0\0\0\x1a\0\0\0\x56\x03\0\0\ -\0\0\0\0\xc8\0\0\0\x1f\0\0\0\x84\x03\0\0\0\0\0\0\xe0\0\0\0\x20\0\0\0\xea\0\0\0\ -\0\0\0\0\xf8\0\0\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x20\x01\0\0\x24\0\0\0\x3e\0\0\0\ -\0\0\0\0\x58\x01\0\0\x1a\0\0\0\xea\0\0\0\0\0\0\0\x68\x01\0\0\x20\0\0\0\x46\x04\ -\0\0\0\0\0\0\x90\x01\0\0\x1a\0\0\0\x3f\x01\0\0\0\0\0\0\xa0\x01\0\0\x1a\0\0\0\ -\x87\x04\0\0\0\0\0\0\xa8\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x42\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0\x1c\0\0\ -\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x1a\0\ -\0\0\x01\0\0\0\0\0\0\0\x13\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\ -\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\ -\0\0\0\0"; - opts.insns_sz = 2216; +\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xb0\0\0\0\x03\0\0\x04\x18\0\0\0\xbe\0\ +\0\0\x09\0\0\0\0\0\0\0\xc2\0\0\0\x0b\0\0\0\x40\0\0\0\xcd\0\0\0\x0b\0\0\0\x80\0\ +\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xd5\0\0\0\0\0\0\x07\0\0\0\0\xde\0\0\0\0\0\0\ +\x08\x0c\0\0\0\xe4\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\xae\x01\0\0\x03\0\0\x04\ +\x18\0\0\0\xb6\x01\0\0\x0e\0\0\0\0\0\0\0\xb9\x01\0\0\x11\0\0\0\x20\0\0\0\xbe\ +\x01\0\0\x0e\0\0\0\xa0\0\0\0\xca\x01\0\0\0\0\0\x08\x0f\0\0\0\xd0\x01\0\0\0\0\0\ +\x01\x04\0\0\0\x20\0\0\0\xdd\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\0\0\0\ +\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xe2\x01\0\0\0\0\0\x01\x04\0\0\0\ +\x20\0\0\0\0\0\0\0\x01\0\0\x0d\x14\0\0\0\x26\x05\0\0\x04\0\0\0\x2b\x02\0\0\0\0\ +\0\x08\x15\0\0\0\x31\x02\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\x01\x3b\x02\0\0\x01\0\ +\0\x0c\x13\0\0\0\0\0\0\0\0\0\0\x02\x18\0\0\0\x52\x02\0\0\x02\0\0\x04\x10\0\0\0\ +\x13\0\0\0\x03\0\0\0\0\0\0\0\x65\x02\0\0\x19\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\ +\x1c\0\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x17\0\0\0\x6a\x02\0\0\x01\0\ +\0\x0c\x1a\0\0\0\xb6\x02\0\0\x01\0\0\x04\x08\0\0\0\xbf\x02\0\0\x1d\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\x02\x1e\0\0\0\x10\x03\0\0\x06\0\0\x04\x38\0\0\0\xb6\x01\0\0\ +\x0e\0\0\0\0\0\0\0\xb9\x01\0\0\x11\0\0\0\x20\0\0\0\x1d\x03\0\0\x1f\0\0\0\xc0\0\ +\0\0\x2e\x03\0\0\x19\0\0\0\0\x01\0\0\x37\x03\0\0\x21\0\0\0\x40\x01\0\0\x41\x03\ +\0\0\x22\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\ +\0\0\0\0\0\0\0\0\0\x02\x23\0\0\0\0\0\0\0\0\0\0\x02\x24\0\0\0\x8b\x03\0\0\x02\0\ +\0\x04\x08\0\0\0\x99\x03\0\0\x0e\0\0\0\0\0\0\0\xa2\x03\0\0\x0e\0\0\0\x20\0\0\0\ +\x41\x03\0\0\x03\0\0\x04\x18\0\0\0\xac\x03\0\0\x1f\0\0\0\0\0\0\0\xb4\x03\0\0\ +\x25\0\0\0\x40\0\0\0\xba\x03\0\0\x27\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x26\0\0\ +\0\0\0\0\0\0\0\0\x02\x28\0\0\0\xbe\x03\0\0\x01\0\0\x04\x04\0\0\0\xc9\x03\0\0\ +\x0e\0\0\0\0\0\0\0\x32\x04\0\0\x01\0\0\x04\x04\0\0\0\x3b\x04\0\0\x0e\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x20\0\0\0\x12\0\0\0\x30\0\0\0\xb1\x04\0\0\0\0\0\ +\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x20\0\0\0\x12\0\0\0\x1a\0\0\0\ +\xc5\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x20\0\0\0\ +\x12\0\0\0\x20\0\0\0\xdb\x04\0\0\0\0\0\x0e\x2d\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\ +\0\0\0\0\x20\0\0\0\x12\0\0\0\x11\0\0\0\xf0\x04\0\0\0\0\0\x0e\x2f\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\x07\x05\0\0\0\0\0\x0e\ +\x31\0\0\0\x01\0\0\0\x0f\x05\0\0\x01\0\0\x0f\x04\0\0\0\x36\0\0\0\0\0\0\0\x04\0\ +\0\0\x16\x05\0\0\x04\0\0\x0f\x7b\0\0\0\x2a\0\0\0\0\0\0\0\x30\0\0\0\x2c\0\0\0\ +\x30\0\0\0\x1a\0\0\0\x2e\0\0\0\x4a\0\0\0\x20\0\0\0\x30\0\0\0\x6a\0\0\0\x11\0\0\ +\0\x1e\x05\0\0\x01\0\0\x0f\x04\0\0\0\x32\0\0\0\0\0\0\0\x04\0\0\0\x26\x05\0\0\0\ +\0\0\x0e\x06\0\0\0\x01\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\ +\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\ +\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\ +\x62\x70\x66\x5f\x6d\x61\x70\0\x30\x3a\x30\0\x2f\x68\x6f\x6d\x65\x2f\x61\x73\ +\x70\x73\x6b\x2f\x73\x72\x63\x2f\x62\x70\x66\x2d\x6e\x65\x78\x74\x2f\x6b\x65\ +\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\ +\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\ +\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\ +\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\ +\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\ +\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\ +\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\x5f\x75\x36\x34\0\x75\x6e\x73\ +\x69\x67\x6e\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\0\x30\x3a\x31\0\ +\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\ +\x70\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\ +\x6d\x61\x70\x29\0\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\ +\x6e\x75\x6d\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\ +\x71\x5f\x6e\x75\x6d\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\ +\x3d\x3d\x20\x30\x29\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\ +\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\ +\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\ +\x72\x69\x65\x73\x20\x20\x63\x75\x72\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\x6e\ +\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\x6d\ +\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\x73\ +\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\x52\ +\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\x5f\ +\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\ +\x75\x20\x25\x2d\x31\x36\x73\x20\x20\x25\x31\x30\x64\x20\x20\x20\x25\x31\x30\ +\x6c\x6c\x64\x5c\x6e\x22\x2c\0\x7d\0\x5f\x5f\x73\x36\x34\0\x6c\x6f\x6e\x67\x20\ +\x6c\x6f\x6e\x67\0\x62\x70\x66\x5f\x6d\x61\x70\x5f\x73\x75\x6d\x5f\x65\x6c\x65\ +\x6d\x5f\x63\x6f\x75\x6e\x74\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\ +\x66\x5f\x70\x72\x6f\x67\0\x70\x72\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\ +\x5f\x70\x72\x6f\x67\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\ +\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\ +\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\ +\x20\x28\x21\x70\x72\x6f\x67\x29\0\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\ +\x78\0\x09\x61\x75\x78\x20\x3d\x20\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\ +\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\ +\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\ +\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\ +\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\ +\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\ +\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\ +\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\ +\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\ +\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\ +\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\ +\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\ +\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\ +\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\ +\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\ +\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\ +\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\x29\x2c\x20\x74\x79\x70\x65\x73\x20\ +\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\x09\x73\x74\x72\x20\x3d\x20\x62\x74\ +\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\ +\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\ +\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\ +\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\x30\x3a\x32\x3a\x30\0\x09\x69\x66\ +\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3e\x3d\x20\x62\x74\x66\x2d\x3e\ +\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\x29\0\x09\x72\x65\x74\x75\x72\x6e\ +\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\ +\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\ +\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\ +\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\ +\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\ +\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\x4e\x53\x45\0\x2e\x6b\x73\x79\x6d\ +\x73\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x64\x75\x6d\ +\x6d\x79\x5f\x6b\x73\x79\x6d\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\xc9\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\0\x04\0\0\0\x7b\0\0\0\x01\0\0\0\ +\x80\0\0\0\0\0\0\0\0\0\0\0\x69\x74\x65\x72\x61\x74\x6f\x72\x2e\x72\x6f\x64\x61\ +\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\x34\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x20\x20\ +\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\ +\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x20\x20\x63\x75\x72\x5f\x65\ +\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x20\x25\ +\x31\x30\x64\x20\x20\x20\x25\x31\x30\x6c\x6c\x64\x0a\0\x20\x20\x69\x64\x20\x6e\ +\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\ +\x61\x63\x68\x65\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\ +\x25\x73\x0a\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x12\0\0\0\ +\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\x1d\0\0\0\0\0\x79\x21\ +\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe0\xff\ +\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\ +\x30\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\ +\xe0\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\ +\x7b\x2a\xe8\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\x7b\x1a\xf0\xff\0\0\0\0\xbf\x71\ +\0\0\0\0\0\0\x85\x20\0\0\0\0\0\0\x7b\x0a\xf8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\ +\x07\x04\0\0\xe0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\ +\x30\0\0\0\xb7\x03\0\0\x1a\0\0\0\xb7\x05\0\0\x20\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\ +\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x88\0\0\0\ +\x1e\x44\x01\0\x01\0\0\0\x42\0\0\0\x88\0\0\0\x24\x44\x01\0\x02\0\0\0\x42\0\0\0\ +\xfb\0\0\0\x1d\x4c\x01\0\x03\0\0\0\x42\0\0\0\x1c\x01\0\0\x06\x54\x01\0\x04\0\0\ +\0\x42\0\0\0\x2b\x01\0\0\x1d\x48\x01\0\x05\0\0\0\x42\0\0\0\x50\x01\0\0\x06\x60\ +\x01\0\x07\0\0\0\x42\0\0\0\x63\x01\0\0\x03\x64\x01\0\x0e\0\0\0\x42\0\0\0\xf6\ +\x01\0\0\x02\x6c\x01\0\x21\0\0\0\x42\0\0\0\x29\x02\0\0\x01\x80\x01\0\0\0\0\0\ +\x02\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\ +\x02\0\0\0\xf7\0\0\0\0\0\0\0\x20\0\0\0\x08\0\0\0\x27\x01\0\0\0\0\0\0\x70\0\0\0\ +\x0d\0\0\0\x3e\0\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\0\xf7\0\0\0\0\0\0\0\xa0\0\0\0\ +\x0d\0\0\0\x27\x01\0\0\0\0\0\0\x1a\0\0\0\x23\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\ +\x70\x66\x5f\x6d\x61\x70\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\ +\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x09\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x62\x70\x66\x5f\x69\x74\ +\x65\x72\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\0\0\0\0\x62\x70\x66\x5f\x6d\ +\x61\x70\x5f\x73\x75\x6d\x5f\x65\x6c\x65\x6d\x5f\x63\x6f\x75\x6e\x74\0\0\x47\ +\x50\x4c\0\0\0\0\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x11\x08\0\0\0\0\ +\0\x15\x01\x3b\0\0\0\0\0\x79\x17\0\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\ +\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\ +\x18\x62\0\0\0\0\0\0\0\0\0\0\x4a\0\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\ +\0\x85\0\0\0\x7e\0\0\0\x7b\x6a\xc8\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\ +\xff\0\0\0\0\xb7\x03\0\0\x04\0\0\0\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\ +\x71\x28\0\0\0\0\0\x79\x78\x30\0\0\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\ +\0\0\x0f\x21\0\0\0\0\0\0\x61\x11\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\ +\0\x03\0\0\0\x0f\x13\0\0\0\0\0\0\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\ +\x01\0\0\xf8\xff\xff\xff\xb7\x02\0\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\ +\0\0\0\0\x79\xa3\xf8\xff\0\0\0\0\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\ +\x01\0\0\xf4\xff\xff\xff\xb7\x02\0\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\ +\x04\0\0\0\x61\xa1\xf4\xff\0\0\0\0\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\ +\x0f\x16\0\0\0\0\0\0\xbf\x69\0\0\0\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\ +\0\0\0\0\x7b\x1a\xe0\xff\0\0\0\0\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\ +\x31\0\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\ +\xff\xff\xff\x79\xa1\xc8\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x6a\0\0\0\xb7\ +\x03\0\0\x11\0\0\0\xb7\x05\0\0\x20\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\ +\x95\0\0\0\0\0\0\0\0\0\0\0\x1b\0\0\0\0\0\0\0\x42\0\0\0\x88\0\0\0\x1e\x94\x01\0\ +\x01\0\0\0\x42\0\0\0\x88\0\0\0\x24\x94\x01\0\x02\0\0\0\x42\0\0\0\x86\x02\0\0\ +\x1f\x9c\x01\0\x03\0\0\0\x42\0\0\0\xaa\x02\0\0\x06\xa8\x01\0\x04\0\0\0\x42\0\0\ +\0\xc3\x02\0\0\x0e\xb4\x01\0\x05\0\0\0\x42\0\0\0\x2b\x01\0\0\x1d\x98\x01\0\x06\ +\0\0\0\x42\0\0\0\x50\x01\0\0\x06\xb8\x01\0\x08\0\0\0\x42\0\0\0\xd5\x02\0\0\x03\ +\xbc\x01\0\x10\0\0\0\x42\0\0\0\x45\x03\0\0\x02\xc4\x01\0\x17\0\0\0\x42\0\0\0\ +\x80\x03\0\0\x06\x04\x01\0\x1a\0\0\0\x42\0\0\0\x45\x03\0\0\x02\xc4\x01\0\x1b\0\ +\0\0\x42\0\0\0\xd1\x03\0\0\x0f\x10\x01\0\x1c\0\0\0\x42\0\0\0\xe6\x03\0\0\x2d\ +\x14\x01\0\x1e\0\0\0\x42\0\0\0\x1d\x04\0\0\x0d\x0c\x01\0\x20\0\0\0\x42\0\0\0\ +\x45\x03\0\0\x02\xc4\x01\0\x21\0\0\0\x42\0\0\0\xe6\x03\0\0\x02\x14\x01\0\x24\0\ +\0\0\x42\0\0\0\x44\x04\0\0\x0d\x18\x01\0\x27\0\0\0\x42\0\0\0\x45\x03\0\0\x02\ +\xc4\x01\0\x28\0\0\0\x42\0\0\0\x44\x04\0\0\x0d\x18\x01\0\x2b\0\0\0\x42\0\0\0\ +\x44\x04\0\0\x0d\x18\x01\0\x2c\0\0\0\x42\0\0\0\x72\x04\0\0\x1b\x1c\x01\0\x2d\0\ +\0\0\x42\0\0\0\x72\x04\0\0\x06\x1c\x01\0\x2e\0\0\0\x42\0\0\0\x95\x04\0\0\x0d\ +\x24\x01\0\x30\0\0\0\x42\0\0\0\x45\x03\0\0\x02\xc4\x01\0\x3f\0\0\0\x42\0\0\0\ +\x29\x02\0\0\x01\xd4\x01\0\0\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\ +\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x18\0\0\0\xf7\0\0\0\0\0\0\0\x20\0\0\0\x1c\0\0\ +\0\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\0\x27\x01\0\0\0\0\0\0\x80\0\0\0\x1e\0\0\ +\0\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1e\0\0\0\xf7\0\0\0\0\0\0\0\xa8\0\0\0\x1e\0\0\0\ +\x78\x03\0\0\0\0\0\0\xb0\0\0\0\x1e\0\0\0\x7c\x03\0\0\0\0\0\0\xc0\0\0\0\x23\0\0\ +\0\xaa\x03\0\0\0\0\0\0\xd8\0\0\0\x24\0\0\0\xf7\0\0\0\0\0\0\0\xf0\0\0\0\x24\0\0\ +\0\x3e\0\0\0\0\0\0\0\x18\x01\0\0\x28\0\0\0\x3e\0\0\0\0\0\0\0\x50\x01\0\0\x1e\0\ +\0\0\xf7\0\0\0\0\0\0\0\x60\x01\0\0\x24\0\0\0\x6c\x04\0\0\0\0\0\0\x88\x01\0\0\ +\x1e\0\0\0\x27\x01\0\0\0\0\0\0\x98\x01\0\0\x1e\0\0\0\xad\x04\0\0\0\0\0\0\xa0\ +\x01\0\0\x1c\0\0\0\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x41\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\ +\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\ +\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x19\0\0\0\x01\0\0\0\0\0\0\0\ +\x12\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x62\x70\x66\x5f\ +\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0"; + opts.insns_sz = 2456; opts.insns = (void *)"\ \xbf\x16\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\x78\xff\xff\xff\xb7\x02\0\ \0\x88\0\0\0\xb7\x03\0\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x05\0\x14\0\0\0\0\0\x61\ @@ -331,79 +326,83 @@ iterators_bpf__load(struct iterators_bpf *skel) \0\0\0\x85\0\0\0\xa8\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\ \0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xbf\x70\0\0\ \0\0\0\0\x95\0\0\0\0\0\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\ -\x48\x0e\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\ -\0\0\x44\x0e\0\0\x63\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\ -\0\0\0\0\x38\x0e\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x05\0\0\ -\x18\x61\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x12\0\ -\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\xb7\x03\0\0\x1c\0\0\0\x85\0\0\0\ +\xe8\x0e\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\ +\0\0\xe4\x0e\0\0\x63\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\ +\0\0\0\0\xd8\x0e\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x05\0\0\ +\x18\x61\0\0\0\0\0\0\0\0\0\0\xd0\x0e\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x12\0\ +\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\xd0\x0e\0\0\xb7\x03\0\0\x1c\0\0\0\x85\0\0\0\ \xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\xd4\xff\0\0\0\0\x63\x7a\x78\xff\0\0\0\0\ -\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x0e\0\0\x63\x01\0\0\0\ +\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x20\x0f\0\0\x63\x01\0\0\0\ \0\0\0\x61\x60\x1c\0\0\0\0\0\x15\0\x03\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\ -\x5c\x0e\0\0\x63\x01\0\0\0\0\0\0\xb7\x01\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\ -\0\x50\x0e\0\0\xb7\x03\0\0\x48\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\ +\xfc\x0e\0\0\x63\x01\0\0\0\0\0\0\xb7\x01\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\ +\0\xf0\x0e\0\0\xb7\x03\0\0\x48\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\ \xc5\x07\xc3\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x71\0\0\0\0\0\ -\0\x79\x63\x20\0\0\0\0\0\x15\x03\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x98\ -\x0e\0\0\xb7\x02\0\0\x62\0\0\0\x61\x60\x04\0\0\0\0\0\x45\0\x02\0\x01\0\0\0\x85\ +\0\x79\x63\x20\0\0\0\0\0\x15\x03\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x38\ +\x0f\0\0\xb7\x02\0\0\x7b\0\0\0\x61\x60\x04\0\0\0\0\0\x45\0\x02\0\x01\0\0\0\x85\ \0\0\0\x94\0\0\0\x05\0\x01\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x18\x62\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\x63\ -\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\ -\0\0\x10\x0f\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x98\x0e\0\0\ -\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x0f\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x02\0\ -\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\xb7\x03\0\0\x20\0\0\0\x85\0\0\0\ +\0\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xc0\x0f\0\0\x63\ +\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xb8\x0f\0\0\x18\x61\0\0\0\0\0\0\0\ +\0\0\0\xc8\x0f\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x38\x0f\0\0\ +\x18\x61\0\0\0\0\0\0\0\0\0\0\xd0\x0f\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x02\0\ +\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\xc0\x0f\0\0\xb7\x03\0\0\x20\0\0\0\x85\0\0\0\ \xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x9f\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\x63\ -\x01\0\0\0\0\0\0\xb7\x01\0\0\x16\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\ +\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe0\x0f\0\0\x63\ +\x01\0\0\0\0\0\0\xb7\x01\0\0\x16\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\xe0\x0f\0\0\ \xb7\x03\0\0\x04\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x92\xff\ -\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\ -\x78\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x38\x0f\0\0\x18\ -\x61\0\0\0\0\0\0\0\0\0\0\x70\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\ -\0\0\0\x40\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x11\0\0\x7b\x01\0\0\0\0\0\0\ -\x18\x60\0\0\0\0\0\0\0\0\0\0\x48\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xc8\x11\0\ -\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x10\0\0\x18\x61\0\0\0\0\ -\0\0\0\0\0\0\xe8\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe0\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\x60\x08\0\0\ -\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x11\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\ -\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x84\x11\0\0\x63\x01\0\0\0\0\0\0\x79\x60\ -\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x88\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\ -\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb0\x11\0\0\x63\x01\0\0\0\0\0\ -\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xf8\x11\0\0\xb7\x02\0\0\x11\0\0\0\xb7\x03\0\0\ +\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\ +\x20\x12\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xf0\x0f\0\0\x18\ +\x61\0\0\0\0\0\0\0\0\0\0\x18\x12\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\ +\0\0\0\x08\x11\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x60\x12\0\0\x7b\x01\0\0\0\0\0\0\ +\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x11\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x70\x12\0\ +\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xa0\x11\0\0\x18\x61\0\0\0\0\ +\0\0\0\0\0\0\x90\x12\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x88\x12\0\0\x7b\x01\0\0\0\0\0\0\x61\x60\x08\0\0\ +\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x12\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\ +\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x2c\x12\0\0\x63\x01\0\0\0\0\0\0\x79\x60\ +\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x30\x12\0\0\x7b\x01\0\0\0\0\0\0\x61\ +\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x58\x12\0\0\x63\x01\0\0\0\0\0\ +\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xa0\x12\0\0\xb7\x02\0\0\x11\0\0\0\xb7\x03\0\0\ \x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\ -\x5c\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x68\x11\0\0\x63\x70\x6c\0\0\0\0\0\ -\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\0\0\0\x18\x62\0\0\ -\0\0\0\0\0\0\0\0\x68\x11\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\ -\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd8\x11\0\0\x61\x01\0\0\0\0\0\0\xd5\ -\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x4a\xff\0\0\ -\0\0\x63\x7a\x80\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\x18\x61\0\ -\0\0\0\0\0\0\0\0\0\x10\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\ -\x18\x12\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\ -\x60\0\0\0\0\0\0\0\0\0\0\x28\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x50\x17\0\0\ -\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x14\0\0\x18\x61\0\0\0\0\0\ -\0\0\0\0\0\x60\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd0\x15\ -\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x78\x17\0\0\x7b\x01\0\0\0\0\ -\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x17\0\0\x63\x01\0\0\ -\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x1c\x17\0\0\x63\x01\ -\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\x7b\ -\x01\0\0\0\0\0\0\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x48\x17\0\ -\0\x63\x01\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x90\x17\0\0\xb7\x02\0\0\x12\ -\0\0\0\xb7\x03\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\ -\0\0\0\0\0\xc5\x07\x13\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\x63\ -\x70\x6c\0\0\0\0\0\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\ -\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\ -\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x70\x17\0\0\x61\x01\ -\0\0\0\0\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\ -\x07\x01\xff\0\0\0\0\x63\x7a\x84\xff\0\0\0\0\x61\xa1\x78\xff\0\0\0\0\xd5\x01\ -\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa0\x80\xff\0\0\0\0\ -\x63\x06\x28\0\0\0\0\0\x61\xa0\x84\xff\0\0\0\0\x63\x06\x2c\0\0\0\0\0\x18\x61\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\0\0\x63\x06\x18\0\0\0\0\0\xb7\0\0\0\ -\0\0\0\0\x95\0\0\0\0\0\0\0"; +\x5c\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\x63\x70\x6c\0\0\0\0\0\ +\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\x18\x68\0\0\0\0\0\0\0\0\0\0\xa8\ +\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x12\0\0\xb7\x02\0\0\x17\0\0\0\xb7\x03\ +\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\ +\x07\x4d\xff\0\0\0\0\x75\x07\x03\0\0\0\0\0\x62\x08\x04\0\0\0\0\0\x6a\x08\x02\0\ +\0\0\0\0\x05\0\x0a\0\0\0\0\0\x63\x78\x04\0\0\0\0\0\xbf\x79\0\0\0\0\0\0\x77\x09\ +\0\0\x20\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\x63\x90\0\0\0\0\0\0\x55\ +\x09\x02\0\0\0\0\0\x6a\x08\x02\0\0\0\0\0\x05\0\x01\0\0\0\0\0\x6a\x08\x02\0\x40\ +\0\0\0\xb7\x01\0\0\x05\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\xb7\x03\0\ +\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\ +\0\0\x01\0\0\x61\x01\0\0\0\0\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\ +\0\0\0\xa8\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x80\x12\0\0\x61\x01\0\0\0\0\0\0\ +\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x2c\xff\ +\0\0\0\0\x63\x7a\x80\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd0\x12\0\0\x18\ +\x61\0\0\0\0\0\0\0\0\0\0\xa8\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\ +\0\0\0\xd8\x12\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xa0\x17\0\0\x7b\x01\0\0\0\0\0\0\ +\x18\x60\0\0\0\0\0\0\0\0\0\0\xe0\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe8\x17\0\ +\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x14\0\0\x18\x61\0\0\0\0\ +\0\0\0\0\0\0\xf8\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x78\ +\x16\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x18\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x10\x18\0\0\x7b\x01\0\0\ +\0\0\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb0\x17\0\0\x63\x01\ +\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb4\x17\0\0\x63\ +\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x17\0\0\ +\x7b\x01\0\0\0\0\0\0\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe0\ +\x17\0\0\x63\x01\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x18\0\0\xb7\x02\0\ +\0\x12\0\0\0\xb7\x03\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\ +\x07\0\0\0\0\0\0\xc5\x07\xf5\xfe\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x98\x17\0\ +\0\x63\x70\x6c\0\0\0\0\0\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\ +\0\x05\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x98\x17\0\0\xb7\x03\0\0\x8c\0\0\0\x85\ +\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x08\x18\0\0\ +\x61\x01\0\0\0\0\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\ +\0\0\xc5\x07\xe3\xfe\0\0\0\0\x63\x7a\x84\xff\0\0\0\0\x61\xa1\x78\xff\0\0\0\0\ +\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa0\x80\xff\ +\0\0\0\0\x63\x06\x28\0\0\0\0\0\x61\xa0\x84\xff\0\0\0\0\x63\x06\x2c\0\0\0\0\0\ +\x18\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\0\0\x63\x06\x18\0\0\0\0\0\ +\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0"; err = bpf_load_and_run(&opts); if (err < 0) return err; - skel->rodata = skel_finalize_map_data(&skel->maps.rodata.initial_value, - 4096, PROT_READ, skel->maps.rodata.map_fd); - if (!skel->rodata) - return -ENOMEM; return 0; } @@ -422,4 +421,15 @@ iterators_bpf__open_and_load(void) return skel; } +__attribute__((unused)) static void +iterators_bpf__assert(struct iterators_bpf *s __attribute__((unused))) +{ +#ifdef __cplusplus +#define _Static_assert static_assert +#endif +#ifdef __cplusplus +#undef _Static_assert +#endif +} + #endif /* __ITERATORS_BPF_SKEL_H__ */ diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 8d2ddcb7566b..d869f51ea93a 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -98,7 +98,12 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete) int err = 0; void *ptr; - raw_spin_lock_irqsave(&qs->lock, flags); + if (in_nmi()) { + if (!raw_spin_trylock_irqsave(&qs->lock, flags)) + return -EBUSY; + } else { + raw_spin_lock_irqsave(&qs->lock, flags); + } if (queue_stack_map_is_empty(qs)) { memset(value, 0, qs->map.value_size); @@ -128,7 +133,12 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete) void *ptr; u32 index; - raw_spin_lock_irqsave(&qs->lock, flags); + if (in_nmi()) { + if (!raw_spin_trylock_irqsave(&qs->lock, flags)) + return -EBUSY; + } else { + raw_spin_lock_irqsave(&qs->lock, flags); + } if (queue_stack_map_is_empty(qs)) { memset(value, 0, qs->map.value_size); @@ -193,7 +203,12 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value, if (flags & BPF_NOEXIST || flags > BPF_EXIST) return -EINVAL; - raw_spin_lock_irqsave(&qs->lock, irq_flags); + if (in_nmi()) { + if (!raw_spin_trylock_irqsave(&qs->lock, irq_flags)) + return -EBUSY; + } else { + raw_spin_lock_irqsave(&qs->lock, irq_flags); + } if (queue_stack_map_is_full(qs)) { if (!replace) { diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 875ac9b698d9..0ee653a936ea 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -23,15 +23,6 @@ #define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4) -/* Maximum size of ring buffer area is limited by 32-bit page offset within - * record header, counted in pages. Reserve 8 bits for extensibility, and take - * into account few extra pages for consumer/producer pages and - * non-mmap()'able parts. This gives 64GB limit, which seems plenty for single - * ring buffer. - */ -#define RINGBUF_MAX_DATA_SZ \ - (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE) - struct bpf_ringbuf { wait_queue_head_t waitq; struct irq_work work; @@ -161,6 +152,17 @@ static void bpf_ringbuf_notify(struct irq_work *work) wake_up_all(&rb->waitq); } +/* Maximum size of ring buffer area is limited by 32-bit page offset within + * record header, counted in pages. Reserve 8 bits for extensibility, and + * take into account few extra pages for consumer/producer pages and + * non-mmap()'able parts, the current maximum size would be: + * + * (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE) + * + * This gives 64GB limit, which seems plenty for single ring buffer. Now + * considering that the maximum value of data_sz is (4GB - 1), there + * will be no overflow, so just note the size limit in the comments. + */ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) { struct bpf_ringbuf *rb; @@ -193,12 +195,6 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) !PAGE_ALIGNED(attr->max_entries)) return ERR_PTR(-EINVAL); -#ifdef CONFIG_64BIT - /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */ - if (attr->max_entries > RINGBUF_MAX_DATA_SZ) - return ERR_PTR(-E2BIG); -#endif - rb_map = bpf_map_area_alloc(sizeof(*rb_map), NUMA_NO_NODE); if (!rb_map) return ERR_PTR(-ENOMEM); @@ -774,8 +770,7 @@ schedule_work_return: /* Prevent the clearing of the busy-bit from being reordered before the * storing of any rb consumer or producer positions. */ - smp_mb__before_atomic(); - atomic_set(&rb->busy, 0); + atomic_set_release(&rb->busy, 0); if (flags & BPF_RB_FORCE_WAKEUP) irq_work_queue(&rb->work); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 458bb80b14d5..d6b277482085 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -28,7 +28,7 @@ struct bpf_stack_map { void *elems; struct pcpu_freelist freelist; u32 n_buckets; - struct stack_map_bucket *buckets[]; + struct stack_map_bucket *buckets[] __counted_by(n_buckets); }; static inline bool stack_map_use_build_id(struct bpf_map *map) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a2aef900519c..0ed286b8a0f0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -35,7 +35,10 @@ #include <linux/rcupdate_trace.h> #include <linux/memcontrol.h> #include <linux/trace_events.h> + #include <net/netfilter/nf_bpf_link.h> +#include <net/netkit.h> +#include <net/tcx.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -512,6 +515,7 @@ void btf_record_free(struct btf_record *rec) switch (rec->fields[i].type) { case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_PERCPU: if (rec->fields[i].kptr.module) module_put(rec->fields[i].kptr.module); btf_put(rec->fields[i].kptr.btf); @@ -558,6 +562,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) switch (fields[i].type) { case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_PERCPU: btf_get(fields[i].kptr.btf); if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) { ret = -ENXIO; @@ -622,8 +627,6 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj) bpf_timer_cancel_and_free(obj + rec->timer_off); } -extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec); - void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; @@ -648,6 +651,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) WRITE_ONCE(*(u64 *)field_ptr, 0); break; case BPF_KPTR_REF: + case BPF_KPTR_PERCPU: xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0); if (!xchgd_field) break; @@ -655,11 +659,10 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) if (!btf_is_kernel(field->kptr.btf)) { pointee_struct_meta = btf_find_struct_meta(field->kptr.btf, field->kptr.btf_id); - WARN_ON_ONCE(!pointee_struct_meta); migrate_disable(); __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ? - pointee_struct_meta->record : - NULL); + pointee_struct_meta->record : NULL, + fields[i].type == BPF_KPTR_PERCPU); migrate_enable(); } else { field->kptr.dtor(xchgd_field); @@ -1044,6 +1047,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_PERCPU: case BPF_REFCOUNT: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_PERCPU_HASH && @@ -2441,14 +2445,19 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: return 0; default: return -EINVAL; @@ -2744,7 +2753,7 @@ free_used_maps: * period before we can tear down JIT memory since symbols * are already exposed under kallsyms. */ - __bpf_prog_put_noref(prog, prog->aux->func_cnt); + __bpf_prog_put_noref(prog, prog->aux->real_func_cnt); return err; free_prog_sec: free_uid(prog->aux->user); @@ -2813,10 +2822,12 @@ static void bpf_link_free_id(int id) /* Clean up bpf_link and corresponding anon_inode file and FD. After * anon_inode is created, bpf_link can't be just kfree()'d due to deferred - * anon_inode's release() call. This helper marksbpf_link as + * anon_inode's release() call. This helper marks bpf_link as * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt * is not decremented, it's the responsibility of a calling code that failed * to complete bpf_link initialization. + * This helper eventually calls link's dealloc callback, but does not call + * link's release callback. */ void bpf_link_cleanup(struct bpf_link_primer *primer) { @@ -3295,6 +3306,25 @@ static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link, raw_tp_link->btp->tp->name); } +static int bpf_copy_to_user(char __user *ubuf, const char *buf, u32 ulen, + u32 len) +{ + if (ulen >= len + 1) { + if (copy_to_user(ubuf, buf, len + 1)) + return -EFAULT; + } else { + char zero = '\0'; + + if (copy_to_user(ubuf, buf, ulen - 1)) + return -EFAULT; + if (put_user(zero, ubuf + ulen - 1)) + return -EFAULT; + return -ENOSPC; + } + + return 0; +} + static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { @@ -3313,20 +3343,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, if (!ubuf) return 0; - if (ulen >= tp_len + 1) { - if (copy_to_user(ubuf, tp_name, tp_len + 1)) - return -EFAULT; - } else { - char zero = '\0'; - - if (copy_to_user(ubuf, tp_name, ulen - 1)) - return -EFAULT; - if (put_user(zero, ubuf + ulen - 1)) - return -EFAULT; - return -ENOSPC; - } - - return 0; + return bpf_copy_to_user(ubuf, tp_name, ulen, tp_len); } static const struct bpf_link_ops bpf_raw_tp_link_lops = { @@ -3358,9 +3375,156 @@ static void bpf_perf_link_dealloc(struct bpf_link *link) kfree(perf_link); } +static int bpf_perf_link_fill_common(const struct perf_event *event, + char __user *uname, u32 ulen, + u64 *probe_offset, u64 *probe_addr, + u32 *fd_type, unsigned long *missed) +{ + const char *buf; + u32 prog_id; + size_t len; + int err; + + if (!ulen ^ !uname) + return -EINVAL; + + err = bpf_get_perf_event_info(event, &prog_id, fd_type, &buf, + probe_offset, probe_addr, missed); + if (err) + return err; + if (!uname) + return 0; + if (buf) { + len = strlen(buf); + err = bpf_copy_to_user(uname, buf, ulen, len); + if (err) + return err; + } else { + char zero = '\0'; + + if (put_user(zero, uname)) + return -EFAULT; + } + return 0; +} + +#ifdef CONFIG_KPROBE_EVENTS +static int bpf_perf_link_fill_kprobe(const struct perf_event *event, + struct bpf_link_info *info) +{ + unsigned long missed; + char __user *uname; + u64 addr, offset; + u32 ulen, type; + int err; + + uname = u64_to_user_ptr(info->perf_event.kprobe.func_name); + ulen = info->perf_event.kprobe.name_len; + err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr, + &type, &missed); + if (err) + return err; + if (type == BPF_FD_TYPE_KRETPROBE) + info->perf_event.type = BPF_PERF_EVENT_KRETPROBE; + else + info->perf_event.type = BPF_PERF_EVENT_KPROBE; + + info->perf_event.kprobe.offset = offset; + info->perf_event.kprobe.missed = missed; + if (!kallsyms_show_value(current_cred())) + addr = 0; + info->perf_event.kprobe.addr = addr; + return 0; +} +#endif + +#ifdef CONFIG_UPROBE_EVENTS +static int bpf_perf_link_fill_uprobe(const struct perf_event *event, + struct bpf_link_info *info) +{ + char __user *uname; + u64 addr, offset; + u32 ulen, type; + int err; + + uname = u64_to_user_ptr(info->perf_event.uprobe.file_name); + ulen = info->perf_event.uprobe.name_len; + err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr, + &type, NULL); + if (err) + return err; + + if (type == BPF_FD_TYPE_URETPROBE) + info->perf_event.type = BPF_PERF_EVENT_URETPROBE; + else + info->perf_event.type = BPF_PERF_EVENT_UPROBE; + info->perf_event.uprobe.offset = offset; + return 0; +} +#endif + +static int bpf_perf_link_fill_probe(const struct perf_event *event, + struct bpf_link_info *info) +{ +#ifdef CONFIG_KPROBE_EVENTS + if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE) + return bpf_perf_link_fill_kprobe(event, info); +#endif +#ifdef CONFIG_UPROBE_EVENTS + if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE) + return bpf_perf_link_fill_uprobe(event, info); +#endif + return -EOPNOTSUPP; +} + +static int bpf_perf_link_fill_tracepoint(const struct perf_event *event, + struct bpf_link_info *info) +{ + char __user *uname; + u32 ulen; + + uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name); + ulen = info->perf_event.tracepoint.name_len; + info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT; + return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL, NULL); +} + +static int bpf_perf_link_fill_perf_event(const struct perf_event *event, + struct bpf_link_info *info) +{ + info->perf_event.event.type = event->attr.type; + info->perf_event.event.config = event->attr.config; + info->perf_event.type = BPF_PERF_EVENT_EVENT; + return 0; +} + +static int bpf_perf_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_perf_link *perf_link; + const struct perf_event *event; + + perf_link = container_of(link, struct bpf_perf_link, link); + event = perf_get_event(perf_link->perf_file); + if (IS_ERR(event)) + return PTR_ERR(event); + + switch (event->prog->type) { + case BPF_PROG_TYPE_PERF_EVENT: + return bpf_perf_link_fill_perf_event(event, info); + case BPF_PROG_TYPE_TRACEPOINT: + return bpf_perf_link_fill_tracepoint(event, info); + case BPF_PROG_TYPE_KPROBE: + return bpf_perf_link_fill_probe(event, info); + default: + return -EOPNOTSUPP; + } +} + static const struct bpf_link_ops bpf_perf_link_lops = { .release = bpf_perf_link_release, .dealloc = bpf_perf_link_dealloc, + .fill_link_info = bpf_perf_link_fill_link_info, }; static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) @@ -3502,34 +3666,6 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) return fd; } -static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, - enum bpf_attach_type attach_type) -{ - switch (prog->type) { - case BPF_PROG_TYPE_CGROUP_SOCK: - case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: - case BPF_PROG_TYPE_CGROUP_SOCKOPT: - case BPF_PROG_TYPE_SK_LOOKUP: - return attach_type == prog->expected_attach_type ? 0 : -EINVAL; - case BPF_PROG_TYPE_CGROUP_SKB: - if (!capable(CAP_NET_ADMIN)) - /* cg-skb progs can be loaded by unpriv user. - * check permissions at attach time. - */ - return -EPERM; - return prog->enforce_expected_attach_type && - prog->expected_attach_type != attach_type ? - -EINVAL : 0; - case BPF_PROG_TYPE_KPROBE: - if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI && - attach_type != BPF_TRACE_KPROBE_MULTI) - return -EINVAL; - return 0; - default: - return 0; - } -} - static enum bpf_prog_type attach_type_to_prog_type(enum bpf_attach_type attach_type) { @@ -3546,14 +3682,19 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: return BPF_PROG_TYPE_CGROUP_SOCK_ADDR; case BPF_CGROUP_SOCK_OPS: return BPF_PROG_TYPE_SOCK_OPS; @@ -3588,15 +3729,87 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_XDP; case BPF_LSM_CGROUP: return BPF_PROG_TYPE_LSM; + case BPF_TCX_INGRESS: + case BPF_TCX_EGRESS: + case BPF_NETKIT_PRIMARY: + case BPF_NETKIT_PEER: + return BPF_PROG_TYPE_SCHED_CLS; default: return BPF_PROG_TYPE_UNSPEC; } } -#define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd +static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, + enum bpf_attach_type attach_type) +{ + enum bpf_prog_type ptype; -#define BPF_F_ATTACH_MASK \ - (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE) + switch (prog->type) { + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_SK_LOOKUP: + return attach_type == prog->expected_attach_type ? 0 : -EINVAL; + case BPF_PROG_TYPE_CGROUP_SKB: + if (!capable(CAP_NET_ADMIN)) + /* cg-skb progs can be loaded by unpriv user. + * check permissions at attach time. + */ + return -EPERM; + return prog->enforce_expected_attach_type && + prog->expected_attach_type != attach_type ? + -EINVAL : 0; + case BPF_PROG_TYPE_EXT: + return 0; + case BPF_PROG_TYPE_NETFILTER: + if (attach_type != BPF_NETFILTER) + return -EINVAL; + return 0; + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_TRACEPOINT: + if (attach_type != BPF_PERF_EVENT) + return -EINVAL; + return 0; + case BPF_PROG_TYPE_KPROBE: + if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI && + attach_type != BPF_TRACE_KPROBE_MULTI) + return -EINVAL; + if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI && + attach_type != BPF_TRACE_UPROBE_MULTI) + return -EINVAL; + if (attach_type != BPF_PERF_EVENT && + attach_type != BPF_TRACE_KPROBE_MULTI && + attach_type != BPF_TRACE_UPROBE_MULTI) + return -EINVAL; + return 0; + case BPF_PROG_TYPE_SCHED_CLS: + if (attach_type != BPF_TCX_INGRESS && + attach_type != BPF_TCX_EGRESS && + attach_type != BPF_NETKIT_PRIMARY && + attach_type != BPF_NETKIT_PEER) + return -EINVAL; + return 0; + default: + ptype = attach_type_to_prog_type(attach_type); + if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) + return -EINVAL; + return 0; + } +} + +#define BPF_PROG_ATTACH_LAST_FIELD expected_revision + +#define BPF_F_ATTACH_MASK_BASE \ + (BPF_F_ALLOW_OVERRIDE | \ + BPF_F_ALLOW_MULTI | \ + BPF_F_REPLACE) + +#define BPF_F_ATTACH_MASK_MPROG \ + (BPF_F_REPLACE | \ + BPF_F_BEFORE | \ + BPF_F_AFTER | \ + BPF_F_ID | \ + BPF_F_LINK) static int bpf_prog_attach(const union bpf_attr *attr) { @@ -3607,12 +3820,19 @@ static int bpf_prog_attach(const union bpf_attr *attr) if (CHECK_ATTR(BPF_PROG_ATTACH)) return -EINVAL; - if (attr->attach_flags & ~BPF_F_ATTACH_MASK) - return -EINVAL; - ptype = attach_type_to_prog_type(attr->attach_type); if (ptype == BPF_PROG_TYPE_UNSPEC) return -EINVAL; + if (bpf_mprog_supported(ptype)) { + if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG) + return -EINVAL; + } else { + if (attr->attach_flags & ~BPF_F_ATTACH_MASK_BASE) + return -EINVAL; + if (attr->relative_fd || + attr->expected_revision) + return -EINVAL; + } prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); if (IS_ERR(prog)) @@ -3648,6 +3868,13 @@ static int bpf_prog_attach(const union bpf_attr *attr) else ret = cgroup_bpf_prog_attach(attr, ptype, prog); break; + case BPF_PROG_TYPE_SCHED_CLS: + if (attr->attach_type == BPF_TCX_INGRESS || + attr->attach_type == BPF_TCX_EGRESS) + ret = tcx_prog_attach(attr, prog); + else + ret = netkit_prog_attach(attr, prog); + break; default: ret = -EINVAL; } @@ -3657,25 +3884,45 @@ static int bpf_prog_attach(const union bpf_attr *attr) return ret; } -#define BPF_PROG_DETACH_LAST_FIELD attach_type +#define BPF_PROG_DETACH_LAST_FIELD expected_revision static int bpf_prog_detach(const union bpf_attr *attr) { + struct bpf_prog *prog = NULL; enum bpf_prog_type ptype; + int ret; if (CHECK_ATTR(BPF_PROG_DETACH)) return -EINVAL; ptype = attach_type_to_prog_type(attr->attach_type); + if (bpf_mprog_supported(ptype)) { + if (ptype == BPF_PROG_TYPE_UNSPEC) + return -EINVAL; + if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG) + return -EINVAL; + if (attr->attach_bpf_fd) { + prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); + if (IS_ERR(prog)) + return PTR_ERR(prog); + } + } else if (attr->attach_flags || + attr->relative_fd || + attr->expected_revision) { + return -EINVAL; + } switch (ptype) { case BPF_PROG_TYPE_SK_MSG: case BPF_PROG_TYPE_SK_SKB: - return sock_map_prog_detach(attr, ptype); + ret = sock_map_prog_detach(attr, ptype); + break; case BPF_PROG_TYPE_LIRC_MODE2: - return lirc_prog_detach(attr); + ret = lirc_prog_detach(attr); + break; case BPF_PROG_TYPE_FLOW_DISSECTOR: - return netns_bpf_prog_detach(attr, ptype); + ret = netns_bpf_prog_detach(attr, ptype); + break; case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: @@ -3684,13 +3931,25 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_LSM: - return cgroup_bpf_prog_detach(attr, ptype); + ret = cgroup_bpf_prog_detach(attr, ptype); + break; + case BPF_PROG_TYPE_SCHED_CLS: + if (attr->attach_type == BPF_TCX_INGRESS || + attr->attach_type == BPF_TCX_EGRESS) + ret = tcx_prog_detach(attr, prog); + else + ret = netkit_prog_detach(attr, prog); + break; default: - return -EINVAL; + ret = -EINVAL; } + + if (prog) + bpf_prog_put(prog); + return ret; } -#define BPF_PROG_QUERY_LAST_FIELD query.prog_attach_flags +#define BPF_PROG_QUERY_LAST_FIELD query.revision static int bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -3713,14 +3972,19 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET6_POST_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: case BPF_CGROUP_SYSCTL: @@ -3738,6 +4002,12 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_SK_MSG_VERDICT: case BPF_SK_SKB_VERDICT: return sock_map_bpf_prog_query(attr, uattr); + case BPF_TCX_INGRESS: + case BPF_TCX_EGRESS: + return tcx_prog_query(attr, uattr); + case BPF_NETKIT_PRIMARY: + case BPF_NETKIT_PEER: + return netkit_prog_query(attr, uattr); default: return -EINVAL; } @@ -4583,7 +4853,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr, err = bpf_get_perf_event_info(event, &prog_id, &fd_type, &buf, &probe_offset, - &probe_addr); + &probe_addr, NULL); if (!err) err = bpf_task_fd_query_copy(attr, uattr, prog_id, fd_type, buf, @@ -4655,10 +4925,9 @@ err_put: return err; } -#define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies +#define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.pid static int link_create(union bpf_attr *attr, bpfptr_t uattr) { - enum bpf_prog_type ptype; struct bpf_prog *prog; int ret; @@ -4678,38 +4947,6 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) goto out; switch (prog->type) { - case BPF_PROG_TYPE_EXT: - break; - case BPF_PROG_TYPE_NETFILTER: - if (attr->link_create.attach_type != BPF_NETFILTER) { - ret = -EINVAL; - goto out; - } - break; - case BPF_PROG_TYPE_PERF_EVENT: - case BPF_PROG_TYPE_TRACEPOINT: - if (attr->link_create.attach_type != BPF_PERF_EVENT) { - ret = -EINVAL; - goto out; - } - break; - case BPF_PROG_TYPE_KPROBE: - if (attr->link_create.attach_type != BPF_PERF_EVENT && - attr->link_create.attach_type != BPF_TRACE_KPROBE_MULTI) { - ret = -EINVAL; - goto out; - } - break; - default: - ptype = attach_type_to_prog_type(attr->link_create.attach_type); - if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) { - ret = -EINVAL; - goto out; - } - break; - } - - switch (prog->type) { case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: @@ -4751,6 +4988,13 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_XDP: ret = bpf_xdp_link_attach(attr, prog); break; + case BPF_PROG_TYPE_SCHED_CLS: + if (attr->link_create.attach_type == BPF_TCX_INGRESS || + attr->link_create.attach_type == BPF_TCX_EGRESS) + ret = tcx_link_attach(attr, prog); + else + ret = netkit_link_attach(attr, prog); + break; case BPF_PROG_TYPE_NETFILTER: ret = bpf_nf_link_attach(attr, prog); break; @@ -4762,8 +5006,10 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_KPROBE: if (attr->link_create.attach_type == BPF_PERF_EVENT) ret = bpf_perf_link_attach(attr, prog); - else + else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI) ret = bpf_kprobe_multi_link_attach(attr, prog); + else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI) + ret = bpf_uprobe_multi_link_attach(attr, prog); break; default: ret = -EINVAL; @@ -5304,9 +5550,9 @@ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size) } run_ctx.bpf_cookie = 0; - run_ctx.saved_run_ctx = NULL; if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) { /* recursion detected */ + __bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx); bpf_prog_put(prog); return -EBUSY; } diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index c4ab9d6cdbe9..26082b97894d 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -7,7 +7,9 @@ #include <linux/fs.h> #include <linux/fdtable.h> #include <linux/filter.h> +#include <linux/bpf_mem_alloc.h> #include <linux/btf_ids.h> +#include <linux/mm_types.h> #include "mmap_unlock_work.h" static const char * const iter_task_type_names[] = { @@ -35,16 +37,13 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm u32 *tid, bool skip_if_dup_files) { - struct task_struct *task, *next_task; + struct task_struct *task; struct pid *pid; - u32 saved_tid; + u32 next_tid; if (!*tid) { /* The first time, the iterator calls this function. */ pid = find_pid_ns(common->pid, common->ns); - if (!pid) - return NULL; - task = get_pid_task(pid, PIDTYPE_TGID); if (!task) return NULL; @@ -66,44 +65,27 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm return task; } - pid = find_pid_ns(common->pid_visiting, common->ns); - if (!pid) - return NULL; - - task = get_pid_task(pid, PIDTYPE_PID); + task = find_task_by_pid_ns(common->pid_visiting, common->ns); if (!task) return NULL; retry: - if (!pid_alive(task)) { - put_task_struct(task); - return NULL; - } + task = next_thread(task); - next_task = next_thread(task); - put_task_struct(task); - if (!next_task) - return NULL; - - saved_tid = *tid; - *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns); - if (!*tid || *tid == common->pid) { + next_tid = __task_pid_nr_ns(task, PIDTYPE_PID, common->ns); + if (!next_tid || next_tid == common->pid) { /* Run out of tasks of a process. The tasks of a * thread_group are linked as circular linked list. */ - *tid = saved_tid; return NULL; } - get_task_struct(next_task); - common->pid_visiting = *tid; - - if (skip_if_dup_files && task->files == task->group_leader->files) { - task = next_task; + if (skip_if_dup_files && task->files == task->group_leader->files) goto retry; - } - return next_task; + *tid = common->pid_visiting = next_tid; + get_task_struct(task); + return task; } static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common, @@ -308,11 +290,9 @@ again: rcu_read_lock(); for (;; curr_fd++) { struct file *f; - f = task_lookup_next_fd_rcu(curr_task, &curr_fd); + f = task_lookup_next_fdget_rcu(curr_task, &curr_fd); if (!f) break; - if (!get_file_rcu(f)) - continue; /* set info->fd */ info->fd = curr_fd; @@ -724,7 +704,7 @@ static struct bpf_iter_reg task_reg_info = { .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__task, task), - PTR_TO_BTF_ID_OR_NULL }, + PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, }, .seq_info = &task_seq_info, .fill_link_info = bpf_iter_fill_link_info, @@ -823,6 +803,244 @@ const struct bpf_func_proto bpf_find_vma_proto = { .arg5_type = ARG_ANYTHING, }; +struct bpf_iter_task_vma_kern_data { + struct task_struct *task; + struct mm_struct *mm; + struct mmap_unlock_irq_work *work; + struct vma_iterator vmi; +}; + +struct bpf_iter_task_vma { + /* opaque iterator state; having __u64 here allows to preserve correct + * alignment requirements in vmlinux.h, generated from BTF + */ + __u64 __opaque[1]; +} __attribute__((aligned(8))); + +/* Non-opaque version of bpf_iter_task_vma */ +struct bpf_iter_task_vma_kern { + struct bpf_iter_task_vma_kern_data *data; +} __attribute__((aligned(8))); + +__bpf_kfunc_start_defs(); + +__bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it, + struct task_struct *task, u64 addr) +{ + struct bpf_iter_task_vma_kern *kit = (void *)it; + bool irq_work_busy = false; + int err; + + BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma)); + BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma)); + + /* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized + * before, so non-NULL kit->data doesn't point to previously + * bpf_mem_alloc'd bpf_iter_task_vma_kern_data + */ + kit->data = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_iter_task_vma_kern_data)); + if (!kit->data) + return -ENOMEM; + + kit->data->task = get_task_struct(task); + kit->data->mm = task->mm; + if (!kit->data->mm) { + err = -ENOENT; + goto err_cleanup_iter; + } + + /* kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work */ + irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work); + if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) { + err = -EBUSY; + goto err_cleanup_iter; + } + + vma_iter_init(&kit->data->vmi, kit->data->mm, addr); + return 0; + +err_cleanup_iter: + if (kit->data->task) + put_task_struct(kit->data->task); + bpf_mem_free(&bpf_global_ma, kit->data); + /* NULL kit->data signals failed bpf_iter_task_vma initialization */ + kit->data = NULL; + return err; +} + +__bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it) +{ + struct bpf_iter_task_vma_kern *kit = (void *)it; + + if (!kit->data) /* bpf_iter_task_vma_new failed */ + return NULL; + return vma_next(&kit->data->vmi); +} + +__bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) +{ + struct bpf_iter_task_vma_kern *kit = (void *)it; + + if (kit->data) { + bpf_mmap_unlock_mm(kit->data->work, kit->data->mm); + put_task_struct(kit->data->task); + bpf_mem_free(&bpf_global_ma, kit->data); + } +} + +__bpf_kfunc_end_defs(); + +#ifdef CONFIG_CGROUPS + +struct bpf_iter_css_task { + __u64 __opaque[1]; +} __attribute__((aligned(8))); + +struct bpf_iter_css_task_kern { + struct css_task_iter *css_it; +} __attribute__((aligned(8))); + +__bpf_kfunc_start_defs(); + +__bpf_kfunc int bpf_iter_css_task_new(struct bpf_iter_css_task *it, + struct cgroup_subsys_state *css, unsigned int flags) +{ + struct bpf_iter_css_task_kern *kit = (void *)it; + + BUILD_BUG_ON(sizeof(struct bpf_iter_css_task_kern) != sizeof(struct bpf_iter_css_task)); + BUILD_BUG_ON(__alignof__(struct bpf_iter_css_task_kern) != + __alignof__(struct bpf_iter_css_task)); + kit->css_it = NULL; + switch (flags) { + case CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED: + case CSS_TASK_ITER_PROCS: + case 0: + break; + default: + return -EINVAL; + } + + kit->css_it = bpf_mem_alloc(&bpf_global_ma, sizeof(struct css_task_iter)); + if (!kit->css_it) + return -ENOMEM; + css_task_iter_start(css, flags, kit->css_it); + return 0; +} + +__bpf_kfunc struct task_struct *bpf_iter_css_task_next(struct bpf_iter_css_task *it) +{ + struct bpf_iter_css_task_kern *kit = (void *)it; + + if (!kit->css_it) + return NULL; + return css_task_iter_next(kit->css_it); +} + +__bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it) +{ + struct bpf_iter_css_task_kern *kit = (void *)it; + + if (!kit->css_it) + return; + css_task_iter_end(kit->css_it); + bpf_mem_free(&bpf_global_ma, kit->css_it); +} + +__bpf_kfunc_end_defs(); + +#endif /* CONFIG_CGROUPS */ + +struct bpf_iter_task { + __u64 __opaque[3]; +} __attribute__((aligned(8))); + +struct bpf_iter_task_kern { + struct task_struct *task; + struct task_struct *pos; + unsigned int flags; +} __attribute__((aligned(8))); + +enum { + /* all process in the system */ + BPF_TASK_ITER_ALL_PROCS, + /* all threads in the system */ + BPF_TASK_ITER_ALL_THREADS, + /* all threads of a specific process */ + BPF_TASK_ITER_PROC_THREADS +}; + +__bpf_kfunc_start_defs(); + +__bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it, + struct task_struct *task__nullable, unsigned int flags) +{ + struct bpf_iter_task_kern *kit = (void *)it; + + BUILD_BUG_ON(sizeof(struct bpf_iter_task_kern) > sizeof(struct bpf_iter_task)); + BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) != + __alignof__(struct bpf_iter_task)); + + kit->task = kit->pos = NULL; + switch (flags) { + case BPF_TASK_ITER_ALL_THREADS: + case BPF_TASK_ITER_ALL_PROCS: + break; + case BPF_TASK_ITER_PROC_THREADS: + if (!task__nullable) + return -EINVAL; + break; + default: + return -EINVAL; + } + + if (flags == BPF_TASK_ITER_PROC_THREADS) + kit->task = task__nullable; + else + kit->task = &init_task; + kit->pos = kit->task; + kit->flags = flags; + return 0; +} + +__bpf_kfunc struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it) +{ + struct bpf_iter_task_kern *kit = (void *)it; + struct task_struct *pos; + unsigned int flags; + + flags = kit->flags; + pos = kit->pos; + + if (!pos) + return pos; + + if (flags == BPF_TASK_ITER_ALL_PROCS) + goto get_next_task; + + kit->pos = next_thread(kit->pos); + if (kit->pos == kit->task) { + if (flags == BPF_TASK_ITER_PROC_THREADS) { + kit->pos = NULL; + return pos; + } + } else + return pos; + +get_next_task: + kit->pos = next_task(kit->pos); + kit->task = kit->pos; + if (kit->pos == &init_task) + kit->pos = NULL; + + return pos; +} + +__bpf_kfunc void bpf_iter_task_destroy(struct bpf_iter_task *it) +{ +} + +__bpf_kfunc_end_defs(); + DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work); static void do_mmap_read_unlock(struct irq_work *entry) diff --git a/kernel/bpf/tcx.c b/kernel/bpf/tcx.c new file mode 100644 index 000000000000..2e4885e7781f --- /dev/null +++ b/kernel/bpf/tcx.c @@ -0,0 +1,346 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2023 Isovalent */ + +#include <linux/bpf.h> +#include <linux/bpf_mprog.h> +#include <linux/netdevice.h> + +#include <net/tcx.h> + +int tcx_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + bool created, ingress = attr->attach_type == BPF_TCX_INGRESS; + struct net *net = current->nsproxy->net_ns; + struct bpf_mprog_entry *entry, *entry_new; + struct bpf_prog *replace_prog = NULL; + struct net_device *dev; + int ret; + + rtnl_lock(); + dev = __dev_get_by_index(net, attr->target_ifindex); + if (!dev) { + ret = -ENODEV; + goto out; + } + if (attr->attach_flags & BPF_F_REPLACE) { + replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, + prog->type); + if (IS_ERR(replace_prog)) { + ret = PTR_ERR(replace_prog); + replace_prog = NULL; + goto out; + } + } + entry = tcx_entry_fetch_or_create(dev, ingress, &created); + if (!entry) { + ret = -ENOMEM; + goto out; + } + ret = bpf_mprog_attach(entry, &entry_new, prog, NULL, replace_prog, + attr->attach_flags, attr->relative_fd, + attr->expected_revision); + if (!ret) { + if (entry != entry_new) { + tcx_entry_update(dev, entry_new, ingress); + tcx_entry_sync(); + tcx_skeys_inc(ingress); + } + bpf_mprog_commit(entry); + } else if (created) { + tcx_entry_free(entry); + } +out: + if (replace_prog) + bpf_prog_put(replace_prog); + rtnl_unlock(); + return ret; +} + +int tcx_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + bool ingress = attr->attach_type == BPF_TCX_INGRESS; + struct net *net = current->nsproxy->net_ns; + struct bpf_mprog_entry *entry, *entry_new; + struct net_device *dev; + int ret; + + rtnl_lock(); + dev = __dev_get_by_index(net, attr->target_ifindex); + if (!dev) { + ret = -ENODEV; + goto out; + } + entry = tcx_entry_fetch(dev, ingress); + if (!entry) { + ret = -ENOENT; + goto out; + } + ret = bpf_mprog_detach(entry, &entry_new, prog, NULL, attr->attach_flags, + attr->relative_fd, attr->expected_revision); + if (!ret) { + if (!tcx_entry_is_active(entry_new)) + entry_new = NULL; + tcx_entry_update(dev, entry_new, ingress); + tcx_entry_sync(); + tcx_skeys_dec(ingress); + bpf_mprog_commit(entry); + if (!entry_new) + tcx_entry_free(entry); + } +out: + rtnl_unlock(); + return ret; +} + +void tcx_uninstall(struct net_device *dev, bool ingress) +{ + struct bpf_mprog_entry *entry, *entry_new = NULL; + struct bpf_tuple tuple = {}; + struct bpf_mprog_fp *fp; + struct bpf_mprog_cp *cp; + bool active; + + entry = tcx_entry_fetch(dev, ingress); + if (!entry) + return; + active = tcx_entry(entry)->miniq_active; + if (active) + bpf_mprog_clear_all(entry, &entry_new); + tcx_entry_update(dev, entry_new, ingress); + tcx_entry_sync(); + bpf_mprog_foreach_tuple(entry, fp, cp, tuple) { + if (tuple.link) + tcx_link(tuple.link)->dev = NULL; + else + bpf_prog_put(tuple.prog); + tcx_skeys_dec(ingress); + } + if (!active) + tcx_entry_free(entry); +} + +int tcx_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) +{ + bool ingress = attr->query.attach_type == BPF_TCX_INGRESS; + struct net *net = current->nsproxy->net_ns; + struct net_device *dev; + int ret; + + rtnl_lock(); + dev = __dev_get_by_index(net, attr->query.target_ifindex); + if (!dev) { + ret = -ENODEV; + goto out; + } + ret = bpf_mprog_query(attr, uattr, tcx_entry_fetch(dev, ingress)); +out: + rtnl_unlock(); + return ret; +} + +static int tcx_link_prog_attach(struct bpf_link *link, u32 flags, u32 id_or_fd, + u64 revision) +{ + struct tcx_link *tcx = tcx_link(link); + bool created, ingress = tcx->location == BPF_TCX_INGRESS; + struct bpf_mprog_entry *entry, *entry_new; + struct net_device *dev = tcx->dev; + int ret; + + ASSERT_RTNL(); + entry = tcx_entry_fetch_or_create(dev, ingress, &created); + if (!entry) + return -ENOMEM; + ret = bpf_mprog_attach(entry, &entry_new, link->prog, link, NULL, flags, + id_or_fd, revision); + if (!ret) { + if (entry != entry_new) { + tcx_entry_update(dev, entry_new, ingress); + tcx_entry_sync(); + tcx_skeys_inc(ingress); + } + bpf_mprog_commit(entry); + } else if (created) { + tcx_entry_free(entry); + } + return ret; +} + +static void tcx_link_release(struct bpf_link *link) +{ + struct tcx_link *tcx = tcx_link(link); + bool ingress = tcx->location == BPF_TCX_INGRESS; + struct bpf_mprog_entry *entry, *entry_new; + struct net_device *dev; + int ret = 0; + + rtnl_lock(); + dev = tcx->dev; + if (!dev) + goto out; + entry = tcx_entry_fetch(dev, ingress); + if (!entry) { + ret = -ENOENT; + goto out; + } + ret = bpf_mprog_detach(entry, &entry_new, link->prog, link, 0, 0, 0); + if (!ret) { + if (!tcx_entry_is_active(entry_new)) + entry_new = NULL; + tcx_entry_update(dev, entry_new, ingress); + tcx_entry_sync(); + tcx_skeys_dec(ingress); + bpf_mprog_commit(entry); + if (!entry_new) + tcx_entry_free(entry); + tcx->dev = NULL; + } +out: + WARN_ON_ONCE(ret); + rtnl_unlock(); +} + +static int tcx_link_update(struct bpf_link *link, struct bpf_prog *nprog, + struct bpf_prog *oprog) +{ + struct tcx_link *tcx = tcx_link(link); + bool ingress = tcx->location == BPF_TCX_INGRESS; + struct bpf_mprog_entry *entry, *entry_new; + struct net_device *dev; + int ret = 0; + + rtnl_lock(); + dev = tcx->dev; + if (!dev) { + ret = -ENOLINK; + goto out; + } + if (oprog && link->prog != oprog) { + ret = -EPERM; + goto out; + } + oprog = link->prog; + if (oprog == nprog) { + bpf_prog_put(nprog); + goto out; + } + entry = tcx_entry_fetch(dev, ingress); + if (!entry) { + ret = -ENOENT; + goto out; + } + ret = bpf_mprog_attach(entry, &entry_new, nprog, link, oprog, + BPF_F_REPLACE | BPF_F_ID, + link->prog->aux->id, 0); + if (!ret) { + WARN_ON_ONCE(entry != entry_new); + oprog = xchg(&link->prog, nprog); + bpf_prog_put(oprog); + bpf_mprog_commit(entry); + } +out: + rtnl_unlock(); + return ret; +} + +static void tcx_link_dealloc(struct bpf_link *link) +{ + kfree(tcx_link(link)); +} + +static void tcx_link_fdinfo(const struct bpf_link *link, struct seq_file *seq) +{ + const struct tcx_link *tcx = tcx_link(link); + u32 ifindex = 0; + + rtnl_lock(); + if (tcx->dev) + ifindex = tcx->dev->ifindex; + rtnl_unlock(); + + seq_printf(seq, "ifindex:\t%u\n", ifindex); + seq_printf(seq, "attach_type:\t%u (%s)\n", + tcx->location, + tcx->location == BPF_TCX_INGRESS ? "ingress" : "egress"); +} + +static int tcx_link_fill_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + const struct tcx_link *tcx = tcx_link(link); + u32 ifindex = 0; + + rtnl_lock(); + if (tcx->dev) + ifindex = tcx->dev->ifindex; + rtnl_unlock(); + + info->tcx.ifindex = ifindex; + info->tcx.attach_type = tcx->location; + return 0; +} + +static int tcx_link_detach(struct bpf_link *link) +{ + tcx_link_release(link); + return 0; +} + +static const struct bpf_link_ops tcx_link_lops = { + .release = tcx_link_release, + .detach = tcx_link_detach, + .dealloc = tcx_link_dealloc, + .update_prog = tcx_link_update, + .show_fdinfo = tcx_link_fdinfo, + .fill_link_info = tcx_link_fill_info, +}; + +static int tcx_link_init(struct tcx_link *tcx, + struct bpf_link_primer *link_primer, + const union bpf_attr *attr, + struct net_device *dev, + struct bpf_prog *prog) +{ + bpf_link_init(&tcx->link, BPF_LINK_TYPE_TCX, &tcx_link_lops, prog); + tcx->location = attr->link_create.attach_type; + tcx->dev = dev; + return bpf_link_prime(&tcx->link, link_primer); +} + +int tcx_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct net *net = current->nsproxy->net_ns; + struct bpf_link_primer link_primer; + struct net_device *dev; + struct tcx_link *tcx; + int ret; + + rtnl_lock(); + dev = __dev_get_by_index(net, attr->link_create.target_ifindex); + if (!dev) { + ret = -ENODEV; + goto out; + } + tcx = kzalloc(sizeof(*tcx), GFP_USER); + if (!tcx) { + ret = -ENOMEM; + goto out; + } + ret = tcx_link_init(tcx, &link_primer, attr, dev, prog); + if (ret) { + kfree(tcx); + goto out; + } + ret = tcx_link_prog_attach(&tcx->link, attr->link_create.flags, + attr->link_create.tcx.relative_fd, + attr->link_create.tcx.expected_revision); + if (ret) { + tcx->dev = NULL; + bpf_link_cleanup(&link_primer); + goto out; + } + ret = bpf_link_settle(&link_primer); +out: + rtnl_unlock(); + return ret; +} diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 78acf28d4873..e97aeda3a86b 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -415,8 +415,8 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut goto out; } - /* clear all bits except SHARE_IPMODIFY */ - tr->flags &= BPF_TRAMP_F_SHARE_IPMODIFY; + /* clear all bits except SHARE_IPMODIFY and TAIL_CALL_CTX */ + tr->flags &= (BPF_TRAMP_F_SHARE_IPMODIFY | BPF_TRAMP_F_TAIL_CALL_CTX); if (tlinks[BPF_TRAMP_FEXIT].nr_links || tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) { @@ -926,13 +926,12 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, migrate_disable(); might_fault(); + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); + if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); return 0; } - - run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); - return bpf_prog_start_time(); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 02a021c524ab..af2819d5c8ee 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -25,6 +25,9 @@ #include <linux/btf_ids.h> #include <linux/poison.h> #include <linux/module.h> +#include <linux/cpumask.h> +#include <linux/bpf_mem_alloc.h> +#include <net/xdp.h> #include "disasm.h" @@ -39,6 +42,9 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { #undef BPF_LINK_TYPE }; +struct bpf_mem_alloc bpf_global_percpu_ma; +static bool bpf_global_percpu_ma_set; + /* bpf_check() is a static code analyzer that walks eBPF program * instruction by instruction and updates register/stack state. * All paths of conditional branches are analyzed until 'bpf_exit' insn. @@ -302,7 +308,7 @@ struct bpf_kfunc_call_arg_meta { /* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling, * generally to pass info about user-defined local kptr types to later * verification logic - * bpf_obj_drop + * bpf_obj_drop/bpf_percpu_obj_drop * Record the local kptr type to be drop'd * bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type) * Record the local kptr type to be refcount_incr'd and use @@ -334,6 +340,7 @@ struct bpf_kfunc_call_arg_meta { struct btf *btf_vmlinux; static DEFINE_MUTEX(bpf_verifier_lock); +static DEFINE_MUTEX(bpf_percpu_ma_lock); static const struct bpf_line_info * find_linfo(const struct bpf_verifier_env *env, u32 insn_off) @@ -540,12 +547,12 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_dynptr_data; } -static bool is_callback_calling_kfunc(u32 btf_id); +static bool is_sync_callback_calling_kfunc(u32 btf_id); +static bool is_bpf_throw_kfunc(struct bpf_insn *insn); -static bool is_callback_calling_function(enum bpf_func_id func_id) +static bool is_sync_callback_calling_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_for_each_map_elem || - func_id == BPF_FUNC_timer_set_callback || func_id == BPF_FUNC_find_vma || func_id == BPF_FUNC_loop || func_id == BPF_FUNC_user_ringbuf_drain; @@ -556,6 +563,18 @@ static bool is_async_callback_calling_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_timer_set_callback; } +static bool is_callback_calling_function(enum bpf_func_id func_id) +{ + return is_sync_callback_calling_function(func_id) || + is_async_callback_calling_function(func_id); +} + +static bool is_sync_callback_calling_insn(struct bpf_insn *insn) +{ + return (bpf_helper_call(insn) && is_sync_callback_calling_function(insn->imm)) || + (bpf_pseudo_kfunc_call(insn) && is_sync_callback_calling_kfunc(insn->imm)); +} + static bool is_storage_get_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_sk_storage_get || @@ -1170,7 +1189,12 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg static void __mark_reg_known_zero(struct bpf_reg_state *reg); +static bool in_rcu_cs(struct bpf_verifier_env *env); + +static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta); + static int mark_stack_slots_iter(struct bpf_verifier_env *env, + struct bpf_kfunc_call_arg_meta *meta, struct bpf_reg_state *reg, int insn_idx, struct btf *btf, u32 btf_id, int nr_slots) { @@ -1191,6 +1215,12 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, __mark_reg_known_zero(st); st->type = PTR_TO_STACK; /* we don't have dedicated reg type */ + if (is_kfunc_rcu_protected(meta)) { + if (in_rcu_cs(env)) + st->type |= MEM_RCU; + else + st->type |= PTR_UNTRUSTED; + } st->live |= REG_LIVE_WRITTEN; st->ref_obj_id = i == 0 ? id : 0; st->iter.btf = btf; @@ -1265,7 +1295,7 @@ static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env, return true; } -static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg, +static int is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg, struct btf *btf, u32 btf_id, int nr_slots) { struct bpf_func_state *state = func(env, reg); @@ -1273,26 +1303,28 @@ static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_ spi = iter_get_spi(env, reg, nr_slots); if (spi < 0) - return false; + return -EINVAL; for (i = 0; i < nr_slots; i++) { struct bpf_stack_state *slot = &state->stack[spi - i]; struct bpf_reg_state *st = &slot->spilled_ptr; + if (st->type & PTR_UNTRUSTED) + return -EPROTO; /* only main (first) slot has ref_obj_id set */ if (i == 0 && !st->ref_obj_id) - return false; + return -EINVAL; if (i != 0 && st->ref_obj_id) - return false; + return -EINVAL; if (st->iter.btf != btf || st->iter.btf_id != btf_id) - return false; + return -EINVAL; for (j = 0; j < BPF_REG_SIZE; j++) if (slot->slot_type[j] != STACK_ITER) - return false; + return -EINVAL; } - return true; + return 0; } /* Check if given stack slot is "special": @@ -1339,6 +1371,50 @@ static void scrub_spilled_slot(u8 *stype) *stype = STACK_MISC; } +static void print_scalar_ranges(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + const char **sep) +{ + struct { + const char *name; + u64 val; + bool omit; + } minmaxs[] = { + {"smin", reg->smin_value, reg->smin_value == S64_MIN}, + {"smax", reg->smax_value, reg->smax_value == S64_MAX}, + {"umin", reg->umin_value, reg->umin_value == 0}, + {"umax", reg->umax_value, reg->umax_value == U64_MAX}, + {"smin32", (s64)reg->s32_min_value, reg->s32_min_value == S32_MIN}, + {"smax32", (s64)reg->s32_max_value, reg->s32_max_value == S32_MAX}, + {"umin32", reg->u32_min_value, reg->u32_min_value == 0}, + {"umax32", reg->u32_max_value, reg->u32_max_value == U32_MAX}, + }, *m1, *m2, *mend = &minmaxs[ARRAY_SIZE(minmaxs)]; + bool neg1, neg2; + + for (m1 = &minmaxs[0]; m1 < mend; m1++) { + if (m1->omit) + continue; + + neg1 = m1->name[0] == 's' && (s64)m1->val < 0; + + verbose(env, "%s%s=", *sep, m1->name); + *sep = ","; + + for (m2 = m1 + 2; m2 < mend; m2 += 2) { + if (m2->omit || m2->val != m1->val) + continue; + /* don't mix negatives with positives */ + neg2 = m2->name[0] == 's' && (s64)m2->val < 0; + if (neg2 != neg1) + continue; + m2->omit = true; + verbose(env, "%s=", m2->name); + } + + verbose(env, m1->name[0] == 's' ? "%lld" : "%llu", m1->val); + } +} + static void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_state *state, bool print_all) @@ -1402,34 +1478,13 @@ static void print_verifier_state(struct bpf_verifier_env *env, */ verbose_a("imm=%llx", reg->var_off.value); } else { - if (reg->smin_value != reg->umin_value && - reg->smin_value != S64_MIN) - verbose_a("smin=%lld", (long long)reg->smin_value); - if (reg->smax_value != reg->umax_value && - reg->smax_value != S64_MAX) - verbose_a("smax=%lld", (long long)reg->smax_value); - if (reg->umin_value != 0) - verbose_a("umin=%llu", (unsigned long long)reg->umin_value); - if (reg->umax_value != U64_MAX) - verbose_a("umax=%llu", (unsigned long long)reg->umax_value); + print_scalar_ranges(env, reg, &sep); if (!tnum_is_unknown(reg->var_off)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose_a("var_off=%s", tn_buf); } - if (reg->s32_min_value != reg->smin_value && - reg->s32_min_value != S32_MIN) - verbose_a("s32_min=%d", (int)(reg->s32_min_value)); - if (reg->s32_max_value != reg->smax_value && - reg->s32_max_value != S32_MAX) - verbose_a("s32_max=%d", (int)(reg->s32_max_value)); - if (reg->u32_min_value != reg->umin_value && - reg->u32_min_value != U32_MIN) - verbose_a("u32_min=%d", (int)(reg->u32_min_value)); - if (reg->u32_max_value != reg->umax_value && - reg->u32_max_value != U32_MAX) - verbose_a("u32_max=%d", (int)(reg->u32_max_value)); } #undef verbose_a @@ -1513,7 +1568,8 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (state->in_async_callback_fn) verbose(env, " async_cb"); verbose(env, "\n"); - mark_verifier_state_clean(env); + if (!print_all) + mark_verifier_state_clean(env); } static inline u32 vlog_alignment(u32 pos) @@ -1746,7 +1802,9 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, return -ENOMEM; dst_state->jmp_history_cnt = src->jmp_history_cnt; - /* if dst has more stack frames then src frame, free them */ + /* if dst has more stack frames then src frame, free them, this is also + * necessary in case of exceptional exits using bpf_throw. + */ for (i = src->curframe + 1; i <= dst_state->curframe; i++) { free_func_state(dst_state->frame[i]); dst_state->frame[i] = NULL; @@ -1760,6 +1818,9 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->parent = src->parent; dst_state->first_insn_idx = src->first_insn_idx; dst_state->last_insn_idx = src->last_insn_idx; + dst_state->dfs_depth = src->dfs_depth; + dst_state->callback_unroll_depth = src->callback_unroll_depth; + dst_state->used_as_loop_entry = src->used_as_loop_entry; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -1775,11 +1836,203 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, return 0; } +static u32 state_htab_size(struct bpf_verifier_env *env) +{ + return env->prog->len; +} + +static struct bpf_verifier_state_list **explored_state(struct bpf_verifier_env *env, int idx) +{ + struct bpf_verifier_state *cur = env->cur_state; + struct bpf_func_state *state = cur->frame[cur->curframe]; + + return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)]; +} + +static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_state *b) +{ + int fr; + + if (a->curframe != b->curframe) + return false; + + for (fr = a->curframe; fr >= 0; fr--) + if (a->frame[fr]->callsite != b->frame[fr]->callsite) + return false; + + return true; +} + +/* Open coded iterators allow back-edges in the state graph in order to + * check unbounded loops that iterators. + * + * In is_state_visited() it is necessary to know if explored states are + * part of some loops in order to decide whether non-exact states + * comparison could be used: + * - non-exact states comparison establishes sub-state relation and uses + * read and precision marks to do so, these marks are propagated from + * children states and thus are not guaranteed to be final in a loop; + * - exact states comparison just checks if current and explored states + * are identical (and thus form a back-edge). + * + * Paper "A New Algorithm for Identifying Loops in Decompilation" + * by Tao Wei, Jian Mao, Wei Zou and Yu Chen [1] presents a convenient + * algorithm for loop structure detection and gives an overview of + * relevant terminology. It also has helpful illustrations. + * + * [1] https://api.semanticscholar.org/CorpusID:15784067 + * + * We use a similar algorithm but because loop nested structure is + * irrelevant for verifier ours is significantly simpler and resembles + * strongly connected components algorithm from Sedgewick's textbook. + * + * Define topmost loop entry as a first node of the loop traversed in a + * depth first search starting from initial state. The goal of the loop + * tracking algorithm is to associate topmost loop entries with states + * derived from these entries. + * + * For each step in the DFS states traversal algorithm needs to identify + * the following situations: + * + * initial initial initial + * | | | + * V V V + * ... ... .---------> hdr + * | | | | + * V V | V + * cur .-> succ | .------... + * | | | | | | + * V | V | V V + * succ '-- cur | ... ... + * | | | + * | V V + * | succ <- cur + * | | + * | V + * | ... + * | | + * '----' + * + * (A) successor state of cur (B) successor state of cur or it's entry + * not yet traversed are in current DFS path, thus cur and succ + * are members of the same outermost loop + * + * initial initial + * | | + * V V + * ... ... + * | | + * V V + * .------... .------... + * | | | | + * V V V V + * .-> hdr ... ... ... + * | | | | | + * | V V V V + * | succ <- cur succ <- cur + * | | | + * | V V + * | ... ... + * | | | + * '----' exit + * + * (C) successor state of cur is a part of some loop but this loop + * does not include cur or successor state is not in a loop at all. + * + * Algorithm could be described as the following python code: + * + * traversed = set() # Set of traversed nodes + * entries = {} # Mapping from node to loop entry + * depths = {} # Depth level assigned to graph node + * path = set() # Current DFS path + * + * # Find outermost loop entry known for n + * def get_loop_entry(n): + * h = entries.get(n, None) + * while h in entries and entries[h] != h: + * h = entries[h] + * return h + * + * # Update n's loop entry if h's outermost entry comes + * # before n's outermost entry in current DFS path. + * def update_loop_entry(n, h): + * n1 = get_loop_entry(n) or n + * h1 = get_loop_entry(h) or h + * if h1 in path and depths[h1] <= depths[n1]: + * entries[n] = h1 + * + * def dfs(n, depth): + * traversed.add(n) + * path.add(n) + * depths[n] = depth + * for succ in G.successors(n): + * if succ not in traversed: + * # Case A: explore succ and update cur's loop entry + * # only if succ's entry is in current DFS path. + * dfs(succ, depth + 1) + * h = get_loop_entry(succ) + * update_loop_entry(n, h) + * else: + * # Case B or C depending on `h1 in path` check in update_loop_entry(). + * update_loop_entry(n, succ) + * path.remove(n) + * + * To adapt this algorithm for use with verifier: + * - use st->branch == 0 as a signal that DFS of succ had been finished + * and cur's loop entry has to be updated (case A), handle this in + * update_branch_counts(); + * - use st->branch > 0 as a signal that st is in the current DFS path; + * - handle cases B and C in is_state_visited(); + * - update topmost loop entry for intermediate states in get_loop_entry(). + */ +static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_state *st) +{ + struct bpf_verifier_state *topmost = st->loop_entry, *old; + + while (topmost && topmost->loop_entry && topmost != topmost->loop_entry) + topmost = topmost->loop_entry; + /* Update loop entries for intermediate states to avoid this + * traversal in future get_loop_entry() calls. + */ + while (st && st->loop_entry != topmost) { + old = st->loop_entry; + st->loop_entry = topmost; + st = old; + } + return topmost; +} + +static void update_loop_entry(struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr) +{ + struct bpf_verifier_state *cur1, *hdr1; + + cur1 = get_loop_entry(cur) ?: cur; + hdr1 = get_loop_entry(hdr) ?: hdr; + /* The head1->branches check decides between cases B and C in + * comment for get_loop_entry(). If hdr1->branches == 0 then + * head's topmost loop entry is not in current DFS path, + * hence 'cur' and 'hdr' are not in the same loop and there is + * no need to update cur->loop_entry. + */ + if (hdr1->branches && hdr1->dfs_depth <= cur1->dfs_depth) { + cur->loop_entry = hdr; + hdr->used_as_loop_entry = true; + } +} + static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st) { while (st) { u32 br = --st->branches; + /* br == 0 signals that DFS exploration for 'st' is finished, + * thus it is necessary to update parent's loop entry if it + * turned out that st is a part of some loop. + * This is a part of 'case A' in get_loop_entry() comment. + */ + if (br == 0 && st->parent && st->loop_entry) + update_loop_entry(st->parent, st->loop_entry); + /* WARN_ON(br > 1) technically makes sense here, * but see comment in push_stack(), hence: */ @@ -2452,6 +2705,68 @@ static int add_subprog(struct bpf_verifier_env *env, int off) return env->subprog_cnt - 1; } +static int bpf_find_exception_callback_insn_off(struct bpf_verifier_env *env) +{ + struct bpf_prog_aux *aux = env->prog->aux; + struct btf *btf = aux->btf; + const struct btf_type *t; + u32 main_btf_id, id; + const char *name; + int ret, i; + + /* Non-zero func_info_cnt implies valid btf */ + if (!aux->func_info_cnt) + return 0; + main_btf_id = aux->func_info[0].type_id; + + t = btf_type_by_id(btf, main_btf_id); + if (!t) { + verbose(env, "invalid btf id for main subprog in func_info\n"); + return -EINVAL; + } + + name = btf_find_decl_tag_value(btf, t, -1, "exception_callback:"); + if (IS_ERR(name)) { + ret = PTR_ERR(name); + /* If there is no tag present, there is no exception callback */ + if (ret == -ENOENT) + ret = 0; + else if (ret == -EEXIST) + verbose(env, "multiple exception callback tags for main subprog\n"); + return ret; + } + + ret = btf_find_by_name_kind(btf, name, BTF_KIND_FUNC); + if (ret < 0) { + verbose(env, "exception callback '%s' could not be found in BTF\n", name); + return ret; + } + id = ret; + t = btf_type_by_id(btf, id); + if (btf_func_linkage(t) != BTF_FUNC_GLOBAL) { + verbose(env, "exception callback '%s' must have global linkage\n", name); + return -EINVAL; + } + ret = 0; + for (i = 0; i < aux->func_info_cnt; i++) { + if (aux->func_info[i].type_id != id) + continue; + ret = aux->func_info[i].insn_off; + /* Further func_info and subprog checks will also happen + * later, so assume this is the right insn_off for now. + */ + if (!ret) { + verbose(env, "invalid exception callback insn_off in func_info: 0\n"); + ret = -EINVAL; + } + } + if (!ret) { + verbose(env, "exception callback type id not found in func_info\n"); + ret = -EINVAL; + } + return ret; +} + #define MAX_KFUNC_DESCS 256 #define MAX_KFUNC_BTFS 256 @@ -2791,8 +3106,8 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog, static int add_subprog_and_kfunc(struct bpf_verifier_env *env) { struct bpf_subprog_info *subprog = env->subprog_info; + int i, ret, insn_cnt = env->prog->len, ex_cb_insn; struct bpf_insn *insn = env->prog->insnsi; - int i, ret, insn_cnt = env->prog->len; /* Add entry function. */ ret = add_subprog(env, 0); @@ -2818,6 +3133,26 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env) return ret; } + ret = bpf_find_exception_callback_insn_off(env); + if (ret < 0) + return ret; + ex_cb_insn = ret; + + /* If ex_cb_insn > 0, this means that the main program has a subprog + * marked using BTF decl tag to serve as the exception callback. + */ + if (ex_cb_insn) { + ret = add_subprog(env, ex_cb_insn); + if (ret < 0) + return ret; + for (i = 1; i < env->subprog_cnt; i++) { + if (env->subprog_info[i].start != ex_cb_insn) + continue; + env->exception_callback_subprog = i; + break; + } + } + /* Add a fake 'exit' subprog which could simplify subprog iteration * logic. 'subprog_cnt' should not be increased. */ @@ -2854,7 +3189,10 @@ static int check_subprogs(struct bpf_verifier_env *env) goto next; if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) goto next; - off = i + insn[i].off + 1; + if (code == (BPF_JMP32 | BPF_JA)) + off = i + insn[i].imm + 1; + else + off = i + insn[i].off + 1; if (off < subprog_start || off >= subprog_end) { verbose(env, "jump out of range from insn %d to %d\n", i, off); return -EINVAL; @@ -2863,9 +3201,10 @@ next: if (i == subprog_end - 1) { /* to avoid fall-through from one subprog into another * the last insn of the subprog should be either exit - * or unconditional jump back + * or unconditional jump back or bpf_throw call */ if (code != (BPF_JMP | BPF_EXIT) && + code != (BPF_JMP32 | BPF_JA) && code != (BPF_JMP | BPF_JA)) { verbose(env, "last insn is not an exit or jmp\n"); return -EINVAL; @@ -3011,8 +3350,10 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, } } + if (class == BPF_ALU64 && op == BPF_END && (insn->imm == 16 || insn->imm == 32)) + return false; + if (class == BPF_ALU64 || class == BPF_JMP || - /* BPF_END always use BPF_ALU class. */ (class == BPF_ALU && op == BPF_END && insn->imm == 64)) return true; @@ -3021,7 +3362,7 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, if (class == BPF_LDX) { if (t != SRC_OP) - return BPF_SIZE(code) == BPF_DW; + return BPF_SIZE(code) == BPF_DW || BPF_MODE(code) == BPF_MEMSX; /* LDX source must be ptr. */ return true; } @@ -3110,13 +3451,11 @@ static void mark_insn_zext(struct bpf_verifier_env *env, reg->subreg_def = DEF_NOT_SUBREG; } -static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, - enum reg_arg_type t) +static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno, + enum reg_arg_type t) { - struct bpf_verifier_state *vstate = env->cur_state; - struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_insn *insn = env->prog->insnsi + env->insn_idx; - struct bpf_reg_state *reg, *regs = state->regs; + struct bpf_reg_state *reg; bool rw64; if (regno >= MAX_BPF_REG) { @@ -3157,6 +3496,15 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, return 0; } +static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, + enum reg_arg_type t) +{ + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + + return __check_reg_arg(env, state->regs, regno, t); +} + static void mark_jmp_point(struct bpf_verifier_env *env, int idx) { env->insn_aux_data[idx].jmp_point = true; @@ -3192,12 +3540,29 @@ static int push_jmp_history(struct bpf_verifier_env *env, /* Backtrack one insn at a time. If idx is not at the top of recorded * history then previous instruction came from straight line execution. + * Return -ENOENT if we exhausted all instructions within given state. + * + * It's legal to have a bit of a looping with the same starting and ending + * insn index within the same state, e.g.: 3->4->5->3, so just because current + * instruction index is the same as state's first_idx doesn't mean we are + * done. If there is still some jump history left, we should keep going. We + * need to take into account that we might have a jump history between given + * state's parent and itself, due to checkpointing. In this case, we'll have + * history entry recording a jump from last instruction of parent state and + * first instruction of given state. */ static int get_prev_insn_idx(struct bpf_verifier_state *st, int i, u32 *history) { u32 cnt = *history; + if (i == st->first_insn_idx) { + if (cnt == 0) + return -ENOENT; + if (cnt == 1 && st->jmp_history[0].idx == i) + return -ENOENT; + } + if (cnt && st->jmp_history[cnt - 1].idx == i) { i = st->jmp_history[cnt - 1].prev_idx; (*history)--; @@ -3378,6 +3743,8 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) } } +static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); + /* For given verifier state backtrack_insn() is called from the last insn to * the first insn. Its purpose is to compute a bitmask of registers and * stack slots that needs precision in the parent verifier state. @@ -3418,9 +3785,14 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, if (class == BPF_ALU || class == BPF_ALU64) { if (!bt_is_reg_set(bt, dreg)) return 0; - if (opcode == BPF_MOV) { + if (opcode == BPF_END || opcode == BPF_NEG) { + /* sreg is reserved and unused + * dreg still need precision before this insn + */ + return 0; + } else if (opcode == BPF_MOV) { if (BPF_SRC(insn->code) == BPF_X) { - /* dreg = sreg + /* dreg = sreg or dreg = (s8, s16, s32)sreg * dreg needs precision after this insn * sreg needs precision before this insn */ @@ -3548,16 +3920,13 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, return -EFAULT; return 0; } - } else if ((bpf_helper_call(insn) && - is_callback_calling_function(insn->imm) && - !is_async_callback_calling_function(insn->imm)) || - (bpf_pseudo_kfunc_call(insn) && is_callback_calling_kfunc(insn->imm))) { - /* callback-calling helper or kfunc call, which means - * we are exiting from subprog, but unlike the subprog - * call handling above, we shouldn't propagate - * precision of r1-r5 (if any requested), as they are - * not actually arguments passed directly to callback - * subprogs + } else if (is_sync_callback_calling_insn(insn) && idx != subseq_idx - 1) { + /* exit from callback subprog to callback-calling helper or + * kfunc call. Use idx/subseq_idx check to discern it from + * straight line code backtracking. + * Unlike the subprog call handling above, we shouldn't + * propagate precision of r1-r5 (if any requested), as they are + * not actually arguments passed directly to callback subprogs */ if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) { verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); @@ -3592,10 +3961,18 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, } else if (opcode == BPF_EXIT) { bool r0_precise; + /* Backtracking to a nested function call, 'idx' is a part of + * the inner frame 'subseq_idx' is a part of the outer frame. + * In case of a regular function call, instructions giving + * precision to registers R1-R5 should have been found already. + * In case of a callback, it is ok to have R1-R5 marked for + * backtracking, as these registers are set by the function + * invoking callback. + */ + if (subseq_idx >= 0 && calls_callback(env, subseq_idx)) + for (i = BPF_REG_1; i <= BPF_REG_5; i++) + bt_clear_reg(bt, i); if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { - /* if backtracing was looking for registers R1-R5 - * they should have been found already. - */ verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); WARN_ONCE(1, "verifier backtracking bug"); return -EFAULT; @@ -4039,11 +4416,9 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) bitmap_from_u64(mask, bt_reg_mask(bt)); for_each_set_bit(i, mask, 32) { reg = &st->frame[0]->regs[i]; - if (reg->type != SCALAR_VALUE) { - bt_clear_reg(bt, i); - continue; - } - reg->precise = true; + bt_clear_reg(bt, i); + if (reg->type == SCALAR_VALUE) + reg->precise = true; } return 0; } @@ -4074,10 +4449,10 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) * Nothing to be tracked further in the parent state. */ return 0; - if (i == first_idx) - break; subseq_idx = i; i = get_prev_insn_idx(st, i, &history); + if (i == -ENOENT) + break; if (i >= env->prog->len) { /* This can happen if backtracking reached insn 0 * and there are still reg_mask or stack_mask @@ -4352,7 +4727,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, insn->imm != 0 && env->bpf_capable) { struct bpf_reg_state fake_reg = {}; - __mark_reg_known(&fake_reg, (u32)insn->imm); + __mark_reg_known(&fake_reg, insn->imm); fake_reg.type = SCALAR_VALUE; save_register_state(state, spi, &fake_reg, size); } else if (reg && is_spillable_regtype(reg->type)) { @@ -4982,20 +5357,24 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno) { const char *targ_name = btf_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id); - int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU; + int perm_flags; const char *reg_name = ""; - /* Only unreferenced case accepts untrusted pointers */ - if (kptr_field->type == BPF_KPTR_UNREF) - perm_flags |= PTR_UNTRUSTED; + if (btf_is_kernel(reg->btf)) { + perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU; + + /* Only unreferenced case accepts untrusted pointers */ + if (kptr_field->type == BPF_KPTR_UNREF) + perm_flags |= PTR_UNTRUSTED; + } else { + perm_flags = PTR_MAYBE_NULL | MEM_ALLOC; + if (kptr_field->type == BPF_KPTR_PERCPU) + perm_flags |= MEM_PERCPU; + } if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags)) goto bad_type; - if (!btf_is_kernel(reg->btf)) { - verbose(env, "R%d must point to kernel BTF\n", regno); - return -EINVAL; - } /* We need to verify reg->type and reg->btf, before accessing reg->btf */ reg_name = btf_type_name(reg->btf, reg->btf_id); @@ -5008,7 +5387,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, if (__check_ptr_off_reg(env, reg, regno, true)) return -EACCES; - /* A full type match is needed, as BTF can be vmlinux or module BTF, and + /* A full type match is needed, as BTF can be vmlinux, module or prog BTF, and * we also need to take into account the reg->off. * * We want to support cases like: @@ -5034,7 +5413,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, */ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, kptr_field->kptr.btf, kptr_field->kptr.btf_id, - kptr_field->type == BPF_KPTR_REF)) + kptr_field->type != BPF_KPTR_UNREF)) goto bad_type; return 0; bad_type: @@ -5054,13 +5433,17 @@ bad_type: */ static bool in_rcu_cs(struct bpf_verifier_env *env) { - return env->cur_state->active_rcu_lock || !env->prog->aux->sleepable; + return env->cur_state->active_rcu_lock || + env->cur_state->active_lock.ptr || + !env->prog->aux->sleepable; } /* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */ BTF_SET_START(rcu_protected_types) BTF_ID(struct, prog_test_ref_kfunc) +#ifdef CONFIG_CGROUPS BTF_ID(struct, cgroup) +#endif BTF_ID(struct, bpf_cpumask) BTF_ID(struct, task_struct) BTF_SET_END(rcu_protected_types) @@ -5076,7 +5459,18 @@ static bool rcu_safe_kptr(const struct btf_field *field) { const struct btf_field_kptr *kptr = &field->kptr; - return field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id); + return field->type == BPF_KPTR_PERCPU || + (field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id)); +} + +static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr_field) +{ + if (rcu_safe_kptr(kptr_field) && in_rcu_cs(env)) { + if (kptr_field->type != BPF_KPTR_PERCPU) + return PTR_MAYBE_NULL | MEM_RCU; + return PTR_MAYBE_NULL | MEM_RCU | MEM_PERCPU; + } + return PTR_MAYBE_NULL | PTR_UNTRUSTED; } static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, @@ -5102,7 +5496,8 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, /* We only allow loading referenced kptr, since it will be marked as * untrusted, similar to unreferenced kptr. */ - if (class != BPF_LDX && kptr_field->type == BPF_KPTR_REF) { + if (class != BPF_LDX && + (kptr_field->type == BPF_KPTR_REF || kptr_field->type == BPF_KPTR_PERCPU)) { verbose(env, "store to referenced kptr disallowed\n"); return -EACCES; } @@ -5113,10 +5508,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, * value from map as PTR_TO_BTF_ID, with the correct type. */ mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf, - kptr_field->kptr.btf_id, - rcu_safe_kptr(kptr_field) && in_rcu_cs(env) ? - PTR_MAYBE_NULL | MEM_RCU : - PTR_MAYBE_NULL | PTR_UNTRUSTED); + kptr_field->kptr.btf_id, btf_ld_kptr_type(env, kptr_field)); /* For mark_ptr_or_null_reg */ val_reg->id = ++env->id_gen; } else if (class == BPF_STX) { @@ -5170,6 +5562,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, switch (field->type) { case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_PERCPU: if (src != ACCESS_DIRECT) { verbose(env, "kptr cannot be accessed indirectly by helper\n"); return -EACCES; @@ -5412,12 +5805,25 @@ static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno) return reg->type == PTR_TO_FLOW_KEYS; } +static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { +#ifdef CONFIG_NET + [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], + [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], + [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP], +#endif + [CONST_PTR_TO_MAP] = btf_bpf_map_id, +}; + static bool is_trusted_reg(const struct bpf_reg_state *reg) { /* A referenced register is always trusted. */ if (reg->ref_obj_id) return true; + /* Types listed in the reg2btf_ids are always trusted */ + if (reg2btf_ids[base_type(reg->type)]) + return true; + /* If a register is not referenced, it is trusted if it has the * MEM_ALLOC or PTR_TRUSTED type modifiers, and no others. Some of the * other type modifiers may be safe, but we elect to take an opt-in @@ -5624,6 +6030,27 @@ continue_func: for (; i < subprog_end; i++) { int next_insn, sidx; + if (bpf_pseudo_kfunc_call(insn + i) && !insn[i].off) { + bool err = false; + + if (!is_bpf_throw_kfunc(insn + i)) + continue; + if (subprog[idx].is_cb) + err = true; + for (int c = 0; c < frame && !err; c++) { + if (subprog[ret_prog[c]].is_cb) { + err = true; + break; + } + } + if (!err) + continue; + verbose(env, + "bpf_throw kfunc (insn %d) cannot be called from callback subprog %d\n", + i, idx); + return -EINVAL; + } + if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i)) continue; /* remember insn and function to return to */ @@ -5646,6 +6073,10 @@ continue_func: /* async callbacks don't increase bpf prog stack size unless called directly */ if (!bpf_pseudo_call(insn + i)) continue; + if (subprog[sidx].is_exception_cb) { + verbose(env, "insn %d cannot call exception cb directly\n", i); + return -EINVAL; + } } i = next_insn; idx = sidx; @@ -5667,8 +6098,13 @@ continue_func: * tail call counter throughout bpf2bpf calls combined with tailcalls */ if (tail_call_reachable) - for (j = 0; j < frame; j++) + for (j = 0; j < frame; j++) { + if (subprog[ret_prog[j]].is_exception_cb) { + verbose(env, "cannot tail call within exception cb\n"); + return -EINVAL; + } subprog[ret_prog[j]].tail_call_reachable = true; + } if (subprog[0].tail_call_reachable) env->prog->aux->tail_call_reachable = true; @@ -5813,6 +6249,147 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) __reg_combine_64_into_32(reg); } +static void set_sext64_default_val(struct bpf_reg_state *reg, int size) +{ + if (size == 1) { + reg->smin_value = reg->s32_min_value = S8_MIN; + reg->smax_value = reg->s32_max_value = S8_MAX; + } else if (size == 2) { + reg->smin_value = reg->s32_min_value = S16_MIN; + reg->smax_value = reg->s32_max_value = S16_MAX; + } else { + /* size == 4 */ + reg->smin_value = reg->s32_min_value = S32_MIN; + reg->smax_value = reg->s32_max_value = S32_MAX; + } + reg->umin_value = reg->u32_min_value = 0; + reg->umax_value = U64_MAX; + reg->u32_max_value = U32_MAX; + reg->var_off = tnum_unknown; +} + +static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size) +{ + s64 init_s64_max, init_s64_min, s64_max, s64_min, u64_cval; + u64 top_smax_value, top_smin_value; + u64 num_bits = size * 8; + + if (tnum_is_const(reg->var_off)) { + u64_cval = reg->var_off.value; + if (size == 1) + reg->var_off = tnum_const((s8)u64_cval); + else if (size == 2) + reg->var_off = tnum_const((s16)u64_cval); + else + /* size == 4 */ + reg->var_off = tnum_const((s32)u64_cval); + + u64_cval = reg->var_off.value; + reg->smax_value = reg->smin_value = u64_cval; + reg->umax_value = reg->umin_value = u64_cval; + reg->s32_max_value = reg->s32_min_value = u64_cval; + reg->u32_max_value = reg->u32_min_value = u64_cval; + return; + } + + top_smax_value = ((u64)reg->smax_value >> num_bits) << num_bits; + top_smin_value = ((u64)reg->smin_value >> num_bits) << num_bits; + + if (top_smax_value != top_smin_value) + goto out; + + /* find the s64_min and s64_min after sign extension */ + if (size == 1) { + init_s64_max = (s8)reg->smax_value; + init_s64_min = (s8)reg->smin_value; + } else if (size == 2) { + init_s64_max = (s16)reg->smax_value; + init_s64_min = (s16)reg->smin_value; + } else { + init_s64_max = (s32)reg->smax_value; + init_s64_min = (s32)reg->smin_value; + } + + s64_max = max(init_s64_max, init_s64_min); + s64_min = min(init_s64_max, init_s64_min); + + /* both of s64_max/s64_min positive or negative */ + if ((s64_max >= 0) == (s64_min >= 0)) { + reg->smin_value = reg->s32_min_value = s64_min; + reg->smax_value = reg->s32_max_value = s64_max; + reg->umin_value = reg->u32_min_value = s64_min; + reg->umax_value = reg->u32_max_value = s64_max; + reg->var_off = tnum_range(s64_min, s64_max); + return; + } + +out: + set_sext64_default_val(reg, size); +} + +static void set_sext32_default_val(struct bpf_reg_state *reg, int size) +{ + if (size == 1) { + reg->s32_min_value = S8_MIN; + reg->s32_max_value = S8_MAX; + } else { + /* size == 2 */ + reg->s32_min_value = S16_MIN; + reg->s32_max_value = S16_MAX; + } + reg->u32_min_value = 0; + reg->u32_max_value = U32_MAX; +} + +static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size) +{ + s32 init_s32_max, init_s32_min, s32_max, s32_min, u32_val; + u32 top_smax_value, top_smin_value; + u32 num_bits = size * 8; + + if (tnum_is_const(reg->var_off)) { + u32_val = reg->var_off.value; + if (size == 1) + reg->var_off = tnum_const((s8)u32_val); + else + reg->var_off = tnum_const((s16)u32_val); + + u32_val = reg->var_off.value; + reg->s32_min_value = reg->s32_max_value = u32_val; + reg->u32_min_value = reg->u32_max_value = u32_val; + return; + } + + top_smax_value = ((u32)reg->s32_max_value >> num_bits) << num_bits; + top_smin_value = ((u32)reg->s32_min_value >> num_bits) << num_bits; + + if (top_smax_value != top_smin_value) + goto out; + + /* find the s32_min and s32_min after sign extension */ + if (size == 1) { + init_s32_max = (s8)reg->s32_max_value; + init_s32_min = (s8)reg->s32_min_value; + } else { + /* size == 2 */ + init_s32_max = (s16)reg->s32_max_value; + init_s32_min = (s16)reg->s32_min_value; + } + s32_max = max(init_s32_max, init_s32_min); + s32_min = min(init_s32_max, init_s32_min); + + if ((s32_min >= 0) == (s32_max >= 0)) { + reg->s32_min_value = s32_min; + reg->s32_max_value = s32_max; + reg->u32_min_value = (u32)s32_min; + reg->u32_max_value = (u32)s32_max; + return; + } + +out: + set_sext32_default_val(reg, size); +} + static bool bpf_map_is_rdonly(const struct bpf_map *map) { /* A map is considered read-only if the following condition are true: @@ -5833,7 +6410,8 @@ static bool bpf_map_is_rdonly(const struct bpf_map *map) !bpf_map_write_active(map); } -static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val) +static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val, + bool is_ldsx) { void *ptr; u64 addr; @@ -5846,13 +6424,13 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val) switch (size) { case sizeof(u8): - *val = (u64)*(u8 *)ptr; + *val = is_ldsx ? (s64)*(s8 *)ptr : (u64)*(u8 *)ptr; break; case sizeof(u16): - *val = (u64)*(u16 *)ptr; + *val = is_ldsx ? (s64)*(s16 *)ptr : (u64)*(u16 *)ptr; break; case sizeof(u32): - *val = (u64)*(u32 *)ptr; + *val = is_ldsx ? (s64)*(s32 *)ptr : (u64)*(u32 *)ptr; break; case sizeof(u64): *val = *(u64 *)ptr; @@ -6042,7 +6620,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, } if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) && - !reg->ref_obj_id) { + !(reg->type & MEM_RCU) && !reg->ref_obj_id) { verbose(env, "verifier internal error: ref_obj_id for allocated object must be non-zero\n"); return -EFAULT; } @@ -6085,6 +6663,11 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, type_is_rcu_or_null(env, reg, field_name, btf_id)) { /* __rcu tagged pointers can be NULL */ flag |= MEM_RCU | PTR_MAYBE_NULL; + + /* We always trust them */ + if (type_is_rcu_or_null(env, reg, field_name, btf_id) && + flag & PTR_UNTRUSTED) + flag &= ~PTR_UNTRUSTED; } else if (flag & (MEM_PERCPU | MEM_USER)) { /* keep as-is */ } else { @@ -6266,7 +6849,7 @@ static int check_stack_access_within_bounds( */ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off, int bpf_size, enum bpf_access_type t, - int value_regno, bool strict_alignment_once) + int value_regno, bool strict_alignment_once, bool is_ldsx) { struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = regs + regno; @@ -6327,7 +6910,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn u64 val = 0; err = bpf_map_direct_read(map, map_off, size, - &val); + &val, is_ldsx); if (err) return err; @@ -6497,8 +7080,11 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ && regs[value_regno].type == SCALAR_VALUE) { - /* b/h/w load zero-extends, mark upper bits as known 0 */ - coerce_reg_to_size(®s[value_regno], size); + if (!is_ldsx) + /* b/h/w load zero-extends, mark upper bits as known 0 */ + coerce_reg_to_size(®s[value_regno], size); + else + coerce_reg_to_size_sx(®s[value_regno], size); } return err; } @@ -6590,17 +7176,17 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i * case to simulate the register fill. */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_READ, -1, true); + BPF_SIZE(insn->code), BPF_READ, -1, true, false); if (!err && load_reg >= 0) err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, load_reg, - true); + true, false); if (err) return err; /* Check whether we can write into the same memory. */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_WRITE, -1, true); + BPF_SIZE(insn->code), BPF_WRITE, -1, true, false); if (err) return err; @@ -6846,7 +7432,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return zero_size_allowed ? 0 : -EACCES; return check_mem_access(env, env->insn_idx, regno, offset, BPF_B, - atype, -1, false); + atype, -1, false, false); } fallthrough; @@ -7145,7 +7731,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, verbose(env, "off=%d doesn't point to kptr\n", kptr_off); return -EACCES; } - if (kptr_field->type != BPF_KPTR_REF) { + if (kptr_field->type != BPF_KPTR_REF && kptr_field->type != BPF_KPTR_PERCPU) { verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off); return -EACCES; } @@ -7218,7 +7804,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn /* we write BPF_DW bits (8 bytes) at a time */ for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) { err = check_mem_access(env, insn_idx, regno, - i, BPF_DW, BPF_WRITE, -1, false); + i, BPF_DW, BPF_WRITE, -1, false, false); if (err) return err; } @@ -7311,20 +7897,29 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) { err = check_mem_access(env, insn_idx, regno, - i, BPF_DW, BPF_WRITE, -1, false); + i, BPF_DW, BPF_WRITE, -1, false, false); if (err) return err; } - err = mark_stack_slots_iter(env, reg, insn_idx, meta->btf, btf_id, nr_slots); + err = mark_stack_slots_iter(env, meta, reg, insn_idx, meta->btf, btf_id, nr_slots); if (err) return err; } else { /* iter_next() or iter_destroy() expect initialized iter state*/ - if (!is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots)) { + err = is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots); + switch (err) { + case 0: + break; + case -EINVAL: verbose(env, "expected an initialized iter_%s as arg #%d\n", iter_type_str(meta->btf, btf_id), regno); - return -EINVAL; + return err; + case -EPROTO: + verbose(env, "expected an RCU CS when using %s\n", meta->func_name); + return err; + default: + return err; } spi = iter_get_spi(env, reg, nr_slots); @@ -7350,6 +7945,81 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id return 0; } +/* Look for a previous loop entry at insn_idx: nearest parent state + * stopped at insn_idx with callsites matching those in cur->frame. + */ +static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env, + struct bpf_verifier_state *cur, + int insn_idx) +{ + struct bpf_verifier_state_list *sl; + struct bpf_verifier_state *st; + + /* Explored states are pushed in stack order, most recent states come first */ + sl = *explored_state(env, insn_idx); + for (; sl; sl = sl->next) { + /* If st->branches != 0 state is a part of current DFS verification path, + * hence cur & st for a loop. + */ + st = &sl->state; + if (st->insn_idx == insn_idx && st->branches && same_callsites(st, cur) && + st->dfs_depth < cur->dfs_depth) + return st; + } + + return NULL; +} + +static void reset_idmap_scratch(struct bpf_verifier_env *env); +static bool regs_exact(const struct bpf_reg_state *rold, + const struct bpf_reg_state *rcur, + struct bpf_idmap *idmap); + +static void maybe_widen_reg(struct bpf_verifier_env *env, + struct bpf_reg_state *rold, struct bpf_reg_state *rcur, + struct bpf_idmap *idmap) +{ + if (rold->type != SCALAR_VALUE) + return; + if (rold->type != rcur->type) + return; + if (rold->precise || rcur->precise || regs_exact(rold, rcur, idmap)) + return; + __mark_reg_unknown(env, rcur); +} + +static int widen_imprecise_scalars(struct bpf_verifier_env *env, + struct bpf_verifier_state *old, + struct bpf_verifier_state *cur) +{ + struct bpf_func_state *fold, *fcur; + int i, fr; + + reset_idmap_scratch(env); + for (fr = old->curframe; fr >= 0; fr--) { + fold = old->frame[fr]; + fcur = cur->frame[fr]; + + for (i = 0; i < MAX_BPF_REG; i++) + maybe_widen_reg(env, + &fold->regs[i], + &fcur->regs[i], + &env->idmap_scratch); + + for (i = 0; i < fold->allocated_stack / BPF_REG_SIZE; i++) { + if (!is_spilled_reg(&fold->stack[i]) || + !is_spilled_reg(&fcur->stack[i])) + continue; + + maybe_widen_reg(env, + &fold->stack[i].spilled_ptr, + &fcur->stack[i].spilled_ptr, + &env->idmap_scratch); + } + } + return 0; +} + /* process_iter_next_call() is called when verifier gets to iterator's next * "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer * to it as just "iter_next()" in comments below. @@ -7391,25 +8061,47 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id * is some statically known limit on number of iterations (e.g., if there is * an explicit `if n > 100 then break;` statement somewhere in the loop). * - * One very subtle but very important aspect is that we *always* simulate NULL - * condition first (as the current state) before we simulate non-NULL case. - * This has to do with intricacies of scalar precision tracking. By simulating - * "exit condition" of iter_next() returning NULL first, we make sure all the - * relevant precision marks *that will be set **after** we exit iterator loop* - * are propagated backwards to common parent state of NULL and non-NULL - * branches. Thanks to that, state equivalence checks done later in forked - * state, when reaching iter_next() for ACTIVE iterator, can assume that - * precision marks are finalized and won't change. Because simulating another - * ACTIVE iterator iteration won't change them (because given same input - * states we'll end up with exactly same output states which we are currently - * comparing; and verification after the loop already propagated back what - * needs to be **additionally** tracked as precise). It's subtle, grok - * precision tracking for more intuitive understanding. + * Iteration convergence logic in is_state_visited() relies on exact + * states comparison, which ignores read and precision marks. + * This is necessary because read and precision marks are not finalized + * while in the loop. Exact comparison might preclude convergence for + * simple programs like below: + * + * i = 0; + * while(iter_next(&it)) + * i++; + * + * At each iteration step i++ would produce a new distinct state and + * eventually instruction processing limit would be reached. + * + * To avoid such behavior speculatively forget (widen) range for + * imprecise scalar registers, if those registers were not precise at the + * end of the previous iteration and do not match exactly. + * + * This is a conservative heuristic that allows to verify wide range of programs, + * however it precludes verification of programs that conjure an + * imprecise value on the first loop iteration and use it as precise on a second. + * For example, the following safe program would fail to verify: + * + * struct bpf_num_iter it; + * int arr[10]; + * int i = 0, a = 0; + * bpf_iter_num_new(&it, 0, 10); + * while (bpf_iter_num_next(&it)) { + * if (a == 0) { + * a = 1; + * i = 7; // Because i changed verifier would forget + * // it's range on second loop entry. + * } else { + * arr[i] = 42; // This would fail to verify. + * } + * } + * bpf_iter_num_destroy(&it); */ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, struct bpf_kfunc_call_arg_meta *meta) { - struct bpf_verifier_state *cur_st = env->cur_state, *queued_st; + struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st; struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr; struct bpf_reg_state *cur_iter, *queued_iter; int iter_frameno = meta->iter.frameno; @@ -7427,6 +8119,19 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, } if (cur_iter->iter.state == BPF_ITER_STATE_ACTIVE) { + /* Because iter_next() call is a checkpoint is_state_visitied() + * should guarantee parent state with same call sites and insn_idx. + */ + if (!cur_st->parent || cur_st->parent->insn_idx != insn_idx || + !same_callsites(cur_st->parent, cur_st)) { + verbose(env, "bug: bad parent state for iter next call"); + return -EFAULT; + } + /* Note cur_st->parent in the call below, it is necessary to skip + * checkpoint created for cur_st by is_state_visited() + * right at this instruction. + */ + prev_st = find_prev_entry(env, cur_st->parent, insn_idx); /* branch out active iter state */ queued_st = push_stack(env, insn_idx + 1, insn_idx, false); if (!queued_st) @@ -7435,6 +8140,8 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr; queued_iter->iter.state = BPF_ITER_STATE_ACTIVE; queued_iter->iter.depth++; + if (prev_st) + widen_imprecise_scalars(env, prev_st, queued_st); queued_fr = queued_st->frame[queued_st->curframe]; mark_ptr_not_null_reg(&queued_fr->regs[BPF_REG_0]); @@ -7578,6 +8285,7 @@ static const struct bpf_reg_types btf_ptr_types = { static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_BTF_ID | MEM_PERCPU, + PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU, PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED, } }; @@ -7656,8 +8364,10 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, if (base_type(arg_type) == ARG_PTR_TO_MEM) type &= ~DYNPTR_TYPE_FLAG_MASK; - if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type)) + if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type)) { type &= ~MEM_ALLOC; + type &= ~MEM_PERCPU; + } for (i = 0; i < ARRAY_SIZE(compatible->types); i++) { expected = compatible->types[i]; @@ -7740,14 +8450,19 @@ found: break; } case PTR_TO_BTF_ID | MEM_ALLOC: + case PTR_TO_BTF_ID | MEM_PERCPU | MEM_ALLOC: if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock && meta->func_id != BPF_FUNC_kptr_xchg) { verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n"); return -EFAULT; } - /* Handled by helper specific checks */ + if (meta->func_id == BPF_FUNC_kptr_xchg) { + if (map_kptr_match_type(env, meta->kptr_field, reg, regno)) + return -EACCES; + } break; case PTR_TO_BTF_ID | MEM_PERCPU: + case PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU: case PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED: /* Handled by helper specific checks */ break; @@ -7797,17 +8512,6 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, if (arg_type_is_dynptr(arg_type) && type == PTR_TO_STACK) return 0; - if ((type_is_ptr_alloc_obj(type) || type_is_non_owning_ref(type)) && reg->off) { - if (reg_find_field_offset(reg, reg->off, BPF_GRAPH_NODE_OR_ROOT)) - return __check_ptr_off_reg(env, reg, regno, true); - - verbose(env, "R%d must have zero offset when passed to release func\n", - regno); - verbose(env, "No graph node or root found at R%d type:%s off:%d\n", regno, - btf_type_name(reg->btf, reg->btf_id), reg->off); - return -EINVAL; - } - /* Doing check_ptr_off_reg check for the offset will catch this * because fixed_off_ok is false, but checking here allows us * to give the user a better error message. @@ -7842,6 +8546,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, case PTR_TO_BTF_ID | PTR_TRUSTED: case PTR_TO_BTF_ID | MEM_RCU: case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF: + case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU: /* When referenced PTR_TO_BTF_ID is passed to release function, * its fixed offset must be 0. In the other cases, fixed offset * can be non-zero. This was already checked above. So pass @@ -8671,7 +9376,7 @@ static void clear_caller_saved_regs(struct bpf_verifier_env *env, /* after the call registers r0 - r5 were scratched */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); - check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); + __check_reg_arg(env, regs, caller_saved[i], DST_OP_NO_MARK); } } @@ -8684,11 +9389,10 @@ static int set_callee_state(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, int insn_idx); -static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, - int *insn_idx, int subprog, - set_callee_state_fn set_callee_state_cb) +static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int callsite, + set_callee_state_fn set_callee_state_cb, + struct bpf_verifier_state *state) { - struct bpf_verifier_state *state = env->cur_state; struct bpf_func_state *caller, *callee; int err; @@ -8698,53 +9402,72 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -E2BIG; } - caller = state->frame[state->curframe]; if (state->frame[state->curframe + 1]) { verbose(env, "verifier bug. Frame %d already allocated\n", state->curframe + 1); return -EFAULT; } + caller = state->frame[state->curframe]; + callee = kzalloc(sizeof(*callee), GFP_KERNEL); + if (!callee) + return -ENOMEM; + state->frame[state->curframe + 1] = callee; + + /* callee cannot access r0, r6 - r9 for reading and has to write + * into its own stack before reading from it. + * callee can read/write into caller's stack + */ + init_func_state(env, callee, + /* remember the callsite, it will be used by bpf_exit */ + callsite, + state->curframe + 1 /* frameno within this callchain */, + subprog /* subprog number within this prog */); + /* Transfer references to the callee */ + err = copy_reference_state(callee, caller); + err = err ?: set_callee_state_cb(env, caller, callee, callsite); + if (err) + goto err_out; + + /* only increment it after check_reg_arg() finished */ + state->curframe++; + + return 0; + +err_out: + free_func_state(callee); + state->frame[state->curframe + 1] = NULL; + return err; +} + +static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int insn_idx, int subprog, + set_callee_state_fn set_callee_state_cb) +{ + struct bpf_verifier_state *state = env->cur_state, *callback_state; + struct bpf_func_state *caller, *callee; + int err; + + caller = state->frame[state->curframe]; err = btf_check_subprog_call(env, subprog, caller->regs); if (err == -EFAULT) return err; - if (subprog_is_global(env, subprog)) { - if (err) { - verbose(env, "Caller passes invalid args into func#%d\n", - subprog); - return err; - } else { - if (env->log.level & BPF_LOG_LEVEL) - verbose(env, - "Func#%d is global and valid. Skipping.\n", - subprog); - clear_caller_saved_regs(env, caller->regs); - - /* All global functions return a 64-bit SCALAR_VALUE */ - mark_reg_unknown(env, caller->regs, BPF_REG_0); - caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; - - /* continue with next insn after call */ - return 0; - } - } /* set_callee_state is used for direct subprog calls, but we are * interested in validating only BPF helpers that can call subprogs as * callbacks */ - if (set_callee_state_cb != set_callee_state) { - if (bpf_pseudo_kfunc_call(insn) && - !is_callback_calling_kfunc(insn->imm)) { - verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n", - func_id_name(insn->imm), insn->imm); - return -EFAULT; - } else if (!bpf_pseudo_kfunc_call(insn) && - !is_callback_calling_function(insn->imm)) { /* helper */ - verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n", - func_id_name(insn->imm), insn->imm); - return -EFAULT; - } + env->subprog_info[subprog].is_cb = true; + if (bpf_pseudo_kfunc_call(insn) && + !is_sync_callback_calling_kfunc(insn->imm)) { + verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n", + func_id_name(insn->imm), insn->imm); + return -EFAULT; + } else if (!bpf_pseudo_kfunc_call(insn) && + !is_callback_calling_function(insn->imm)) { /* helper */ + verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n", + func_id_name(insn->imm), insn->imm); + return -EFAULT; } if (insn->code == (BPF_JMP | BPF_CALL) && @@ -8755,53 +9478,83 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn /* there is no real recursion here. timer callbacks are async */ env->subprog_info[subprog].is_async_cb = true; async_cb = push_async_cb(env, env->subprog_info[subprog].start, - *insn_idx, subprog); + insn_idx, subprog); if (!async_cb) return -EFAULT; callee = async_cb->frame[0]; callee->async_entry_cnt = caller->async_entry_cnt + 1; /* Convert bpf_timer_set_callback() args into timer callback args */ - err = set_callee_state_cb(env, caller, callee, *insn_idx); + err = set_callee_state_cb(env, caller, callee, insn_idx); if (err) return err; + return 0; + } + + /* for callback functions enqueue entry to callback and + * proceed with next instruction within current frame. + */ + callback_state = push_stack(env, env->subprog_info[subprog].start, insn_idx, false); + if (!callback_state) + return -ENOMEM; + + err = setup_func_entry(env, subprog, insn_idx, set_callee_state_cb, + callback_state); + if (err) + return err; + + callback_state->callback_unroll_depth++; + callback_state->frame[callback_state->curframe - 1]->callback_depth++; + caller->callback_depth = 0; + return 0; +} + +static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx) +{ + struct bpf_verifier_state *state = env->cur_state; + struct bpf_func_state *caller; + int err, subprog, target_insn; + + target_insn = *insn_idx + insn->imm + 1; + subprog = find_subprog(env, target_insn); + if (subprog < 0) { + verbose(env, "verifier bug. No program starts at insn %d\n", target_insn); + return -EFAULT; + } + + caller = state->frame[state->curframe]; + err = btf_check_subprog_call(env, subprog, caller->regs); + if (err == -EFAULT) + return err; + if (subprog_is_global(env, subprog)) { + if (err) { + verbose(env, "Caller passes invalid args into func#%d\n", subprog); + return err; + } + + if (env->log.level & BPF_LOG_LEVEL) + verbose(env, "Func#%d is global and valid. Skipping.\n", subprog); clear_caller_saved_regs(env, caller->regs); + + /* All global functions return a 64-bit SCALAR_VALUE */ mark_reg_unknown(env, caller->regs, BPF_REG_0); caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; + /* continue with next insn after call */ return 0; } - callee = kzalloc(sizeof(*callee), GFP_KERNEL); - if (!callee) - return -ENOMEM; - state->frame[state->curframe + 1] = callee; - - /* callee cannot access r0, r6 - r9 for reading and has to write - * into its own stack before reading from it. - * callee can read/write into caller's stack + /* for regular function entry setup new frame and continue + * from that frame. */ - init_func_state(env, callee, - /* remember the callsite, it will be used by bpf_exit */ - *insn_idx /* callsite */, - state->curframe + 1 /* frameno within this callchain */, - subprog /* subprog number within this prog */); - - /* Transfer references to the callee */ - err = copy_reference_state(callee, caller); + err = setup_func_entry(env, subprog, *insn_idx, set_callee_state, state); if (err) - goto err_out; - - err = set_callee_state_cb(env, caller, callee, *insn_idx); - if (err) - goto err_out; + return err; clear_caller_saved_regs(env, caller->regs); - /* only increment it after check_reg_arg() finished */ - state->curframe++; - /* and go analyze first insn of the callee */ *insn_idx = env->subprog_info[subprog].start - 1; @@ -8809,14 +9562,10 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn verbose(env, "caller:\n"); print_verifier_state(env, caller, true); verbose(env, "callee:\n"); - print_verifier_state(env, callee, true); + print_verifier_state(env, state->frame[state->curframe], true); } - return 0; -err_out: - free_func_state(callee); - state->frame[state->curframe + 1] = NULL; - return err; + return 0; } int map_set_for_each_callback_args(struct bpf_verifier_env *env, @@ -8860,22 +9609,6 @@ static int set_callee_state(struct bpf_verifier_env *env, return 0; } -static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, - int *insn_idx) -{ - int subprog, target_insn; - - target_insn = *insn_idx + insn->imm + 1; - subprog = find_subprog(env, target_insn); - if (subprog < 0) { - verbose(env, "verifier bug. No program starts at insn %d\n", - target_insn); - return -EFAULT; - } - - return __check_func_call(env, insn, insn_idx, subprog, set_callee_state); -} - static int set_map_elem_callback_state(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, @@ -9068,9 +9801,10 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env) static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) { - struct bpf_verifier_state *state = env->cur_state; + struct bpf_verifier_state *state = env->cur_state, *prev_st; struct bpf_func_state *caller, *callee; struct bpf_reg_state *r0; + bool in_callback_fn; int err; callee = state->frame[state->curframe]; @@ -9099,6 +9833,11 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) verbose_invalid_scalar(env, r0, &range, "callback return", "R0"); return -EINVAL; } + if (!calls_callback(env, callee->callsite)) { + verbose(env, "BUG: in callback at %d, callsite %d !calls_callback\n", + *insn_idx, callee->callsite); + return -EFAULT; + } } else { /* return to the caller whatever r0 had in the callee */ caller->regs[BPF_REG_0] = *r0; @@ -9116,16 +9855,44 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) return err; } - *insn_idx = callee->callsite + 1; + /* for callbacks like bpf_loop or bpf_for_each_map_elem go back to callsite, + * there function call logic would reschedule callback visit. If iteration + * converges is_state_visited() would prune that visit eventually. + */ + in_callback_fn = callee->in_callback_fn; + if (in_callback_fn) + *insn_idx = callee->callsite; + else + *insn_idx = callee->callsite + 1; + if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "returning from callee:\n"); print_verifier_state(env, callee, true); verbose(env, "to caller at %d:\n", *insn_idx); print_verifier_state(env, caller, true); } - /* clear everything in the callee */ + /* clear everything in the callee. In case of exceptional exits using + * bpf_throw, this will be done by copy_verifier_state for extra frames. */ free_func_state(callee); state->frame[state->curframe--] = NULL; + + /* for callbacks widen imprecise scalars to make programs like below verify: + * + * struct ctx { int i; } + * void cb(int idx, struct ctx *ctx) { ctx->i++; ... } + * ... + * struct ctx = { .i = 0; } + * bpf_loop(100, cb, &ctx, 0); + * + * This is similar to what is done in process_iter_next_call() for open + * coded iterators. + */ + prev_st = in_callback_fn ? find_prev_entry(env, state, *insn_idx) : NULL; + if (prev_st) { + err = widen_imprecise_scalars(env, prev_st, state); + if (err) + return err; + } return 0; } @@ -9135,19 +9902,33 @@ static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type, { struct bpf_reg_state *ret_reg = ®s[BPF_REG_0]; - if (ret_type != RET_INTEGER || - (func_id != BPF_FUNC_get_stack && - func_id != BPF_FUNC_get_task_stack && - func_id != BPF_FUNC_probe_read_str && - func_id != BPF_FUNC_probe_read_kernel_str && - func_id != BPF_FUNC_probe_read_user_str)) + if (ret_type != RET_INTEGER) return; - ret_reg->smax_value = meta->msize_max_value; - ret_reg->s32_max_value = meta->msize_max_value; - ret_reg->smin_value = -MAX_ERRNO; - ret_reg->s32_min_value = -MAX_ERRNO; - reg_bounds_sync(ret_reg); + switch (func_id) { + case BPF_FUNC_get_stack: + case BPF_FUNC_get_task_stack: + case BPF_FUNC_probe_read_str: + case BPF_FUNC_probe_read_kernel_str: + case BPF_FUNC_probe_read_user_str: + ret_reg->smax_value = meta->msize_max_value; + ret_reg->s32_max_value = meta->msize_max_value; + ret_reg->smin_value = -MAX_ERRNO; + ret_reg->s32_min_value = -MAX_ERRNO; + reg_bounds_sync(ret_reg); + break; + case BPF_FUNC_get_smp_processor_id: + ret_reg->umax_value = nr_cpu_ids - 1; + ret_reg->u32_max_value = nr_cpu_ids - 1; + ret_reg->smax_value = nr_cpu_ids - 1; + ret_reg->s32_max_value = nr_cpu_ids - 1; + ret_reg->umin_value = 0; + ret_reg->u32_min_value = 0; + ret_reg->smin_value = 0; + ret_reg->s32_min_value = 0; + reg_bounds_sync(ret_reg); + break; + } } static int @@ -9233,17 +10014,17 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return 0; } -static int check_reference_leak(struct bpf_verifier_env *env) +static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exit) { struct bpf_func_state *state = cur_func(env); bool refs_lingering = false; int i; - if (state->frameno && !state->in_callback_fn) + if (!exception_exit && state->frameno && !state->in_callback_fn) return 0; for (i = 0; i < state->acquired_refs; i++) { - if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno) + if (!exception_exit && state->in_callback_fn && state->refs[i].callback_ref != state->frameno) continue; verbose(env, "Unreleased reference id=%d alloc_insn=%d\n", state->refs[i].id, state->refs[i].insn_idx); @@ -9350,6 +10131,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn int *insn_idx_p) { enum bpf_prog_type prog_type = resolve_prog_type(env->prog); + bool returns_cpu_specific_alloc_ptr = false; const struct bpf_func_proto *fn = NULL; enum bpf_return_type ret_type; enum bpf_type_flag ret_flag; @@ -9441,7 +10223,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn */ for (i = 0; i < meta.access_size; i++) { err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, - BPF_WRITE, -1, false); + BPF_WRITE, -1, false, false); if (err) return err; } @@ -9460,6 +10242,26 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EFAULT; } err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]); + } else if (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj_id) { + u32 ref_obj_id = meta.ref_obj_id; + bool in_rcu = in_rcu_cs(env); + struct bpf_func_state *state; + struct bpf_reg_state *reg; + + err = release_reference_state(cur_func(env), ref_obj_id); + if (!err) { + bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ + if (reg->ref_obj_id == ref_obj_id) { + if (in_rcu && (reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU)) { + reg->ref_obj_id = 0; + reg->type &= ~MEM_ALLOC; + reg->type |= MEM_RCU; + } else { + mark_reg_invalid(env, reg); + } + } + })); + } } else if (meta.ref_obj_id) { err = release_reference(env, meta.ref_obj_id); } else if (register_is_null(®s[meta.release_regno])) { @@ -9477,7 +10279,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn switch (func_id) { case BPF_FUNC_tail_call: - err = check_reference_leak(env); + err = check_reference_leak(env, false); if (err) { verbose(env, "tail_call would lead to reference leak\n"); return err; @@ -9493,24 +10295,37 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } break; case BPF_FUNC_for_each_map_elem: - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, - set_map_elem_callback_state); + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_map_elem_callback_state); break; case BPF_FUNC_timer_set_callback: - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, - set_timer_callback_state); + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_timer_callback_state); break; case BPF_FUNC_find_vma: - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, - set_find_vma_callback_state); + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_find_vma_callback_state); break; case BPF_FUNC_snprintf: err = check_bpf_snprintf_call(env, regs); break; case BPF_FUNC_loop: update_loop_inline_state(env, meta.subprogno); - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, - set_loop_callback_state); + /* Verifier relies on R1 value to determine if bpf_loop() iteration + * is finished, thus mark it precise. + */ + err = mark_chain_precision(env, BPF_REG_1); + if (err) + return err; + if (cur_func(env)->callback_depth < regs[BPF_REG_1].umax_value) { + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_loop_callback_state); + } else { + cur_func(env)->callback_depth = 0; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "frame%d bpf_loop iteration limit reached\n", + env->cur_state->curframe); + } break; case BPF_FUNC_dynptr_from_mem: if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) { @@ -9588,9 +10403,26 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn break; } + case BPF_FUNC_per_cpu_ptr: + case BPF_FUNC_this_cpu_ptr: + { + struct bpf_reg_state *reg = ®s[BPF_REG_1]; + const struct btf_type *type; + + if (reg->type & MEM_RCU) { + type = btf_type_by_id(reg->btf, reg->btf_id); + if (!type || !btf_type_is_struct(type)) { + verbose(env, "Helper has invalid btf/btf_id in R1\n"); + return -EFAULT; + } + returns_cpu_specific_alloc_ptr = true; + env->insn_aux_data[insn_idx].call_with_percpu_alloc_ptr = true; + } + break; + } case BPF_FUNC_user_ringbuf_drain: - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, - set_user_ringbuf_callback_state); + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_user_ringbuf_callback_state); break; } @@ -9677,14 +10509,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag; regs[BPF_REG_0].mem_size = tsize; } else { - /* MEM_RDONLY may be carried from ret_flag, but it - * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise - * it will confuse the check of PTR_TO_BTF_ID in - * check_mem_access(). - */ - ret_flag &= ~MEM_RDONLY; + if (returns_cpu_specific_alloc_ptr) { + regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC | MEM_RCU; + } else { + /* MEM_RDONLY may be carried from ret_flag, but it + * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise + * it will confuse the check of PTR_TO_BTF_ID in + * check_mem_access(). + */ + ret_flag &= ~MEM_RDONLY; + regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; + } - regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; regs[BPF_REG_0].btf = meta.ret_btf; regs[BPF_REG_0].btf_id = meta.ret_btf_id; } @@ -9700,8 +10536,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (func_id == BPF_FUNC_kptr_xchg) { ret_btf = meta.kptr_field->kptr.btf; ret_btf_id = meta.kptr_field->kptr.btf_id; - if (!btf_is_kernel(ret_btf)) + if (!btf_is_kernel(ret_btf)) { regs[BPF_REG_0].type |= MEM_ALLOC; + if (meta.kptr_field->type == BPF_KPTR_PERCPU) + regs[BPF_REG_0].type |= MEM_PERCPU; + } } else { if (fn->ret_btf_id == BPF_PTR_POISON) { verbose(env, "verifier internal error:"); @@ -9848,6 +10687,11 @@ static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta) return meta->kfunc_flags & KF_RCU; } +static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_RCU_PROTECTED; +} + static bool __kfunc_param_match_suffix(const struct btf *btf, const struct btf_param *arg, const char *suffix) @@ -9922,6 +10766,11 @@ static bool is_kfunc_arg_refcounted_kptr(const struct btf *btf, const struct btf return __kfunc_param_match_suffix(btf, arg, "__refcounted_kptr"); } +static bool is_kfunc_arg_nullable(const struct btf *btf, const struct btf_param *arg) +{ + return __kfunc_param_match_suffix(btf, arg, "__nullable"); +} + static bool is_kfunc_arg_scalar_with_name(const struct btf *btf, const struct btf_param *arg, const char *name) @@ -10050,15 +10899,6 @@ static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env, return true; } - -static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { -#ifdef CONFIG_NET - [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], - [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], - [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP], -#endif -}; - enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_CTX, KF_ARG_PTR_TO_ALLOC_BTF_ID, /* Allocated object */ @@ -10073,6 +10913,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_CALLBACK, KF_ARG_PTR_TO_RB_ROOT, KF_ARG_PTR_TO_RB_NODE, + KF_ARG_PTR_TO_NULL, }; enum special_kfunc_type { @@ -10095,6 +10936,10 @@ enum special_kfunc_type { KF_bpf_dynptr_slice, KF_bpf_dynptr_slice_rdwr, KF_bpf_dynptr_clone, + KF_bpf_percpu_obj_new_impl, + KF_bpf_percpu_obj_drop_impl, + KF_bpf_throw, + KF_bpf_iter_css_task_new, }; BTF_SET_START(special_kfunc_set) @@ -10115,6 +10960,12 @@ BTF_ID(func, bpf_dynptr_from_xdp) BTF_ID(func, bpf_dynptr_slice) BTF_ID(func, bpf_dynptr_slice_rdwr) BTF_ID(func, bpf_dynptr_clone) +BTF_ID(func, bpf_percpu_obj_new_impl) +BTF_ID(func, bpf_percpu_obj_drop_impl) +BTF_ID(func, bpf_throw) +#ifdef CONFIG_CGROUPS +BTF_ID(func, bpf_iter_css_task_new) +#endif BTF_SET_END(special_kfunc_set) BTF_ID_LIST(special_kfunc_list) @@ -10137,6 +10988,14 @@ BTF_ID(func, bpf_dynptr_from_xdp) BTF_ID(func, bpf_dynptr_slice) BTF_ID(func, bpf_dynptr_slice_rdwr) BTF_ID(func, bpf_dynptr_clone) +BTF_ID(func, bpf_percpu_obj_new_impl) +BTF_ID(func, bpf_percpu_obj_drop_impl) +BTF_ID(func, bpf_throw) +#ifdef CONFIG_CGROUPS +BTF_ID(func, bpf_iter_css_task_new) +#else +BTF_ID_UNUSED +#endif static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -10217,6 +11076,8 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_callback(env, meta->btf, &args[argno])) return KF_ARG_PTR_TO_CALLBACK; + if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg)) + return KF_ARG_PTR_TO_NULL; if (argno + 1 < nargs && (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]) || @@ -10303,6 +11164,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_verifier_state *state = env->cur_state; + struct btf_record *rec = reg_btf_record(reg); if (!state->active_lock.ptr) { verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n"); @@ -10315,6 +11177,9 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state } reg->type |= NON_OWN_REF; + if (rec->refcount_off >= 0) + reg->type |= MEM_RCU; + return 0; } @@ -10445,11 +11310,17 @@ static bool is_bpf_graph_api_kfunc(u32 btf_id) btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]; } -static bool is_callback_calling_kfunc(u32 btf_id) +static bool is_sync_callback_calling_kfunc(u32 btf_id) { return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]; } +static bool is_bpf_throw_kfunc(struct bpf_insn *insn) +{ + return bpf_pseudo_kfunc_call(insn) && insn->off == 0 && + insn->imm == special_kfunc_list[KF_bpf_throw]; +} + static bool is_rbtree_lock_required_kfunc(u32 btf_id) { return is_bpf_rbtree_api_kfunc(btf_id); @@ -10657,6 +11528,28 @@ static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env, &meta->arg_rbtree_root.field); } +/* + * css_task iter allowlist is needed to avoid dead locking on css_set_lock. + * LSM hooks and iters (both sleepable and non-sleepable) are safe. + * Any sleepable progs are also safe since bpf_check_attach_target() enforce + * them can only be attached to some specific hook points. + */ +static bool check_css_task_iter_allowlist(struct bpf_verifier_env *env) +{ + enum bpf_prog_type prog_type = resolve_prog_type(env->prog); + + switch (prog_type) { + case BPF_PROG_TYPE_LSM: + return true; + case BPF_PROG_TYPE_TRACING: + if (env->prog->expected_attach_type == BPF_TRACE_ITER) + return true; + fallthrough; + default: + return env->prog->aux->sleepable; + } +} + static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, int insn_idx) { @@ -10743,7 +11636,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) && - (register_is_null(reg) || type_may_be_null(reg->type))) { + (register_is_null(reg) || type_may_be_null(reg->type)) && + !is_kfunc_arg_nullable(meta->btf, &args[i])) { verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i); return -EACCES; } @@ -10768,6 +11662,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return kf_arg_type; switch (kf_arg_type) { + case KF_ARG_PTR_TO_NULL: + continue; case KF_ARG_PTR_TO_ALLOC_BTF_ID: case KF_ARG_PTR_TO_BTF_ID: if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta)) @@ -10827,7 +11723,17 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } break; case KF_ARG_PTR_TO_ALLOC_BTF_ID: - if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { + if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC)) { + if (meta->func_id != special_kfunc_list[KF_bpf_obj_drop_impl]) { + verbose(env, "arg#%d expected for bpf_obj_drop_impl()\n", i); + return -EINVAL; + } + } else if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC | MEM_PERCPU)) { + if (meta->func_id != special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) { + verbose(env, "arg#%d expected for bpf_percpu_obj_drop_impl()\n", i); + return -EINVAL; + } + } else { verbose(env, "arg#%d expected pointer to allocated object\n", i); return -EINVAL; } @@ -10835,8 +11741,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "allocated object must be referenced\n"); return -EINVAL; } - if (meta->btf == btf_vmlinux && - meta->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { + if (meta->btf == btf_vmlinux) { meta->arg_btf = reg->btf; meta->arg_btf_id = reg->btf_id; } @@ -10898,6 +11803,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ break; } case KF_ARG_PTR_TO_ITER: + if (meta->func_id == special_kfunc_list[KF_bpf_iter_css_task_new]) { + if (!check_css_task_iter_allowlist(env)) { + verbose(env, "css_task_iter is only allowed in bpf_lsm, bpf_iter and sleepable progs\n"); + return -EINVAL; + } + } ret = process_iter_arg(env, regno, insn_idx, meta); if (ret < 0) return ret; @@ -11027,6 +11938,10 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ break; } case KF_ARG_PTR_TO_CALLBACK: + if (reg->type != PTR_TO_FUNC) { + verbose(env, "arg%d expected pointer to func\n", i); + return -EINVAL; + } meta->subprogno = reg->subprogno; break; case KF_ARG_PTR_TO_REFCOUNTED_KPTR: @@ -11047,10 +11962,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "arg#%d doesn't point to a type with bpf_refcount field\n", i); return -EINVAL; } - if (rec->refcount_off >= 0) { - verbose(env, "bpf_refcount_acquire calls are disabled for now\n"); - return -EINVAL; - } + meta->arg_btf = reg->btf; meta->arg_btf_id = reg->btf_id; break; @@ -11108,6 +12020,8 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env, return 0; } +static int check_return_code(struct bpf_verifier_env *env, int regno); + static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { @@ -11148,18 +12062,39 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EACCES; } + /* Check the arguments */ + err = check_kfunc_args(env, &meta, insn_idx); + if (err < 0) + return err; + + if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_rbtree_add_callback_state); + if (err) { + verbose(env, "kfunc %s#%d failed callback verification\n", + func_name, meta.func_id); + return err; + } + } + rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta); rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta); if (env->cur_state->active_rcu_lock) { struct bpf_func_state *state; struct bpf_reg_state *reg; + u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER); + + if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) { + verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n"); + return -EACCES; + } if (rcu_lock) { verbose(env, "nested rcu read lock (kernel function %s)\n", func_name); return -EINVAL; } else if (rcu_unlock) { - bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ + bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, clear_mask, ({ if (reg->type & MEM_RCU) { reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL); reg->type |= PTR_UNTRUSTED; @@ -11177,10 +12112,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EINVAL; } - /* Check the arguments */ - err = check_kfunc_args(env, &meta, insn_idx); - if (err < 0) - return err; /* In case of release function, we get register number of refcounted * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now. */ @@ -11214,13 +12145,21 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } - if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, - set_rbtree_add_callback_state); - if (err) { - verbose(env, "kfunc %s#%d failed callback verification\n", + if (meta.func_id == special_kfunc_list[KF_bpf_throw]) { + if (!bpf_jit_supports_exceptions()) { + verbose(env, "JIT does not support calling kfunc %s#%d\n", func_name, meta.func_id); - return err; + return -ENOTSUPP; + } + env->seen_exception = true; + + /* In the case of the default callback, the cookie value passed + * to bpf_throw becomes the return value of the program. + */ + if (!env->exception_callback_subprog) { + err = check_return_code(env, BPF_REG_1); + if (err < 0) + return err; } } @@ -11234,6 +12173,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* Only exception is bpf_obj_new_impl */ if (meta.btf != btf_vmlinux || (meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl] && + meta.func_id != special_kfunc_list[KF_bpf_percpu_obj_new_impl] && meta.func_id != special_kfunc_list[KF_bpf_refcount_acquire_impl])) { verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n"); return -EINVAL; @@ -11247,13 +12187,29 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id); if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) { - if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl]) { + if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] || + meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + struct btf_struct_meta *struct_meta; struct btf *ret_btf; u32 ret_btf_id; - if (unlikely(!bpf_global_ma_set)) + if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set) return -ENOMEM; + if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + if (!bpf_global_percpu_ma_set) { + mutex_lock(&bpf_percpu_ma_lock); + if (!bpf_global_percpu_ma_set) { + err = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true); + if (!err) + bpf_global_percpu_ma_set = true; + } + mutex_unlock(&bpf_percpu_ma_lock); + if (err) + return err; + } + } + if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) { verbose(env, "local type ID argument must be in range [0, U32_MAX]\n"); return -EINVAL; @@ -11264,24 +12220,38 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* This may be NULL due to user not supplying a BTF */ if (!ret_btf) { - verbose(env, "bpf_obj_new requires prog BTF\n"); + verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n"); return -EINVAL; } ret_t = btf_type_by_id(ret_btf, ret_btf_id); if (!ret_t || !__btf_type_is_struct(ret_t)) { - verbose(env, "bpf_obj_new type ID argument must be of a struct\n"); + verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n"); return -EINVAL; } + struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id); + if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) { + verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n"); + return -EINVAL; + } + + if (struct_meta) { + verbose(env, "bpf_percpu_obj_new type ID argument must not contain special fields\n"); + return -EINVAL; + } + } + mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; regs[BPF_REG_0].btf = ret_btf; regs[BPF_REG_0].btf_id = ret_btf_id; + if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) + regs[BPF_REG_0].type |= MEM_PERCPU; insn_aux->obj_new_size = ret_t->size; - insn_aux->kptr_struct_meta = - btf_find_struct_meta(ret_btf, ret_btf_id); + insn_aux->kptr_struct_meta = struct_meta; } else if (meta.func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; @@ -11418,7 +12388,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, regs[BPF_REG_0].id = ++env->id_gen; } else if (btf_type_is_void(t)) { if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) { - if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { + if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl] || + meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) { insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id); @@ -12907,7 +13878,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else { if (insn->src_reg != BPF_REG_0 || insn->off != 0 || (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) || - BPF_CLASS(insn->code) == BPF_ALU64) { + (BPF_CLASS(insn->code) == BPF_ALU64 && + BPF_SRC(insn->code) != BPF_TO_LE)) { verbose(env, "BPF_END uses reserved fields\n"); return -EINVAL; } @@ -12932,11 +13904,24 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else if (opcode == BPF_MOV) { if (BPF_SRC(insn->code) == BPF_X) { - if (insn->imm != 0 || insn->off != 0) { + if (insn->imm != 0) { verbose(env, "BPF_MOV uses reserved fields\n"); return -EINVAL; } + if (BPF_CLASS(insn->code) == BPF_ALU) { + if (insn->off != 0 && insn->off != 8 && insn->off != 16) { + verbose(env, "BPF_MOV uses reserved fields\n"); + return -EINVAL; + } + } else { + if (insn->off != 0 && insn->off != 8 && insn->off != 16 && + insn->off != 32) { + verbose(env, "BPF_MOV uses reserved fields\n"); + return -EINVAL; + } + } + /* check src operand */ err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) @@ -12960,18 +13945,42 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) !tnum_is_const(src_reg->var_off); if (BPF_CLASS(insn->code) == BPF_ALU64) { - /* case: R1 = R2 - * copy register state to dest reg - */ - if (need_id) - /* Assign src and dst registers the same ID - * that will be used by find_equal_scalars() - * to propagate min/max range. + if (insn->off == 0) { + /* case: R1 = R2 + * copy register state to dest reg */ - src_reg->id = ++env->id_gen; - copy_register_state(dst_reg, src_reg); - dst_reg->live |= REG_LIVE_WRITTEN; - dst_reg->subreg_def = DEF_NOT_SUBREG; + if (need_id) + /* Assign src and dst registers the same ID + * that will be used by find_equal_scalars() + * to propagate min/max range. + */ + src_reg->id = ++env->id_gen; + copy_register_state(dst_reg, src_reg); + dst_reg->live |= REG_LIVE_WRITTEN; + dst_reg->subreg_def = DEF_NOT_SUBREG; + } else { + /* case: R1 = (s8, s16 s32)R2 */ + if (is_pointer_value(env, insn->src_reg)) { + verbose(env, + "R%d sign-extension part of pointer\n", + insn->src_reg); + return -EACCES; + } else if (src_reg->type == SCALAR_VALUE) { + bool no_sext; + + no_sext = src_reg->umax_value < (1ULL << (insn->off - 1)); + if (no_sext && need_id) + src_reg->id = ++env->id_gen; + copy_register_state(dst_reg, src_reg); + if (!no_sext) + dst_reg->id = 0; + coerce_reg_to_size_sx(dst_reg, insn->off >> 3); + dst_reg->live |= REG_LIVE_WRITTEN; + dst_reg->subreg_def = DEF_NOT_SUBREG; + } else { + mark_reg_unknown(env, regs, insn->dst_reg); + } + } } else { /* R1 = (u32) R2 */ if (is_pointer_value(env, insn->src_reg)) { @@ -12980,19 +13989,33 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) insn->src_reg); return -EACCES; } else if (src_reg->type == SCALAR_VALUE) { - bool is_src_reg_u32 = src_reg->umax_value <= U32_MAX; - - if (is_src_reg_u32 && need_id) - src_reg->id = ++env->id_gen; - copy_register_state(dst_reg, src_reg); - /* Make sure ID is cleared if src_reg is not in u32 range otherwise - * dst_reg min/max could be incorrectly - * propagated into src_reg by find_equal_scalars() - */ - if (!is_src_reg_u32) - dst_reg->id = 0; - dst_reg->live |= REG_LIVE_WRITTEN; - dst_reg->subreg_def = env->insn_idx + 1; + if (insn->off == 0) { + bool is_src_reg_u32 = src_reg->umax_value <= U32_MAX; + + if (is_src_reg_u32 && need_id) + src_reg->id = ++env->id_gen; + copy_register_state(dst_reg, src_reg); + /* Make sure ID is cleared if src_reg is not in u32 + * range otherwise dst_reg min/max could be incorrectly + * propagated into src_reg by find_equal_scalars() + */ + if (!is_src_reg_u32) + dst_reg->id = 0; + dst_reg->live |= REG_LIVE_WRITTEN; + dst_reg->subreg_def = env->insn_idx + 1; + } else { + /* case: W1 = (s8, s16)W2 */ + bool no_sext = src_reg->umax_value < (1ULL << (insn->off - 1)); + + if (no_sext && need_id) + src_reg->id = ++env->id_gen; + copy_register_state(dst_reg, src_reg); + if (!no_sext) + dst_reg->id = 0; + dst_reg->live |= REG_LIVE_WRITTEN; + dst_reg->subreg_def = env->insn_idx + 1; + coerce_subreg_to_size_sx(dst_reg, insn->off >> 3); + } } else { mark_reg_unknown(env, regs, insn->dst_reg); @@ -13023,7 +14046,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else { /* all other ALU ops: and, sub, xor, add, ... */ if (BPF_SRC(insn->code) == BPF_X) { - if (insn->imm != 0 || insn->off != 0) { + if (insn->imm != 0 || insn->off > 1 || + (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) { verbose(env, "BPF_ALU uses reserved fields\n"); return -EINVAL; } @@ -13032,7 +14056,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (err) return err; } else { - if (insn->src_reg != BPF_REG_0 || insn->off != 0) { + if (insn->src_reg != BPF_REG_0 || insn->off > 1 || + (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) { verbose(env, "BPF_ALU uses reserved fields\n"); return -EINVAL; } @@ -13160,12 +14185,16 @@ static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode) return !!tnum_equals_const(subreg, val); else if (val < reg->u32_min_value || val > reg->u32_max_value) return 0; + else if (sval < reg->s32_min_value || sval > reg->s32_max_value) + return 0; break; case BPF_JNE: if (tnum_is_const(subreg)) return !tnum_equals_const(subreg, val); else if (val < reg->u32_min_value || val > reg->u32_max_value) return 1; + else if (sval < reg->s32_min_value || sval > reg->s32_max_value) + return 1; break; case BPF_JSET: if ((~subreg.mask & subreg.value) & val) @@ -13237,12 +14266,16 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) return !!tnum_equals_const(reg->var_off, val); else if (val < reg->umin_value || val > reg->umax_value) return 0; + else if (sval < reg->smin_value || sval > reg->smax_value) + return 0; break; case BPF_JNE: if (tnum_is_const(reg->var_off)) return !tnum_equals_const(reg->var_off, val); else if (val < reg->umin_value || val > reg->umax_value) return 1; + else if (sval < reg->smin_value || sval > reg->smax_value) + return 1; break; case BPF_JSET: if ((~reg->var_off.mask & reg->var_off.value) & val) @@ -13817,6 +14850,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return -EINVAL; } + /* check src2 operand */ + err = check_reg_arg(env, insn->dst_reg, SRC_OP); + if (err) + return err; + + dst_reg = ®s[insn->dst_reg]; if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0) { verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); @@ -13828,12 +14867,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (err) return err; - if (is_pointer_value(env, insn->src_reg)) { + src_reg = ®s[insn->src_reg]; + if (!(reg_is_pkt_pointer_any(dst_reg) && reg_is_pkt_pointer_any(src_reg)) && + is_pointer_value(env, insn->src_reg)) { verbose(env, "R%d pointer comparison prohibited\n", insn->src_reg); return -EACCES; } - src_reg = ®s[insn->src_reg]; } else { if (insn->src_reg != BPF_REG_0) { verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); @@ -13841,12 +14881,6 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } } - /* check src2 operand */ - err = check_reg_arg(env, insn->dst_reg, SRC_OP); - if (err) - return err; - - dst_reg = ®s[insn->dst_reg]; is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; if (BPF_SRC(insn->code) == BPF_K) { @@ -13903,6 +14937,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, !sanitize_speculative_path(env, insn, *insn_idx + 1, *insn_idx)) return -EFAULT; + if (env->log.level & BPF_LOG_LEVEL) + print_insn_state(env, this_branch->frame[this_branch->curframe]); *insn_idx += insn->off; return 0; } else if (pred == 0) { @@ -13915,6 +14951,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, *insn_idx + insn->off + 1, *insn_idx)) return -EFAULT; + if (env->log.level & BPF_LOG_LEVEL) + print_insn_state(env, this_branch->frame[this_branch->curframe]); return 0; } @@ -14193,7 +15231,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) * gen_ld_abs() may terminate the program at runtime, leading to * reference leak. */ - err = check_reference_leak(env); + err = check_reference_leak(env, false); if (err) { verbose(env, "BPF_LD_[ABS|IND] cannot be mixed with socket references\n"); return err; @@ -14242,19 +15280,19 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } -static int check_return_code(struct bpf_verifier_env *env) +static int check_return_code(struct bpf_verifier_env *env, int regno) { struct tnum enforce_attach_type_range = tnum_unknown; const struct bpf_prog *prog = env->prog; struct bpf_reg_state *reg; - struct tnum range = tnum_range(0, 1); + struct tnum range = tnum_range(0, 1), const_0 = tnum_const(0); enum bpf_prog_type prog_type = resolve_prog_type(env->prog); int err; struct bpf_func_state *frame = env->cur_state->frame[0]; const bool is_subprog = frame->subprogno; /* LSM and struct_ops func-ptr's return type could be "void" */ - if (!is_subprog) { + if (!is_subprog || frame->in_exception_callback_fn) { switch (prog_type) { case BPF_PROG_TYPE_LSM: if (prog->expected_attach_type == BPF_LSM_CGROUP) @@ -14276,36 +15314,36 @@ static int check_return_code(struct bpf_verifier_env *env) * of bpf_exit, which means that program wrote * something into it earlier */ - err = check_reg_arg(env, BPF_REG_0, SRC_OP); + err = check_reg_arg(env, regno, SRC_OP); if (err) return err; - if (is_pointer_value(env, BPF_REG_0)) { - verbose(env, "R0 leaks addr as return value\n"); + if (is_pointer_value(env, regno)) { + verbose(env, "R%d leaks addr as return value\n", regno); return -EACCES; } - reg = cur_regs(env) + BPF_REG_0; + reg = cur_regs(env) + regno; if (frame->in_async_callback_fn) { /* enforce return zero from async callbacks like timer */ if (reg->type != SCALAR_VALUE) { - verbose(env, "In async callback the register R0 is not a known value (%s)\n", - reg_type_str(env, reg->type)); + verbose(env, "In async callback the register R%d is not a known value (%s)\n", + regno, reg_type_str(env, reg->type)); return -EINVAL; } - if (!tnum_in(tnum_const(0), reg->var_off)) { - verbose_invalid_scalar(env, reg, &range, "async callback", "R0"); + if (!tnum_in(const_0, reg->var_off)) { + verbose_invalid_scalar(env, reg, &const_0, "async callback", "R0"); return -EINVAL; } return 0; } - if (is_subprog) { + if (is_subprog && !frame->in_exception_callback_fn) { if (reg->type != SCALAR_VALUE) { - verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n", - reg_type_str(env, reg->type)); + verbose(env, "At subprogram exit the register R%d is not a scalar value (%s)\n", + regno, reg_type_str(env, reg->type)); return -EINVAL; } return 0; @@ -14315,10 +15353,13 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG || + env->prog->expected_attach_type == BPF_CGROUP_UNIX_RECVMSG || env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME || env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME || + env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETPEERNAME || env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME || - env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME) + env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME || + env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETSOCKNAME) range = tnum_range(1, 1); if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND || env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND) @@ -14387,8 +15428,8 @@ static int check_return_code(struct bpf_verifier_env *env) } if (reg->type != SCALAR_VALUE) { - verbose(env, "At program exit the register R0 is not a known value (%s)\n", - reg_type_str(env, reg->type)); + verbose(env, "At program exit the register R%d is not a known value (%s)\n", + regno, reg_type_str(env, reg->type)); return -EINVAL; } @@ -14447,21 +15488,6 @@ enum { BRANCH = 2, }; -static u32 state_htab_size(struct bpf_verifier_env *env) -{ - return env->prog->len; -} - -static struct bpf_verifier_state_list **explored_state( - struct bpf_verifier_env *env, - int idx) -{ - struct bpf_verifier_state *cur = env->cur_state; - struct bpf_func_state *state = cur->frame[cur->curframe]; - - return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)]; -} - static void mark_prune_point(struct bpf_verifier_env *env, int idx) { env->insn_aux_data[idx].prune_point = true; @@ -14482,6 +15508,15 @@ static bool is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx) return env->insn_aux_data[insn_idx].force_checkpoint; } +static void mark_calls_callback(struct bpf_verifier_env *env, int idx) +{ + env->insn_aux_data[idx].calls_callback = true; +} + +static bool calls_callback(struct bpf_verifier_env *env, int insn_idx) +{ + return env->insn_aux_data[insn_idx].calls_callback; +} enum { DONE_EXPLORING = 0, @@ -14493,8 +15528,7 @@ enum { * w - next instruction * e - edge */ -static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, - bool loop_ok) +static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) { int *insn_stack = env->cfg.insn_stack; int *insn_state = env->cfg.insn_state; @@ -14526,7 +15560,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, insn_stack[env->cfg.cur_stack++] = w; return KEEP_EXPLORING; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { - if (loop_ok && env->bpf_capable) + if (env->bpf_capable) return DONE_EXPLORING; verbose_linfo(env, t, "%d: ", t); verbose_linfo(env, w, "%d: ", w); @@ -14546,24 +15580,20 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, struct bpf_verifier_env *env, bool visit_callee) { - int ret; + int ret, insn_sz; - ret = push_insn(t, t + 1, FALLTHROUGH, env, false); + insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1; + ret = push_insn(t, t + insn_sz, FALLTHROUGH, env); if (ret) return ret; - mark_prune_point(env, t + 1); + mark_prune_point(env, t + insn_sz); /* when we exit from subprog, we need to record non-linear history */ - mark_jmp_point(env, t + 1); + mark_jmp_point(env, t + insn_sz); if (visit_callee) { mark_prune_point(env, t); - ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env, - /* It's ok to allow recursion from CFG point of - * view. __check_func_call() will do the actual - * check. - */ - bpf_pseudo_func(insns + t)); + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); } return ret; } @@ -14576,15 +15606,17 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, static int visit_insn(int t, struct bpf_verifier_env *env) { struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t]; - int ret; + int ret, off, insn_sz; if (bpf_pseudo_func(insn)) return visit_func_call_insn(t, insns, env, true); /* All non-branch instructions have a single fall-through edge. */ if (BPF_CLASS(insn->code) != BPF_JMP && - BPF_CLASS(insn->code) != BPF_JMP32) - return push_insn(t, t + 1, FALLTHROUGH, env, false); + BPF_CLASS(insn->code) != BPF_JMP32) { + insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; + return push_insn(t, t + insn_sz, FALLTHROUGH, env); + } switch (BPF_OP(insn->code)) { case BPF_EXIT: @@ -14598,6 +15630,21 @@ static int visit_insn(int t, struct bpf_verifier_env *env) * async state will be pushed for further exploration. */ mark_prune_point(env, t); + /* For functions that invoke callbacks it is not known how many times + * callback would be called. Verifier models callback calling functions + * by repeatedly visiting callback bodies and returning to origin call + * instruction. + * In order to stop such iteration verifier needs to identify when a + * state identical some state from a previous iteration is reached. + * Check below forces creation of checkpoint before callback calling + * instruction to allow search for such identical states. + */ + if (is_sync_callback_calling_insn(insn)) { + mark_calls_callback(env, t); + mark_force_checkpoint(env, t); + mark_prune_point(env, t); + mark_jmp_point(env, t); + } if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { struct bpf_kfunc_call_arg_meta meta; @@ -14624,14 +15671,18 @@ static int visit_insn(int t, struct bpf_verifier_env *env) if (BPF_SRC(insn->code) != BPF_K) return -EINVAL; + if (BPF_CLASS(insn->code) == BPF_JMP) + off = insn->off; + else + off = insn->imm; + /* unconditional jump with single edge */ - ret = push_insn(t, t + insn->off + 1, FALLTHROUGH, env, - true); + ret = push_insn(t, t + off + 1, FALLTHROUGH, env); if (ret) return ret; - mark_prune_point(env, t + insn->off + 1); - mark_jmp_point(env, t + insn->off + 1); + mark_prune_point(env, t + off + 1); + mark_jmp_point(env, t + off + 1); return ret; @@ -14639,11 +15690,11 @@ static int visit_insn(int t, struct bpf_verifier_env *env) /* conditional jump with two edges */ mark_prune_point(env, t); - ret = push_insn(t, t + 1, FALLTHROUGH, env, true); + ret = push_insn(t, t + 1, FALLTHROUGH, env); if (ret) return ret; - return push_insn(t, t + insn->off + 1, BRANCH, env, true); + return push_insn(t, t + insn->off + 1, BRANCH, env); } } @@ -14654,8 +15705,8 @@ static int check_cfg(struct bpf_verifier_env *env) { int insn_cnt = env->prog->len; int *insn_stack, *insn_state; - int ret = 0; - int i; + int ex_insn_beg, i, ret = 0; + bool ex_done = false; insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) @@ -14671,6 +15722,7 @@ static int check_cfg(struct bpf_verifier_env *env) insn_stack[0] = 0; /* 0 is the first instruction */ env->cfg.cur_stack = 1; +walk_cfg: while (env->cfg.cur_stack > 0) { int t = insn_stack[env->cfg.cur_stack - 1]; @@ -14697,12 +15749,32 @@ static int check_cfg(struct bpf_verifier_env *env) goto err_free; } + if (env->exception_callback_subprog && !ex_done) { + ex_insn_beg = env->subprog_info[env->exception_callback_subprog].start; + + insn_state[ex_insn_beg] = DISCOVERED; + insn_stack[0] = ex_insn_beg; + env->cfg.cur_stack = 1; + ex_done = true; + goto walk_cfg; + } + for (i = 0; i < insn_cnt; i++) { + struct bpf_insn *insn = &env->prog->insnsi[i]; + if (insn_state[i] != EXPLORED) { verbose(env, "unreachable insn %d\n", i); ret = -EINVAL; goto err_free; } + if (bpf_is_ldimm64(insn)) { + if (insn_state[i + 1] != 0) { + verbose(env, "jump into the middle of ldimm64 insn %d\n", i); + ret = -EINVAL; + goto err_free; + } + i++; /* skip second half of ldimm64 */ + } } ret = 0; /* cfg looks good */ @@ -14734,20 +15806,18 @@ static int check_abnormal_return(struct bpf_verifier_env *env) #define MIN_BPF_FUNCINFO_SIZE 8 #define MAX_FUNCINFO_REC_SIZE 252 -static int check_btf_func(struct bpf_verifier_env *env, - const union bpf_attr *attr, - bpfptr_t uattr) +static int check_btf_func_early(struct bpf_verifier_env *env, + const union bpf_attr *attr, + bpfptr_t uattr) { - const struct btf_type *type, *func_proto, *ret_type; - u32 i, nfuncs, urec_size, min_size; u32 krec_size = sizeof(struct bpf_func_info); + const struct btf_type *type, *func_proto; + u32 i, nfuncs, urec_size, min_size; struct bpf_func_info *krecord; - struct bpf_func_info_aux *info_aux = NULL; struct bpf_prog *prog; const struct btf *btf; - bpfptr_t urecord; u32 prev_offset = 0; - bool scalar_return; + bpfptr_t urecord; int ret = -ENOMEM; nfuncs = attr->func_info_cnt; @@ -14757,11 +15827,6 @@ static int check_btf_func(struct bpf_verifier_env *env, return 0; } - if (nfuncs != env->subprog_cnt) { - verbose(env, "number of funcs in func_info doesn't match number of subprogs\n"); - return -EINVAL; - } - urec_size = attr->func_info_rec_size; if (urec_size < MIN_BPF_FUNCINFO_SIZE || urec_size > MAX_FUNCINFO_REC_SIZE || @@ -14779,9 +15844,6 @@ static int check_btf_func(struct bpf_verifier_env *env, krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN); if (!krecord) return -ENOMEM; - info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL | __GFP_NOWARN); - if (!info_aux) - goto err_free; for (i = 0; i < nfuncs; i++) { ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size); @@ -14820,11 +15882,6 @@ static int check_btf_func(struct bpf_verifier_env *env, goto err_free; } - if (env->subprog_info[i].start != krecord[i].insn_off) { - verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n"); - goto err_free; - } - /* check type_id */ type = btf_type_by_id(btf, krecord[i].type_id); if (!type || !btf_type_is_func(type)) { @@ -14832,12 +15889,77 @@ static int check_btf_func(struct bpf_verifier_env *env, krecord[i].type_id); goto err_free; } - info_aux[i].linkage = BTF_INFO_VLEN(type->info); func_proto = btf_type_by_id(btf, type->type); if (unlikely(!func_proto || !btf_type_is_func_proto(func_proto))) /* btf_func_check() already verified it during BTF load */ goto err_free; + + prev_offset = krecord[i].insn_off; + bpfptr_add(&urecord, urec_size); + } + + prog->aux->func_info = krecord; + prog->aux->func_info_cnt = nfuncs; + return 0; + +err_free: + kvfree(krecord); + return ret; +} + +static int check_btf_func(struct bpf_verifier_env *env, + const union bpf_attr *attr, + bpfptr_t uattr) +{ + const struct btf_type *type, *func_proto, *ret_type; + u32 i, nfuncs, urec_size; + struct bpf_func_info *krecord; + struct bpf_func_info_aux *info_aux = NULL; + struct bpf_prog *prog; + const struct btf *btf; + bpfptr_t urecord; + bool scalar_return; + int ret = -ENOMEM; + + nfuncs = attr->func_info_cnt; + if (!nfuncs) { + if (check_abnormal_return(env)) + return -EINVAL; + return 0; + } + if (nfuncs != env->subprog_cnt) { + verbose(env, "number of funcs in func_info doesn't match number of subprogs\n"); + return -EINVAL; + } + + urec_size = attr->func_info_rec_size; + + prog = env->prog; + btf = prog->aux->btf; + + urecord = make_bpfptr(attr->func_info, uattr.is_kernel); + + krecord = prog->aux->func_info; + info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL | __GFP_NOWARN); + if (!info_aux) + return -ENOMEM; + + for (i = 0; i < nfuncs; i++) { + /* check insn_off */ + ret = -EINVAL; + + if (env->subprog_info[i].start != krecord[i].insn_off) { + verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n"); + goto err_free; + } + + /* Already checked type_id */ + type = btf_type_by_id(btf, krecord[i].type_id); + info_aux[i].linkage = BTF_INFO_VLEN(type->info); + /* Already checked func_proto */ + func_proto = btf_type_by_id(btf, type->type); + ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL); scalar_return = btf_type_is_small_int(ret_type) || btf_is_any_enum(ret_type); @@ -14850,17 +15972,13 @@ static int check_btf_func(struct bpf_verifier_env *env, goto err_free; } - prev_offset = krecord[i].insn_off; bpfptr_add(&urecord, urec_size); } - prog->aux->func_info = krecord; - prog->aux->func_info_cnt = nfuncs; prog->aux->func_info_aux = info_aux; return 0; err_free: - kvfree(krecord); kfree(info_aux); return ret; } @@ -14873,7 +15991,8 @@ static void adjust_btf_func(struct bpf_verifier_env *env) if (!aux->func_info) return; - for (i = 0; i < env->subprog_cnt; i++) + /* func_info is not available for hidden subprogs */ + for (i = 0; i < env->subprog_cnt - env->hidden_subprog_cnt; i++) aux->func_info[i].insn_off = env->subprog_info[i].start; } @@ -15077,9 +16196,9 @@ static int check_core_relo(struct bpf_verifier_env *env, return err; } -static int check_btf_info(struct bpf_verifier_env *env, - const union bpf_attr *attr, - bpfptr_t uattr) +static int check_btf_info_early(struct bpf_verifier_env *env, + const union bpf_attr *attr, + bpfptr_t uattr) { struct btf *btf; int err; @@ -15099,6 +16218,24 @@ static int check_btf_info(struct bpf_verifier_env *env, } env->prog->aux->btf = btf; + err = check_btf_func_early(env, attr, uattr); + if (err) + return err; + return 0; +} + +static int check_btf_info(struct bpf_verifier_env *env, + const union bpf_attr *attr, + bpfptr_t uattr) +{ + int err; + + if (!attr->func_info_cnt && !attr->line_info_cnt) { + if (check_abnormal_return(env)) + return -EINVAL; + return 0; + } + err = check_btf_func(env, attr, uattr); if (err) return err; @@ -15257,18 +16394,14 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, struct bpf_verifier_state *cur) { struct bpf_verifier_state_list *sl; - int i; sl = *explored_state(env, insn); while (sl) { if (sl->state.branches) goto next; if (sl->state.insn_idx != insn || - sl->state.curframe != cur->curframe) + !same_callsites(&sl->state, cur)) goto next; - for (i = 0; i <= cur->curframe; i++) - if (sl->state.frame[i]->callsite != cur->frame[i]->callsite) - goto next; clean_verifier_state(env, &sl->state); next: sl = sl->next; @@ -15286,8 +16419,11 @@ static bool regs_exact(const struct bpf_reg_state *rold, /* Returns true if (rold safe implies rcur safe) */ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, - struct bpf_reg_state *rcur, struct bpf_idmap *idmap) + struct bpf_reg_state *rcur, struct bpf_idmap *idmap, bool exact) { + if (exact) + return regs_exact(rold, rcur, idmap); + if (!(rold->live & REG_LIVE_READ)) /* explored state didn't use this */ return true; @@ -15404,7 +16540,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, } static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, - struct bpf_func_state *cur, struct bpf_idmap *idmap) + struct bpf_func_state *cur, struct bpf_idmap *idmap, bool exact) { int i, spi; @@ -15417,7 +16553,12 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, spi = i / BPF_REG_SIZE; - if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) { + if (exact && + old->stack[spi].slot_type[i % BPF_REG_SIZE] != + cur->stack[spi].slot_type[i % BPF_REG_SIZE]) + return false; + + if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ) && !exact) { i += BPF_REG_SIZE - 1; /* explored state didn't use this */ continue; @@ -15467,7 +16608,7 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, * return false to continue verification of this path */ if (!regsafe(env, &old->stack[spi].spilled_ptr, - &cur->stack[spi].spilled_ptr, idmap)) + &cur->stack[spi].spilled_ptr, idmap, exact)) return false; break; case STACK_DYNPTR: @@ -15549,16 +16690,16 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur, * the current state will reach 'bpf_exit' instruction safely */ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old, - struct bpf_func_state *cur) + struct bpf_func_state *cur, bool exact) { int i; for (i = 0; i < MAX_BPF_REG; i++) if (!regsafe(env, &old->regs[i], &cur->regs[i], - &env->idmap_scratch)) + &env->idmap_scratch, exact)) return false; - if (!stacksafe(env, old, cur, &env->idmap_scratch)) + if (!stacksafe(env, old, cur, &env->idmap_scratch, exact)) return false; if (!refsafe(old, cur, &env->idmap_scratch)) @@ -15567,17 +16708,23 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat return true; } +static void reset_idmap_scratch(struct bpf_verifier_env *env) +{ + env->idmap_scratch.tmp_id_gen = env->id_gen; + memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map)); +} + static bool states_equal(struct bpf_verifier_env *env, struct bpf_verifier_state *old, - struct bpf_verifier_state *cur) + struct bpf_verifier_state *cur, + bool exact) { int i; if (old->curframe != cur->curframe) return false; - env->idmap_scratch.tmp_id_gen = env->id_gen; - memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map)); + reset_idmap_scratch(env); /* Verification state from speculative execution simulation * must never prune a non-speculative execution one. @@ -15607,7 +16754,7 @@ static bool states_equal(struct bpf_verifier_env *env, for (i = 0; i <= old->curframe; i++) { if (old->frame[i]->callsite != cur->frame[i]->callsite) return false; - if (!func_states_equal(env, old->frame[i], cur->frame[i])) + if (!func_states_equal(env, old->frame[i], cur->frame[i], exact)) return false; } return true; @@ -15861,10 +17008,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl, **pprev; - struct bpf_verifier_state *cur = env->cur_state, *new; - int i, j, err, states_cnt = 0; + struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry; + int i, j, n, err, states_cnt = 0; bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx); bool add_new_state = force_new_state; + bool force_exact; /* bpf progs typically have pruning point every 4 instructions * http://vger.kernel.org/bpfconf2019.html#session-1 @@ -15917,9 +17065,33 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * It's safe to assume that iterator loop will finish, taking into * account iter_next() contract of eventually returning * sticky NULL result. + * + * Note, that states have to be compared exactly in this case because + * read and precision marks might not be finalized inside the loop. + * E.g. as in the program below: + * + * 1. r7 = -16 + * 2. r6 = bpf_get_prandom_u32() + * 3. while (bpf_iter_num_next(&fp[-8])) { + * 4. if (r6 != 42) { + * 5. r7 = -32 + * 6. r6 = bpf_get_prandom_u32() + * 7. continue + * 8. } + * 9. r0 = r10 + * 10. r0 += r7 + * 11. r8 = *(u64 *)(r0 + 0) + * 12. r6 = bpf_get_prandom_u32() + * 13. } + * + * Here verifier would first visit path 1-3, create a checkpoint at 3 + * with r7=-16, continue to 4-7,3. Existing checkpoint at 3 does + * not have read or precision mark for r7 yet, thus inexact states + * comparison would discard current state with r7=-32 + * => unsafe memory access at 11 would not be caught. */ if (is_iter_next_insn(env, insn_idx)) { - if (states_equal(env, &sl->state, cur)) { + if (states_equal(env, &sl->state, cur, true)) { struct bpf_func_state *cur_frame; struct bpf_reg_state *iter_state, *iter_reg; int spi; @@ -15935,17 +17107,29 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) */ spi = __get_spi(iter_reg->off + iter_reg->var_off.value); iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr; - if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) + if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) { + update_loop_entry(cur, &sl->state); goto hit; + } } goto skip_inf_loop_check; } + if (calls_callback(env, insn_idx)) { + if (states_equal(env, &sl->state, cur, true)) + goto hit; + goto skip_inf_loop_check; + } /* attempt to detect infinite loop to avoid unnecessary doomed work */ if (states_maybe_looping(&sl->state, cur) && - states_equal(env, &sl->state, cur) && - !iter_active_depths_differ(&sl->state, cur)) { + states_equal(env, &sl->state, cur, false) && + !iter_active_depths_differ(&sl->state, cur) && + sl->state.callback_unroll_depth == cur->callback_unroll_depth) { verbose_linfo(env, insn_idx, "; "); verbose(env, "infinite loop detected at insn %d\n", insn_idx); + verbose(env, "cur state:"); + print_verifier_state(env, cur->frame[cur->curframe], true); + verbose(env, "old state:"); + print_verifier_state(env, sl->state.frame[cur->curframe], true); return -EINVAL; } /* if the verifier is processing a loop, avoid adding new state @@ -15967,7 +17151,36 @@ skip_inf_loop_check: add_new_state = false; goto miss; } - if (states_equal(env, &sl->state, cur)) { + /* If sl->state is a part of a loop and this loop's entry is a part of + * current verification path then states have to be compared exactly. + * 'force_exact' is needed to catch the following case: + * + * initial Here state 'succ' was processed first, + * | it was eventually tracked to produce a + * V state identical to 'hdr'. + * .---------> hdr All branches from 'succ' had been explored + * | | and thus 'succ' has its .branches == 0. + * | V + * | .------... Suppose states 'cur' and 'succ' correspond + * | | | to the same instruction + callsites. + * | V V In such case it is necessary to check + * | ... ... if 'succ' and 'cur' are states_equal(). + * | | | If 'succ' and 'cur' are a part of the + * | V V same loop exact flag has to be set. + * | succ <- cur To check if that is the case, verify + * | | if loop entry of 'succ' is in current + * | V DFS path. + * | ... + * | | + * '----' + * + * Additional details are in the comment before get_loop_entry(). + */ + loop_entry = get_loop_entry(&sl->state); + force_exact = loop_entry && loop_entry->branches > 0; + if (states_equal(env, &sl->state, cur, force_exact)) { + if (force_exact) + update_loop_entry(cur, loop_entry); hit: sl->hit_cnt++; /* reached equivalent register/stack state, @@ -16006,13 +17219,18 @@ miss: * to keep checking from state equivalence point of view. * Higher numbers increase max_states_per_insn and verification time, * but do not meaningfully decrease insn_processed. + * 'n' controls how many times state could miss before eviction. + * Use bigger 'n' for checkpoints because evicting checkpoint states + * too early would hinder iterator convergence. */ - if (sl->miss_cnt > sl->hit_cnt * 3 + 3) { + n = is_force_checkpoint(env, insn_idx) && sl->state.branches > 0 ? 64 : 3; + if (sl->miss_cnt > sl->hit_cnt * n + n) { /* the state is unlikely to be useful. Remove it to * speed up verification */ *pprev = sl->next; - if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) { + if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE && + !sl->state.used_as_loop_entry) { u32 br = sl->state.branches; WARN_ONCE(br, @@ -16081,6 +17299,7 @@ next: cur->parent = new; cur->first_insn_idx = insn_idx; + cur->dfs_depth = new->dfs_depth + 1; clear_jmp_history(cur); new_sl->next = *explored_state(env, insn_idx); *explored_state(env, insn_idx) = new_sl; @@ -16178,7 +17397,7 @@ static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type typ * Have to support a use case when one path through * the program yields TRUSTED pointer while another * is UNTRUSTED. Fallback to UNTRUSTED to generate - * BPF_PROBE_MEM. + * BPF_PROBE_MEM/BPF_PROBE_MEMSX. */ *prev_type = PTR_TO_BTF_ID | PTR_UNTRUSTED; } else { @@ -16201,6 +17420,7 @@ static int do_check(struct bpf_verifier_env *env) int prev_insn_idx = -1; for (;;) { + bool exception_exit = false; struct bpf_insn *insn; u8 class; int err; @@ -16319,7 +17539,8 @@ static int do_check(struct bpf_verifier_env *env) */ err = check_mem_access(env, env->insn_idx, insn->src_reg, insn->off, BPF_SIZE(insn->code), - BPF_READ, insn->dst_reg, false); + BPF_READ, insn->dst_reg, false, + BPF_MODE(insn->code) == BPF_MEMSX); if (err) return err; @@ -16356,7 +17577,7 @@ static int do_check(struct bpf_verifier_env *env) /* check that memory (dst_reg + off) is writeable */ err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), - BPF_WRITE, insn->src_reg, false); + BPF_WRITE, insn->src_reg, false, false); if (err) return err; @@ -16381,7 +17602,7 @@ static int do_check(struct bpf_verifier_env *env) /* check that memory (dst_reg + off) is writeable */ err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), - BPF_WRITE, -1, false); + BPF_WRITE, -1, false, false); if (err) return err; @@ -16414,27 +17635,35 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } } - if (insn->src_reg == BPF_PSEUDO_CALL) + if (insn->src_reg == BPF_PSEUDO_CALL) { err = check_func_call(env, insn, &env->insn_idx); - else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) + } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { err = check_kfunc_call(env, insn, &env->insn_idx); - else + if (!err && is_bpf_throw_kfunc(insn)) { + exception_exit = true; + goto process_bpf_exit_full; + } + } else { err = check_helper_call(env, insn, &env->insn_idx); + } if (err) return err; mark_reg_scratched(env, BPF_REG_0); } else if (opcode == BPF_JA) { if (BPF_SRC(insn->code) != BPF_K || - insn->imm != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0 || - class == BPF_JMP32) { + (class == BPF_JMP && insn->imm != 0) || + (class == BPF_JMP32 && insn->off != 0)) { verbose(env, "BPF_JA uses reserved fields\n"); return -EINVAL; } - env->insn_idx += insn->off + 1; + if (class == BPF_JMP) + env->insn_idx += insn->off + 1; + else + env->insn_idx += insn->imm + 1; continue; } else if (opcode == BPF_EXIT) { @@ -16446,14 +17675,15 @@ static int do_check(struct bpf_verifier_env *env) verbose(env, "BPF_EXIT uses reserved fields\n"); return -EINVAL; } - +process_bpf_exit_full: if (env->cur_state->active_lock.ptr && !in_rbtree_lock_required_cb(env)) { verbose(env, "bpf_spin_unlock is missing\n"); return -EINVAL; } - if (env->cur_state->active_rcu_lock) { + if (env->cur_state->active_rcu_lock && + !in_rbtree_lock_required_cb(env)) { verbose(env, "bpf_rcu_read_unlock is missing\n"); return -EINVAL; } @@ -16464,10 +17694,23 @@ static int do_check(struct bpf_verifier_env *env) * function, for which reference_state must * match caller reference state when it exits. */ - err = check_reference_leak(env); + err = check_reference_leak(env, exception_exit); if (err) return err; + /* The side effect of the prepare_func_exit + * which is being skipped is that it frees + * bpf_func_state. Typically, process_bpf_exit + * will only be hit with outermost exit. + * copy_verifier_state in pop_stack will handle + * freeing of any extra bpf_func_state left over + * from not processing all nested function + * exits. We also skip return code checks as + * they are not needed for exceptional exits. + */ + if (exception_exit) + goto process_bpf_exit; + if (state->curframe) { /* exit from nested function */ err = prepare_func_exit(env, &env->insn_idx); @@ -16477,7 +17720,7 @@ static int do_check(struct bpf_verifier_env *env) continue; } - err = check_return_code(env); + err = check_return_code(env, BPF_REG_0); if (err) return err; process_bpf_exit: @@ -16733,11 +17976,6 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, verbose(env, "tracing progs cannot use bpf_spin_lock yet\n"); return -EINVAL; } - - if (prog->aux->sleepable) { - verbose(env, "sleepable progs cannot use bpf_spin_lock yet\n"); - return -EINVAL; - } } if (btf_record_has_field(map->record, BPF_TIMER)) { @@ -16809,7 +18047,8 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++, insn++) { if (BPF_CLASS(insn->code) == BPF_LDX && - (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) { + ((BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_MEMSX) || + insn->imm != 0)) { verbose(env, "BPF_LDX uses reserved fields\n"); return -EINVAL; } @@ -17280,13 +18519,13 @@ static bool insn_is_cond_jump(u8 code) { u8 op; + op = BPF_OP(code); if (BPF_CLASS(code) == BPF_JMP32) - return true; + return op != BPF_JA; if (BPF_CLASS(code) != BPF_JMP) return false; - op = BPF_OP(code); return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL; } @@ -17503,11 +18742,15 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++, insn++) { bpf_convert_ctx_access_t convert_ctx_access; + u8 mode; if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) || insn->code == (BPF_LDX | BPF_MEM | BPF_H) || insn->code == (BPF_LDX | BPF_MEM | BPF_W) || - insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) { + insn->code == (BPF_LDX | BPF_MEM | BPF_DW) || + insn->code == (BPF_LDX | BPF_MEMSX | BPF_B) || + insn->code == (BPF_LDX | BPF_MEMSX | BPF_H) || + insn->code == (BPF_LDX | BPF_MEMSX | BPF_W)) { type = BPF_READ; } else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) || insn->code == (BPF_STX | BPF_MEM | BPF_H) || @@ -17566,8 +18809,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) */ case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED: if (type == BPF_READ) { - insn->code = BPF_LDX | BPF_PROBE_MEM | - BPF_SIZE((insn)->code); + if (BPF_MODE(insn->code) == BPF_MEM) + insn->code = BPF_LDX | BPF_PROBE_MEM | + BPF_SIZE((insn)->code); + else + insn->code = BPF_LDX | BPF_PROBE_MEMSX | + BPF_SIZE((insn)->code); env->prog->aux->num_exentries++; } continue; @@ -17577,6 +18824,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size; size = BPF_LDST_BYTES(insn); + mode = BPF_MODE(insn->code); /* If the read access is a narrower load of the field, * convert to a 4/8-byte load, to minimum program type specific @@ -17636,6 +18884,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) (1ULL << size * 8) - 1); } } + if (mode == BPF_MEMSX) + insn_buf[cnt++] = BPF_RAW_INSN(BPF_ALU64 | BPF_MOV | BPF_X, + insn->dst_reg, insn->dst_reg, + size * 8, 0); new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) @@ -17755,11 +19007,15 @@ static int jit_subprogs(struct bpf_verifier_env *env) insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { if (BPF_CLASS(insn->code) == BPF_LDX && - BPF_MODE(insn->code) == BPF_PROBE_MEM) + (BPF_MODE(insn->code) == BPF_PROBE_MEM || + BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) num_exentries++; } func[i]->aux->num_exentries = num_exentries; func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; + func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb; + if (!i) + func[i]->aux->exception_boundary = env->seen_exception; func[i] = bpf_int_jit_compile(func[i]); if (!func[i]->jited) { err = -ENOTSUPP; @@ -17799,7 +19055,8 @@ static int jit_subprogs(struct bpf_verifier_env *env) * the call instruction, as an index for this list */ func[i]->aux->func = func; - func[i]->aux->func_cnt = env->subprog_cnt; + func[i]->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt; + func[i]->aux->real_func_cnt = env->subprog_cnt; } for (i = 0; i < env->subprog_cnt; i++) { old_bpf_func = func[i]->bpf_func; @@ -17845,7 +19102,10 @@ static int jit_subprogs(struct bpf_verifier_env *env) prog->aux->extable = func[0]->aux->extable; prog->aux->num_exentries = func[0]->aux->num_exentries; prog->aux->func = func; - prog->aux->func_cnt = env->subprog_cnt; + prog->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt; + prog->aux->real_func_cnt = env->subprog_cnt; + prog->aux->bpf_exception_cb = (void *)func[env->exception_callback_subprog]->bpf_func; + prog->aux->exception_boundary = func[0]->aux->exception_boundary; bpf_prog_jit_attempt_done(prog); return 0; out_free: @@ -18012,21 +19272,42 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, insn->imm = BPF_CALL_IMM(desc->addr); if (insn->off) return 0; - if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl]) { + if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl] || + desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) }; u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size; + if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && kptr_struct_meta) { + verbose(env, "verifier internal error: NULL kptr_struct_meta expected at insn_idx %d\n", + insn_idx); + return -EFAULT; + } + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_1, obj_new_size); insn_buf[1] = addr[0]; insn_buf[2] = addr[1]; insn_buf[3] = *insn; *cnt = 4; } else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl] || + desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] || desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) { struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) }; + if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] && kptr_struct_meta) { + verbose(env, "verifier internal error: NULL kptr_struct_meta expected at insn_idx %d\n", + insn_idx); + return -EFAULT; + } + + if (desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] && + !kptr_struct_meta) { + verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n", + insn_idx); + return -EFAULT; + } + insn_buf[0] = addr[0]; insn_buf[1] = addr[1]; insn_buf[2] = *insn; @@ -18034,6 +19315,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } else if (desc->func_id == special_kfunc_list[KF_bpf_list_push_back_impl] || desc->func_id == special_kfunc_list[KF_bpf_list_push_front_impl] || desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { + struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; int struct_meta_reg = BPF_REG_3; int node_offset_reg = BPF_REG_4; @@ -18043,6 +19325,12 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, node_offset_reg = BPF_REG_5; } + if (!kptr_struct_meta) { + verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n", + insn_idx); + return -EFAULT; + } + __fixup_collection_insert_kfunc(&env->insn_aux_data[insn_idx], struct_meta_reg, node_offset_reg, insn, insn_buf, cnt); } else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] || @@ -18053,6 +19341,33 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return 0; } +/* The function requires that first instruction in 'patch' is insnsi[prog->len - 1] */ +static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len) +{ + struct bpf_subprog_info *info = env->subprog_info; + int cnt = env->subprog_cnt; + struct bpf_prog *prog; + + /* We only reserve one slot for hidden subprogs in subprog_info. */ + if (env->hidden_subprog_cnt) { + verbose(env, "verifier internal error: only one hidden subprog supported\n"); + return -EFAULT; + } + /* We're not patching any existing instruction, just appending the new + * ones for the hidden subprog. Hence all of the adjustment operations + * in bpf_patch_insn_data are no-ops. + */ + prog = bpf_patch_insn_data(env, env->prog->len - 1, patch, len); + if (!prog) + return -ENOMEM; + env->prog = prog; + info[cnt + 1].start = info[cnt].start; + info[cnt].start = prog->len - len + 1; + env->subprog_cnt++; + env->hidden_subprog_cnt++; + return 0; +} + /* Do various post-verification rewrites in a single program pass. * These rewrites simplify JIT and interpreter implementations. */ @@ -18071,6 +19386,26 @@ static int do_misc_fixups(struct bpf_verifier_env *env) struct bpf_map *map_ptr; int i, ret, cnt, delta = 0; + if (env->seen_exception && !env->exception_callback_subprog) { + struct bpf_insn patch[] = { + env->prog->insnsi[insn_cnt - 1], + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }; + + ret = add_hidden_subprog(env, patch, ARRAY_SIZE(patch)); + if (ret < 0) + return ret; + prog = env->prog; + insn = prog->insnsi; + + env->exception_callback_subprog = env->subprog_cnt - 1; + /* Don't update insn_cnt, as add_hidden_subprog always appends insns */ + env->subprog_info[env->exception_callback_subprog].is_cb = true; + env->subprog_info[env->exception_callback_subprog].is_async_cb = true; + env->subprog_info[env->exception_callback_subprog].is_exception_cb = true; + } + for (i = 0; i < insn_cnt; i++, insn++) { /* Make divide-by-zero exceptions impossible. */ if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || @@ -18340,6 +19675,25 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto patch_call_imm; } + /* bpf_per_cpu_ptr() and bpf_this_cpu_ptr() */ + if (env->insn_aux_data[i + delta].call_with_percpu_alloc_ptr) { + /* patch with 'r1 = *(u64 *)(r1 + 0)' since for percpu data, + * bpf_mem_alloc() returns a ptr to the percpu data ptr. + */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0); + insn_buf[1] = *insn; + cnt = 2; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto patch_call_imm; + } + /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup * and other inlining handlers are currently limited to 64 bit * only. @@ -18749,7 +20103,7 @@ static void free_states(struct bpf_verifier_env *env) } } -static int do_check_common(struct bpf_verifier_env *env, int subprog) +static int do_check_common(struct bpf_verifier_env *env, int subprog, bool is_ex_cb) { bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); struct bpf_verifier_state *state; @@ -18780,7 +20134,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) regs = state->frame[state->curframe]->regs; if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) { - ret = btf_prepare_func_args(env, subprog, regs); + ret = btf_prepare_func_args(env, subprog, regs, is_ex_cb); if (ret) goto out; for (i = BPF_REG_1; i <= BPF_REG_5; i++) { @@ -18796,6 +20150,12 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) regs[i].id = ++env->id_gen; } } + if (is_ex_cb) { + state->frame[0]->in_exception_callback_fn = true; + env->subprog_info[subprog].is_cb = true; + env->subprog_info[subprog].is_async_cb = true; + env->subprog_info[subprog].is_exception_cb = true; + } } else { /* 1st arg to a function */ regs[BPF_REG_1].type = PTR_TO_CTX; @@ -18860,7 +20220,7 @@ static int do_check_subprogs(struct bpf_verifier_env *env) continue; env->insn_idx = env->subprog_info[i].start; WARN_ON_ONCE(env->insn_idx == 0); - ret = do_check_common(env, i); + ret = do_check_common(env, i, env->exception_callback_subprog == i); if (ret) { return ret; } else if (env->log.level & BPF_LOG_LEVEL) { @@ -18877,7 +20237,7 @@ static int do_check_main(struct bpf_verifier_env *env) int ret; env->insn_idx = 0; - ret = do_check_common(env, 0); + ret = do_check_common(env, 0, false); if (!ret) env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; return ret; @@ -19046,6 +20406,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, bpf_log(log, "Subprog %s doesn't exist\n", tname); return -EINVAL; } + if (aux->func && aux->func[subprog]->aux->exception_cb) { + bpf_log(log, + "%s programs cannot attach to exception callback\n", + prog_extension ? "Extension" : "FENTRY/FEXIT"); + return -EINVAL; + } conservative = aux->func_info_aux[subprog].unreliable; if (prog_extension) { if (conservative) { @@ -19375,6 +20741,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) if (!tr) return -ENOMEM; + if (tgt_prog && tgt_prog->aux->tail_call_reachable) + tr->flags = BPF_TRAMP_F_TAIL_CALL_CTX; + prog->aux->dst_trampoline = tr; return 0; } @@ -19470,6 +20839,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (!env->explored_states) goto skip_full_check; + ret = check_btf_info_early(env, attr, uattr); + if (ret < 0) + goto skip_full_check; + ret = add_subprog_and_kfunc(env); if (ret < 0) goto skip_full_check; |