diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/btf.c | 2 | ||||
-rw-r--r-- | kernel/bpf/cgroup.c | 13 | ||||
-rw-r--r-- | kernel/bpf/memalloc.c | 94 | ||||
-rw-r--r-- | kernel/bpf/offload.c | 12 | ||||
-rw-r--r-- | kernel/bpf/queue_stack_maps.c | 21 | ||||
-rw-r--r-- | kernel/pid.c | 2 | ||||
-rw-r--r-- | kernel/sched/core.c | 2 | ||||
-rw-r--r-- | kernel/sched/idle.c | 1 | ||||
-rw-r--r-- | kernel/task_work.c | 1 | ||||
-rw-r--r-- | kernel/trace/bpf_trace.c | 20 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 28 |
11 files changed, 161 insertions, 35 deletions
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 1095bbe29859..8090d7fb11ef 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -8501,7 +8501,7 @@ bool btf_nested_type_is_trusted(struct bpf_verifier_log *log, tname = btf_name_by_offset(btf, walk_type->name_off); ret = snprintf(safe_tname, sizeof(safe_tname), "%s%s", tname, suffix); - if (ret < 0) + if (ret >= sizeof(safe_tname)) return false; safe_id = btf_find_by_name_kind(btf, safe_tname, BTF_INFO_KIND(walk_type->info)); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 5b2741aa0d9b..03b3d4492980 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -785,7 +785,8 @@ found: * to descendants * @cgrp: The cgroup which descendants to traverse * @link: A link for which to replace BPF program - * @type: Type of attach operation + * @new_prog: &struct bpf_prog for the target BPF program with its refcnt + * incremented * * Must be called with cgroup_mutex held. */ @@ -1334,7 +1335,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering * @sk: The socket sending or receiving traffic * @skb: The skb that is being sent or received - * @type: The type of program to be executed + * @atype: The type of program to be executed * * If no socket is passed, or the socket is not of type INET or INET6, * this function does nothing and returns 0. @@ -1424,7 +1425,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); /** * __cgroup_bpf_run_filter_sk() - Run a program on a sock * @sk: sock structure to manipulate - * @type: The type of program to be executed + * @atype: The type of program to be executed * * socket is passed is expected to be of type INET or INET6. * @@ -1449,7 +1450,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); * provided by user sockaddr * @sk: sock struct that will use sockaddr * @uaddr: sockaddr struct provided by user - * @type: The type of program to be executed + * @atype: The type of program to be executed * @t_ctx: Pointer to attach type specific context * @flags: Pointer to u32 which contains higher bits of BPF program * return value (OR'ed together). @@ -1496,7 +1497,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains * sk with connection information (IP addresses, etc.) May not contain * cgroup info if it is a req sock. - * @type: The type of program to be executed + * @atype: The type of program to be executed * * socket passed is expected to be of type INET or INET6. * @@ -1670,7 +1671,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { * @ppos: value-result argument: value is position at which read from or write * to sysctl is happening, result is new position if program overrode it, * initial value otherwise - * @type: type of program to be executed + * @atype: type of program to be executed * * Program is run when sysctl is being accessed, either read or written, and * can allow or deny such access. diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 9c49ae53deaf..cf1941516643 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -459,8 +459,7 @@ static void notrace irq_work_raise(struct bpf_mem_cache *c) * Typical case will be between 11K and 116K closer to 11K. * bpf progs can and should share bpf_mem_cache when possible. */ - -static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) +static void init_refill_work(struct bpf_mem_cache *c) { init_irq_work(&c->refill_work, bpf_mem_refill); if (c->unit_size <= 256) { @@ -476,7 +475,10 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) c->high_watermark = max(96 * 256 / c->unit_size, 3); } c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1); +} +static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) +{ /* To avoid consuming memory assume that 1st run of bpf * prog won't be doing more than 4 map_update_elem from * irq disabled region @@ -484,6 +486,31 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu), false); } +static int check_obj_size(struct bpf_mem_cache *c, unsigned int idx) +{ + struct llist_node *first; + unsigned int obj_size; + + /* For per-cpu allocator, the size of free objects in free list doesn't + * match with unit_size and now there is no way to get the size of + * per-cpu pointer saved in free object, so just skip the checking. + */ + if (c->percpu_size) + return 0; + + first = c->free_llist.first; + if (!first) + return 0; + + obj_size = ksize(first); + if (obj_size != c->unit_size) { + WARN_ONCE(1, "bpf_mem_cache[%u]: unexpected object size %u, expect %u\n", + idx, obj_size, c->unit_size); + return -EINVAL; + } + return 0; +} + /* When size != 0 bpf_mem_cache for each cpu. * This is typical bpf hash map use case when all elements have equal size. * @@ -494,10 +521,10 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) { static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; + int cpu, i, err, unit_size, percpu_size = 0; struct bpf_mem_caches *cc, __percpu *pcc; struct bpf_mem_cache *c, __percpu *pc; struct obj_cgroup *objcg = NULL; - int cpu, i, unit_size, percpu_size = 0; if (size) { pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); @@ -521,6 +548,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) c->objcg = objcg; c->percpu_size = percpu_size; c->tgt = c; + init_refill_work(c); prefill_mem_cache(c, cpu); } ma->cache = pc; @@ -534,6 +562,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL); if (!pcc) return -ENOMEM; + err = 0; #ifdef CONFIG_MEMCG_KMEM objcg = get_obj_cgroup_from_current(); #endif @@ -544,11 +573,30 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) c->unit_size = sizes[i]; c->objcg = objcg; c->tgt = c; + + init_refill_work(c); + /* Another bpf_mem_cache will be used when allocating + * c->unit_size in bpf_mem_alloc(), so doesn't prefill + * for the bpf_mem_cache because these free objects will + * never be used. + */ + if (i != bpf_mem_cache_idx(c->unit_size)) + continue; prefill_mem_cache(c, cpu); + err = check_obj_size(c, i); + if (err) + goto out; } } + +out: ma->caches = pcc; - return 0; + /* refill_work is either zeroed or initialized, so it is safe to + * call irq_work_sync(). + */ + if (err) + bpf_mem_alloc_destroy(ma); + return err; } static void drain_mem_cache(struct bpf_mem_cache *c) @@ -916,3 +964,41 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) return !ret ? NULL : ret + LLIST_NODE_SZ; } + +/* Most of the logic is taken from setup_kmalloc_cache_index_table() */ +static __init int bpf_mem_cache_adjust_size(void) +{ + unsigned int size, index; + + /* Normally KMALLOC_MIN_SIZE is 8-bytes, but it can be + * up-to 256-bytes. + */ + size = KMALLOC_MIN_SIZE; + if (size <= 192) + index = size_index[(size - 1) / 8]; + else + index = fls(size - 1) - 1; + for (size = 8; size < KMALLOC_MIN_SIZE && size <= 192; size += 8) + size_index[(size - 1) / 8] = index; + + /* The minimal alignment is 64-bytes, so disable 96-bytes cache and + * use 128-bytes cache instead. + */ + if (KMALLOC_MIN_SIZE >= 64) { + index = size_index[(128 - 1) / 8]; + for (size = 64 + 8; size <= 96; size += 8) + size_index[(size - 1) / 8] = index; + } + + /* The minimal alignment is 128-bytes, so disable 192-bytes cache and + * use 256-bytes cache instead. + */ + if (KMALLOC_MIN_SIZE >= 128) { + index = fls(256 - 1) - 1; + for (size = 128 + 8; size <= 192; size += 8) + size_index[(size - 1) / 8] = index; + } + + return 0; +} +subsys_initcall(bpf_mem_cache_adjust_size); diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 3e4f2ec1af06..87d6693d8233 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -199,12 +199,14 @@ static int __bpf_prog_dev_bound_init(struct bpf_prog *prog, struct net_device *n offload->netdev = netdev; ondev = bpf_offload_find_netdev(offload->netdev); + /* When program is offloaded require presence of "true" + * bpf_offload_netdev, avoid the one created for !ondev case below. + */ + if (bpf_prog_is_offloaded(prog->aux) && (!ondev || !ondev->offdev)) { + err = -EINVAL; + goto err_free; + } if (!ondev) { - if (bpf_prog_is_offloaded(prog->aux)) { - err = -EINVAL; - goto err_free; - } - /* When only binding to the device, explicitly * create an entry in the hashtable. */ diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 8d2ddcb7566b..d869f51ea93a 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -98,7 +98,12 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete) int err = 0; void *ptr; - raw_spin_lock_irqsave(&qs->lock, flags); + if (in_nmi()) { + if (!raw_spin_trylock_irqsave(&qs->lock, flags)) + return -EBUSY; + } else { + raw_spin_lock_irqsave(&qs->lock, flags); + } if (queue_stack_map_is_empty(qs)) { memset(value, 0, qs->map.value_size); @@ -128,7 +133,12 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete) void *ptr; u32 index; - raw_spin_lock_irqsave(&qs->lock, flags); + if (in_nmi()) { + if (!raw_spin_trylock_irqsave(&qs->lock, flags)) + return -EBUSY; + } else { + raw_spin_lock_irqsave(&qs->lock, flags); + } if (queue_stack_map_is_empty(qs)) { memset(value, 0, qs->map.value_size); @@ -193,7 +203,12 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value, if (flags & BPF_NOEXIST || flags > BPF_EXIST) return -EINVAL; - raw_spin_lock_irqsave(&qs->lock, irq_flags); + if (in_nmi()) { + if (!raw_spin_trylock_irqsave(&qs->lock, irq_flags)) + return -EBUSY; + } else { + raw_spin_lock_irqsave(&qs->lock, irq_flags); + } if (queue_stack_map_is_full(qs)) { if (!replace) { diff --git a/kernel/pid.c b/kernel/pid.c index fee14a4486a3..6500ef956f2f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -609,7 +609,7 @@ int pidfd_create(struct pid *pid, unsigned int flags) } /** - * pidfd_open() - Open new pid file descriptor. + * sys_pidfd_open() - Open new pid file descriptor. * * @pid: pid for which to retrieve a pidfd * @flags: flags to pass diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2299a5cfbfb9..802551e0009b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9269,7 +9269,7 @@ void __init init_idle(struct task_struct *idle, int cpu) * PF_KTHREAD should already be set at this point; regardless, make it * look like a proper per-CPU kthread. */ - idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY; + idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY; kthread_set_per_cpu(idle, cpu); #ifdef CONFIG_SMP diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 342f58a329f5..5007b25c5bc6 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -373,6 +373,7 @@ EXPORT_SYMBOL_GPL(play_idle_precise); void cpu_startup_entry(enum cpuhp_state state) { + current->flags |= PF_IDLE; arch_cpu_idle_prepare(); cpuhp_online_idle(state); while (1) diff --git a/kernel/task_work.c b/kernel/task_work.c index 065e1ef8fc8d..95a7e1b7f1da 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -78,6 +78,7 @@ int task_work_add(struct task_struct *task, struct callback_head *work, * task_work_cancel_match - cancel a pending work added by task_work_add() * @task: the task which should execute the work * @match: match function to call + * @data: data to be passed in to match function * * RETURNS: * The found work or NULL if not found. diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a7264b2c17ad..868008f56fec 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2853,6 +2853,17 @@ static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u3 return arr.mods_cnt; } +static int addrs_check_error_injection_list(unsigned long *addrs, u32 cnt) +{ + u32 i; + + for (i = 0; i < cnt; i++) { + if (!within_error_injection_list(addrs[i])) + return -EINVAL; + } + return 0; +} + int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_kprobe_multi_link *link = NULL; @@ -2930,6 +2941,11 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr goto error; } + if (prog->kprobe_override && addrs_check_error_injection_list(addrs, cnt)) { + err = -EINVAL; + goto error; + } + link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) { err = -ENOMEM; @@ -3207,8 +3223,10 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr rcu_read_lock(); task = get_pid_task(find_vpid(pid), PIDTYPE_PID); rcu_read_unlock(); - if (!task) + if (!task) { + err = -ESRCH; goto error_path_put; + } } err = -ENOMEM; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a1651edc48d5..28daf0ce95c5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -354,6 +354,11 @@ static void rb_init_page(struct buffer_data_page *bpage) local_set(&bpage->commit, 0); } +static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage) +{ + return local_read(&bpage->page->commit); +} + static void free_buffer_page(struct buffer_page *bpage) { free_page((unsigned long)bpage->page); @@ -2003,7 +2008,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) * Increment overrun to account for the lost events. */ local_add(page_entries, &cpu_buffer->overrun); - local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); + local_sub(rb_page_commit(to_remove_page), &cpu_buffer->entries_bytes); local_inc(&cpu_buffer->pages_lost); } @@ -2367,11 +2372,6 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->read); } -static __always_inline unsigned rb_page_commit(struct buffer_page *bpage) -{ - return local_read(&bpage->page->commit); -} - static struct ring_buffer_event * rb_iter_head_event(struct ring_buffer_iter *iter) { @@ -2517,7 +2517,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, * the counters. */ local_add(entries, &cpu_buffer->overrun); - local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); + local_sub(rb_page_commit(next_page), &cpu_buffer->entries_bytes); local_inc(&cpu_buffer->pages_lost); /* @@ -2660,9 +2660,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, event = __rb_page_index(tail_page, tail); - /* account for padding bytes */ - local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); - /* * Save the original length to the meta data. * This will be used by the reader to add lost event @@ -2676,7 +2673,8 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, * write counter enough to allow another writer to slip * in on this page. * We put in a discarded commit instead, to make sure - * that this space is not used again. + * that this space is not used again, and this space will + * not be accounted into 'entries_bytes'. * * If we are less than the minimum size, we don't need to * worry about it. @@ -2701,6 +2699,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, /* time delta must be non zero */ event->time_delta = 1; + /* account for padding bytes */ + local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); + /* Make sure the padding is visible before the tail_page->write update */ smp_wmb(); @@ -4215,7 +4216,7 @@ u64 ring_buffer_oldest_event_ts(struct trace_buffer *buffer, int cpu) EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts); /** - * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer + * ring_buffer_bytes_cpu - get the number of bytes unconsumed in a cpu buffer * @buffer: The ring buffer * @cpu: The per CPU buffer to read from. */ @@ -4723,6 +4724,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) length = rb_event_length(event); cpu_buffer->reader_page->read += length; + cpu_buffer->read_bytes += length; } static void rb_advance_iter(struct ring_buffer_iter *iter) @@ -5816,7 +5818,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, } else { /* update the entry counter */ cpu_buffer->read += rb_page_entries(reader); - cpu_buffer->read_bytes += BUF_PAGE_SIZE; + cpu_buffer->read_bytes += rb_page_commit(reader); /* swap the pages */ rb_init_page(bpage); |