diff options
Diffstat (limited to 'kernel/bpf/memalloc.c')
-rw-r--r-- | kernel/bpf/memalloc.c | 116 |
1 files changed, 68 insertions, 48 deletions
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index cf1941516643..63b909d277d4 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -340,6 +340,7 @@ static void free_bulk(struct bpf_mem_cache *c) int cnt; WARN_ON_ONCE(tgt->unit_size != c->unit_size); + WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); do { inc_active(c, &flags); @@ -365,6 +366,9 @@ static void __free_by_rcu(struct rcu_head *head) struct bpf_mem_cache *tgt = c->tgt; struct llist_node *llnode; + WARN_ON_ONCE(tgt->unit_size != c->unit_size); + WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); + llnode = llist_del_all(&c->waiting_for_gp); if (!llnode) goto out; @@ -491,21 +495,17 @@ static int check_obj_size(struct bpf_mem_cache *c, unsigned int idx) struct llist_node *first; unsigned int obj_size; - /* For per-cpu allocator, the size of free objects in free list doesn't - * match with unit_size and now there is no way to get the size of - * per-cpu pointer saved in free object, so just skip the checking. - */ - if (c->percpu_size) - return 0; - first = c->free_llist.first; if (!first) return 0; - obj_size = ksize(first); + if (c->percpu_size) + obj_size = pcpu_alloc_size(((void **)first)[1]); + else + obj_size = ksize(first); if (obj_size != c->unit_size) { - WARN_ONCE(1, "bpf_mem_cache[%u]: unexpected object size %u, expect %u\n", - idx, obj_size, c->unit_size); + WARN_ONCE(1, "bpf_mem_cache[%u]: percpu %d, unexpected object size %u, expect %u\n", + idx, c->percpu_size, obj_size, c->unit_size); return -EINVAL; } return 0; @@ -526,15 +526,17 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) struct bpf_mem_cache *c, __percpu *pc; struct obj_cgroup *objcg = NULL; + /* room for llist_node and per-cpu pointer */ + if (percpu) + percpu_size = LLIST_NODE_SZ + sizeof(void *); + ma->percpu = percpu; + if (size) { pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); if (!pc) return -ENOMEM; - if (percpu) - /* room for llist_node and per-cpu pointer */ - percpu_size = LLIST_NODE_SZ + sizeof(void *); - else + if (!percpu) size += LLIST_NODE_SZ; /* room for llist_node */ unit_size = size; @@ -555,10 +557,6 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) return 0; } - /* size == 0 && percpu is an invalid combination */ - if (WARN_ON_ONCE(percpu)) - return -EINVAL; - pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL); if (!pcc) return -ENOMEM; @@ -572,6 +570,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) c = &cc->cache[i]; c->unit_size = sizes[i]; c->objcg = objcg; + c->percpu_size = percpu_size; c->tgt = c; init_refill_work(c); @@ -782,12 +781,17 @@ static void notrace *unit_alloc(struct bpf_mem_cache *c) } } local_dec(&c->active); - local_irq_restore(flags); WARN_ON(cnt < 0); if (cnt < c->low_watermark) irq_work_raise(c); + /* Enable IRQ after the enqueue of irq work completes, so irq work + * will run after IRQ is enabled and free_llist may be refilled by + * irq work before other task preempts current task. + */ + local_irq_restore(flags); + return llnode; } @@ -823,11 +827,16 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) llist_add(llnode, &c->free_llist_extra); } local_dec(&c->active); - local_irq_restore(flags); if (cnt > c->high_watermark) /* free few objects from current cpu into global kmalloc pool */ irq_work_raise(c); + /* Enable IRQ after irq_work_raise() completes, otherwise when current + * task is preempted by task which does unit_alloc(), unit_alloc() may + * return NULL unexpectedly because irq work is already pending but can + * not been triggered and free_llist can not be refilled timely. + */ + local_irq_restore(flags); } static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) @@ -845,10 +854,10 @@ static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) llist_add(llnode, &c->free_llist_extra_rcu); } local_dec(&c->active); - local_irq_restore(flags); if (!atomic_read(&c->call_rcu_in_progress)) irq_work_raise(c); + local_irq_restore(flags); } /* Called from BPF program or from sys_bpf syscall. @@ -870,6 +879,17 @@ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) return !ret ? NULL : ret + LLIST_NODE_SZ; } +static notrace int bpf_mem_free_idx(void *ptr, bool percpu) +{ + size_t size; + + if (percpu) + size = pcpu_alloc_size(*((void **)ptr)); + else + size = ksize(ptr - LLIST_NODE_SZ); + return bpf_mem_cache_idx(size); +} + void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) { int idx; @@ -877,7 +897,7 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) if (!ptr) return; - idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ)); + idx = bpf_mem_free_idx(ptr, ma->percpu); if (idx < 0) return; @@ -891,7 +911,7 @@ void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr) if (!ptr) return; - idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ)); + idx = bpf_mem_free_idx(ptr, ma->percpu); if (idx < 0) return; @@ -965,37 +985,37 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) return !ret ? NULL : ret + LLIST_NODE_SZ; } -/* Most of the logic is taken from setup_kmalloc_cache_index_table() */ +/* The alignment of dynamic per-cpu area is 8, so c->unit_size and the + * actual size of dynamic per-cpu area will always be matched and there is + * no need to adjust size_index for per-cpu allocation. However for the + * simplicity of the implementation, use an unified size_index for both + * kmalloc and per-cpu allocation. + */ static __init int bpf_mem_cache_adjust_size(void) { - unsigned int size, index; + unsigned int size; - /* Normally KMALLOC_MIN_SIZE is 8-bytes, but it can be - * up-to 256-bytes. + /* Adjusting the indexes in size_index() according to the object_size + * of underlying slab cache, so bpf_mem_alloc() will select a + * bpf_mem_cache with unit_size equal to the object_size of + * the underlying slab cache. + * + * The maximal value of KMALLOC_MIN_SIZE and __kmalloc_minalign() is + * 256-bytes, so only do adjustment for [8-bytes, 192-bytes]. */ - size = KMALLOC_MIN_SIZE; - if (size <= 192) - index = size_index[(size - 1) / 8]; - else - index = fls(size - 1) - 1; - for (size = 8; size < KMALLOC_MIN_SIZE && size <= 192; size += 8) - size_index[(size - 1) / 8] = index; + for (size = 192; size >= 8; size -= 8) { + unsigned int kmalloc_size, index; - /* The minimal alignment is 64-bytes, so disable 96-bytes cache and - * use 128-bytes cache instead. - */ - if (KMALLOC_MIN_SIZE >= 64) { - index = size_index[(128 - 1) / 8]; - for (size = 64 + 8; size <= 96; size += 8) - size_index[(size - 1) / 8] = index; - } + kmalloc_size = kmalloc_size_roundup(size); + if (kmalloc_size == size) + continue; - /* The minimal alignment is 128-bytes, so disable 192-bytes cache and - * use 256-bytes cache instead. - */ - if (KMALLOC_MIN_SIZE >= 128) { - index = fls(256 - 1) - 1; - for (size = 128 + 8; size <= 192; size += 8) + if (kmalloc_size <= 192) + index = size_index[(kmalloc_size - 1) / 8]; + else + index = fls(kmalloc_size - 1) - 1; + /* Only overwrite if necessary */ + if (size_index[(size - 1) / 8] != index) size_index[(size - 1) / 8] = index; } |