diff options
Diffstat (limited to 'kernel/bpf/arraymap.c')
-rw-r--r-- | kernel/bpf/arraymap.c | 342 |
1 files changed, 310 insertions, 32 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index c72e0d8e1e65..f0d19bbb9211 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016,2017 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include <linux/bpf.h> #include <linux/btf.h> @@ -22,7 +14,7 @@ #include "map_in_map.h" #define ARRAY_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK) static void bpf_array_free_percpu(struct bpf_array *array) { @@ -63,9 +55,14 @@ int array_map_alloc_check(union bpf_attr *attr) if (attr->max_entries == 0 || attr->key_size != 4 || attr->value_size == 0 || attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags) || (percpu && numa_node != NUMA_NO_NODE)) return -EINVAL; + if (attr->map_type != BPF_MAP_TYPE_ARRAY && + attr->map_flags & BPF_F_MMAPABLE) + return -EINVAL; + if (attr->value_size > KMALLOC_MAX_SIZE) /* if value_size is bigger, the user space won't be able to * access the elements. @@ -82,6 +79,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) u32 elem_size, index_mask, max_entries; bool unpriv = !capable(CAP_SYS_ADMIN); u64 cost, array_size, mask64; + struct bpf_map_memory mem; struct bpf_array *array; elem_size = round_up(attr->value_size, 8); @@ -108,39 +106,58 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) } array_size = sizeof(*array); - if (percpu) + if (percpu) { array_size += (u64) max_entries * sizeof(void *); - else - array_size += (u64) max_entries * elem_size; + } else { + /* rely on vmalloc() to return page-aligned memory and + * ensure array->value is exactly page-aligned + */ + if (attr->map_flags & BPF_F_MMAPABLE) { + array_size = PAGE_ALIGN(array_size); + array_size += PAGE_ALIGN((u64) max_entries * elem_size); + } else { + array_size += (u64) max_entries * elem_size; + } + } /* make sure there is no u32 overflow later in round_up() */ cost = array_size; - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-ENOMEM); - if (percpu) { + if (percpu) cost += (u64)attr->max_entries * elem_size * num_possible_cpus(); - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-ENOMEM); - } - cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - ret = bpf_map_precharge_memlock(cost); + ret = bpf_map_charge_init(&mem, cost); if (ret < 0) return ERR_PTR(ret); /* allocate all map elements and zero-initialize them */ - array = bpf_map_area_alloc(array_size, numa_node); - if (!array) + if (attr->map_flags & BPF_F_MMAPABLE) { + void *data; + + /* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */ + data = bpf_map_area_mmapable_alloc(array_size, numa_node); + if (!data) { + bpf_map_charge_finish(&mem); + return ERR_PTR(-ENOMEM); + } + array = data + PAGE_ALIGN(sizeof(struct bpf_array)) + - offsetof(struct bpf_array, value); + } else { + array = bpf_map_area_alloc(array_size, numa_node); + } + if (!array) { + bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); + } array->index_mask = index_mask; array->map.unpriv_array = unpriv; /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); - array->map.pages = cost; + bpf_map_charge_move(&array->map.memory, &mem); array->elem_size = elem_size; if (percpu && bpf_array_alloc_percpu(array)) { + bpf_map_charge_finish(&array->map.memory); bpf_map_area_free(array); return ERR_PTR(-ENOMEM); } @@ -160,6 +177,36 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) return array->value + array->elem_size * (index & array->index_mask); } +static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, + u32 off) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + + if (map->max_entries != 1) + return -ENOTSUPP; + if (off >= map->value_size) + return -EINVAL; + + *imm = (unsigned long)array->value; + return 0; +} + +static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm, + u32 *off) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u64 base = (unsigned long)array->value; + u64 range = array->elem_size; + + if (map->max_entries != 1) + return -ENOTSUPP; + if (imm < base || imm >= base + range) + return -ENOENT; + + *off = imm - base; + return 0; +} + /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { @@ -329,6 +376,11 @@ static int array_map_delete_elem(struct bpf_map *map, void *key) return -EINVAL; } +static void *array_map_vmalloc_addr(struct bpf_array *array) +{ + return (void *)round_down((unsigned long)array, PAGE_SIZE); +} + /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void array_map_free(struct bpf_map *map) { @@ -344,7 +396,10 @@ static void array_map_free(struct bpf_map *map) if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) bpf_array_free_percpu(array); - bpf_map_area_free(array); + if (array->map.map_flags & BPF_F_MMAPABLE) + bpf_map_area_free(array_map_vmalloc_addr(array)); + else + bpf_map_area_free(array); } static void array_map_seq_show_elem(struct bpf_map *map, void *key, @@ -360,7 +415,8 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key, return; } - seq_printf(m, "%u: ", *(u32 *)key); + if (map->btf_key_type_id) + seq_printf(m, "%u: ", *(u32 *)key); btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); seq_puts(m, "\n"); @@ -397,6 +453,18 @@ static int array_map_check_btf(const struct bpf_map *map, { u32 int_data; + /* One exception for keyless BTF: .bss/.data/.rodata map */ + if (btf_type_is_void(key_type)) { + if (map->map_type != BPF_MAP_TYPE_ARRAY || + map->max_entries != 1) + return -EINVAL; + + if (BTF_INFO_KIND(value_type->info) != BTF_KIND_DATASEC) + return -EINVAL; + + return 0; + } + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) return -EINVAL; @@ -410,6 +478,17 @@ static int array_map_check_btf(const struct bpf_map *map, return 0; } +static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + pgoff_t pgoff = PAGE_ALIGN(sizeof(*array)) >> PAGE_SHIFT; + + if (!(map->map_flags & BPF_F_MMAPABLE)) + return -EINVAL; + + return remap_vmalloc_range(vma, array_map_vmalloc_addr(array), pgoff); +} + const struct bpf_map_ops array_map_ops = { .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, @@ -419,6 +498,9 @@ const struct bpf_map_ops array_map_ops = { .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, .map_gen_lookup = array_map_gen_lookup, + .map_direct_value_addr = array_map_direct_value_addr, + .map_direct_value_meta = array_map_direct_value_meta, + .map_mmap = array_map_mmap, .map_seq_show_elem = array_map_seq_show_elem, .map_check_btf = array_map_check_btf, }; @@ -440,6 +522,9 @@ static int fd_array_map_alloc_check(union bpf_attr *attr) /* only file descriptors can be stored in this type of map */ if (attr->value_size != sizeof(u32)) return -EINVAL; + /* Program read-only/write-only not supported for special maps yet. */ + if (attr->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) + return -EINVAL; return array_map_alloc_check(attr); } @@ -501,10 +586,17 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, if (IS_ERR(new_ptr)) return PTR_ERR(new_ptr); - old_ptr = xchg(array->ptrs + index, new_ptr); + if (map->ops->map_poke_run) { + mutex_lock(&array->aux->poke_mutex); + old_ptr = xchg(array->ptrs + index, new_ptr); + map->ops->map_poke_run(map, index, old_ptr, new_ptr); + mutex_unlock(&array->aux->poke_mutex); + } else { + old_ptr = xchg(array->ptrs + index, new_ptr); + } + if (old_ptr) map->ops->map_fd_put_ptr(old_ptr); - return 0; } @@ -517,7 +609,15 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key) if (index >= array->map.max_entries) return -E2BIG; - old_ptr = xchg(array->ptrs + index, NULL); + if (map->ops->map_poke_run) { + mutex_lock(&array->aux->poke_mutex); + old_ptr = xchg(array->ptrs + index, NULL); + map->ops->map_poke_run(map, index, old_ptr, NULL); + mutex_unlock(&array->aux->poke_mutex); + } else { + old_ptr = xchg(array->ptrs + index, NULL); + } + if (old_ptr) { map->ops->map_fd_put_ptr(old_ptr); return 0; @@ -586,17 +686,195 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } +struct prog_poke_elem { + struct list_head list; + struct bpf_prog_aux *aux; +}; + +static int prog_array_map_poke_track(struct bpf_map *map, + struct bpf_prog_aux *prog_aux) +{ + struct prog_poke_elem *elem; + struct bpf_array_aux *aux; + int ret = 0; + + aux = container_of(map, struct bpf_array, map)->aux; + mutex_lock(&aux->poke_mutex); + list_for_each_entry(elem, &aux->poke_progs, list) { + if (elem->aux == prog_aux) + goto out; + } + + elem = kmalloc(sizeof(*elem), GFP_KERNEL); + if (!elem) { + ret = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&elem->list); + /* We must track the program's aux info at this point in time + * since the program pointer itself may not be stable yet, see + * also comment in prog_array_map_poke_run(). + */ + elem->aux = prog_aux; + + list_add_tail(&elem->list, &aux->poke_progs); +out: + mutex_unlock(&aux->poke_mutex); + return ret; +} + +static void prog_array_map_poke_untrack(struct bpf_map *map, + struct bpf_prog_aux *prog_aux) +{ + struct prog_poke_elem *elem, *tmp; + struct bpf_array_aux *aux; + + aux = container_of(map, struct bpf_array, map)->aux; + mutex_lock(&aux->poke_mutex); + list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { + if (elem->aux == prog_aux) { + list_del_init(&elem->list); + kfree(elem); + break; + } + } + mutex_unlock(&aux->poke_mutex); +} + +static void prog_array_map_poke_run(struct bpf_map *map, u32 key, + struct bpf_prog *old, + struct bpf_prog *new) +{ + struct prog_poke_elem *elem; + struct bpf_array_aux *aux; + + aux = container_of(map, struct bpf_array, map)->aux; + WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex)); + + list_for_each_entry(elem, &aux->poke_progs, list) { + struct bpf_jit_poke_descriptor *poke; + int i, ret; + + for (i = 0; i < elem->aux->size_poke_tab; i++) { + poke = &elem->aux->poke_tab[i]; + + /* Few things to be aware of: + * + * 1) We can only ever access aux in this context, but + * not aux->prog since it might not be stable yet and + * there could be danger of use after free otherwise. + * 2) Initially when we start tracking aux, the program + * is not JITed yet and also does not have a kallsyms + * entry. We skip these as poke->ip_stable is not + * active yet. The JIT will do the final fixup before + * setting it stable. The various poke->ip_stable are + * successively activated, so tail call updates can + * arrive from here while JIT is still finishing its + * final fixup for non-activated poke entries. + * 3) On program teardown, the program's kallsym entry gets + * removed out of RCU callback, but we can only untrack + * from sleepable context, therefore bpf_arch_text_poke() + * might not see that this is in BPF text section and + * bails out with -EINVAL. As these are unreachable since + * RCU grace period already passed, we simply skip them. + * 4) Also programs reaching refcount of zero while patching + * is in progress is okay since we're protected under + * poke_mutex and untrack the programs before the JIT + * buffer is freed. When we're still in the middle of + * patching and suddenly kallsyms entry of the program + * gets evicted, we just skip the rest which is fine due + * to point 3). + * 5) Any other error happening below from bpf_arch_text_poke() + * is a unexpected bug. + */ + if (!READ_ONCE(poke->ip_stable)) + continue; + if (poke->reason != BPF_POKE_REASON_TAIL_CALL) + continue; + if (poke->tail_call.map != map || + poke->tail_call.key != key) + continue; + + ret = bpf_arch_text_poke(poke->ip, BPF_MOD_JUMP, + old ? (u8 *)old->bpf_func + + poke->adj_off : NULL, + new ? (u8 *)new->bpf_func + + poke->adj_off : NULL); + BUG_ON(ret < 0 && ret != -EINVAL); + } + } +} + +static void prog_array_map_clear_deferred(struct work_struct *work) +{ + struct bpf_map *map = container_of(work, struct bpf_array_aux, + work)->map; + bpf_fd_array_map_clear(map); + bpf_map_put(map); +} + +static void prog_array_map_clear(struct bpf_map *map) +{ + struct bpf_array_aux *aux = container_of(map, struct bpf_array, + map)->aux; + bpf_map_inc(map); + schedule_work(&aux->work); +} + +static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) +{ + struct bpf_array_aux *aux; + struct bpf_map *map; + + aux = kzalloc(sizeof(*aux), GFP_KERNEL); + if (!aux) + return ERR_PTR(-ENOMEM); + + INIT_WORK(&aux->work, prog_array_map_clear_deferred); + INIT_LIST_HEAD(&aux->poke_progs); + mutex_init(&aux->poke_mutex); + + map = array_map_alloc(attr); + if (IS_ERR(map)) { + kfree(aux); + return map; + } + + container_of(map, struct bpf_array, map)->aux = aux; + aux->map = map; + + return map; +} + +static void prog_array_map_free(struct bpf_map *map) +{ + struct prog_poke_elem *elem, *tmp; + struct bpf_array_aux *aux; + + aux = container_of(map, struct bpf_array, map)->aux; + list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { + list_del_init(&elem->list); + kfree(elem); + } + kfree(aux); + fd_array_map_free(map); +} + const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, - .map_alloc = array_map_alloc, - .map_free = fd_array_map_free, + .map_alloc = prog_array_map_alloc, + .map_free = prog_array_map_free, + .map_poke_track = prog_array_map_poke_track, + .map_poke_untrack = prog_array_map_poke_untrack, + .map_poke_run = prog_array_map_poke_run, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = prog_fd_array_get_ptr, .map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, - .map_release_uref = bpf_fd_array_map_clear, + .map_release_uref = prog_array_map_clear, .map_seq_show_elem = prog_array_map_seq_show_elem, }; |