summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt2
-rw-r--r--kernel/Makefile7
-rw-r--r--kernel/audit.c62
-rw-r--r--kernel/audit.h10
-rw-r--r--kernel/audit_fsnotify.c6
-rw-r--r--kernel/audit_tree.c498
-rw-r--r--kernel/audit_watch.c6
-rw-r--r--kernel/auditsc.c150
-rw-r--r--kernel/bpf/arraymap.c1
-rw-r--r--kernel/bpf/btf.c778
-rw-r--r--kernel/bpf/core.c198
-rw-r--r--kernel/bpf/cpumap.c2
-rw-r--r--kernel/bpf/hashtab.c13
-rw-r--r--kernel/bpf/local_storage.c87
-rw-r--r--kernel/bpf/lpm_trie.c60
-rw-r--r--kernel/bpf/offload.c76
-rw-r--r--kernel/bpf/syscall.c160
-rw-r--r--kernel/bpf/verifier.c586
-rw-r--r--kernel/cgroup/cgroup-internal.h2
-rw-r--r--kernel/cgroup/cgroup-v1.c14
-rw-r--r--kernel/cgroup/cgroup.c108
-rw-r--r--kernel/cgroup/cpuset.c948
-rw-r--r--kernel/cgroup/debug.c4
-rw-r--r--kernel/dma/Kconfig14
-rw-r--r--kernel/dma/Makefile5
-rw-r--r--kernel/dma/debug.c259
-rw-r--r--kernel/dma/direct.c222
-rw-r--r--kernel/dma/dummy.c39
-rw-r--r--kernel/dma/mapping.c223
-rw-r--r--kernel/dma/remap.c256
-rw-r--r--kernel/dma/swiotlb.c253
-rw-r--r--kernel/dma/virt.c2
-rw-r--r--kernel/events/uprobes.c10
-rw-r--r--kernel/fork.c7
-rw-r--r--kernel/futex.c207
-rw-r--r--kernel/futex_compat.c202
-rw-r--r--kernel/kexec_core.c5
-rw-r--r--kernel/memremap.c103
-rw-r--r--kernel/module.c134
-rw-r--r--kernel/module_signing.c3
-rw-r--r--kernel/padata.c2
-rw-r--r--kernel/panic.c6
-rw-r--r--kernel/pid.c6
-rw-r--r--kernel/power/snapshot.c2
-rw-r--r--kernel/printk/printk.c131
-rw-r--r--kernel/resource.c15
-rw-r--r--kernel/signal.c143
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/sysctl.c8
-rw-r--r--kernel/time/ntp.c10
-rw-r--r--kernel/time/time.c36
-rw-r--r--kernel/time/timekeeping.c12
-rw-r--r--kernel/trace/blktrace.c4
-rw-r--r--kernel/trace/bpf_trace.c99
54 files changed, 4487 insertions, 1711 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index cd1655122ec0..0fee5fe6c899 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -57,4 +57,4 @@ config PREEMPT
endchoice
config PREEMPT_COUNT
- bool \ No newline at end of file
+ bool
diff --git a/kernel/Makefile b/kernel/Makefile
index 7343b3a9bff0..cde93d54c571 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -49,9 +49,6 @@ obj-$(CONFIG_PROFILING) += profile.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/
obj-$(CONFIG_FUTEX) += futex.o
-ifeq ($(CONFIG_COMPAT),y)
-obj-$(CONFIG_FUTEX) += futex_compat.o
-endif
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += smp.o
ifneq ($(CONFIG_SMP),y)
@@ -76,9 +73,7 @@ obj-$(CONFIG_IKCONFIG) += configs.o
obj-$(CONFIG_SMP) += stop_machine.o
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
-obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
-obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o
-obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
+obj-$(CONFIG_AUDITSYSCALL) += auditsc.o audit_watch.o audit_fsnotify.o audit_tree.o
obj-$(CONFIG_GCOV_KERNEL) += gcov/
obj-$(CONFIG_KCOV) += kcov.o
obj-$(CONFIG_KPROBES) += kprobes.o
diff --git a/kernel/audit.c b/kernel/audit.c
index 2a8058764aa6..632d36059556 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -60,7 +60,6 @@
#include <linux/mutex.h>
#include <linux/gfp.h>
#include <linux/pid.h>
-#include <linux/slab.h>
#include <linux/audit.h>
@@ -400,7 +399,7 @@ static int audit_log_config_change(char *function_name, u32 new, u32 old,
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (unlikely(!ab))
return rc;
- audit_log_format(ab, "%s=%u old=%u", function_name, new, old);
+ audit_log_format(ab, "%s=%u old=%u ", function_name, new, old);
audit_log_session_info(ab);
rc = audit_log_task_context(ab);
if (rc)
@@ -1067,7 +1066,7 @@ static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
*ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
if (unlikely(!*ab))
return;
- audit_log_format(*ab, "pid=%d uid=%u", pid, uid);
+ audit_log_format(*ab, "pid=%d uid=%u ", pid, uid);
audit_log_session_info(*ab);
audit_log_task_context(*ab);
}
@@ -1096,10 +1095,11 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature
if (audit_enabled == AUDIT_OFF)
return;
+
ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_FEATURE_CHANGE);
if (!ab)
return;
- audit_log_task_info(ab, current);
+ audit_log_task_info(ab);
audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d",
audit_feature_names[which], !!old_feature, !!new_feature,
!!old_lock, !!new_lock, res);
@@ -2042,7 +2042,7 @@ void audit_log_session_info(struct audit_buffer *ab)
unsigned int sessionid = audit_get_sessionid(current);
uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current));
- audit_log_format(ab, " auid=%u ses=%u", auid, sessionid);
+ audit_log_format(ab, "auid=%u ses=%u", auid, sessionid);
}
void audit_log_key(struct audit_buffer *ab, char *key)
@@ -2058,11 +2058,13 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
{
int i;
- audit_log_format(ab, " %s=", prefix);
- CAP_FOR_EACH_U32(i) {
- audit_log_format(ab, "%08x",
- cap->cap[CAP_LAST_U32 - i]);
+ if (cap_isclear(*cap)) {
+ audit_log_format(ab, " %s=0", prefix);
+ return;
}
+ audit_log_format(ab, " %s=", prefix);
+ CAP_FOR_EACH_U32(i)
+ audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]);
}
static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
@@ -2177,22 +2179,21 @@ void audit_log_name(struct audit_context *context, struct audit_names *n,
}
/* log the audit_names record type */
- audit_log_format(ab, " nametype=");
switch(n->type) {
case AUDIT_TYPE_NORMAL:
- audit_log_format(ab, "NORMAL");
+ audit_log_format(ab, " nametype=NORMAL");
break;
case AUDIT_TYPE_PARENT:
- audit_log_format(ab, "PARENT");
+ audit_log_format(ab, " nametype=PARENT");
break;
case AUDIT_TYPE_CHILD_DELETE:
- audit_log_format(ab, "DELETE");
+ audit_log_format(ab, " nametype=DELETE");
break;
case AUDIT_TYPE_CHILD_CREATE:
- audit_log_format(ab, "CREATE");
+ audit_log_format(ab, " nametype=CREATE");
break;
default:
- audit_log_format(ab, "UNKNOWN");
+ audit_log_format(ab, " nametype=UNKNOWN");
break;
}
@@ -2247,15 +2248,15 @@ out_null:
audit_log_format(ab, " exe=(null)");
}
-struct tty_struct *audit_get_tty(struct task_struct *tsk)
+struct tty_struct *audit_get_tty(void)
{
struct tty_struct *tty = NULL;
unsigned long flags;
- spin_lock_irqsave(&tsk->sighand->siglock, flags);
- if (tsk->signal)
- tty = tty_kref_get(tsk->signal->tty);
- spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+ spin_lock_irqsave(&current->sighand->siglock, flags);
+ if (current->signal)
+ tty = tty_kref_get(current->signal->tty);
+ spin_unlock_irqrestore(&current->sighand->siglock, flags);
return tty;
}
@@ -2264,25 +2265,24 @@ void audit_put_tty(struct tty_struct *tty)
tty_kref_put(tty);
}
-void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
+void audit_log_task_info(struct audit_buffer *ab)
{
const struct cred *cred;
- char comm[sizeof(tsk->comm)];
+ char comm[sizeof(current->comm)];
struct tty_struct *tty;
if (!ab)
return;
- /* tsk == current */
cred = current_cred();
- tty = audit_get_tty(tsk);
+ tty = audit_get_tty();
audit_log_format(ab,
" ppid=%d pid=%d auid=%u uid=%u gid=%u"
" euid=%u suid=%u fsuid=%u"
" egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
- task_ppid_nr(tsk),
- task_tgid_nr(tsk),
- from_kuid(&init_user_ns, audit_get_loginuid(tsk)),
+ task_ppid_nr(current),
+ task_tgid_nr(current),
+ from_kuid(&init_user_ns, audit_get_loginuid(current)),
from_kuid(&init_user_ns, cred->uid),
from_kgid(&init_user_ns, cred->gid),
from_kuid(&init_user_ns, cred->euid),
@@ -2292,11 +2292,11 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
from_kgid(&init_user_ns, cred->sgid),
from_kgid(&init_user_ns, cred->fsgid),
tty ? tty_name(tty) : "(none)",
- audit_get_sessionid(tsk));
+ audit_get_sessionid(current));
audit_put_tty(tty);
audit_log_format(ab, " comm=");
- audit_log_untrustedstring(ab, get_task_comm(comm, tsk));
- audit_log_d_path_exe(ab, tsk->mm);
+ audit_log_untrustedstring(ab, get_task_comm(comm, current));
+ audit_log_d_path_exe(ab, current->mm);
audit_log_task_context(ab);
}
EXPORT_SYMBOL(audit_log_task_info);
@@ -2317,7 +2317,7 @@ void audit_log_link_denied(const char *operation)
if (!ab)
return;
audit_log_format(ab, "op=%s", operation);
- audit_log_task_info(ab, current);
+ audit_log_task_info(ab);
audit_log_format(ab, " res=0");
audit_log_end(ab);
}
diff --git a/kernel/audit.h b/kernel/audit.h
index 214e14948370..91421679a168 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -210,6 +210,8 @@ struct audit_context {
extern bool audit_ever_enabled;
+extern void audit_log_session_info(struct audit_buffer *ab);
+
extern void audit_copy_inode(struct audit_names *name,
const struct dentry *dentry,
struct inode *inode);
@@ -262,11 +264,11 @@ extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
extern void audit_log_d_path_exe(struct audit_buffer *ab,
struct mm_struct *mm);
-extern struct tty_struct *audit_get_tty(struct task_struct *tsk);
+extern struct tty_struct *audit_get_tty(void);
extern void audit_put_tty(struct tty_struct *tty);
/* audit watch functions */
-#ifdef CONFIG_AUDIT_WATCH
+#ifdef CONFIG_AUDITSYSCALL
extern void audit_put_watch(struct audit_watch *watch);
extern void audit_get_watch(struct audit_watch *watch);
extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
@@ -299,9 +301,9 @@ extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark
#define audit_mark_compare(m, i, d) 0
#define audit_exe_compare(t, m) (-EINVAL)
#define audit_dupe_exe(n, o) (-EINVAL)
-#endif /* CONFIG_AUDIT_WATCH */
+#endif /* CONFIG_AUDITSYSCALL */
-#ifdef CONFIG_AUDIT_TREE
+#ifdef CONFIG_AUDITSYSCALL
extern struct audit_chunk *audit_tree_lookup(const struct inode *inode);
extern void audit_put_chunk(struct audit_chunk *chunk);
extern bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree);
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index fba78047fb37..cf4512a33675 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -130,10 +130,8 @@ static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, c
ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
if (unlikely(!ab))
return;
- audit_log_format(ab, "auid=%u ses=%u op=%s",
- from_kuid(&init_user_ns, audit_get_loginuid(current)),
- audit_get_sessionid(current), op);
- audit_log_format(ab, " path=");
+ audit_log_session_info(ab);
+ audit_log_format(ab, " op=%s path=", op);
audit_log_untrustedstring(ab, audit_mark->path);
audit_log_key(ab, rule->filterkey);
audit_log_format(ab, " list=%d res=1", rule->listnr);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index ea43181cde4a..d4af4d97f847 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -24,9 +24,9 @@ struct audit_tree {
struct audit_chunk {
struct list_head hash;
- struct fsnotify_mark mark;
+ unsigned long key;
+ struct fsnotify_mark *mark;
struct list_head trees; /* with root here */
- int dead;
int count;
atomic_long_t refs;
struct rcu_head head;
@@ -37,13 +37,25 @@ struct audit_chunk {
} owners[];
};
+struct audit_tree_mark {
+ struct fsnotify_mark mark;
+ struct audit_chunk *chunk;
+};
+
static LIST_HEAD(tree_list);
static LIST_HEAD(prune_list);
static struct task_struct *prune_thread;
/*
- * One struct chunk is attached to each inode of interest.
- * We replace struct chunk on tagging/untagging.
+ * One struct chunk is attached to each inode of interest through
+ * audit_tree_mark (fsnotify mark). We replace struct chunk on tagging /
+ * untagging, the mark is stable as long as there is chunk attached. The
+ * association between mark and chunk is protected by hash_lock and
+ * audit_tree_group->mark_mutex. Thus as long as we hold
+ * audit_tree_group->mark_mutex and check that the mark is alive by
+ * FSNOTIFY_MARK_FLAG_ATTACHED flag check, we are sure the mark points to
+ * the current chunk.
+ *
* Rules have pointer to struct audit_tree.
* Rules have struct list_head rlist forming a list of rules over
* the same tree.
@@ -62,8 +74,12 @@ static struct task_struct *prune_thread;
* tree is refcounted; one reference for "some rules on rules_list refer to
* it", one for each chunk with pointer to it.
*
- * chunk is refcounted by embedded fsnotify_mark + .refs (non-zero refcount
- * of watch contributes 1 to .refs).
+ * chunk is refcounted by embedded .refs. Mark associated with the chunk holds
+ * one chunk reference. This reference is dropped either when a mark is going
+ * to be freed (corresponding inode goes away) or when chunk attached to the
+ * mark gets replaced. This reference must be dropped using
+ * audit_mark_put_chunk() to make sure the reference is dropped only after RCU
+ * grace period as it protects RCU readers of the hash table.
*
* node.index allows to get from node.list to containing chunk.
* MSB of that sucker is stolen to mark taggings that we might have to
@@ -72,6 +88,7 @@ static struct task_struct *prune_thread;
*/
static struct fsnotify_group *audit_tree_group;
+static struct kmem_cache *audit_tree_mark_cachep __read_mostly;
static struct audit_tree *alloc_tree(const char *s)
{
@@ -131,12 +148,43 @@ static void __put_chunk(struct rcu_head *rcu)
audit_put_chunk(chunk);
}
-static void audit_tree_destroy_watch(struct fsnotify_mark *entry)
+/*
+ * Drop reference to the chunk that was held by the mark. This is the reference
+ * that gets dropped after we've removed the chunk from the hash table and we
+ * use it to make sure chunk cannot be freed before RCU grace period expires.
+ */
+static void audit_mark_put_chunk(struct audit_chunk *chunk)
{
- struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
call_rcu(&chunk->head, __put_chunk);
}
+static inline struct audit_tree_mark *audit_mark(struct fsnotify_mark *mark)
+{
+ return container_of(mark, struct audit_tree_mark, mark);
+}
+
+static struct audit_chunk *mark_chunk(struct fsnotify_mark *mark)
+{
+ return audit_mark(mark)->chunk;
+}
+
+static void audit_tree_destroy_watch(struct fsnotify_mark *mark)
+{
+ kmem_cache_free(audit_tree_mark_cachep, audit_mark(mark));
+}
+
+static struct fsnotify_mark *alloc_mark(void)
+{
+ struct audit_tree_mark *amark;
+
+ amark = kmem_cache_zalloc(audit_tree_mark_cachep, GFP_KERNEL);
+ if (!amark)
+ return NULL;
+ fsnotify_init_mark(&amark->mark, audit_tree_group);
+ amark->mark.mask = FS_IN_IGNORED;
+ return &amark->mark;
+}
+
static struct audit_chunk *alloc_chunk(int count)
{
struct audit_chunk *chunk;
@@ -156,8 +204,6 @@ static struct audit_chunk *alloc_chunk(int count)
INIT_LIST_HEAD(&chunk->owners[i].list);
chunk->owners[i].index = i;
}
- fsnotify_init_mark(&chunk->mark, audit_tree_group);
- chunk->mark.mask = FS_IN_IGNORED;
return chunk;
}
@@ -172,36 +218,25 @@ static unsigned long inode_to_key(const struct inode *inode)
return (unsigned long)&inode->i_fsnotify_marks;
}
-/*
- * Function to return search key in our hash from chunk. Key 0 is special and
- * should never be present in the hash.
- */
-static unsigned long chunk_to_key(struct audit_chunk *chunk)
-{
- /*
- * We have a reference to the mark so it should be attached to a
- * connector.
- */
- if (WARN_ON_ONCE(!chunk->mark.connector))
- return 0;
- return (unsigned long)chunk->mark.connector->obj;
-}
-
static inline struct list_head *chunk_hash(unsigned long key)
{
unsigned long n = key / L1_CACHE_BYTES;
return chunk_hash_heads + n % HASH_SIZE;
}
-/* hash_lock & entry->lock is held by caller */
+/* hash_lock & mark->group->mark_mutex is held by caller */
static void insert_hash(struct audit_chunk *chunk)
{
- unsigned long key = chunk_to_key(chunk);
struct list_head *list;
- if (!(chunk->mark.flags & FSNOTIFY_MARK_FLAG_ATTACHED))
- return;
- list = chunk_hash(key);
+ /*
+ * Make sure chunk is fully initialized before making it visible in the
+ * hash. Pairs with a data dependency barrier in READ_ONCE() in
+ * audit_tree_lookup().
+ */
+ smp_wmb();
+ WARN_ON_ONCE(!chunk->key);
+ list = chunk_hash(chunk->key);
list_add_rcu(&chunk->hash, list);
}
@@ -213,7 +248,11 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
struct audit_chunk *p;
list_for_each_entry_rcu(p, list, hash) {
- if (chunk_to_key(p) == key) {
+ /*
+ * We use a data dependency barrier in READ_ONCE() to make sure
+ * the chunk we see is fully initialized.
+ */
+ if (READ_ONCE(p->key) == key) {
atomic_long_inc(&p->refs);
return p;
}
@@ -239,137 +278,159 @@ static struct audit_chunk *find_chunk(struct node *p)
return container_of(p, struct audit_chunk, owners[0]);
}
-static void untag_chunk(struct node *p)
+static void replace_mark_chunk(struct fsnotify_mark *mark,
+ struct audit_chunk *chunk)
+{
+ struct audit_chunk *old;
+
+ assert_spin_locked(&hash_lock);
+ old = mark_chunk(mark);
+ audit_mark(mark)->chunk = chunk;
+ if (chunk)
+ chunk->mark = mark;
+ if (old)
+ old->mark = NULL;
+}
+
+static void replace_chunk(struct audit_chunk *new, struct audit_chunk *old)
{
- struct audit_chunk *chunk = find_chunk(p);
- struct fsnotify_mark *entry = &chunk->mark;
- struct audit_chunk *new = NULL;
struct audit_tree *owner;
- int size = chunk->count - 1;
int i, j;
- fsnotify_get_mark(entry);
+ new->key = old->key;
+ list_splice_init(&old->trees, &new->trees);
+ list_for_each_entry(owner, &new->trees, same_root)
+ owner->root = new;
+ for (i = j = 0; j < old->count; i++, j++) {
+ if (!old->owners[j].owner) {
+ i--;
+ continue;
+ }
+ owner = old->owners[j].owner;
+ new->owners[i].owner = owner;
+ new->owners[i].index = old->owners[j].index - j + i;
+ if (!owner) /* result of earlier fallback */
+ continue;
+ get_tree(owner);
+ list_replace_init(&old->owners[j].list, &new->owners[i].list);
+ }
+ replace_mark_chunk(old->mark, new);
+ /*
+ * Make sure chunk is fully initialized before making it visible in the
+ * hash. Pairs with a data dependency barrier in READ_ONCE() in
+ * audit_tree_lookup().
+ */
+ smp_wmb();
+ list_replace_rcu(&old->hash, &new->hash);
+}
- spin_unlock(&hash_lock);
+static void remove_chunk_node(struct audit_chunk *chunk, struct node *p)
+{
+ struct audit_tree *owner = p->owner;
+
+ if (owner->root == chunk) {
+ list_del_init(&owner->same_root);
+ owner->root = NULL;
+ }
+ list_del_init(&p->list);
+ p->owner = NULL;
+ put_tree(owner);
+}
- if (size)
- new = alloc_chunk(size);
+static int chunk_count_trees(struct audit_chunk *chunk)
+{
+ int i;
+ int ret = 0;
- mutex_lock(&entry->group->mark_mutex);
- spin_lock(&entry->lock);
+ for (i = 0; i < chunk->count; i++)
+ if (chunk->owners[i].owner)
+ ret++;
+ return ret;
+}
+
+static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark)
+{
+ struct audit_chunk *new;
+ int size;
+
+ mutex_lock(&audit_tree_group->mark_mutex);
/*
- * mark_mutex protects mark from getting detached and thus also from
- * mark->connector->obj getting NULL.
+ * mark_mutex stabilizes chunk attached to the mark so we can check
+ * whether it didn't change while we've dropped hash_lock.
*/
- if (chunk->dead || !(entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
- spin_unlock(&entry->lock);
- mutex_unlock(&entry->group->mark_mutex);
- if (new)
- fsnotify_put_mark(&new->mark);
- goto out;
- }
-
- owner = p->owner;
+ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) ||
+ mark_chunk(mark) != chunk)
+ goto out_mutex;
+ size = chunk_count_trees(chunk);
if (!size) {
- chunk->dead = 1;
spin_lock(&hash_lock);
list_del_init(&chunk->trees);
- if (owner->root == chunk)
- owner->root = NULL;
- list_del_init(&p->list);
list_del_rcu(&chunk->hash);
+ replace_mark_chunk(mark, NULL);
spin_unlock(&hash_lock);
- spin_unlock(&entry->lock);
- mutex_unlock(&entry->group->mark_mutex);
- fsnotify_destroy_mark(entry, audit_tree_group);
- goto out;
+ fsnotify_detach_mark(mark);
+ mutex_unlock(&audit_tree_group->mark_mutex);
+ audit_mark_put_chunk(chunk);
+ fsnotify_free_mark(mark);
+ return;
}
+ new = alloc_chunk(size);
if (!new)
- goto Fallback;
+ goto out_mutex;
- if (fsnotify_add_mark_locked(&new->mark, entry->connector->obj,
- FSNOTIFY_OBJ_TYPE_INODE, 1)) {
- fsnotify_put_mark(&new->mark);
- goto Fallback;
- }
-
- chunk->dead = 1;
spin_lock(&hash_lock);
- list_replace_init(&chunk->trees, &new->trees);
- if (owner->root == chunk) {
- list_del_init(&owner->same_root);
- owner->root = NULL;
- }
-
- for (i = j = 0; j <= size; i++, j++) {
- struct audit_tree *s;
- if (&chunk->owners[j] == p) {
- list_del_init(&p->list);
- i--;
- continue;
- }
- s = chunk->owners[j].owner;
- new->owners[i].owner = s;
- new->owners[i].index = chunk->owners[j].index - j + i;
- if (!s) /* result of earlier fallback */
- continue;
- get_tree(s);
- list_replace_init(&chunk->owners[j].list, &new->owners[i].list);
- }
-
- list_replace_rcu(&chunk->hash, &new->hash);
- list_for_each_entry(owner, &new->trees, same_root)
- owner->root = new;
- spin_unlock(&hash_lock);
- spin_unlock(&entry->lock);
- mutex_unlock(&entry->group->mark_mutex);
- fsnotify_destroy_mark(entry, audit_tree_group);
- fsnotify_put_mark(&new->mark); /* drop initial reference */
- goto out;
-
-Fallback:
- // do the best we can
- spin_lock(&hash_lock);
- if (owner->root == chunk) {
- list_del_init(&owner->same_root);
- owner->root = NULL;
- }
- list_del_init(&p->list);
- p->owner = NULL;
- put_tree(owner);
+ /*
+ * This has to go last when updating chunk as once replace_chunk() is
+ * called, new RCU readers can see the new chunk.
+ */
+ replace_chunk(new, chunk);
spin_unlock(&hash_lock);
- spin_unlock(&entry->lock);
- mutex_unlock(&entry->group->mark_mutex);
-out:
- fsnotify_put_mark(entry);
- spin_lock(&hash_lock);
+ mutex_unlock(&audit_tree_group->mark_mutex);
+ audit_mark_put_chunk(chunk);
+ return;
+
+out_mutex:
+ mutex_unlock(&audit_tree_group->mark_mutex);
}
+/* Call with group->mark_mutex held, releases it */
static int create_chunk(struct inode *inode, struct audit_tree *tree)
{
- struct fsnotify_mark *entry;
+ struct fsnotify_mark *mark;
struct audit_chunk *chunk = alloc_chunk(1);
- if (!chunk)
+
+ if (!chunk) {
+ mutex_unlock(&audit_tree_group->mark_mutex);
return -ENOMEM;
+ }
- entry = &chunk->mark;
- if (fsnotify_add_inode_mark(entry, inode, 0)) {
- fsnotify_put_mark(entry);
+ mark = alloc_mark();
+ if (!mark) {
+ mutex_unlock(&audit_tree_group->mark_mutex);
+ kfree(chunk);
+ return -ENOMEM;
+ }
+
+ if (fsnotify_add_inode_mark_locked(mark, inode, 0)) {
+ mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_put_mark(mark);
+ kfree(chunk);
return -ENOSPC;
}
- spin_lock(&entry->lock);
spin_lock(&hash_lock);
if (tree->goner) {
spin_unlock(&hash_lock);
- chunk->dead = 1;
- spin_unlock(&entry->lock);
- fsnotify_destroy_mark(entry, audit_tree_group);
- fsnotify_put_mark(entry);
+ fsnotify_detach_mark(mark);
+ mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_free_mark(mark);
+ fsnotify_put_mark(mark);
+ kfree(chunk);
return 0;
}
+ replace_mark_chunk(mark, chunk);
chunk->owners[0].index = (1U << 31);
chunk->owners[0].owner = tree;
get_tree(tree);
@@ -378,35 +439,49 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
tree->root = chunk;
list_add(&tree->same_root, &chunk->trees);
}
+ chunk->key = inode_to_key(inode);
+ /*
+ * Inserting into the hash table has to go last as once we do that RCU
+ * readers can see the chunk.
+ */
insert_hash(chunk);
spin_unlock(&hash_lock);
- spin_unlock(&entry->lock);
- fsnotify_put_mark(entry); /* drop initial reference */
+ mutex_unlock(&audit_tree_group->mark_mutex);
+ /*
+ * Drop our initial reference. When mark we point to is getting freed,
+ * we get notification through ->freeing_mark callback and cleanup
+ * chunk pointing to this mark.
+ */
+ fsnotify_put_mark(mark);
return 0;
}
/* the first tagged inode becomes root of tree */
static int tag_chunk(struct inode *inode, struct audit_tree *tree)
{
- struct fsnotify_mark *old_entry, *chunk_entry;
- struct audit_tree *owner;
+ struct fsnotify_mark *mark;
struct audit_chunk *chunk, *old;
struct node *p;
int n;
- old_entry = fsnotify_find_mark(&inode->i_fsnotify_marks,
- audit_tree_group);
- if (!old_entry)
+ mutex_lock(&audit_tree_group->mark_mutex);
+ mark = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_tree_group);
+ if (!mark)
return create_chunk(inode, tree);
- old = container_of(old_entry, struct audit_chunk, mark);
-
+ /*
+ * Found mark is guaranteed to be attached and mark_mutex protects mark
+ * from getting detached and thus it makes sure there is chunk attached
+ * to the mark.
+ */
/* are we already there? */
spin_lock(&hash_lock);
+ old = mark_chunk(mark);
for (n = 0; n < old->count; n++) {
if (old->owners[n].owner == tree) {
spin_unlock(&hash_lock);
- fsnotify_put_mark(old_entry);
+ mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_put_mark(mark);
return 0;
}
}
@@ -414,83 +489,38 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
chunk = alloc_chunk(old->count + 1);
if (!chunk) {
- fsnotify_put_mark(old_entry);
+ mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_put_mark(mark);
return -ENOMEM;
}
- chunk_entry = &chunk->mark;
-
- mutex_lock(&old_entry->group->mark_mutex);
- spin_lock(&old_entry->lock);
- /*
- * mark_mutex protects mark from getting detached and thus also from
- * mark->connector->obj getting NULL.
- */
- if (!(old_entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
- /* old_entry is being shot, lets just lie */
- spin_unlock(&old_entry->lock);
- mutex_unlock(&old_entry->group->mark_mutex);
- fsnotify_put_mark(old_entry);
- fsnotify_put_mark(&chunk->mark);
- return -ENOENT;
- }
-
- if (fsnotify_add_mark_locked(chunk_entry, old_entry->connector->obj,
- FSNOTIFY_OBJ_TYPE_INODE, 1)) {
- spin_unlock(&old_entry->lock);
- mutex_unlock(&old_entry->group->mark_mutex);
- fsnotify_put_mark(chunk_entry);
- fsnotify_put_mark(old_entry);
- return -ENOSPC;
- }
-
- /* even though we hold old_entry->lock, this is safe since chunk_entry->lock could NEVER have been grabbed before */
- spin_lock(&chunk_entry->lock);
spin_lock(&hash_lock);
-
- /* we now hold old_entry->lock, chunk_entry->lock, and hash_lock */
if (tree->goner) {
spin_unlock(&hash_lock);
- chunk->dead = 1;
- spin_unlock(&chunk_entry->lock);
- spin_unlock(&old_entry->lock);
- mutex_unlock(&old_entry->group->mark_mutex);
-
- fsnotify_destroy_mark(chunk_entry, audit_tree_group);
-
- fsnotify_put_mark(chunk_entry);
- fsnotify_put_mark(old_entry);
+ mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_put_mark(mark);
+ kfree(chunk);
return 0;
}
- list_replace_init(&old->trees, &chunk->trees);
- for (n = 0, p = chunk->owners; n < old->count; n++, p++) {
- struct audit_tree *s = old->owners[n].owner;
- p->owner = s;
- p->index = old->owners[n].index;
- if (!s) /* result of fallback in untag */
- continue;
- get_tree(s);
- list_replace_init(&old->owners[n].list, &p->list);
- }
+ p = &chunk->owners[chunk->count - 1];
p->index = (chunk->count - 1) | (1U<<31);
p->owner = tree;
get_tree(tree);
list_add(&p->list, &tree->chunks);
- list_replace_rcu(&old->hash, &chunk->hash);
- list_for_each_entry(owner, &chunk->trees, same_root)
- owner->root = chunk;
- old->dead = 1;
if (!tree->root) {
tree->root = chunk;
list_add(&tree->same_root, &chunk->trees);
}
+ /*
+ * This has to go last when updating chunk as once replace_chunk() is
+ * called, new RCU readers can see the new chunk.
+ */
+ replace_chunk(chunk, old);
spin_unlock(&hash_lock);
- spin_unlock(&chunk_entry->lock);
- spin_unlock(&old_entry->lock);
- mutex_unlock(&old_entry->group->mark_mutex);
- fsnotify_destroy_mark(old_entry, audit_tree_group);
- fsnotify_put_mark(chunk_entry); /* drop initial reference */
- fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
+ mutex_unlock(&audit_tree_group->mark_mutex);
+ fsnotify_put_mark(mark); /* pair to fsnotify_find_mark */
+ audit_mark_put_chunk(old);
+
return 0;
}
@@ -503,8 +533,7 @@ static void audit_tree_log_remove_rule(struct audit_krule *rule)
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (unlikely(!ab))
return;
- audit_log_format(ab, "op=remove_rule");
- audit_log_format(ab, " dir=");
+ audit_log_format(ab, "op=remove_rule dir=");
audit_log_untrustedstring(ab, rule->tree->pathname);
audit_log_key(ab, rule->filterkey);
audit_log_format(ab, " list=%d res=1", rule->listnr);
@@ -534,22 +563,48 @@ static void kill_rules(struct audit_tree *tree)
}
/*
- * finish killing struct audit_tree
+ * Remove tree from chunks. If 'tagged' is set, remove tree only from tagged
+ * chunks. The function expects tagged chunks are all at the beginning of the
+ * chunks list.
*/
-static void prune_one(struct audit_tree *victim)
+static void prune_tree_chunks(struct audit_tree *victim, bool tagged)
{
spin_lock(&hash_lock);
while (!list_empty(&victim->chunks)) {
struct node *p;
+ struct audit_chunk *chunk;
+ struct fsnotify_mark *mark;
+
+ p = list_first_entry(&victim->chunks, struct node, list);
+ /* have we run out of marked? */
+ if (tagged && !(p->index & (1U<<31)))
+ break;
+ chunk = find_chunk(p);
+ mark = chunk->mark;
+ remove_chunk_node(chunk, p);
+ /* Racing with audit_tree_freeing_mark()? */
+ if (!mark)
+ continue;
+ fsnotify_get_mark(mark);
+ spin_unlock(&hash_lock);
- p = list_entry(victim->chunks.next, struct node, list);
+ untag_chunk(chunk, mark);
+ fsnotify_put_mark(mark);
- untag_chunk(p);
+ spin_lock(&hash_lock);
}
spin_unlock(&hash_lock);
put_tree(victim);
}
+/*
+ * finish killing struct audit_tree
+ */
+static void prune_one(struct audit_tree *victim)
+{
+ prune_tree_chunks(victim, false);
+}
+
/* trim the uncommitted chunks from tree */
static void trim_marked(struct audit_tree *tree)
@@ -569,18 +624,11 @@ static void trim_marked(struct audit_tree *tree)
list_add(p, &tree->chunks);
}
}
+ spin_unlock(&hash_lock);
- while (!list_empty(&tree->chunks)) {
- struct node *node;
-
- node = list_entry(tree->chunks.next, struct node, list);
-
- /* have we run out of marked? */
- if (!(node->index & (1U<<31)))
- break;
+ prune_tree_chunks(tree, true);
- untag_chunk(node);
- }
+ spin_lock(&hash_lock);
if (!tree->root && !tree->goner) {
tree->goner = 1;
spin_unlock(&hash_lock);
@@ -661,7 +709,7 @@ void audit_trim_trees(void)
/* this could be NULL if the watch is dying else where... */
node->index |= 1U<<31;
if (iterate_mounts(compare_root,
- (void *)chunk_to_key(chunk),
+ (void *)(chunk->key),
root_mnt))
node->index &= ~(1U<<31);
}
@@ -959,10 +1007,6 @@ static void evict_chunk(struct audit_chunk *chunk)
int need_prune = 0;
int n;
- if (chunk->dead)
- return;
-
- chunk->dead = 1;
mutex_lock(&audit_filter_mutex);
spin_lock(&hash_lock);
while (!list_empty(&chunk->trees)) {
@@ -999,17 +1043,27 @@ static int audit_tree_handle_event(struct fsnotify_group *group,
return 0;
}
-static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
+static void audit_tree_freeing_mark(struct fsnotify_mark *mark,
+ struct fsnotify_group *group)
{
- struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
+ struct audit_chunk *chunk;
- evict_chunk(chunk);
+ mutex_lock(&mark->group->mark_mutex);
+ spin_lock(&hash_lock);
+ chunk = mark_chunk(mark);
+ replace_mark_chunk(mark, NULL);
+ spin_unlock(&hash_lock);
+ mutex_unlock(&mark->group->mark_mutex);
+ if (chunk) {
+ evict_chunk(chunk);
+ audit_mark_put_chunk(chunk);
+ }
/*
* We are guaranteed to have at least one reference to the mark from
* either the inode or the caller of fsnotify_destroy_mark().
*/
- BUG_ON(refcount_read(&entry->refcnt) < 1);
+ BUG_ON(refcount_read(&mark->refcnt) < 1);
}
static const struct fsnotify_ops audit_tree_ops = {
@@ -1022,6 +1076,8 @@ static int __init audit_tree_init(void)
{
int i;
+ audit_tree_mark_cachep = KMEM_CACHE(audit_tree_mark, SLAB_PANIC);
+
audit_tree_group = fsnotify_alloc_group(&audit_tree_ops);
if (IS_ERR(audit_tree_group))
audit_panic("cannot initialize fsnotify group for rectree watches");
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 787c7afdf829..20ef9ba134b0 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -245,10 +245,8 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
if (!ab)
return;
- audit_log_format(ab, "auid=%u ses=%u op=%s",
- from_kuid(&init_user_ns, audit_get_loginuid(current)),
- audit_get_sessionid(current), op);
- audit_log_format(ab, " path=");
+ audit_log_session_info(ab);
+ audit_log_format(ab, "op=%s path=", op);
audit_log_untrustedstring(ab, w->path);
audit_log_key(ab, r->filterkey);
audit_log_format(ab, " list=%d res=1", r->listnr);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b2d1f043f17f..6593a5207fb0 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -200,7 +200,6 @@ static int audit_match_filetype(struct audit_context *ctx, int val)
* References in it _are_ dropped - at the same time we free/drop aux stuff.
*/
-#ifdef CONFIG_AUDIT_TREE
static void audit_set_auditable(struct audit_context *ctx)
{
if (!ctx->prio) {
@@ -245,12 +244,10 @@ static int grow_tree_refs(struct audit_context *ctx)
ctx->tree_count = 31;
return 1;
}
-#endif
static void unroll_tree_refs(struct audit_context *ctx,
struct audit_tree_refs *p, int count)
{
-#ifdef CONFIG_AUDIT_TREE
struct audit_tree_refs *q;
int n;
if (!p) {
@@ -274,7 +271,6 @@ static void unroll_tree_refs(struct audit_context *ctx,
}
ctx->trees = p;
ctx->tree_count = count;
-#endif
}
static void free_tree_refs(struct audit_context *ctx)
@@ -288,7 +284,6 @@ static void free_tree_refs(struct audit_context *ctx)
static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
{
-#ifdef CONFIG_AUDIT_TREE
struct audit_tree_refs *p;
int n;
if (!tree)
@@ -305,7 +300,6 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
if (audit_tree_match(p->c[n], tree))
return 1;
}
-#endif
return 0;
}
@@ -836,44 +830,6 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
rcu_read_unlock();
}
-/* Transfer the audit context pointer to the caller, clearing it in the tsk's struct */
-static inline struct audit_context *audit_take_context(struct task_struct *tsk,
- int return_valid,
- long return_code)
-{
- struct audit_context *context = tsk->audit_context;
-
- if (!context)
- return NULL;
- context->return_valid = return_valid;
-
- /*
- * we need to fix up the return code in the audit logs if the actual
- * return codes are later going to be fixed up by the arch specific
- * signal handlers
- *
- * This is actually a test for:
- * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) ||
- * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK)
- *
- * but is faster than a bunch of ||
- */
- if (unlikely(return_code <= -ERESTARTSYS) &&
- (return_code >= -ERESTART_RESTARTBLOCK) &&
- (return_code != -ENOIOCTLCMD))
- context->return_code = -EINTR;
- else
- context->return_code = return_code;
-
- if (context->in_syscall && !context->dummy) {
- audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
- audit_filter_inodes(tsk, context);
- }
-
- audit_set_context(tsk, NULL);
- return context;
-}
-
static inline void audit_proctitle_free(struct audit_context *context)
{
kfree(context->proctitle.value);
@@ -1107,7 +1063,7 @@ static void audit_log_execve_info(struct audit_context *context,
}
/* write as much as we can to the audit log */
- if (len_buf > 0) {
+ if (len_buf >= 0) {
/* NOTE: some magic numbers here - basically if we
* can't fit a reasonable amount of data into the
* existing audit buffer, flush it and start with
@@ -1302,15 +1258,18 @@ static inline int audit_proctitle_rtrim(char *proctitle, int len)
return len;
}
-static void audit_log_proctitle(struct task_struct *tsk,
- struct audit_context *context)
+static void audit_log_proctitle(void)
{
int res;
char *buf;
char *msg = "(null)";
int len = strlen(msg);
+ struct audit_context *context = audit_context();
struct audit_buffer *ab;
+ if (!context || context->dummy)
+ return;
+
ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE);
if (!ab)
return; /* audit_panic or being filtered */
@@ -1323,7 +1282,7 @@ static void audit_log_proctitle(struct task_struct *tsk,
if (!buf)
goto out;
/* Historically called this from procfs naming */
- res = get_cmdline(tsk, buf, MAX_PROCTITLE_AUDIT_LEN);
+ res = get_cmdline(current, buf, MAX_PROCTITLE_AUDIT_LEN);
if (res == 0) {
kfree(buf);
goto out;
@@ -1343,15 +1302,15 @@ out:
audit_log_end(ab);
}
-static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
+static void audit_log_exit(void)
{
int i, call_panic = 0;
+ struct audit_context *context = audit_context();
struct audit_buffer *ab;
struct audit_aux_data *aux;
struct audit_names *n;
- /* tsk == current */
- context->personality = tsk->personality;
+ context->personality = current->personality;
ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
if (!ab)
@@ -1373,7 +1332,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
context->argv[3],
context->name_count);
- audit_log_task_info(ab, tsk);
+ audit_log_task_info(ab);
audit_log_key(ab, context->filterkey);
audit_log_end(ab);
@@ -1462,7 +1421,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
audit_log_name(context, n, NULL, i++, &call_panic);
}
- audit_log_proctitle(tsk, context);
+ audit_log_proctitle();
/* Send end of event record to help user space know we are finished */
ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
@@ -1480,22 +1439,31 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
*/
void __audit_free(struct task_struct *tsk)
{
- struct audit_context *context;
+ struct audit_context *context = tsk->audit_context;
- context = audit_take_context(tsk, 0, 0);
if (!context)
return;
- /* Check for system calls that do not go through the exit
- * function (e.g., exit_group), then free context block.
- * We use GFP_ATOMIC here because we might be doing this
- * in the context of the idle thread */
- /* that can happen only if we are called from do_exit() */
- if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
- audit_log_exit(context, tsk);
+ /* We are called either by do_exit() or the fork() error handling code;
+ * in the former case tsk == current and in the latter tsk is a
+ * random task_struct that doesn't doesn't have any meaningful data we
+ * need to log via audit_log_exit().
+ */
+ if (tsk == current && !context->dummy && context->in_syscall) {
+ context->return_valid = 0;
+ context->return_code = 0;
+
+ audit_filter_syscall(tsk, context,
+ &audit_filter_list[AUDIT_FILTER_EXIT]);
+ audit_filter_inodes(tsk, context);
+ if (context->current_state == AUDIT_RECORD_CONTEXT)
+ audit_log_exit();
+ }
+
if (!list_empty(&context->killed_trees))
audit_kill_trees(&context->killed_trees);
+ audit_set_context(tsk, NULL);
audit_free_context(context);
}
@@ -1565,17 +1533,40 @@ void __audit_syscall_exit(int success, long return_code)
{
struct audit_context *context;
- if (success)
- success = AUDITSC_SUCCESS;
- else
- success = AUDITSC_FAILURE;
-
- context = audit_take_context(current, success, return_code);
+ context = audit_context();
if (!context)
return;
- if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
- audit_log_exit(context, current);
+ if (!context->dummy && context->in_syscall) {
+ if (success)
+ context->return_valid = AUDITSC_SUCCESS;
+ else
+ context->return_valid = AUDITSC_FAILURE;
+
+ /*
+ * we need to fix up the return code in the audit logs if the
+ * actual return codes are later going to be fixed up by the
+ * arch specific signal handlers
+ *
+ * This is actually a test for:
+ * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) ||
+ * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK)
+ *
+ * but is faster than a bunch of ||
+ */
+ if (unlikely(return_code <= -ERESTARTSYS) &&
+ (return_code >= -ERESTART_RESTARTBLOCK) &&
+ (return_code != -ENOIOCTLCMD))
+ context->return_code = -EINTR;
+ else
+ context->return_code = return_code;
+
+ audit_filter_syscall(current, context,
+ &audit_filter_list[AUDIT_FILTER_EXIT]);
+ audit_filter_inodes(current, context);
+ if (context->current_state == AUDIT_RECORD_CONTEXT)
+ audit_log_exit();
+ }
context->in_syscall = 0;
context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
@@ -1597,12 +1588,10 @@ void __audit_syscall_exit(int success, long return_code)
kfree(context->filterkey);
context->filterkey = NULL;
}
- audit_set_context(current, context);
}
static inline void handle_one(const struct inode *inode)
{
-#ifdef CONFIG_AUDIT_TREE
struct audit_context *context;
struct audit_tree_refs *p;
struct audit_chunk *chunk;
@@ -1627,12 +1616,10 @@ static inline void handle_one(const struct inode *inode)
return;
}
put_tree_ref(context, chunk);
-#endif
}
static void handle_path(const struct dentry *dentry)
{
-#ifdef CONFIG_AUDIT_TREE
struct audit_context *context;
struct audit_tree_refs *p;
const struct dentry *d, *parent;
@@ -1685,7 +1672,6 @@ retry:
return;
}
rcu_read_unlock();
-#endif
}
static struct audit_names *audit_alloc_name(struct audit_context *context,
@@ -2035,7 +2021,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
uid = from_kuid(&init_user_ns, task_uid(current));
oldloginuid = from_kuid(&init_user_ns, koldloginuid);
loginuid = from_kuid(&init_user_ns, kloginuid),
- tty = audit_get_tty(current);
+ tty = audit_get_tty();
audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid);
audit_log_task_context(ab);
@@ -2056,7 +2042,6 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
*/
int audit_set_loginuid(kuid_t loginuid)
{
- struct task_struct *task = current;
unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET;
kuid_t oldloginuid;
int rc;
@@ -2075,8 +2060,8 @@ int audit_set_loginuid(kuid_t loginuid)
sessionid = (unsigned int)atomic_inc_return(&session_id);
}
- task->sessionid = sessionid;
- task->loginuid = loginuid;
+ current->sessionid = sessionid;
+ current->loginuid = loginuid;
out:
audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc);
return rc;
@@ -2513,10 +2498,9 @@ void audit_seccomp_actions_logged(const char *names, const char *old_names,
if (unlikely(!ab))
return;
- audit_log_format(ab, "op=seccomp-logging");
- audit_log_format(ab, " actions=%s", names);
- audit_log_format(ab, " old-actions=%s", old_names);
- audit_log_format(ab, " res=%d", res);
+ audit_log_format(ab,
+ "op=seccomp-logging actions=%s old-actions=%s res=%d",
+ names, old_names, res);
audit_log_end(ab);
}
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 24583da9ffd1..25632a75d630 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -382,6 +382,7 @@ static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key,
}
static int array_map_check_btf(const struct bpf_map *map,
+ const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
{
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 4da543d6bea2..715f9fcf4712 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -164,7 +164,7 @@
#define BITS_ROUNDUP_BYTES(bits) \
(BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits))
-#define BTF_INFO_MASK 0x0f00ffff
+#define BTF_INFO_MASK 0x8f00ffff
#define BTF_INT_MASK 0x0fffffff
#define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE)
#define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET)
@@ -260,6 +260,8 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
[BTF_KIND_VOLATILE] = "VOLATILE",
[BTF_KIND_CONST] = "CONST",
[BTF_KIND_RESTRICT] = "RESTRICT",
+ [BTF_KIND_FUNC] = "FUNC",
+ [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO",
};
struct btf_kind_operations {
@@ -272,6 +274,10 @@ struct btf_kind_operations {
const struct btf_type *struct_type,
const struct btf_member *member,
const struct btf_type *member_type);
+ int (*check_kflag_member)(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type);
void (*log_details)(struct btf_verifier_env *env,
const struct btf_type *t);
void (*seq_show)(const struct btf *btf, const struct btf_type *t,
@@ -282,6 +288,9 @@ struct btf_kind_operations {
static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS];
static struct btf_type btf_void;
+static int btf_resolve(struct btf_verifier_env *env,
+ const struct btf_type *t, u32 type_id);
+
static bool btf_type_is_modifier(const struct btf_type *t)
{
/* Some of them is not strictly a C modifier
@@ -307,15 +316,33 @@ static bool btf_type_is_modifier(const struct btf_type *t)
static bool btf_type_is_void(const struct btf_type *t)
{
- /* void => no type and size info.
- * Hence, FWD is also treated as void.
- */
- return t == &btf_void || BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
+ return t == &btf_void;
+}
+
+static bool btf_type_is_fwd(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
+}
+
+static bool btf_type_is_func(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC;
+}
+
+static bool btf_type_is_func_proto(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO;
+}
+
+static bool btf_type_nosize(const struct btf_type *t)
+{
+ return btf_type_is_void(t) || btf_type_is_fwd(t) ||
+ btf_type_is_func(t) || btf_type_is_func_proto(t);
}
-static bool btf_type_is_void_or_null(const struct btf_type *t)
+static bool btf_type_nosize_or_null(const struct btf_type *t)
{
- return !t || btf_type_is_void(t);
+ return !t || btf_type_nosize(t);
}
/* union is only a special case of struct:
@@ -396,6 +423,25 @@ static u16 btf_type_vlen(const struct btf_type *t)
return BTF_INFO_VLEN(t->info);
}
+static bool btf_type_kflag(const struct btf_type *t)
+{
+ return BTF_INFO_KFLAG(t->info);
+}
+
+static u32 btf_member_bit_offset(const struct btf_type *struct_type,
+ const struct btf_member *member)
+{
+ return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset)
+ : member->offset;
+}
+
+static u32 btf_member_bitfield_size(const struct btf_type *struct_type,
+ const struct btf_member *member)
+{
+ return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset)
+ : 0;
+}
+
static u32 btf_type_int(const struct btf_type *t)
{
return *(u32 *)(t + 1);
@@ -421,7 +467,7 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
return kind_ops[BTF_INFO_KIND(t->info)];
}
-static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
+bool btf_name_offset_valid(const struct btf *btf, u32 offset)
{
return BTF_STR_OFFSET_VALID(offset) &&
offset < btf->hdr.str_len;
@@ -451,7 +497,7 @@ static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
return !*src;
}
-static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
+static const char *__btf_name_by_offset(const struct btf *btf, u32 offset)
{
if (!offset)
return "(anon)";
@@ -461,7 +507,15 @@ static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
return "(invalid-name-offset)";
}
-static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
+const char *btf_name_by_offset(const struct btf *btf, u32 offset)
+{
+ if (offset < btf->hdr.str_len)
+ return &btf->strings[offset];
+
+ return NULL;
+}
+
+const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
{
if (type_id > btf->nr_types)
return NULL;
@@ -491,6 +545,47 @@ static bool btf_type_int_is_regular(const struct btf_type *t)
return true;
}
+/*
+ * Check that given struct member is a regular int with expected
+ * offset and size.
+ */
+bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
+ const struct btf_member *m,
+ u32 expected_offset, u32 expected_size)
+{
+ const struct btf_type *t;
+ u32 id, int_data;
+ u8 nr_bits;
+
+ id = m->type;
+ t = btf_type_id_size(btf, &id, NULL);
+ if (!t || !btf_type_is_int(t))
+ return false;
+
+ int_data = btf_type_int(t);
+ nr_bits = BTF_INT_BITS(int_data);
+ if (btf_type_kflag(s)) {
+ u32 bitfield_size = BTF_MEMBER_BITFIELD_SIZE(m->offset);
+ u32 bit_offset = BTF_MEMBER_BIT_OFFSET(m->offset);
+
+ /* if kflag set, int should be a regular int and
+ * bit offset should be at byte boundary.
+ */
+ return !bitfield_size &&
+ BITS_ROUNDUP_BYTES(bit_offset) == expected_offset &&
+ BITS_ROUNDUP_BYTES(nr_bits) == expected_size;
+ }
+
+ if (BTF_INT_OFFSET(int_data) ||
+ BITS_PER_BYTE_MASKED(m->offset) ||
+ BITS_ROUNDUP_BYTES(m->offset) != expected_offset ||
+ BITS_PER_BYTE_MASKED(nr_bits) ||
+ BITS_ROUNDUP_BYTES(nr_bits) != expected_size)
+ return false;
+
+ return true;
+}
+
__printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log,
const char *fmt, ...)
{
@@ -531,7 +626,7 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
__btf_verifier_log(log, "[%u] %s %s%s",
env->log_type_id,
btf_kind_str[kind],
- btf_name_by_offset(btf, t->name_off),
+ __btf_name_by_offset(btf, t->name_off),
log_details ? " " : "");
if (log_details)
@@ -574,9 +669,17 @@ static void btf_verifier_log_member(struct btf_verifier_env *env,
if (env->phase != CHECK_META)
btf_verifier_log_type(env, struct_type, NULL);
- __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u",
- btf_name_by_offset(btf, member->name_off),
- member->type, member->offset);
+ if (btf_type_kflag(struct_type))
+ __btf_verifier_log(log,
+ "\t%s type_id=%u bitfield_size=%u bits_offset=%u",
+ __btf_name_by_offset(btf, member->name_off),
+ member->type,
+ BTF_MEMBER_BITFIELD_SIZE(member->offset),
+ BTF_MEMBER_BIT_OFFSET(member->offset));
+ else
+ __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u",
+ __btf_name_by_offset(btf, member->name_off),
+ member->type, member->offset);
if (fmt && *fmt) {
__btf_verifier_log(log, " ");
@@ -765,11 +868,15 @@ static bool env_type_is_resolve_sink(const struct btf_verifier_env *env,
/* int, enum or void is a sink */
return !btf_type_needs_resolve(next_type);
case RESOLVE_PTR:
- /* int, enum, void, struct or array is a sink for ptr */
+ /* int, enum, void, struct, array, func or func_proto is a sink
+ * for ptr
+ */
return !btf_type_is_modifier(next_type) &&
!btf_type_is_ptr(next_type);
case RESOLVE_STRUCT_OR_ARRAY:
- /* int, enum, void or ptr is a sink for struct and array */
+ /* int, enum, void, ptr, func or func_proto is a sink
+ * for struct and array
+ */
return !btf_type_is_modifier(next_type) &&
!btf_type_is_array(next_type) &&
!btf_type_is_struct(next_type);
@@ -851,7 +958,7 @@ const struct btf_type *btf_type_id_size(const struct btf *btf,
u32 size = 0;
size_type = btf_type_by_id(btf, size_type_id);
- if (btf_type_is_void_or_null(size_type))
+ if (btf_type_nosize_or_null(size_type))
return NULL;
if (btf_type_has_size(size_type)) {
@@ -867,7 +974,7 @@ const struct btf_type *btf_type_id_size(const struct btf *btf,
size = btf->resolved_sizes[size_type_id];
size_type_id = btf->resolved_ids[size_type_id];
size_type = btf_type_by_id(btf, size_type_id);
- if (btf_type_is_void(size_type))
+ if (btf_type_nosize_or_null(size_type))
return NULL;
}
@@ -888,6 +995,38 @@ static int btf_df_check_member(struct btf_verifier_env *env,
return -EINVAL;
}
+static int btf_df_check_kflag_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ btf_verifier_log_basic(env, struct_type,
+ "Unsupported check_kflag_member");
+ return -EINVAL;
+}
+
+/* Used for ptr, array and struct/union type members.
+ * int, enum and modifier types have their specific callback functions.
+ */
+static int btf_generic_check_kflag_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ if (BTF_MEMBER_BITFIELD_SIZE(member->offset)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Invalid member bitfield_size");
+ return -EINVAL;
+ }
+
+ /* bitfield size is 0, so member->offset represents bit offset only.
+ * It is safe to call non kflag check_member variants.
+ */
+ return btf_type_ops(member_type)->check_member(env, struct_type,
+ member,
+ member_type);
+}
+
static int btf_df_resolve(struct btf_verifier_env *env,
const struct resolve_vertex *v)
{
@@ -940,6 +1079,62 @@ static int btf_int_check_member(struct btf_verifier_env *env,
return 0;
}
+static int btf_int_check_kflag_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u32 struct_bits_off, nr_bits, nr_int_data_bits, bytes_offset;
+ u32 int_data = btf_type_int(member_type);
+ u32 struct_size = struct_type->size;
+ u32 nr_copy_bits;
+
+ /* a regular int type is required for the kflag int member */
+ if (!btf_type_int_is_regular(member_type)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Invalid member base type");
+ return -EINVAL;
+ }
+
+ /* check sanity of bitfield size */
+ nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset);
+ struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset);
+ nr_int_data_bits = BTF_INT_BITS(int_data);
+ if (!nr_bits) {
+ /* Not a bitfield member, member offset must be at byte
+ * boundary.
+ */
+ if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Invalid member offset");
+ return -EINVAL;
+ }
+
+ nr_bits = nr_int_data_bits;
+ } else if (nr_bits > nr_int_data_bits) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Invalid member bitfield_size");
+ return -EINVAL;
+ }
+
+ bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+ nr_copy_bits = nr_bits + BITS_PER_BYTE_MASKED(struct_bits_off);
+ if (nr_copy_bits > BITS_PER_U64) {
+ btf_verifier_log_member(env, struct_type, member,
+ "nr_copy_bits exceeds 64");
+ return -EINVAL;
+ }
+
+ if (struct_size < bytes_offset ||
+ struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static s32 btf_int_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
@@ -959,6 +1154,11 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
int_data = btf_type_int(t);
if (int_data & ~BTF_INT_MASK) {
btf_verifier_log_basic(env, t, "Invalid int_data:%x",
@@ -1011,26 +1211,16 @@ static void btf_int_log(struct btf_verifier_env *env,
btf_int_encoding_str(BTF_INT_ENCODING(int_data)));
}
-static void btf_int_bits_seq_show(const struct btf *btf,
- const struct btf_type *t,
- void *data, u8 bits_offset,
- struct seq_file *m)
+static void btf_bitfield_seq_show(void *data, u8 bits_offset,
+ u8 nr_bits, struct seq_file *m)
{
u16 left_shift_bits, right_shift_bits;
- u32 int_data = btf_type_int(t);
- u8 nr_bits = BTF_INT_BITS(int_data);
- u8 total_bits_offset;
u8 nr_copy_bytes;
u8 nr_copy_bits;
u64 print_num;
- /*
- * bits_offset is at most 7.
- * BTF_INT_OFFSET() cannot exceed 64 bits.
- */
- total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);
- data += BITS_ROUNDDOWN_BYTES(total_bits_offset);
- bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset);
+ data += BITS_ROUNDDOWN_BYTES(bits_offset);
+ bits_offset = BITS_PER_BYTE_MASKED(bits_offset);
nr_copy_bits = nr_bits + bits_offset;
nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits);
@@ -1050,6 +1240,24 @@ static void btf_int_bits_seq_show(const struct btf *btf,
seq_printf(m, "0x%llx", print_num);
}
+
+static void btf_int_bits_seq_show(const struct btf *btf,
+ const struct btf_type *t,
+ void *data, u8 bits_offset,
+ struct seq_file *m)
+{
+ u32 int_data = btf_type_int(t);
+ u8 nr_bits = BTF_INT_BITS(int_data);
+ u8 total_bits_offset;
+
+ /*
+ * bits_offset is at most 7.
+ * BTF_INT_OFFSET() cannot exceed 64 bits.
+ */
+ total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);
+ btf_bitfield_seq_show(data, total_bits_offset, nr_bits, m);
+}
+
static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t,
u32 type_id, void *data, u8 bits_offset,
struct seq_file *m)
@@ -1099,6 +1307,7 @@ static const struct btf_kind_operations int_ops = {
.check_meta = btf_int_check_meta,
.resolve = btf_df_resolve,
.check_member = btf_int_check_member,
+ .check_kflag_member = btf_int_check_kflag_member,
.log_details = btf_int_log,
.seq_show = btf_int_seq_show,
};
@@ -1128,6 +1337,31 @@ static int btf_modifier_check_member(struct btf_verifier_env *env,
resolved_type);
}
+static int btf_modifier_check_kflag_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ const struct btf_type *resolved_type;
+ u32 resolved_type_id = member->type;
+ struct btf_member resolved_member;
+ struct btf *btf = env->btf;
+
+ resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL);
+ if (!resolved_type) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Invalid member");
+ return -EINVAL;
+ }
+
+ resolved_member = *member;
+ resolved_member.type = resolved_type_id;
+
+ return btf_type_ops(resolved_type)->check_kflag_member(env, struct_type,
+ &resolved_member,
+ resolved_type);
+}
+
static int btf_ptr_check_member(struct btf_verifier_env *env,
const struct btf_type *struct_type,
const struct btf_member *member,
@@ -1163,6 +1397,11 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
if (!BTF_TYPE_ID_VALID(t->type)) {
btf_verifier_log_type(env, t, "Invalid type_id");
return -EINVAL;
@@ -1204,10 +1443,6 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
return -EINVAL;
}
- /* "typedef void new_void", "const void"...etc */
- if (btf_type_is_void(next_type))
- goto resolved;
-
if (!env_type_is_resolve_sink(env, next_type) &&
!env_type_is_resolved(env, next_type_id))
return env_stack_push(env, next_type, next_type_id);
@@ -1218,13 +1453,18 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
* save us a few type-following when we use it later (e.g. in
* pretty print).
*/
- if (!btf_type_id_size(btf, &next_type_id, &next_type_size) &&
- !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) {
- btf_verifier_log_type(env, v->t, "Invalid type_id");
- return -EINVAL;
+ if (!btf_type_id_size(btf, &next_type_id, &next_type_size)) {
+ if (env_type_is_resolved(env, next_type_id))
+ next_type = btf_type_id_resolve(btf, &next_type_id);
+
+ /* "typedef void new_void", "const void"...etc */
+ if (!btf_type_is_void(next_type) &&
+ !btf_type_is_fwd(next_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
}
-resolved:
env_stack_pop_resolved(env, next_type_id, next_type_size);
return 0;
@@ -1237,7 +1477,6 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
const struct btf_type *t = v->t;
u32 next_type_id = t->type;
struct btf *btf = env->btf;
- u32 next_type_size = 0;
next_type = btf_type_by_id(btf, next_type_id);
if (!next_type) {
@@ -1245,10 +1484,6 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
return -EINVAL;
}
- /* "void *" */
- if (btf_type_is_void(next_type))
- goto resolved;
-
if (!env_type_is_resolve_sink(env, next_type) &&
!env_type_is_resolved(env, next_type_id))
return env_stack_push(env, next_type, next_type_id);
@@ -1275,13 +1510,18 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
resolved_type_id);
}
- if (!btf_type_id_size(btf, &next_type_id, &next_type_size) &&
- !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) {
- btf_verifier_log_type(env, v->t, "Invalid type_id");
- return -EINVAL;
+ if (!btf_type_id_size(btf, &next_type_id, NULL)) {
+ if (env_type_is_resolved(env, next_type_id))
+ next_type = btf_type_id_resolve(btf, &next_type_id);
+
+ if (!btf_type_is_void(next_type) &&
+ !btf_type_is_fwd(next_type) &&
+ !btf_type_is_func_proto(next_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
}
-resolved:
env_stack_pop_resolved(env, next_type_id, 0);
return 0;
@@ -1315,6 +1555,7 @@ static struct btf_kind_operations modifier_ops = {
.check_meta = btf_ref_type_check_meta,
.resolve = btf_modifier_resolve,
.check_member = btf_modifier_check_member,
+ .check_kflag_member = btf_modifier_check_kflag_member,
.log_details = btf_ref_type_log,
.seq_show = btf_modifier_seq_show,
};
@@ -1323,6 +1564,7 @@ static struct btf_kind_operations ptr_ops = {
.check_meta = btf_ref_type_check_meta,
.resolve = btf_ptr_resolve,
.check_member = btf_ptr_check_member,
+ .check_kflag_member = btf_generic_check_kflag_member,
.log_details = btf_ref_type_log,
.seq_show = btf_ptr_seq_show,
};
@@ -1353,11 +1595,18 @@ static s32 btf_fwd_check_meta(struct btf_verifier_env *env,
return 0;
}
+static void btf_fwd_type_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ btf_verifier_log(env, "%s", btf_type_kflag(t) ? "union" : "struct");
+}
+
static struct btf_kind_operations fwd_ops = {
.check_meta = btf_fwd_check_meta,
.resolve = btf_df_resolve,
.check_member = btf_df_check_member,
- .log_details = btf_ref_type_log,
+ .check_kflag_member = btf_df_check_kflag_member,
+ .log_details = btf_fwd_type_log,
.seq_show = btf_df_seq_show,
};
@@ -1415,6 +1664,11 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
if (t->size) {
btf_verifier_log_type(env, t, "size != 0");
return -EINVAL;
@@ -1450,7 +1704,7 @@ static int btf_array_resolve(struct btf_verifier_env *env,
/* Check array->index_type */
index_type_id = array->index_type;
index_type = btf_type_by_id(btf, index_type_id);
- if (btf_type_is_void_or_null(index_type)) {
+ if (btf_type_nosize_or_null(index_type)) {
btf_verifier_log_type(env, v->t, "Invalid index");
return -EINVAL;
}
@@ -1469,7 +1723,7 @@ static int btf_array_resolve(struct btf_verifier_env *env,
/* Check array->type */
elem_type_id = array->type;
elem_type = btf_type_by_id(btf, elem_type_id);
- if (btf_type_is_void_or_null(elem_type)) {
+ if (btf_type_nosize_or_null(elem_type)) {
btf_verifier_log_type(env, v->t,
"Invalid elem");
return -EINVAL;
@@ -1538,6 +1792,7 @@ static struct btf_kind_operations array_ops = {
.check_meta = btf_array_check_meta,
.resolve = btf_array_resolve,
.check_member = btf_array_check_member,
+ .check_kflag_member = btf_generic_check_kflag_member,
.log_details = btf_array_log,
.seq_show = btf_array_seq_show,
};
@@ -1576,6 +1831,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
u32 meta_needed, last_offset;
struct btf *btf = env->btf;
u32 struct_size = t->size;
+ u32 offset;
u16 i;
meta_needed = btf_type_vlen(t) * sizeof(*member);
@@ -1617,7 +1873,8 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- if (is_union && member->offset) {
+ offset = btf_member_bit_offset(t, member);
+ if (is_union && offset) {
btf_verifier_log_member(env, t, member,
"Invalid member bits_offset");
return -EINVAL;
@@ -1627,20 +1884,20 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
* ">" instead of ">=" because the last member could be
* "char a[0];"
*/
- if (last_offset > member->offset) {
+ if (last_offset > offset) {
btf_verifier_log_member(env, t, member,
"Invalid member bits_offset");
return -EINVAL;
}
- if (BITS_ROUNDUP_BYTES(member->offset) > struct_size) {
+ if (BITS_ROUNDUP_BYTES(offset) > struct_size) {
btf_verifier_log_member(env, t, member,
- "Memmber bits_offset exceeds its struct size");
+ "Member bits_offset exceeds its struct size");
return -EINVAL;
}
btf_verifier_log_member(env, t, member, NULL);
- last_offset = member->offset;
+ last_offset = offset;
}
return meta_needed;
@@ -1670,9 +1927,14 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
last_member_type = btf_type_by_id(env->btf,
last_member_type_id);
- err = btf_type_ops(last_member_type)->check_member(env, v->t,
- last_member,
- last_member_type);
+ if (btf_type_kflag(v->t))
+ err = btf_type_ops(last_member_type)->check_kflag_member(env, v->t,
+ last_member,
+ last_member_type);
+ else
+ err = btf_type_ops(last_member_type)->check_member(env, v->t,
+ last_member,
+ last_member_type);
if (err)
return err;
}
@@ -1682,7 +1944,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
const struct btf_type *member_type = btf_type_by_id(env->btf,
member_type_id);
- if (btf_type_is_void_or_null(member_type)) {
+ if (btf_type_nosize_or_null(member_type)) {
btf_verifier_log_member(env, v->t, member,
"Invalid member");
return -EINVAL;
@@ -1694,9 +1956,14 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
return env_stack_push(env, member_type, member_type_id);
}
- err = btf_type_ops(member_type)->check_member(env, v->t,
- member,
- member_type);
+ if (btf_type_kflag(v->t))
+ err = btf_type_ops(member_type)->check_kflag_member(env, v->t,
+ member,
+ member_type);
+ else
+ err = btf_type_ops(member_type)->check_member(env, v->t,
+ member,
+ member_type);
if (err)
return err;
}
@@ -1724,17 +1991,26 @@ static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t,
for_each_member(i, t, member) {
const struct btf_type *member_type = btf_type_by_id(btf,
member->type);
- u32 member_offset = member->offset;
- u32 bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset);
- u8 bits8_offset = BITS_PER_BYTE_MASKED(member_offset);
const struct btf_kind_operations *ops;
+ u32 member_offset, bitfield_size;
+ u32 bytes_offset;
+ u8 bits8_offset;
if (i)
seq_puts(m, seq);
- ops = btf_type_ops(member_type);
- ops->seq_show(btf, member_type, member->type,
- data + bytes_offset, bits8_offset, m);
+ member_offset = btf_member_bit_offset(t, member);
+ bitfield_size = btf_member_bitfield_size(t, member);
+ if (bitfield_size) {
+ btf_bitfield_seq_show(data, member_offset,
+ bitfield_size, m);
+ } else {
+ bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset);
+ bits8_offset = BITS_PER_BYTE_MASKED(member_offset);
+ ops = btf_type_ops(member_type);
+ ops->seq_show(btf, member_type, member->type,
+ data + bytes_offset, bits8_offset, m);
+ }
}
seq_puts(m, "}");
}
@@ -1743,6 +2019,7 @@ static struct btf_kind_operations struct_ops = {
.check_meta = btf_struct_check_meta,
.resolve = btf_struct_resolve,
.check_member = btf_struct_check_member,
+ .check_kflag_member = btf_generic_check_kflag_member,
.log_details = btf_struct_log,
.seq_show = btf_struct_seq_show,
};
@@ -1772,6 +2049,41 @@ static int btf_enum_check_member(struct btf_verifier_env *env,
return 0;
}
+static int btf_enum_check_kflag_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u32 struct_bits_off, nr_bits, bytes_end, struct_size;
+ u32 int_bitsize = sizeof(int) * BITS_PER_BYTE;
+
+ struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset);
+ nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset);
+ if (!nr_bits) {
+ if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member is not byte aligned");
+ return -EINVAL;
+ }
+
+ nr_bits = int_bitsize;
+ } else if (nr_bits > int_bitsize) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Invalid member bitfield_size");
+ return -EINVAL;
+ }
+
+ struct_size = struct_type->size;
+ bytes_end = BITS_ROUNDUP_BYTES(struct_bits_off + nr_bits);
+ if (struct_size < bytes_end) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static s32 btf_enum_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
@@ -1791,6 +2103,11 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
if (t->size != sizeof(int)) {
btf_verifier_log_type(env, t, "Expected size:%zu",
sizeof(int));
@@ -1822,7 +2139,7 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
btf_verifier_log(env, "\t%s val=%d\n",
- btf_name_by_offset(btf, enums[i].name_off),
+ __btf_name_by_offset(btf, enums[i].name_off),
enums[i].val);
}
@@ -1846,7 +2163,8 @@ static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t,
for (i = 0; i < nr_enums; i++) {
if (v == enums[i].val) {
seq_printf(m, "%s",
- btf_name_by_offset(btf, enums[i].name_off));
+ __btf_name_by_offset(btf,
+ enums[i].name_off));
return;
}
}
@@ -1858,10 +2176,249 @@ static struct btf_kind_operations enum_ops = {
.check_meta = btf_enum_check_meta,
.resolve = btf_df_resolve,
.check_member = btf_enum_check_member,
+ .check_kflag_member = btf_enum_check_kflag_member,
.log_details = btf_enum_log,
.seq_show = btf_enum_seq_show,
};
+static s32 btf_func_proto_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ u32 meta_needed = btf_type_vlen(t) * sizeof(struct btf_param);
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ if (t->name_off) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return meta_needed;
+}
+
+static void btf_func_proto_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ const struct btf_param *args = (const struct btf_param *)(t + 1);
+ u16 nr_args = btf_type_vlen(t), i;
+
+ btf_verifier_log(env, "return=%u args=(", t->type);
+ if (!nr_args) {
+ btf_verifier_log(env, "void");
+ goto done;
+ }
+
+ if (nr_args == 1 && !args[0].type) {
+ /* Only one vararg */
+ btf_verifier_log(env, "vararg");
+ goto done;
+ }
+
+ btf_verifier_log(env, "%u %s", args[0].type,
+ __btf_name_by_offset(env->btf,
+ args[0].name_off));
+ for (i = 1; i < nr_args - 1; i++)
+ btf_verifier_log(env, ", %u %s", args[i].type,
+ __btf_name_by_offset(env->btf,
+ args[i].name_off));
+
+ if (nr_args > 1) {
+ const struct btf_param *last_arg = &args[nr_args - 1];
+
+ if (last_arg->type)
+ btf_verifier_log(env, ", %u %s", last_arg->type,
+ __btf_name_by_offset(env->btf,
+ last_arg->name_off));
+ else
+ btf_verifier_log(env, ", vararg");
+ }
+
+done:
+ btf_verifier_log(env, ")");
+}
+
+static struct btf_kind_operations func_proto_ops = {
+ .check_meta = btf_func_proto_check_meta,
+ .resolve = btf_df_resolve,
+ /*
+ * BTF_KIND_FUNC_PROTO cannot be directly referred by
+ * a struct's member.
+ *
+ * It should be a funciton pointer instead.
+ * (i.e. struct's member -> BTF_KIND_PTR -> BTF_KIND_FUNC_PROTO)
+ *
+ * Hence, there is no btf_func_check_member().
+ */
+ .check_member = btf_df_check_member,
+ .check_kflag_member = btf_df_check_kflag_member,
+ .log_details = btf_func_proto_log,
+ .seq_show = btf_df_seq_show,
+};
+
+static s32 btf_func_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ if (!t->name_off ||
+ !btf_name_valid_identifier(env->btf, t->name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return 0;
+}
+
+static struct btf_kind_operations func_ops = {
+ .check_meta = btf_func_check_meta,
+ .resolve = btf_df_resolve,
+ .check_member = btf_df_check_member,
+ .check_kflag_member = btf_df_check_kflag_member,
+ .log_details = btf_ref_type_log,
+ .seq_show = btf_df_seq_show,
+};
+
+static int btf_func_proto_check(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ const struct btf_type *ret_type;
+ const struct btf_param *args;
+ const struct btf *btf;
+ u16 nr_args, i;
+ int err;
+
+ btf = env->btf;
+ args = (const struct btf_param *)(t + 1);
+ nr_args = btf_type_vlen(t);
+
+ /* Check func return type which could be "void" (t->type == 0) */
+ if (t->type) {
+ u32 ret_type_id = t->type;
+
+ ret_type = btf_type_by_id(btf, ret_type_id);
+ if (!ret_type) {
+ btf_verifier_log_type(env, t, "Invalid return type");
+ return -EINVAL;
+ }
+
+ if (btf_type_needs_resolve(ret_type) &&
+ !env_type_is_resolved(env, ret_type_id)) {
+ err = btf_resolve(env, ret_type, ret_type_id);
+ if (err)
+ return err;
+ }
+
+ /* Ensure the return type is a type that has a size */
+ if (!btf_type_id_size(btf, &ret_type_id, NULL)) {
+ btf_verifier_log_type(env, t, "Invalid return type");
+ return -EINVAL;
+ }
+ }
+
+ if (!nr_args)
+ return 0;
+
+ /* Last func arg type_id could be 0 if it is a vararg */
+ if (!args[nr_args - 1].type) {
+ if (args[nr_args - 1].name_off) {
+ btf_verifier_log_type(env, t, "Invalid arg#%u",
+ nr_args);
+ return -EINVAL;
+ }
+ nr_args--;
+ }
+
+ err = 0;
+ for (i = 0; i < nr_args; i++) {
+ const struct btf_type *arg_type;
+ u32 arg_type_id;
+
+ arg_type_id = args[i].type;
+ arg_type = btf_type_by_id(btf, arg_type_id);
+ if (!arg_type) {
+ btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
+ err = -EINVAL;
+ break;
+ }
+
+ if (args[i].name_off &&
+ (!btf_name_offset_valid(btf, args[i].name_off) ||
+ !btf_name_valid_identifier(btf, args[i].name_off))) {
+ btf_verifier_log_type(env, t,
+ "Invalid arg#%u", i + 1);
+ err = -EINVAL;
+ break;
+ }
+
+ if (btf_type_needs_resolve(arg_type) &&
+ !env_type_is_resolved(env, arg_type_id)) {
+ err = btf_resolve(env, arg_type, arg_type_id);
+ if (err)
+ break;
+ }
+
+ if (!btf_type_id_size(btf, &arg_type_id, NULL)) {
+ btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
+ err = -EINVAL;
+ break;
+ }
+ }
+
+ return err;
+}
+
+static int btf_func_check(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ const struct btf_type *proto_type;
+ const struct btf_param *args;
+ const struct btf *btf;
+ u16 nr_args, i;
+
+ btf = env->btf;
+ proto_type = btf_type_by_id(btf, t->type);
+
+ if (!proto_type || !btf_type_is_func_proto(proto_type)) {
+ btf_verifier_log_type(env, t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ args = (const struct btf_param *)(proto_type + 1);
+ nr_args = btf_type_vlen(proto_type);
+ for (i = 0; i < nr_args; i++) {
+ if (!args[i].name_off && args[i].type) {
+ btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
[BTF_KIND_INT] = &int_ops,
[BTF_KIND_PTR] = &ptr_ops,
@@ -1874,6 +2431,8 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
[BTF_KIND_VOLATILE] = &modifier_ops,
[BTF_KIND_CONST] = &modifier_ops,
[BTF_KIND_RESTRICT] = &modifier_ops,
+ [BTF_KIND_FUNC] = &func_ops,
+ [BTF_KIND_FUNC_PROTO] = &func_proto_ops,
};
static s32 btf_check_meta(struct btf_verifier_env *env,
@@ -1945,30 +2504,6 @@ static int btf_check_all_metas(struct btf_verifier_env *env)
return 0;
}
-static int btf_resolve(struct btf_verifier_env *env,
- const struct btf_type *t, u32 type_id)
-{
- const struct resolve_vertex *v;
- int err = 0;
-
- env->resolve_mode = RESOLVE_TBD;
- env_stack_push(env, t, type_id);
- while (!err && (v = env_stack_peak(env))) {
- env->log_type_id = v->type_id;
- err = btf_type_ops(v->t)->resolve(env, v);
- }
-
- env->log_type_id = type_id;
- if (err == -E2BIG)
- btf_verifier_log_type(env, t,
- "Exceeded max resolving depth:%u",
- MAX_RESOLVE_DEPTH);
- else if (err == -EEXIST)
- btf_verifier_log_type(env, t, "Loop detected");
-
- return err;
-}
-
static bool btf_resolve_valid(struct btf_verifier_env *env,
const struct btf_type *t,
u32 type_id)
@@ -2002,6 +2537,39 @@ static bool btf_resolve_valid(struct btf_verifier_env *env,
return false;
}
+static int btf_resolve(struct btf_verifier_env *env,
+ const struct btf_type *t, u32 type_id)
+{
+ u32 save_log_type_id = env->log_type_id;
+ const struct resolve_vertex *v;
+ int err = 0;
+
+ env->resolve_mode = RESOLVE_TBD;
+ env_stack_push(env, t, type_id);
+ while (!err && (v = env_stack_peak(env))) {
+ env->log_type_id = v->type_id;
+ err = btf_type_ops(v->t)->resolve(env, v);
+ }
+
+ env->log_type_id = type_id;
+ if (err == -E2BIG) {
+ btf_verifier_log_type(env, t,
+ "Exceeded max resolving depth:%u",
+ MAX_RESOLVE_DEPTH);
+ } else if (err == -EEXIST) {
+ btf_verifier_log_type(env, t, "Loop detected");
+ }
+
+ /* Final sanity check */
+ if (!err && !btf_resolve_valid(env, t, type_id)) {
+ btf_verifier_log_type(env, t, "Invalid resolve state");
+ err = -EINVAL;
+ }
+
+ env->log_type_id = save_log_type_id;
+ return err;
+}
+
static int btf_check_all_types(struct btf_verifier_env *env)
{
struct btf *btf = env->btf;
@@ -2024,10 +2592,16 @@ static int btf_check_all_types(struct btf_verifier_env *env)
return err;
}
- if (btf_type_needs_resolve(t) &&
- !btf_resolve_valid(env, t, type_id)) {
- btf_verifier_log_type(env, t, "Invalid resolve state");
- return -EINVAL;
+ if (btf_type_is_func_proto(t)) {
+ err = btf_func_proto_check(env, t);
+ if (err)
+ return err;
+ }
+
+ if (btf_type_is_func(t)) {
+ err = btf_func_check(env, t);
+ if (err)
+ return err;
}
}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b2890c268cb3..38de580abcc2 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -21,12 +21,14 @@
* Kris Katterjohn - Added many additional checks in bpf_check_classic()
*/
+#include <uapi/linux/btf.h>
#include <linux/filter.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
#include <linux/random.h>
#include <linux/moduleloader.h>
#include <linux/bpf.h>
+#include <linux/btf.h>
#include <linux/frame.h>
#include <linux/rbtree_latch.h>
#include <linux/kallsyms.h>
@@ -103,6 +105,91 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
}
EXPORT_SYMBOL_GPL(bpf_prog_alloc);
+int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
+{
+ if (!prog->aux->nr_linfo || !prog->jit_requested)
+ return 0;
+
+ prog->aux->jited_linfo = kcalloc(prog->aux->nr_linfo,
+ sizeof(*prog->aux->jited_linfo),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!prog->aux->jited_linfo)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void bpf_prog_free_jited_linfo(struct bpf_prog *prog)
+{
+ kfree(prog->aux->jited_linfo);
+ prog->aux->jited_linfo = NULL;
+}
+
+void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog)
+{
+ if (prog->aux->jited_linfo && !prog->aux->jited_linfo[0])
+ bpf_prog_free_jited_linfo(prog);
+}
+
+/* The jit engine is responsible to provide an array
+ * for insn_off to the jited_off mapping (insn_to_jit_off).
+ *
+ * The idx to this array is the insn_off. Hence, the insn_off
+ * here is relative to the prog itself instead of the main prog.
+ * This array has one entry for each xlated bpf insn.
+ *
+ * jited_off is the byte off to the last byte of the jited insn.
+ *
+ * Hence, with
+ * insn_start:
+ * The first bpf insn off of the prog. The insn off
+ * here is relative to the main prog.
+ * e.g. if prog is a subprog, insn_start > 0
+ * linfo_idx:
+ * The prog's idx to prog->aux->linfo and jited_linfo
+ *
+ * jited_linfo[linfo_idx] = prog->bpf_func
+ *
+ * For i > linfo_idx,
+ *
+ * jited_linfo[i] = prog->bpf_func +
+ * insn_to_jit_off[linfo[i].insn_off - insn_start - 1]
+ */
+void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
+ const u32 *insn_to_jit_off)
+{
+ u32 linfo_idx, insn_start, insn_end, nr_linfo, i;
+ const struct bpf_line_info *linfo;
+ void **jited_linfo;
+
+ if (!prog->aux->jited_linfo)
+ /* Userspace did not provide linfo */
+ return;
+
+ linfo_idx = prog->aux->linfo_idx;
+ linfo = &prog->aux->linfo[linfo_idx];
+ insn_start = linfo[0].insn_off;
+ insn_end = insn_start + prog->len;
+
+ jited_linfo = &prog->aux->jited_linfo[linfo_idx];
+ jited_linfo[0] = prog->bpf_func;
+
+ nr_linfo = prog->aux->nr_linfo - linfo_idx;
+
+ for (i = 1; i < nr_linfo && linfo[i].insn_off < insn_end; i++)
+ /* The verifier ensures that linfo[i].insn_off is
+ * strictly increasing
+ */
+ jited_linfo[i] = prog->bpf_func +
+ insn_to_jit_off[linfo[i].insn_off - insn_start - 1];
+}
+
+void bpf_prog_free_linfo(struct bpf_prog *prog)
+{
+ bpf_prog_free_jited_linfo(prog);
+ kvfree(prog->aux->linfo);
+}
+
struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
gfp_t gfp_extra_flags)
{
@@ -292,6 +379,26 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta,
return ret;
}
+static void bpf_adj_linfo(struct bpf_prog *prog, u32 off, u32 delta)
+{
+ struct bpf_line_info *linfo;
+ u32 i, nr_linfo;
+
+ nr_linfo = prog->aux->nr_linfo;
+ if (!nr_linfo || !delta)
+ return;
+
+ linfo = prog->aux->linfo;
+
+ for (i = 0; i < nr_linfo; i++)
+ if (off < linfo[i].insn_off)
+ break;
+
+ /* Push all off < linfo[i].insn_off by delta */
+ for (; i < nr_linfo; i++)
+ linfo[i].insn_off += delta;
+}
+
struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
const struct bpf_insn *patch, u32 len)
{
@@ -347,6 +454,8 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
*/
BUG_ON(bpf_adj_branches(prog_adj, off, insn_delta, false));
+ bpf_adj_linfo(prog_adj, off, insn_delta);
+
return prog_adj;
}
@@ -388,6 +497,8 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog,
static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
{
const char *end = sym + KSYM_NAME_LEN;
+ const struct btf_type *type;
+ const char *func_name;
BUILD_BUG_ON(sizeof("bpf_prog_") +
sizeof(prog->tag) * 2 +
@@ -402,6 +513,16 @@ static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
sym = bin2hex(sym, prog->tag, sizeof(prog->tag));
+
+ /* prog->aux->name will be ignored if full btf name is available */
+ if (prog->aux->func_info_cnt) {
+ type = btf_type_by_id(prog->aux->btf,
+ prog->aux->func_info[prog->aux->func_idx].type_id);
+ func_name = btf_name_by_offset(prog->aux->btf, type->name_off);
+ snprintf(sym, (size_t)(end - sym), "_%s", func_name);
+ return;
+ }
+
if (prog->aux->name[0])
snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
else
@@ -618,6 +739,16 @@ static void bpf_jit_uncharge_modmem(u32 pages)
atomic_long_sub(pages, &bpf_jit_current);
}
+void *__weak bpf_jit_alloc_exec(unsigned long size)
+{
+ return module_alloc(size);
+}
+
+void __weak bpf_jit_free_exec(void *addr)
+{
+ module_memfree(addr);
+}
+
struct bpf_binary_header *
bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
unsigned int alignment,
@@ -635,7 +766,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
if (bpf_jit_charge_modmem(pages))
return NULL;
- hdr = module_alloc(size);
+ hdr = bpf_jit_alloc_exec(size);
if (!hdr) {
bpf_jit_uncharge_modmem(pages);
return NULL;
@@ -659,7 +790,7 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
{
u32 pages = hdr->pages;
- module_memfree(hdr);
+ bpf_jit_free_exec(hdr);
bpf_jit_uncharge_modmem(pages);
}
@@ -918,32 +1049,34 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
#define BPF_INSN_MAP(INSN_2, INSN_3) \
/* 32 bit ALU operations. */ \
/* Register based. */ \
- INSN_3(ALU, ADD, X), \
- INSN_3(ALU, SUB, X), \
- INSN_3(ALU, AND, X), \
- INSN_3(ALU, OR, X), \
- INSN_3(ALU, LSH, X), \
- INSN_3(ALU, RSH, X), \
- INSN_3(ALU, XOR, X), \
- INSN_3(ALU, MUL, X), \
- INSN_3(ALU, MOV, X), \
- INSN_3(ALU, DIV, X), \
- INSN_3(ALU, MOD, X), \
+ INSN_3(ALU, ADD, X), \
+ INSN_3(ALU, SUB, X), \
+ INSN_3(ALU, AND, X), \
+ INSN_3(ALU, OR, X), \
+ INSN_3(ALU, LSH, X), \
+ INSN_3(ALU, RSH, X), \
+ INSN_3(ALU, XOR, X), \
+ INSN_3(ALU, MUL, X), \
+ INSN_3(ALU, MOV, X), \
+ INSN_3(ALU, ARSH, X), \
+ INSN_3(ALU, DIV, X), \
+ INSN_3(ALU, MOD, X), \
INSN_2(ALU, NEG), \
INSN_3(ALU, END, TO_BE), \
INSN_3(ALU, END, TO_LE), \
/* Immediate based. */ \
- INSN_3(ALU, ADD, K), \
- INSN_3(ALU, SUB, K), \
- INSN_3(ALU, AND, K), \
- INSN_3(ALU, OR, K), \
- INSN_3(ALU, LSH, K), \
- INSN_3(ALU, RSH, K), \
- INSN_3(ALU, XOR, K), \
- INSN_3(ALU, MUL, K), \
- INSN_3(ALU, MOV, K), \
- INSN_3(ALU, DIV, K), \
- INSN_3(ALU, MOD, K), \
+ INSN_3(ALU, ADD, K), \
+ INSN_3(ALU, SUB, K), \
+ INSN_3(ALU, AND, K), \
+ INSN_3(ALU, OR, K), \
+ INSN_3(ALU, LSH, K), \
+ INSN_3(ALU, RSH, K), \
+ INSN_3(ALU, XOR, K), \
+ INSN_3(ALU, MUL, K), \
+ INSN_3(ALU, MOV, K), \
+ INSN_3(ALU, ARSH, K), \
+ INSN_3(ALU, DIV, K), \
+ INSN_3(ALU, MOD, K), \
/* 64 bit ALU operations. */ \
/* Register based. */ \
INSN_3(ALU64, ADD, X), \
@@ -1122,6 +1255,12 @@ select_insn:
DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
insn++;
CONT;
+ ALU_ARSH_X:
+ DST = (u64) (u32) ((*(s32 *) &DST) >> SRC);
+ CONT;
+ ALU_ARSH_K:
+ DST = (u64) (u32) ((*(s32 *) &DST) >> IMM);
+ CONT;
ALU64_ARSH_X:
(*(s64 *) &DST) >>= SRC;
CONT;
@@ -1568,13 +1707,20 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
* be JITed, but falls back to the interpreter.
*/
if (!bpf_prog_is_dev_bound(fp->aux)) {
+ *err = bpf_prog_alloc_jited_linfo(fp);
+ if (*err)
+ return fp;
+
fp = bpf_int_jit_compile(fp);
-#ifdef CONFIG_BPF_JIT_ALWAYS_ON
if (!fp->jited) {
+ bpf_prog_free_jited_linfo(fp);
+#ifdef CONFIG_BPF_JIT_ALWAYS_ON
*err = -ENOTSUPP;
return fp;
- }
#endif
+ } else {
+ bpf_prog_free_unused_jited_linfo(fp);
+ }
} else {
*err = bpf_prog_offload_compile(fp);
if (*err)
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 24aac0d0f412..8974b3755670 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -183,7 +183,7 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
* is not at a fixed memory location, with mixed length
* packets, which is bad for cache-line hotness.
*/
- frame_size = SKB_DATA_ALIGN(xdpf->len) + xdpf->headroom +
+ frame_size = SKB_DATA_ALIGN(xdpf->len + xdpf->headroom) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
pkt_data_start = xdpf->data - xdpf->headroom;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 2c1790288138..4b7c76765d9d 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -23,7 +23,7 @@
#define HTAB_CREATE_FLAG_MASK \
(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
- BPF_F_RDONLY | BPF_F_WRONLY)
+ BPF_F_RDONLY | BPF_F_WRONLY | BPF_F_ZERO_SEED)
struct bucket {
struct hlist_nulls_head head;
@@ -244,6 +244,7 @@ static int htab_map_alloc_check(union bpf_attr *attr)
*/
bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
+ bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
int numa_node = bpf_map_attr_numa_node(attr);
BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
@@ -257,6 +258,10 @@ static int htab_map_alloc_check(union bpf_attr *attr)
*/
return -EPERM;
+ if (zero_seed && !capable(CAP_SYS_ADMIN))
+ /* Guard against local DoS, and discourage production use. */
+ return -EPERM;
+
if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK)
/* reserved bits should not be used */
return -EINVAL;
@@ -373,7 +378,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (!htab->buckets)
goto free_htab;
- htab->hashrnd = get_random_int();
+ if (htab->map.map_flags & BPF_F_ZERO_SEED)
+ htab->hashrnd = 0;
+ else
+ htab->hashrnd = get_random_int();
+
for (i = 0; i < htab->n_buckets; i++) {
INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
raw_spin_lock_init(&htab->buckets[i].lock);
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index bed9d48a7ae9..07a34ef562a0 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -1,14 +1,15 @@
//SPDX-License-Identifier: GPL-2.0
#include <linux/bpf-cgroup.h>
#include <linux/bpf.h>
+#include <linux/btf.h>
#include <linux/bug.h>
#include <linux/filter.h>
#include <linux/mm.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
+#include <uapi/linux/btf.h>
-DEFINE_PER_CPU(struct bpf_cgroup_storage*,
- bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
+DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
#ifdef CONFIG_CGROUP_BPF
@@ -309,6 +310,85 @@ static int cgroup_storage_delete_elem(struct bpf_map *map, void *key)
return -EINVAL;
}
+static int cgroup_storage_check_btf(const struct bpf_map *map,
+ const struct btf *btf,
+ const struct btf_type *key_type,
+ const struct btf_type *value_type)
+{
+ struct btf_member *m;
+ u32 offset, size;
+
+ /* Key is expected to be of struct bpf_cgroup_storage_key type,
+ * which is:
+ * struct bpf_cgroup_storage_key {
+ * __u64 cgroup_inode_id;
+ * __u32 attach_type;
+ * };
+ */
+
+ /*
+ * Key_type must be a structure with two fields.
+ */
+ if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ||
+ BTF_INFO_VLEN(key_type->info) != 2)
+ return -EINVAL;
+
+ /*
+ * The first field must be a 64 bit integer at 0 offset.
+ */
+ m = (struct btf_member *)(key_type + 1);
+ size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, cgroup_inode_id);
+ if (!btf_member_is_reg_int(btf, key_type, m, 0, size))
+ return -EINVAL;
+
+ /*
+ * The second field must be a 32 bit integer at 64 bit offset.
+ */
+ m++;
+ offset = offsetof(struct bpf_cgroup_storage_key, attach_type);
+ size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, attach_type);
+ if (!btf_member_is_reg_int(btf, key_type, m, offset, size))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *_key,
+ struct seq_file *m)
+{
+ enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
+ struct bpf_cgroup_storage_key *key = _key;
+ struct bpf_cgroup_storage *storage;
+ int cpu;
+
+ rcu_read_lock();
+ storage = cgroup_storage_lookup(map_to_storage(map), key, false);
+ if (!storage) {
+ rcu_read_unlock();
+ return;
+ }
+
+ btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
+ stype = cgroup_storage_type(map);
+ if (stype == BPF_CGROUP_STORAGE_SHARED) {
+ seq_puts(m, ": ");
+ btf_type_seq_show(map->btf, map->btf_value_type_id,
+ &READ_ONCE(storage->buf)->data[0], m);
+ seq_puts(m, "\n");
+ } else {
+ seq_puts(m, ": {\n");
+ for_each_possible_cpu(cpu) {
+ seq_printf(m, "\tcpu%d: ", cpu);
+ btf_type_seq_show(map->btf, map->btf_value_type_id,
+ per_cpu_ptr(storage->percpu_buf, cpu),
+ m);
+ seq_puts(m, "\n");
+ }
+ seq_puts(m, "}\n");
+ }
+ rcu_read_unlock();
+}
+
const struct bpf_map_ops cgroup_storage_map_ops = {
.map_alloc = cgroup_storage_map_alloc,
.map_free = cgroup_storage_map_free,
@@ -316,7 +396,8 @@ const struct bpf_map_ops cgroup_storage_map_ops = {
.map_lookup_elem = cgroup_storage_lookup_elem,
.map_update_elem = cgroup_storage_update_elem,
.map_delete_elem = cgroup_storage_delete_elem,
- .map_check_btf = map_check_no_btf,
+ .map_check_btf = cgroup_storage_check_btf,
+ .map_seq_show_elem = cgroup_storage_seq_show_elem,
};
int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map)
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 9058317ba9de..abf1002080df 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -168,20 +168,59 @@ static size_t longest_prefix_match(const struct lpm_trie *trie,
const struct lpm_trie_node *node,
const struct bpf_lpm_trie_key *key)
{
- size_t prefixlen = 0;
- size_t i;
+ u32 limit = min(node->prefixlen, key->prefixlen);
+ u32 prefixlen = 0, i = 0;
- for (i = 0; i < trie->data_size; i++) {
- size_t b;
+ BUILD_BUG_ON(offsetof(struct lpm_trie_node, data) % sizeof(u32));
+ BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key, data) % sizeof(u32));
- b = 8 - fls(node->data[i] ^ key->data[i]);
- prefixlen += b;
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(CONFIG_64BIT)
- if (prefixlen >= node->prefixlen || prefixlen >= key->prefixlen)
- return min(node->prefixlen, key->prefixlen);
+ /* data_size >= 16 has very small probability.
+ * We do not use a loop for optimal code generation.
+ */
+ if (trie->data_size >= 8) {
+ u64 diff = be64_to_cpu(*(__be64 *)node->data ^
+ *(__be64 *)key->data);
+
+ prefixlen = 64 - fls64(diff);
+ if (prefixlen >= limit)
+ return limit;
+ if (diff)
+ return prefixlen;
+ i = 8;
+ }
+#endif
+
+ while (trie->data_size >= i + 4) {
+ u32 diff = be32_to_cpu(*(__be32 *)&node->data[i] ^
+ *(__be32 *)&key->data[i]);
+
+ prefixlen += 32 - fls(diff);
+ if (prefixlen >= limit)
+ return limit;
+ if (diff)
+ return prefixlen;
+ i += 4;
+ }
- if (b < 8)
- break;
+ if (trie->data_size >= i + 2) {
+ u16 diff = be16_to_cpu(*(__be16 *)&node->data[i] ^
+ *(__be16 *)&key->data[i]);
+
+ prefixlen += 16 - fls(diff);
+ if (prefixlen >= limit)
+ return limit;
+ if (diff)
+ return prefixlen;
+ i += 2;
+ }
+
+ if (trie->data_size >= i + 1) {
+ prefixlen += 8 - fls(node->data[i] ^ key->data[i]);
+
+ if (prefixlen >= limit)
+ return limit;
}
return prefixlen;
@@ -689,6 +728,7 @@ free_stack:
}
static int trie_check_btf(const struct bpf_map *map,
+ const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
{
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 8e93c47f0779..54cf2b9c44a4 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -33,6 +33,7 @@
static DECLARE_RWSEM(bpf_devs_lock);
struct bpf_offload_dev {
+ const struct bpf_prog_offload_ops *ops;
struct list_head netdevs;
};
@@ -106,6 +107,7 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
err = -EINVAL;
goto err_unlock;
}
+ offload->offdev = ondev->offdev;
prog->aux->offload = offload;
list_add_tail(&offload->offloads, &ondev->progs);
dev_put(offload->netdev);
@@ -121,40 +123,20 @@ err_maybe_put:
return err;
}
-static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
- struct netdev_bpf *data)
+int bpf_prog_offload_verifier_prep(struct bpf_prog *prog)
{
- struct bpf_prog_offload *offload = prog->aux->offload;
- struct net_device *netdev;
-
- ASSERT_RTNL();
-
- if (!offload)
- return -ENODEV;
- netdev = offload->netdev;
-
- data->command = cmd;
-
- return netdev->netdev_ops->ndo_bpf(netdev, data);
-}
-
-int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
-{
- struct netdev_bpf data = {};
- int err;
-
- data.verifier.prog = env->prog;
+ struct bpf_prog_offload *offload;
+ int ret = -ENODEV;
- rtnl_lock();
- err = __bpf_offload_ndo(env->prog, BPF_OFFLOAD_VERIFIER_PREP, &data);
- if (err)
- goto exit_unlock;
+ down_read(&bpf_devs_lock);
+ offload = prog->aux->offload;
+ if (offload) {
+ ret = offload->offdev->ops->prepare(prog);
+ offload->dev_state = !ret;
+ }
+ up_read(&bpf_devs_lock);
- env->prog->aux->offload->dev_ops = data.verifier.ops;
- env->prog->aux->offload->dev_state = true;
-exit_unlock:
- rtnl_unlock();
- return err;
+ return ret;
}
int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
@@ -166,7 +148,8 @@ int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
down_read(&bpf_devs_lock);
offload = env->prog->aux->offload;
if (offload)
- ret = offload->dev_ops->insn_hook(env, insn_idx, prev_insn_idx);
+ ret = offload->offdev->ops->insn_hook(env, insn_idx,
+ prev_insn_idx);
up_read(&bpf_devs_lock);
return ret;
@@ -180,8 +163,8 @@ int bpf_prog_offload_finalize(struct bpf_verifier_env *env)
down_read(&bpf_devs_lock);
offload = env->prog->aux->offload;
if (offload) {
- if (offload->dev_ops->finalize)
- ret = offload->dev_ops->finalize(env);
+ if (offload->offdev->ops->finalize)
+ ret = offload->offdev->ops->finalize(env);
else
ret = 0;
}
@@ -193,12 +176,9 @@ int bpf_prog_offload_finalize(struct bpf_verifier_env *env)
static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
{
struct bpf_prog_offload *offload = prog->aux->offload;
- struct netdev_bpf data = {};
-
- data.offload.prog = prog;
if (offload->dev_state)
- WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data));
+ offload->offdev->ops->destroy(prog);
/* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */
bpf_prog_free_id(prog, true);
@@ -210,24 +190,22 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
void bpf_prog_offload_destroy(struct bpf_prog *prog)
{
- rtnl_lock();
down_write(&bpf_devs_lock);
if (prog->aux->offload)
__bpf_prog_offload_destroy(prog);
up_write(&bpf_devs_lock);
- rtnl_unlock();
}
static int bpf_prog_offload_translate(struct bpf_prog *prog)
{
- struct netdev_bpf data = {};
- int ret;
-
- data.offload.prog = prog;
+ struct bpf_prog_offload *offload;
+ int ret = -ENODEV;
- rtnl_lock();
- ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data);
- rtnl_unlock();
+ down_read(&bpf_devs_lock);
+ offload = prog->aux->offload;
+ if (offload)
+ ret = offload->offdev->ops->translate(prog);
+ up_read(&bpf_devs_lock);
return ret;
}
@@ -655,7 +633,8 @@ unlock:
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister);
-struct bpf_offload_dev *bpf_offload_dev_create(void)
+struct bpf_offload_dev *
+bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops)
{
struct bpf_offload_dev *offdev;
int err;
@@ -673,6 +652,7 @@ struct bpf_offload_dev *bpf_offload_dev_create(void)
if (!offdev)
return ERR_PTR(-ENOMEM);
+ offdev->ops = ops;
INIT_LIST_HEAD(&offdev->netdevs);
return offdev;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cf5040fd5434..0607db304def 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -456,6 +456,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
}
int map_check_no_btf(const struct bpf_map *map,
+ const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
{
@@ -478,7 +479,7 @@ static int map_check_btf(const struct bpf_map *map, const struct btf *btf,
return -EINVAL;
if (map->ops->map_check_btf)
- ret = map->ops->map_check_btf(map, key_type, value_type);
+ ret = map->ops->map_check_btf(map, btf, key_type, value_type);
return ret;
}
@@ -1213,6 +1214,9 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
/* bpf_prog_free_id() must be called first */
bpf_prog_free_id(prog, do_idr_lock);
bpf_prog_kallsyms_del_all(prog);
+ btf_put(prog->aux->btf);
+ kvfree(prog->aux->func_info);
+ bpf_prog_free_linfo(prog);
call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
}
@@ -1437,9 +1441,9 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
}
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD expected_attach_type
+#define BPF_PROG_LOAD_LAST_FIELD line_info_cnt
-static int bpf_prog_load(union bpf_attr *attr)
+static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
{
enum bpf_prog_type type = attr->prog_type;
struct bpf_prog *prog;
@@ -1450,9 +1454,14 @@ static int bpf_prog_load(union bpf_attr *attr)
if (CHECK_ATTR(BPF_PROG_LOAD))
return -EINVAL;
- if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT)
+ if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT))
return -EINVAL;
+ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
+ (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
+ !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
/* copy eBPF program license from user space */
if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
sizeof(license) - 1) < 0)
@@ -1464,11 +1473,6 @@ static int bpf_prog_load(union bpf_attr *attr)
if (attr->insn_cnt == 0 || attr->insn_cnt > BPF_MAXINSNS)
return -E2BIG;
-
- if (type == BPF_PROG_TYPE_KPROBE &&
- attr->kern_version != LINUX_VERSION_CODE)
- return -EINVAL;
-
if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
type != BPF_PROG_TYPE_CGROUP_SKB &&
!capable(CAP_SYS_ADMIN))
@@ -1525,7 +1529,7 @@ static int bpf_prog_load(union bpf_attr *attr)
goto free_prog;
/* run eBPF verifier */
- err = bpf_check(&prog, attr);
+ err = bpf_check(&prog, attr, uattr);
if (err < 0)
goto free_used_maps;
@@ -1553,6 +1557,9 @@ static int bpf_prog_load(union bpf_attr *attr)
return err;
free_used_maps:
+ bpf_prog_free_linfo(prog);
+ kvfree(prog->aux->func_info);
+ btf_put(prog->aux->btf);
bpf_prog_kallsyms_del_subprogs(prog);
free_used_maps(prog->aux);
free_prog:
@@ -1597,6 +1604,7 @@ static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp)
bpf_probe_unregister(raw_tp->btp, raw_tp->prog);
bpf_prog_put(raw_tp->prog);
}
+ bpf_put_raw_tracepoint(raw_tp->btp);
kfree(raw_tp);
return 0;
}
@@ -1622,13 +1630,15 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
return -EFAULT;
tp_name[sizeof(tp_name) - 1] = 0;
- btp = bpf_find_raw_tracepoint(tp_name);
+ btp = bpf_get_raw_tracepoint(tp_name);
if (!btp)
return -ENOENT;
raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER);
- if (!raw_tp)
- return -ENOMEM;
+ if (!raw_tp) {
+ err = -ENOMEM;
+ goto out_put_btp;
+ }
raw_tp->btp = btp;
prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd,
@@ -1656,6 +1666,8 @@ out_put_prog:
bpf_prog_put(prog);
out_free_tp:
kfree(raw_tp);
+out_put_btp:
+ bpf_put_raw_tracepoint(btp);
return err;
}
@@ -2020,18 +2032,42 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)
insns[i + 1].imm = 0;
continue;
}
-
- if (!bpf_dump_raw_ok() &&
- imm == (unsigned long)prog->aux) {
- insns[i].imm = 0;
- insns[i + 1].imm = 0;
- continue;
- }
}
return insns;
}
+static int set_info_rec_size(struct bpf_prog_info *info)
+{
+ /*
+ * Ensure info.*_rec_size is the same as kernel expected size
+ *
+ * or
+ *
+ * Only allow zero *_rec_size if both _rec_size and _cnt are
+ * zero. In this case, the kernel will set the expected
+ * _rec_size back to the info.
+ */
+
+ if ((info->nr_func_info || info->func_info_rec_size) &&
+ info->func_info_rec_size != sizeof(struct bpf_func_info))
+ return -EINVAL;
+
+ if ((info->nr_line_info || info->line_info_rec_size) &&
+ info->line_info_rec_size != sizeof(struct bpf_line_info))
+ return -EINVAL;
+
+ if ((info->nr_jited_line_info || info->jited_line_info_rec_size) &&
+ info->jited_line_info_rec_size != sizeof(__u64))
+ return -EINVAL;
+
+ info->func_info_rec_size = sizeof(struct bpf_func_info);
+ info->line_info_rec_size = sizeof(struct bpf_line_info);
+ info->jited_line_info_rec_size = sizeof(__u64);
+
+ return 0;
+}
+
static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
@@ -2074,11 +2110,18 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
return -EFAULT;
}
+ err = set_info_rec_size(&info);
+ if (err)
+ return err;
+
if (!capable(CAP_SYS_ADMIN)) {
info.jited_prog_len = 0;
info.xlated_prog_len = 0;
info.nr_jited_ksyms = 0;
info.nr_jited_func_lens = 0;
+ info.nr_func_info = 0;
+ info.nr_line_info = 0;
+ info.nr_jited_line_info = 0;
goto done;
}
@@ -2160,7 +2203,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
ulen = info.nr_jited_ksyms;
info.nr_jited_ksyms = prog->aux->func_cnt ? : 1;
- if (info.nr_jited_ksyms && ulen) {
+ if (ulen) {
if (bpf_dump_raw_ok()) {
unsigned long ksym_addr;
u64 __user *user_ksyms;
@@ -2191,7 +2234,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
ulen = info.nr_jited_func_lens;
info.nr_jited_func_lens = prog->aux->func_cnt ? : 1;
- if (info.nr_jited_func_lens && ulen) {
+ if (ulen) {
if (bpf_dump_raw_ok()) {
u32 __user *user_lens;
u32 func_len, i;
@@ -2216,6 +2259,77 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
}
}
+ if (prog->aux->btf)
+ info.btf_id = btf_id(prog->aux->btf);
+
+ ulen = info.nr_func_info;
+ info.nr_func_info = prog->aux->func_info_cnt;
+ if (info.nr_func_info && ulen) {
+ char __user *user_finfo;
+
+ user_finfo = u64_to_user_ptr(info.func_info);
+ ulen = min_t(u32, info.nr_func_info, ulen);
+ if (copy_to_user(user_finfo, prog->aux->func_info,
+ info.func_info_rec_size * ulen))
+ return -EFAULT;
+ }
+
+ ulen = info.nr_line_info;
+ info.nr_line_info = prog->aux->nr_linfo;
+ if (info.nr_line_info && ulen) {
+ __u8 __user *user_linfo;
+
+ user_linfo = u64_to_user_ptr(info.line_info);
+ ulen = min_t(u32, info.nr_line_info, ulen);
+ if (copy_to_user(user_linfo, prog->aux->linfo,
+ info.line_info_rec_size * ulen))
+ return -EFAULT;
+ }
+
+ ulen = info.nr_jited_line_info;
+ if (prog->aux->jited_linfo)
+ info.nr_jited_line_info = prog->aux->nr_linfo;
+ else
+ info.nr_jited_line_info = 0;
+ if (info.nr_jited_line_info && ulen) {
+ if (bpf_dump_raw_ok()) {
+ __u64 __user *user_linfo;
+ u32 i;
+
+ user_linfo = u64_to_user_ptr(info.jited_line_info);
+ ulen = min_t(u32, info.nr_jited_line_info, ulen);
+ for (i = 0; i < ulen; i++) {
+ if (put_user((__u64)(long)prog->aux->jited_linfo[i],
+ &user_linfo[i]))
+ return -EFAULT;
+ }
+ } else {
+ info.jited_line_info = 0;
+ }
+ }
+
+ ulen = info.nr_prog_tags;
+ info.nr_prog_tags = prog->aux->func_cnt ? : 1;
+ if (ulen) {
+ __u8 __user (*user_prog_tags)[BPF_TAG_SIZE];
+ u32 i;
+
+ user_prog_tags = u64_to_user_ptr(info.prog_tags);
+ ulen = min_t(u32, info.nr_prog_tags, ulen);
+ if (prog->aux->func_cnt) {
+ for (i = 0; i < ulen; i++) {
+ if (copy_to_user(user_prog_tags[i],
+ prog->aux->func[i]->tag,
+ BPF_TAG_SIZE))
+ return -EFAULT;
+ }
+ } else {
+ if (copy_to_user(user_prog_tags[0],
+ prog->tag, BPF_TAG_SIZE))
+ return -EFAULT;
+ }
+ }
+
done:
if (copy_to_user(uinfo, &info, info_len) ||
put_user(info_len, &uattr->info.info_len))
@@ -2501,7 +2615,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
err = map_get_next_key(&attr);
break;
case BPF_PROG_LOAD:
- err = bpf_prog_load(&attr);
+ err = bpf_prog_load(&attr, uattr);
break;
case BPF_OBJ_PIN:
err = bpf_obj_pin(&attr);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 51ba84d4d34a..71d86e3024ae 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -11,10 +11,12 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
+#include <uapi/linux/btf.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/bpf.h>
+#include <linux/btf.h>
#include <linux/bpf_verifier.h>
#include <linux/filter.h>
#include <net/netlink.h>
@@ -24,6 +26,7 @@
#include <linux/bsearch.h>
#include <linux/sort.h>
#include <linux/perf_event.h>
+#include <linux/ctype.h>
#include "disasm.h"
@@ -214,6 +217,27 @@ struct bpf_call_arg_meta {
static DEFINE_MUTEX(bpf_verifier_lock);
+static const struct bpf_line_info *
+find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
+{
+ const struct bpf_line_info *linfo;
+ const struct bpf_prog *prog;
+ u32 i, nr_linfo;
+
+ prog = env->prog;
+ nr_linfo = prog->aux->nr_linfo;
+
+ if (!nr_linfo || insn_off >= prog->len)
+ return NULL;
+
+ linfo = prog->aux->linfo;
+ for (i = 1; i < nr_linfo; i++)
+ if (insn_off < linfo[i].insn_off)
+ break;
+
+ return &linfo[i - 1];
+}
+
void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
va_list args)
{
@@ -264,6 +288,42 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
va_end(args);
}
+static const char *ltrim(const char *s)
+{
+ while (isspace(*s))
+ s++;
+
+ return s;
+}
+
+__printf(3, 4) static void verbose_linfo(struct bpf_verifier_env *env,
+ u32 insn_off,
+ const char *prefix_fmt, ...)
+{
+ const struct bpf_line_info *linfo;
+
+ if (!bpf_verifier_log_needed(&env->log))
+ return;
+
+ linfo = find_linfo(env, insn_off);
+ if (!linfo || linfo == env->prev_linfo)
+ return;
+
+ if (prefix_fmt) {
+ va_list args;
+
+ va_start(args, prefix_fmt);
+ bpf_verifier_vlog(&env->log, prefix_fmt, args);
+ va_end(args);
+ }
+
+ verbose(env, "%s\n",
+ ltrim(btf_name_by_offset(env->prog->aux->btf,
+ linfo->line_off)));
+
+ env->prev_linfo = linfo;
+}
+
static bool type_is_pkt_pointer(enum bpf_reg_type type)
{
return type == PTR_TO_PACKET ||
@@ -337,12 +397,14 @@ static char slot_type_char[] = {
static void print_liveness(struct bpf_verifier_env *env,
enum bpf_reg_liveness live)
{
- if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN))
+ if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE))
verbose(env, "_");
if (live & REG_LIVE_READ)
verbose(env, "r");
if (live & REG_LIVE_WRITTEN)
verbose(env, "w");
+ if (live & REG_LIVE_DONE)
+ verbose(env, "D");
}
static struct bpf_func_state *func(struct bpf_verifier_env *env,
@@ -1072,6 +1134,12 @@ static int mark_reg_read(struct bpf_verifier_env *env,
/* if read wasn't screened by an earlier write ... */
if (writes && state->live & REG_LIVE_WRITTEN)
break;
+ if (parent->live & REG_LIVE_DONE) {
+ verbose(env, "verifier BUG type %s var_off %lld off %d\n",
+ reg_type_str[parent->type],
+ parent->var_off.value, parent->off);
+ return -EFAULT;
+ }
/* ... then we depend on parent's value */
parent->live |= REG_LIVE_READ;
state = parent;
@@ -1218,6 +1286,10 @@ static int check_stack_write(struct bpf_verifier_env *env,
/* regular write of data into stack destroys any spilled ptr */
state->stack[spi].spilled_ptr.type = NOT_INIT;
+ /* Mark slots as STACK_MISC if they belonged to spilled ptr. */
+ if (state->stack[spi].slot_type[0] == STACK_SPILL)
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ state->stack[spi].slot_type[i] = STACK_MISC;
/* only mark the slot as written if all 8 bytes were written
* otherwise read propagation may incorrectly stop too soon
@@ -1235,6 +1307,7 @@ static int check_stack_write(struct bpf_verifier_env *env,
register_is_null(&cur->regs[value_regno]))
type = STACK_ZERO;
+ /* Mark slots affected by this stack write. */
for (i = 0; i < size; i++)
state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] =
type;
@@ -1456,6 +1529,17 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
verbose(env, "R%d offset is outside of the packet\n", regno);
return err;
}
+
+ /* __check_packet_access has made sure "off + size - 1" is within u16.
+ * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff,
+ * otherwise find_good_pkt_pointers would have refused to set range info
+ * that __check_packet_access would have rejected this pkt access.
+ * Therefore, "off + reg->umax_value + size - 1" won't overflow u32.
+ */
+ env->prog->aux->max_pkt_offset =
+ max_t(u32, env->prog->aux->max_pkt_offset,
+ off + reg->umax_value + size - 1);
+
return err;
}
@@ -3571,12 +3655,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
if (BPF_SRC(insn->code) == BPF_X) {
+ struct bpf_reg_state *src_reg = regs + insn->src_reg;
+ struct bpf_reg_state *dst_reg = regs + insn->dst_reg;
+
if (BPF_CLASS(insn->code) == BPF_ALU64) {
/* case: R1 = R2
* copy register state to dest reg
*/
- regs[insn->dst_reg] = regs[insn->src_reg];
- regs[insn->dst_reg].live |= REG_LIVE_WRITTEN;
+ *dst_reg = *src_reg;
+ dst_reg->live |= REG_LIVE_WRITTEN;
} else {
/* R1 = (u32) R2 */
if (is_pointer_value(env, insn->src_reg)) {
@@ -3584,9 +3671,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
"R%d partial copy of pointer\n",
insn->src_reg);
return -EACCES;
+ } else if (src_reg->type == SCALAR_VALUE) {
+ *dst_reg = *src_reg;
+ dst_reg->live |= REG_LIVE_WRITTEN;
+ } else {
+ mark_reg_unknown(env, regs,
+ insn->dst_reg);
}
- mark_reg_unknown(env, regs, insn->dst_reg);
- coerce_reg_to_size(&regs[insn->dst_reg], 4);
+ coerce_reg_to_size(dst_reg, 4);
}
} else {
/* case: R = imm
@@ -3637,11 +3729,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
- if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) {
- verbose(env, "BPF_ARSH not supported for 32 bit ALU\n");
- return -EINVAL;
- }
-
if ((opcode == BPF_LSH || opcode == BPF_RSH ||
opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
@@ -3772,6 +3859,12 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
if (tnum_is_const(reg->var_off))
return !tnum_equals_const(reg->var_off, val);
break;
+ case BPF_JSET:
+ if ((~reg->var_off.mask & reg->var_off.value) & val)
+ return 1;
+ if (!((reg->var_off.mask | reg->var_off.value) & val))
+ return 0;
+ break;
case BPF_JGT:
if (reg->umin_value > val)
return 1;
@@ -3856,6 +3949,13 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
*/
__mark_reg_known(false_reg, val);
break;
+ case BPF_JSET:
+ false_reg->var_off = tnum_and(false_reg->var_off,
+ tnum_const(~val));
+ if (is_power_of_2(val))
+ true_reg->var_off = tnum_or(true_reg->var_off,
+ tnum_const(val));
+ break;
case BPF_JGT:
false_reg->umax_value = min(false_reg->umax_value, val);
true_reg->umin_value = max(true_reg->umin_value, val + 1);
@@ -3928,6 +4028,13 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
*/
__mark_reg_known(false_reg, val);
break;
+ case BPF_JSET:
+ false_reg->var_off = tnum_and(false_reg->var_off,
+ tnum_const(~val));
+ if (is_power_of_2(val))
+ true_reg->var_off = tnum_or(true_reg->var_off,
+ tnum_const(val));
+ break;
case BPF_JGT:
true_reg->umax_value = min(true_reg->umax_value, val - 1);
false_reg->umin_value = max(false_reg->umin_value, val);
@@ -4545,6 +4652,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
return 0;
if (w < 0 || w >= env->prog->len) {
+ verbose_linfo(env, t, "%d: ", t);
verbose(env, "jump out of range from insn %d to %d\n", t, w);
return -EINVAL;
}
@@ -4562,6 +4670,8 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
insn_stack[cur_stack++] = w;
return 1;
} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
+ verbose_linfo(env, t, "%d: ", t);
+ verbose_linfo(env, w, "%d: ", w);
verbose(env, "back-edge from insn %d to %d\n", t, w);
return -EINVAL;
} else if (insn_state[w] == EXPLORED) {
@@ -4584,10 +4694,6 @@ static int check_cfg(struct bpf_verifier_env *env)
int ret = 0;
int i, t;
- ret = check_subprogs(env);
- if (ret < 0)
- return ret;
-
insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
if (!insn_state)
return -ENOMEM;
@@ -4696,6 +4802,277 @@ err_free:
return ret;
}
+/* The minimum supported BTF func info size */
+#define MIN_BPF_FUNCINFO_SIZE 8
+#define MAX_FUNCINFO_REC_SIZE 252
+
+static int check_btf_func(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ u32 i, nfuncs, urec_size, min_size, prev_offset;
+ u32 krec_size = sizeof(struct bpf_func_info);
+ struct bpf_func_info *krecord;
+ const struct btf_type *type;
+ struct bpf_prog *prog;
+ const struct btf *btf;
+ void __user *urecord;
+ int ret = 0;
+
+ nfuncs = attr->func_info_cnt;
+ if (!nfuncs)
+ return 0;
+
+ if (nfuncs != env->subprog_cnt) {
+ verbose(env, "number of funcs in func_info doesn't match number of subprogs\n");
+ return -EINVAL;
+ }
+
+ urec_size = attr->func_info_rec_size;
+ if (urec_size < MIN_BPF_FUNCINFO_SIZE ||
+ urec_size > MAX_FUNCINFO_REC_SIZE ||
+ urec_size % sizeof(u32)) {
+ verbose(env, "invalid func info rec size %u\n", urec_size);
+ return -EINVAL;
+ }
+
+ prog = env->prog;
+ btf = prog->aux->btf;
+
+ urecord = u64_to_user_ptr(attr->func_info);
+ min_size = min_t(u32, krec_size, urec_size);
+
+ krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN);
+ if (!krecord)
+ return -ENOMEM;
+
+ for (i = 0; i < nfuncs; i++) {
+ ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size);
+ if (ret) {
+ if (ret == -E2BIG) {
+ verbose(env, "nonzero tailing record in func info");
+ /* set the size kernel expects so loader can zero
+ * out the rest of the record.
+ */
+ if (put_user(min_size, &uattr->func_info_rec_size))
+ ret = -EFAULT;
+ }
+ goto err_free;
+ }
+
+ if (copy_from_user(&krecord[i], urecord, min_size)) {
+ ret = -EFAULT;
+ goto err_free;
+ }
+
+ /* check insn_off */
+ if (i == 0) {
+ if (krecord[i].insn_off) {
+ verbose(env,
+ "nonzero insn_off %u for the first func info record",
+ krecord[i].insn_off);
+ ret = -EINVAL;
+ goto err_free;
+ }
+ } else if (krecord[i].insn_off <= prev_offset) {
+ verbose(env,
+ "same or smaller insn offset (%u) than previous func info record (%u)",
+ krecord[i].insn_off, prev_offset);
+ ret = -EINVAL;
+ goto err_free;
+ }
+
+ if (env->subprog_info[i].start != krecord[i].insn_off) {
+ verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
+ ret = -EINVAL;
+ goto err_free;
+ }
+
+ /* check type_id */
+ type = btf_type_by_id(btf, krecord[i].type_id);
+ if (!type || BTF_INFO_KIND(type->info) != BTF_KIND_FUNC) {
+ verbose(env, "invalid type id %d in func info",
+ krecord[i].type_id);
+ ret = -EINVAL;
+ goto err_free;
+ }
+
+ prev_offset = krecord[i].insn_off;
+ urecord += urec_size;
+ }
+
+ prog->aux->func_info = krecord;
+ prog->aux->func_info_cnt = nfuncs;
+ return 0;
+
+err_free:
+ kvfree(krecord);
+ return ret;
+}
+
+static void adjust_btf_func(struct bpf_verifier_env *env)
+{
+ int i;
+
+ if (!env->prog->aux->func_info)
+ return;
+
+ for (i = 0; i < env->subprog_cnt; i++)
+ env->prog->aux->func_info[i].insn_off = env->subprog_info[i].start;
+}
+
+#define MIN_BPF_LINEINFO_SIZE (offsetof(struct bpf_line_info, line_col) + \
+ sizeof(((struct bpf_line_info *)(0))->line_col))
+#define MAX_LINEINFO_REC_SIZE MAX_FUNCINFO_REC_SIZE
+
+static int check_btf_line(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ u32 i, s, nr_linfo, ncopy, expected_size, rec_size, prev_offset = 0;
+ struct bpf_subprog_info *sub;
+ struct bpf_line_info *linfo;
+ struct bpf_prog *prog;
+ const struct btf *btf;
+ void __user *ulinfo;
+ int err;
+
+ nr_linfo = attr->line_info_cnt;
+ if (!nr_linfo)
+ return 0;
+
+ rec_size = attr->line_info_rec_size;
+ if (rec_size < MIN_BPF_LINEINFO_SIZE ||
+ rec_size > MAX_LINEINFO_REC_SIZE ||
+ rec_size & (sizeof(u32) - 1))
+ return -EINVAL;
+
+ /* Need to zero it in case the userspace may
+ * pass in a smaller bpf_line_info object.
+ */
+ linfo = kvcalloc(nr_linfo, sizeof(struct bpf_line_info),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!linfo)
+ return -ENOMEM;
+
+ prog = env->prog;
+ btf = prog->aux->btf;
+
+ s = 0;
+ sub = env->subprog_info;
+ ulinfo = u64_to_user_ptr(attr->line_info);
+ expected_size = sizeof(struct bpf_line_info);
+ ncopy = min_t(u32, expected_size, rec_size);
+ for (i = 0; i < nr_linfo; i++) {
+ err = bpf_check_uarg_tail_zero(ulinfo, expected_size, rec_size);
+ if (err) {
+ if (err == -E2BIG) {
+ verbose(env, "nonzero tailing record in line_info");
+ if (put_user(expected_size,
+ &uattr->line_info_rec_size))
+ err = -EFAULT;
+ }
+ goto err_free;
+ }
+
+ if (copy_from_user(&linfo[i], ulinfo, ncopy)) {
+ err = -EFAULT;
+ goto err_free;
+ }
+
+ /*
+ * Check insn_off to ensure
+ * 1) strictly increasing AND
+ * 2) bounded by prog->len
+ *
+ * The linfo[0].insn_off == 0 check logically falls into
+ * the later "missing bpf_line_info for func..." case
+ * because the first linfo[0].insn_off must be the
+ * first sub also and the first sub must have
+ * subprog_info[0].start == 0.
+ */
+ if ((i && linfo[i].insn_off <= prev_offset) ||
+ linfo[i].insn_off >= prog->len) {
+ verbose(env, "Invalid line_info[%u].insn_off:%u (prev_offset:%u prog->len:%u)\n",
+ i, linfo[i].insn_off, prev_offset,
+ prog->len);
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ if (!prog->insnsi[linfo[i].insn_off].code) {
+ verbose(env,
+ "Invalid insn code at line_info[%u].insn_off\n",
+ i);
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ if (!btf_name_by_offset(btf, linfo[i].line_off) ||
+ !btf_name_by_offset(btf, linfo[i].file_name_off)) {
+ verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i);
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ if (s != env->subprog_cnt) {
+ if (linfo[i].insn_off == sub[s].start) {
+ sub[s].linfo_idx = i;
+ s++;
+ } else if (sub[s].start < linfo[i].insn_off) {
+ verbose(env, "missing bpf_line_info for func#%u\n", s);
+ err = -EINVAL;
+ goto err_free;
+ }
+ }
+
+ prev_offset = linfo[i].insn_off;
+ ulinfo += rec_size;
+ }
+
+ if (s != env->subprog_cnt) {
+ verbose(env, "missing bpf_line_info for %u funcs starting from func#%u\n",
+ env->subprog_cnt - s, s);
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ prog->aux->linfo = linfo;
+ prog->aux->nr_linfo = nr_linfo;
+
+ return 0;
+
+err_free:
+ kvfree(linfo);
+ return err;
+}
+
+static int check_btf_info(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct btf *btf;
+ int err;
+
+ if (!attr->func_info_cnt && !attr->line_info_cnt)
+ return 0;
+
+ btf = btf_get_by_fd(attr->prog_btf_fd);
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+ env->prog->aux->btf = btf;
+
+ err = check_btf_func(env, attr, uattr);
+ if (err)
+ return err;
+
+ err = check_btf_line(env, attr, uattr);
+ if (err)
+ return err;
+
+ return 0;
+}
+
/* check %cur's range satisfies %old's */
static bool range_within(struct bpf_reg_state *old,
struct bpf_reg_state *cur)
@@ -4742,6 +5119,102 @@ static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap)
return false;
}
+static void clean_func_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *st)
+{
+ enum bpf_reg_liveness live;
+ int i, j;
+
+ for (i = 0; i < BPF_REG_FP; i++) {
+ live = st->regs[i].live;
+ /* liveness must not touch this register anymore */
+ st->regs[i].live |= REG_LIVE_DONE;
+ if (!(live & REG_LIVE_READ))
+ /* since the register is unused, clear its state
+ * to make further comparison simpler
+ */
+ __mark_reg_not_init(&st->regs[i]);
+ }
+
+ for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) {
+ live = st->stack[i].spilled_ptr.live;
+ /* liveness must not touch this stack slot anymore */
+ st->stack[i].spilled_ptr.live |= REG_LIVE_DONE;
+ if (!(live & REG_LIVE_READ)) {
+ __mark_reg_not_init(&st->stack[i].spilled_ptr);
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ st->stack[i].slot_type[j] = STACK_INVALID;
+ }
+ }
+}
+
+static void clean_verifier_state(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st)
+{
+ int i;
+
+ if (st->frame[0]->regs[0].live & REG_LIVE_DONE)
+ /* all regs in this state in all frames were already marked */
+ return;
+
+ for (i = 0; i <= st->curframe; i++)
+ clean_func_state(env, st->frame[i]);
+}
+
+/* the parentage chains form a tree.
+ * the verifier states are added to state lists at given insn and
+ * pushed into state stack for future exploration.
+ * when the verifier reaches bpf_exit insn some of the verifer states
+ * stored in the state lists have their final liveness state already,
+ * but a lot of states will get revised from liveness point of view when
+ * the verifier explores other branches.
+ * Example:
+ * 1: r0 = 1
+ * 2: if r1 == 100 goto pc+1
+ * 3: r0 = 2
+ * 4: exit
+ * when the verifier reaches exit insn the register r0 in the state list of
+ * insn 2 will be seen as !REG_LIVE_READ. Then the verifier pops the other_branch
+ * of insn 2 and goes exploring further. At the insn 4 it will walk the
+ * parentage chain from insn 4 into insn 2 and will mark r0 as REG_LIVE_READ.
+ *
+ * Since the verifier pushes the branch states as it sees them while exploring
+ * the program the condition of walking the branch instruction for the second
+ * time means that all states below this branch were already explored and
+ * their final liveness markes are already propagated.
+ * Hence when the verifier completes the search of state list in is_state_visited()
+ * we can call this clean_live_states() function to mark all liveness states
+ * as REG_LIVE_DONE to indicate that 'parent' pointers of 'struct bpf_reg_state'
+ * will not be used.
+ * This function also clears the registers and stack for states that !READ
+ * to simplify state merging.
+ *
+ * Important note here that walking the same branch instruction in the callee
+ * doesn't meant that the states are DONE. The verifier has to compare
+ * the callsites
+ */
+static void clean_live_states(struct bpf_verifier_env *env, int insn,
+ struct bpf_verifier_state *cur)
+{
+ struct bpf_verifier_state_list *sl;
+ int i;
+
+ sl = env->explored_states[insn];
+ if (!sl)
+ return;
+
+ while (sl != STATE_LIST_MARK) {
+ if (sl->state.curframe != cur->curframe)
+ goto next;
+ for (i = 0; i <= cur->curframe; i++)
+ if (sl->state.frame[i]->callsite != cur->frame[i]->callsite)
+ goto next;
+ clean_verifier_state(env, &sl->state);
+next:
+ sl = sl->next;
+ }
+}
+
/* Returns true if (rold safe implies rcur safe) */
static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
struct idpair *idmap)
@@ -4855,12 +5328,6 @@ static bool stacksafe(struct bpf_func_state *old,
{
int i, spi;
- /* if explored stack has more populated slots than current stack
- * such stacks are not equivalent
- */
- if (old->allocated_stack > cur->allocated_stack)
- return false;
-
/* walk slots of the explored stack and ignore any additional
* slots in the current stack, since explored(safe) state
* didn't use them
@@ -4868,12 +5335,21 @@ static bool stacksafe(struct bpf_func_state *old,
for (i = 0; i < old->allocated_stack; i++) {
spi = i / BPF_REG_SIZE;
- if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ))
+ if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) {
+ i += BPF_REG_SIZE - 1;
/* explored state didn't use this */
continue;
+ }
if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
continue;
+
+ /* explored stack has more populated slots than current stack
+ * and these slots were used
+ */
+ if (i >= cur->allocated_stack)
+ return false;
+
/* if old state was safe with misc data in the stack
* it will be safe with zero-initialized stack.
* The opposite is not true
@@ -5057,6 +5533,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
*/
return 0;
+ clean_live_states(env, insn_idx, cur);
+
while (sl != STATE_LIST_MARK) {
if (states_equal(env, &sl->state, cur)) {
/* reached equivalent register/stack state,
@@ -5176,6 +5654,8 @@ static int do_check(struct bpf_verifier_env *env)
int insn_processed = 0;
bool do_print_state = false;
+ env->prev_linfo = NULL;
+
state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
if (!state)
return -ENOMEM;
@@ -5249,6 +5729,7 @@ static int do_check(struct bpf_verifier_env *env)
.private_data = env,
};
+ verbose_linfo(env, insn_idx, "; ");
verbose(env, "%d: ", insn_idx);
print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
}
@@ -5789,10 +6270,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
int i, cnt, size, ctx_field_size, delta = 0;
const int insn_cnt = env->prog->len;
struct bpf_insn insn_buf[16], *insn;
+ u32 target_size, size_default, off;
struct bpf_prog *new_prog;
enum bpf_access_type type;
bool is_narrower_load;
- u32 target_size;
if (ops->gen_prologue || env->seen_direct_write) {
if (!ops->gen_prologue) {
@@ -5885,9 +6366,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
* we will apply proper mask to the result.
*/
is_narrower_load = size < ctx_field_size;
+ size_default = bpf_ctx_off_adjust_machine(ctx_field_size);
+ off = insn->off;
if (is_narrower_load) {
- u32 size_default = bpf_ctx_off_adjust_machine(ctx_field_size);
- u32 off = insn->off;
u8 size_code;
if (type == BPF_WRITE) {
@@ -5915,12 +6396,23 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
}
if (is_narrower_load && size < target_size) {
- if (ctx_field_size <= 4)
+ u8 shift = (off & (size_default - 1)) * 8;
+
+ if (ctx_field_size <= 4) {
+ if (shift)
+ insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
+ insn->dst_reg,
+ shift);
insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
(1 << size * 8) - 1);
- else
+ } else {
+ if (shift)
+ insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH,
+ insn->dst_reg,
+ shift);
insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
(1 << size * 8) - 1);
+ }
}
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
@@ -5943,7 +6435,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
int i, j, subprog_start, subprog_end = 0, len, subprog;
struct bpf_insn *insn;
void *old_bpf_func;
- int err = -ENOMEM;
+ int err;
if (env->subprog_cnt <= 1)
return 0;
@@ -5974,6 +6466,11 @@ static int jit_subprogs(struct bpf_verifier_env *env)
insn->imm = 1;
}
+ err = bpf_prog_alloc_jited_linfo(prog);
+ if (err)
+ goto out_undo_insn;
+
+ err = -ENOMEM;
func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL);
if (!func)
goto out_undo_insn;
@@ -5993,12 +6490,21 @@ static int jit_subprogs(struct bpf_verifier_env *env)
if (bpf_prog_calc_tag(func[i]))
goto out_free;
func[i]->is_func = 1;
+ func[i]->aux->func_idx = i;
+ /* the btf and func_info will be freed only at prog->aux */
+ func[i]->aux->btf = prog->aux->btf;
+ func[i]->aux->func_info = prog->aux->func_info;
+
/* Use bpf_prog_F_tag to indicate functions in stack traces.
* Long term would need debug info to populate names
*/
func[i]->aux->name[0] = 'F';
func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
func[i]->jit_requested = 1;
+ func[i]->aux->linfo = prog->aux->linfo;
+ func[i]->aux->nr_linfo = prog->aux->nr_linfo;
+ func[i]->aux->jited_linfo = prog->aux->jited_linfo;
+ func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx;
func[i] = bpf_int_jit_compile(func[i]);
if (!func[i]->jited) {
err = -ENOTSUPP;
@@ -6072,6 +6578,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
prog->bpf_func = func[0]->bpf_func;
prog->aux->func = func;
prog->aux->func_cnt = env->subprog_cnt;
+ bpf_prog_free_unused_jited_linfo(prog);
return 0;
out_free:
for (i = 0; i < env->subprog_cnt; i++)
@@ -6088,6 +6595,7 @@ out_undo_insn:
insn->off = 0;
insn->imm = env->insn_aux_data[i].call_imm;
}
+ bpf_prog_free_jited_linfo(prog);
return err;
}
@@ -6220,6 +6728,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
*/
prog->cb_access = 1;
env->prog->aux->stack_depth = MAX_BPF_STACK;
+ env->prog->aux->max_pkt_offset = MAX_PACKET_OFF;
/* mark bpf_tail_call as different opcode to avoid
* conditional branch in the interpeter for every normal
@@ -6384,7 +6893,8 @@ static void free_states(struct bpf_verifier_env *env)
kfree(env->explored_states);
}
-int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
+int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
+ union bpf_attr __user *uattr)
{
struct bpf_verifier_env *env;
struct bpf_verifier_log *log;
@@ -6432,13 +6942,15 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
env->strict_alignment = true;
+ if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
+ env->strict_alignment = false;
ret = replace_map_fd_with_map_ptr(env);
if (ret < 0)
goto skip_full_check;
if (bpf_prog_is_dev_bound(env->prog->aux)) {
- ret = bpf_prog_offload_verifier_prep(env);
+ ret = bpf_prog_offload_verifier_prep(env->prog);
if (ret)
goto skip_full_check;
}
@@ -6452,6 +6964,14 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
+ ret = check_subprogs(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ ret = check_btf_info(env, attr, uattr);
+ if (ret < 0)
+ goto skip_full_check;
+
ret = check_cfg(env);
if (ret < 0)
goto skip_full_check;
@@ -6470,10 +6990,11 @@ skip_full_check:
free_states(env);
if (ret == 0)
- sanitize_dead_code(env);
+ ret = check_max_stack_depth(env);
+ /* instruction rewrites happen after this point */
if (ret == 0)
- ret = check_max_stack_depth(env);
+ sanitize_dead_code(env);
if (ret == 0)
/* program is valid, convert *(u32*)(ctx + off) accesses */
@@ -6513,6 +7034,9 @@ skip_full_check:
convert_pseudo_ld_imm64(env);
}
+ if (ret == 0)
+ adjust_btf_func(env);
+
err_release_maps:
if (!env->prog->aux->used_maps)
/* if we didn't copy map pointers into bpf_prog_info, release
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 75568fcf2180..c950864016e2 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -11,6 +11,8 @@
#define TRACE_CGROUP_PATH_LEN 1024
extern spinlock_t trace_cgroup_path_lock;
extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
+extern bool cgroup_debug;
+extern void __init enable_debug_cgroup(void);
/*
* cgroup_path() takes a spin lock. It is good practice not to take
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 51063e7a93c2..583b969b0c0e 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -27,6 +27,9 @@
/* Controllers blocked by the commandline in v1 */
static u16 cgroup_no_v1_mask;
+/* disable named v1 mounts */
+static bool cgroup_no_v1_named;
+
/*
* pidlist destructions need to be flushed on cgroup destruction. Use a
* separate workqueue as flush domain.
@@ -963,6 +966,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
}
if (!strncmp(token, "name=", 5)) {
const char *name = token + 5;
+
+ /* blocked by boot param? */
+ if (cgroup_no_v1_named)
+ return -ENOENT;
/* Can't specify an empty name */
if (!strlen(name))
return -EINVAL;
@@ -1292,7 +1299,12 @@ static int __init cgroup_no_v1(char *str)
if (!strcmp(token, "all")) {
cgroup_no_v1_mask = U16_MAX;
- break;
+ continue;
+ }
+
+ if (!strcmp(token, "named")) {
+ cgroup_no_v1_named = true;
+ continue;
}
for_each_subsys(ss, i) {
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 7a8429f8e280..f31bd61c9466 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -86,6 +86,7 @@ EXPORT_SYMBOL_GPL(css_set_lock);
DEFINE_SPINLOCK(trace_cgroup_path_lock);
char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
+bool cgroup_debug __read_mostly;
/*
* Protects cgroup_idr and css_idr so that IDs can be released without
@@ -493,7 +494,7 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
}
/**
- * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
+ * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest (%NULL returns @cgrp->self)
*
@@ -502,8 +503,8 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
* enabled. If @ss is associated with the hierarchy @cgrp is on, this
* function is guaranteed to return non-NULL css.
*/
-static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
- struct cgroup_subsys *ss)
+static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
{
lockdep_assert_held(&cgroup_mutex);
@@ -524,6 +525,35 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
}
/**
+ * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get the effective css of @cgrp for @ss. The effective css is
+ * defined as the matching css of the nearest ancestor including self which
+ * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
+ * the root css is returned, so this function always returns a valid css.
+ *
+ * The returned css is not guaranteed to be online, and therefore it is the
+ * callers responsiblity to tryget a reference for it.
+ */
+struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ struct cgroup_subsys_state *css;
+
+ do {
+ css = cgroup_css(cgrp, ss);
+
+ if (css)
+ return css;
+ cgrp = cgroup_parent(cgrp);
+ } while (cgrp);
+
+ return init_css_set.subsys[ss->id];
+}
+
+/**
* cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest
@@ -605,10 +635,11 @@ EXPORT_SYMBOL_GPL(of_css);
*
* Should be called under cgroup_[tree_]mutex.
*/
-#define for_each_e_css(css, ssid, cgrp) \
- for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
- if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
- ; \
+#define for_each_e_css(css, ssid, cgrp) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
+ if (!((css) = cgroup_e_css_by_mask(cgrp, \
+ cgroup_subsys[(ssid)]))) \
+ ; \
else
/**
@@ -1007,7 +1038,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
* @ss is in this hierarchy, so we want the
* effective css from @cgrp.
*/
- template[i] = cgroup_e_css(cgrp, ss);
+ template[i] = cgroup_e_css_by_mask(cgrp, ss);
} else {
/*
* @ss is not in this hierarchy, so we don't want
@@ -1399,12 +1430,15 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
struct cgroup_subsys *ss = cft->ss;
if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
- !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
- snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
- cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
+ !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
+ const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";
+
+ snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
+ dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
cft->name);
- else
+ } else {
strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+ }
return buf;
}
@@ -1744,7 +1778,7 @@ static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
*root_flags = 0;
- if (!data)
+ if (!data || *data == '\0')
return 0;
while ((token = strsep(&data, ",")) != NULL) {
@@ -3024,7 +3058,7 @@ static int cgroup_apply_control(struct cgroup *cgrp)
return ret;
/*
- * At this point, cgroup_e_css() results reflect the new csses
+ * At this point, cgroup_e_css_by_mask() results reflect the new csses
* making the following cgroup_update_dfl_csses() properly update
* css associations of all tasks in the subtree.
*/
@@ -3639,7 +3673,8 @@ restart:
continue;
if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
continue;
-
+ if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
+ continue;
if (is_add) {
ret = cgroup_add_file(css, cgrp, cft);
if (ret) {
@@ -4202,20 +4237,25 @@ static void css_task_iter_advance(struct css_task_iter *it)
lockdep_assert_held(&css_set_lock);
repeat:
- /*
- * Advance iterator to find next entry. cset->tasks is consumed
- * first and then ->mg_tasks. After ->mg_tasks, we move onto the
- * next cset.
- */
- next = it->task_pos->next;
+ if (it->task_pos) {
+ /*
+ * Advance iterator to find next entry. cset->tasks is
+ * consumed first and then ->mg_tasks. After ->mg_tasks,
+ * we move onto the next cset.
+ */
+ next = it->task_pos->next;
- if (next == it->tasks_head)
- next = it->mg_tasks_head->next;
+ if (next == it->tasks_head)
+ next = it->mg_tasks_head->next;
- if (next == it->mg_tasks_head)
+ if (next == it->mg_tasks_head)
+ css_task_iter_advance_css_set(it);
+ else
+ it->task_pos = next;
+ } else {
+ /* called from start, proceed to the first cset */
css_task_iter_advance_css_set(it);
- else
- it->task_pos = next;
+ }
/* if PROCS, skip over tasks which aren't group leaders */
if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
@@ -4255,7 +4295,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
it->cset_head = it->cset_pos;
- css_task_iter_advance_css_set(it);
+ css_task_iter_advance(it);
spin_unlock_irq(&css_set_lock);
}
@@ -5743,6 +5783,16 @@ static int __init cgroup_disable(char *str)
}
__setup("cgroup_disable=", cgroup_disable);
+void __init __weak enable_debug_cgroup(void) { }
+
+static int __init enable_cgroup_debug(char *str)
+{
+ cgroup_debug = true;
+ enable_debug_cgroup();
+ return 1;
+}
+__setup("cgroup_debug", enable_cgroup_debug);
+
/**
* css_tryget_online_from_dir - get corresponding css from a cgroup dentry
* @dentry: directory dentry of interest
@@ -5978,10 +6028,8 @@ static ssize_t show_delegatable_files(struct cftype *files, char *buf,
ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
- if (unlikely(ret >= size)) {
- WARN_ON(1);
+ if (WARN_ON(ret >= size))
break;
- }
}
return ret;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 266f10cb7222..479743db6c37 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -110,6 +110,16 @@ struct cpuset {
nodemask_t effective_mems;
/*
+ * CPUs allocated to child sub-partitions (default hierarchy only)
+ * - CPUs granted by the parent = effective_cpus U subparts_cpus
+ * - effective_cpus and subparts_cpus are mutually exclusive.
+ *
+ * effective_cpus contains only onlined CPUs, but subparts_cpus
+ * may have offlined ones.
+ */
+ cpumask_var_t subparts_cpus;
+
+ /*
* This is old Memory Nodes tasks took on.
*
* - top_cpuset.old_mems_allowed is initialized to mems_allowed.
@@ -134,6 +144,47 @@ struct cpuset {
/* for custom sched domain */
int relax_domain_level;
+
+ /* number of CPUs in subparts_cpus */
+ int nr_subparts_cpus;
+
+ /* partition root state */
+ int partition_root_state;
+
+ /*
+ * Default hierarchy only:
+ * use_parent_ecpus - set if using parent's effective_cpus
+ * child_ecpus_count - # of children with use_parent_ecpus set
+ */
+ int use_parent_ecpus;
+ int child_ecpus_count;
+};
+
+/*
+ * Partition root states:
+ *
+ * 0 - not a partition root
+ *
+ * 1 - partition root
+ *
+ * -1 - invalid partition root
+ * None of the cpus in cpus_allowed can be put into the parent's
+ * subparts_cpus. In this case, the cpuset is not a real partition
+ * root anymore. However, the CPU_EXCLUSIVE bit will still be set
+ * and the cpuset can be restored back to a partition root if the
+ * parent cpuset can give more CPUs back to this child cpuset.
+ */
+#define PRS_DISABLED 0
+#define PRS_ENABLED 1
+#define PRS_ERROR -1
+
+/*
+ * Temporary cpumasks for working with partitions that are passed among
+ * functions to avoid memory allocation in inner functions.
+ */
+struct tmpmasks {
+ cpumask_var_t addmask, delmask; /* For partition root */
+ cpumask_var_t new_cpus; /* For update_cpumasks_hier() */
};
static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
@@ -218,9 +269,15 @@ static inline int is_spread_slab(const struct cpuset *cs)
return test_bit(CS_SPREAD_SLAB, &cs->flags);
}
+static inline int is_partition_root(const struct cpuset *cs)
+{
+ return cs->partition_root_state > 0;
+}
+
static struct cpuset top_cpuset = {
.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
(1 << CS_MEM_EXCLUSIVE)),
+ .partition_root_state = PRS_ENABLED,
};
/**
@@ -419,6 +476,65 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
}
/**
+ * alloc_cpumasks - allocate three cpumasks for cpuset
+ * @cs: the cpuset that have cpumasks to be allocated.
+ * @tmp: the tmpmasks structure pointer
+ * Return: 0 if successful, -ENOMEM otherwise.
+ *
+ * Only one of the two input arguments should be non-NULL.
+ */
+static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
+{
+ cpumask_var_t *pmask1, *pmask2, *pmask3;
+
+ if (cs) {
+ pmask1 = &cs->cpus_allowed;
+ pmask2 = &cs->effective_cpus;
+ pmask3 = &cs->subparts_cpus;
+ } else {
+ pmask1 = &tmp->new_cpus;
+ pmask2 = &tmp->addmask;
+ pmask3 = &tmp->delmask;
+ }
+
+ if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
+ goto free_one;
+
+ if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
+ goto free_two;
+
+ return 0;
+
+free_two:
+ free_cpumask_var(*pmask2);
+free_one:
+ free_cpumask_var(*pmask1);
+ return -ENOMEM;
+}
+
+/**
+ * free_cpumasks - free cpumasks in a tmpmasks structure
+ * @cs: the cpuset that have cpumasks to be free.
+ * @tmp: the tmpmasks structure pointer
+ */
+static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
+{
+ if (cs) {
+ free_cpumask_var(cs->cpus_allowed);
+ free_cpumask_var(cs->effective_cpus);
+ free_cpumask_var(cs->subparts_cpus);
+ }
+ if (tmp) {
+ free_cpumask_var(tmp->new_cpus);
+ free_cpumask_var(tmp->addmask);
+ free_cpumask_var(tmp->delmask);
+ }
+}
+
+/**
* alloc_trial_cpuset - allocate a trial cpuset
* @cs: the cpuset that the trial cpuset duplicates
*/
@@ -430,31 +546,24 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
if (!trial)
return NULL;
- if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
- goto free_cs;
- if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
- goto free_cpus;
+ if (alloc_cpumasks(trial, NULL)) {
+ kfree(trial);
+ return NULL;
+ }
cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
cpumask_copy(trial->effective_cpus, cs->effective_cpus);
return trial;
-
-free_cpus:
- free_cpumask_var(trial->cpus_allowed);
-free_cs:
- kfree(trial);
- return NULL;
}
/**
- * free_trial_cpuset - free the trial cpuset
- * @trial: the trial cpuset to be freed
+ * free_cpuset - free the cpuset
+ * @cs: the cpuset to be freed
*/
-static void free_trial_cpuset(struct cpuset *trial)
+static inline void free_cpuset(struct cpuset *cs)
{
- free_cpumask_var(trial->effective_cpus);
- free_cpumask_var(trial->cpus_allowed);
- kfree(trial);
+ free_cpumasks(cs, NULL);
+ kfree(cs);
}
/*
@@ -660,13 +769,14 @@ static int generate_sched_domains(cpumask_var_t **domains,
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */
struct cgroup_subsys_state *pos_css;
+ bool root_load_balance = is_sched_load_balance(&top_cpuset);
doms = NULL;
dattr = NULL;
csa = NULL;
/* Special case for the 99% of systems with one, full, sched domain */
- if (is_sched_load_balance(&top_cpuset)) {
+ if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
ndoms = 1;
doms = alloc_sched_domains(ndoms);
if (!doms)
@@ -689,6 +799,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
csn = 0;
rcu_read_lock();
+ if (root_load_balance)
+ csa[csn++] = &top_cpuset;
cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
if (cp == &top_cpuset)
continue;
@@ -699,6 +811,9 @@ static int generate_sched_domains(cpumask_var_t **domains,
* parent's cpus, so just skip them, and then we call
* update_domain_attr_tree() to calc relax_domain_level of
* the corresponding sched domain.
+ *
+ * If root is load-balancing, we can skip @cp if it
+ * is a subset of the root's effective_cpus.
*/
if (!cpumask_empty(cp->cpus_allowed) &&
!(is_sched_load_balance(cp) &&
@@ -706,11 +821,16 @@ static int generate_sched_domains(cpumask_var_t **domains,
housekeeping_cpumask(HK_FLAG_DOMAIN))))
continue;
+ if (root_load_balance &&
+ cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
+ continue;
+
if (is_sched_load_balance(cp))
csa[csn++] = cp;
- /* skip @cp's subtree */
- pos_css = css_rightmost_descendant(pos_css);
+ /* skip @cp's subtree if not a partition root */
+ if (!is_partition_root(cp))
+ pos_css = css_rightmost_descendant(pos_css);
}
rcu_read_unlock();
@@ -838,7 +958,12 @@ static void rebuild_sched_domains_locked(void)
* passing doms with offlined cpu to partition_sched_domains().
* Anyways, hotplug work item will rebuild sched domains.
*/
- if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
+ if (!top_cpuset.nr_subparts_cpus &&
+ !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
+ goto out;
+
+ if (top_cpuset.nr_subparts_cpus &&
+ !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask))
goto out;
/* Generate domain masks and attrs */
@@ -881,10 +1006,248 @@ static void update_tasks_cpumask(struct cpuset *cs)
css_task_iter_end(&it);
}
+/**
+ * compute_effective_cpumask - Compute the effective cpumask of the cpuset
+ * @new_cpus: the temp variable for the new effective_cpus mask
+ * @cs: the cpuset the need to recompute the new effective_cpus mask
+ * @parent: the parent cpuset
+ *
+ * If the parent has subpartition CPUs, include them in the list of
+ * allowable CPUs in computing the new effective_cpus mask. Since offlined
+ * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask
+ * to mask those out.
+ */
+static void compute_effective_cpumask(struct cpumask *new_cpus,
+ struct cpuset *cs, struct cpuset *parent)
+{
+ if (parent->nr_subparts_cpus) {
+ cpumask_or(new_cpus, parent->effective_cpus,
+ parent->subparts_cpus);
+ cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
+ cpumask_and(new_cpus, new_cpus, cpu_active_mask);
+ } else {
+ cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
+ }
+}
+
+/*
+ * Commands for update_parent_subparts_cpumask
+ */
+enum subparts_cmd {
+ partcmd_enable, /* Enable partition root */
+ partcmd_disable, /* Disable partition root */
+ partcmd_update, /* Update parent's subparts_cpus */
+};
+
+/**
+ * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
+ * @cpuset: The cpuset that requests change in partition root state
+ * @cmd: Partition root state change command
+ * @newmask: Optional new cpumask for partcmd_update
+ * @tmp: Temporary addmask and delmask
+ * Return: 0, 1 or an error code
+ *
+ * For partcmd_enable, the cpuset is being transformed from a non-partition
+ * root to a partition root. The cpus_allowed mask of the given cpuset will
+ * be put into parent's subparts_cpus and taken away from parent's
+ * effective_cpus. The function will return 0 if all the CPUs listed in
+ * cpus_allowed can be granted or an error code will be returned.
+ *
+ * For partcmd_disable, the cpuset is being transofrmed from a partition
+ * root back to a non-partition root. any CPUs in cpus_allowed that are in
+ * parent's subparts_cpus will be taken away from that cpumask and put back
+ * into parent's effective_cpus. 0 should always be returned.
+ *
+ * For partcmd_update, if the optional newmask is specified, the cpu
+ * list is to be changed from cpus_allowed to newmask. Otherwise,
+ * cpus_allowed is assumed to remain the same. The cpuset should either
+ * be a partition root or an invalid partition root. The partition root
+ * state may change if newmask is NULL and none of the requested CPUs can
+ * be granted by the parent. The function will return 1 if changes to
+ * parent's subparts_cpus and effective_cpus happen or 0 otherwise.
+ * Error code should only be returned when newmask is non-NULL.
+ *
+ * The partcmd_enable and partcmd_disable commands are used by
+ * update_prstate(). The partcmd_update command is used by
+ * update_cpumasks_hier() with newmask NULL and update_cpumask() with
+ * newmask set.
+ *
+ * The checking is more strict when enabling partition root than the
+ * other two commands.
+ *
+ * Because of the implicit cpu exclusive nature of a partition root,
+ * cpumask changes that violates the cpu exclusivity rule will not be
+ * permitted when checked by validate_change(). The validate_change()
+ * function will also prevent any changes to the cpu list if it is not
+ * a superset of children's cpu lists.
+ */
+static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
+ struct cpumask *newmask,
+ struct tmpmasks *tmp)
+{
+ struct cpuset *parent = parent_cs(cpuset);
+ int adding; /* Moving cpus from effective_cpus to subparts_cpus */
+ int deleting; /* Moving cpus from subparts_cpus to effective_cpus */
+ bool part_error = false; /* Partition error? */
+
+ lockdep_assert_held(&cpuset_mutex);
+
+ /*
+ * The parent must be a partition root.
+ * The new cpumask, if present, or the current cpus_allowed must
+ * not be empty.
+ */
+ if (!is_partition_root(parent) ||
+ (newmask && cpumask_empty(newmask)) ||
+ (!newmask && cpumask_empty(cpuset->cpus_allowed)))
+ return -EINVAL;
+
+ /*
+ * Enabling/disabling partition root is not allowed if there are
+ * online children.
+ */
+ if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
+ return -EBUSY;
+
+ /*
+ * Enabling partition root is not allowed if not all the CPUs
+ * can be granted from parent's effective_cpus or at least one
+ * CPU will be left after that.
+ */
+ if ((cmd == partcmd_enable) &&
+ (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) ||
+ cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
+ return -EINVAL;
+
+ /*
+ * A cpumask update cannot make parent's effective_cpus become empty.
+ */
+ adding = deleting = false;
+ if (cmd == partcmd_enable) {
+ cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
+ adding = true;
+ } else if (cmd == partcmd_disable) {
+ deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
+ parent->subparts_cpus);
+ } else if (newmask) {
+ /*
+ * partcmd_update with newmask:
+ *
+ * delmask = cpus_allowed & ~newmask & parent->subparts_cpus
+ * addmask = newmask & parent->effective_cpus
+ * & ~parent->subparts_cpus
+ */
+ cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
+ deleting = cpumask_and(tmp->delmask, tmp->delmask,
+ parent->subparts_cpus);
+
+ cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
+ adding = cpumask_andnot(tmp->addmask, tmp->addmask,
+ parent->subparts_cpus);
+ /*
+ * Return error if the new effective_cpus could become empty.
+ */
+ if (adding &&
+ cpumask_equal(parent->effective_cpus, tmp->addmask)) {
+ if (!deleting)
+ return -EINVAL;
+ /*
+ * As some of the CPUs in subparts_cpus might have
+ * been offlined, we need to compute the real delmask
+ * to confirm that.
+ */
+ if (!cpumask_and(tmp->addmask, tmp->delmask,
+ cpu_active_mask))
+ return -EINVAL;
+ cpumask_copy(tmp->addmask, parent->effective_cpus);
+ }
+ } else {
+ /*
+ * partcmd_update w/o newmask:
+ *
+ * addmask = cpus_allowed & parent->effectiveb_cpus
+ *
+ * Note that parent's subparts_cpus may have been
+ * pre-shrunk in case there is a change in the cpu list.
+ * So no deletion is needed.
+ */
+ adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
+ parent->effective_cpus);
+ part_error = cpumask_equal(tmp->addmask,
+ parent->effective_cpus);
+ }
+
+ if (cmd == partcmd_update) {
+ int prev_prs = cpuset->partition_root_state;
+
+ /*
+ * Check for possible transition between PRS_ENABLED
+ * and PRS_ERROR.
+ */
+ switch (cpuset->partition_root_state) {
+ case PRS_ENABLED:
+ if (part_error)
+ cpuset->partition_root_state = PRS_ERROR;
+ break;
+ case PRS_ERROR:
+ if (!part_error)
+ cpuset->partition_root_state = PRS_ENABLED;
+ break;
+ }
+ /*
+ * Set part_error if previously in invalid state.
+ */
+ part_error = (prev_prs == PRS_ERROR);
+ }
+
+ if (!part_error && (cpuset->partition_root_state == PRS_ERROR))
+ return 0; /* Nothing need to be done */
+
+ if (cpuset->partition_root_state == PRS_ERROR) {
+ /*
+ * Remove all its cpus from parent's subparts_cpus.
+ */
+ adding = false;
+ deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
+ parent->subparts_cpus);
+ }
+
+ if (!adding && !deleting)
+ return 0;
+
+ /*
+ * Change the parent's subparts_cpus.
+ * Newly added CPUs will be removed from effective_cpus and
+ * newly deleted ones will be added back to effective_cpus.
+ */
+ spin_lock_irq(&callback_lock);
+ if (adding) {
+ cpumask_or(parent->subparts_cpus,
+ parent->subparts_cpus, tmp->addmask);
+ cpumask_andnot(parent->effective_cpus,
+ parent->effective_cpus, tmp->addmask);
+ }
+ if (deleting) {
+ cpumask_andnot(parent->subparts_cpus,
+ parent->subparts_cpus, tmp->delmask);
+ /*
+ * Some of the CPUs in subparts_cpus might have been offlined.
+ */
+ cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
+ cpumask_or(parent->effective_cpus,
+ parent->effective_cpus, tmp->delmask);
+ }
+
+ parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
+ spin_unlock_irq(&callback_lock);
+
+ return cmd == partcmd_update;
+}
+
/*
* update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
- * @cs: the cpuset to consider
- * @new_cpus: temp variable for calculating new effective_cpus
+ * @cs: the cpuset to consider
+ * @tmp: temp variables for calculating effective_cpus & partition setup
*
* When congifured cpumask is changed, the effective cpumasks of this cpuset
* and all its descendants need to be updated.
@@ -893,7 +1256,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
*
* Called with cpuset_mutex held
*/
-static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
+static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
@@ -903,27 +1266,115 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
cpuset_for_each_descendant_pre(cp, pos_css, cs) {
struct cpuset *parent = parent_cs(cp);
- cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
+ compute_effective_cpumask(tmp->new_cpus, cp, parent);
/*
* If it becomes empty, inherit the effective mask of the
* parent, which is guaranteed to have some CPUs.
*/
- if (is_in_v2_mode() && cpumask_empty(new_cpus))
- cpumask_copy(new_cpus, parent->effective_cpus);
+ if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
+ cpumask_copy(tmp->new_cpus, parent->effective_cpus);
+ if (!cp->use_parent_ecpus) {
+ cp->use_parent_ecpus = true;
+ parent->child_ecpus_count++;
+ }
+ } else if (cp->use_parent_ecpus) {
+ cp->use_parent_ecpus = false;
+ WARN_ON_ONCE(!parent->child_ecpus_count);
+ parent->child_ecpus_count--;
+ }
- /* Skip the whole subtree if the cpumask remains the same. */
- if (cpumask_equal(new_cpus, cp->effective_cpus)) {
+ /*
+ * Skip the whole subtree if the cpumask remains the same
+ * and has no partition root state.
+ */
+ if (!cp->partition_root_state &&
+ cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
+ /*
+ * update_parent_subparts_cpumask() should have been called
+ * for cs already in update_cpumask(). We should also call
+ * update_tasks_cpumask() again for tasks in the parent
+ * cpuset if the parent's subparts_cpus changes.
+ */
+ if ((cp != cs) && cp->partition_root_state) {
+ switch (parent->partition_root_state) {
+ case PRS_DISABLED:
+ /*
+ * If parent is not a partition root or an
+ * invalid partition root, clear the state
+ * state and the CS_CPU_EXCLUSIVE flag.
+ */
+ WARN_ON_ONCE(cp->partition_root_state
+ != PRS_ERROR);
+ cp->partition_root_state = 0;
+
+ /*
+ * clear_bit() is an atomic operation and
+ * readers aren't interested in the state
+ * of CS_CPU_EXCLUSIVE anyway. So we can
+ * just update the flag without holding
+ * the callback_lock.
+ */
+ clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
+ break;
+
+ case PRS_ENABLED:
+ if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
+ update_tasks_cpumask(parent);
+ break;
+
+ case PRS_ERROR:
+ /*
+ * When parent is invalid, it has to be too.
+ */
+ cp->partition_root_state = PRS_ERROR;
+ if (cp->nr_subparts_cpus) {
+ cp->nr_subparts_cpus = 0;
+ cpumask_clear(cp->subparts_cpus);
+ }
+ break;
+ }
+ }
+
if (!css_tryget_online(&cp->css))
continue;
rcu_read_unlock();
spin_lock_irq(&callback_lock);
- cpumask_copy(cp->effective_cpus, new_cpus);
+
+ cpumask_copy(cp->effective_cpus, tmp->new_cpus);
+ if (cp->nr_subparts_cpus &&
+ (cp->partition_root_state != PRS_ENABLED)) {
+ cp->nr_subparts_cpus = 0;
+ cpumask_clear(cp->subparts_cpus);
+ } else if (cp->nr_subparts_cpus) {
+ /*
+ * Make sure that effective_cpus & subparts_cpus
+ * are mutually exclusive.
+ *
+ * In the unlikely event that effective_cpus
+ * becomes empty. we clear cp->nr_subparts_cpus and
+ * let its child partition roots to compete for
+ * CPUs again.
+ */
+ cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
+ cp->subparts_cpus);
+ if (cpumask_empty(cp->effective_cpus)) {
+ cpumask_copy(cp->effective_cpus, tmp->new_cpus);
+ cpumask_clear(cp->subparts_cpus);
+ cp->nr_subparts_cpus = 0;
+ } else if (!cpumask_subset(cp->subparts_cpus,
+ tmp->new_cpus)) {
+ cpumask_andnot(cp->subparts_cpus,
+ cp->subparts_cpus, tmp->new_cpus);
+ cp->nr_subparts_cpus
+ = cpumask_weight(cp->subparts_cpus);
+ }
+ }
spin_unlock_irq(&callback_lock);
WARN_ON(!is_in_v2_mode() &&
@@ -932,11 +1383,15 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
update_tasks_cpumask(cp);
/*
- * If the effective cpumask of any non-empty cpuset is changed,
- * we need to rebuild sched domains.
+ * On legacy hierarchy, if the effective cpumask of any non-
+ * empty cpuset is changed, we need to rebuild sched domains.
+ * On default hierarchy, the cpuset needs to be a partition
+ * root as well.
*/
if (!cpumask_empty(cp->cpus_allowed) &&
- is_sched_load_balance(cp))
+ is_sched_load_balance(cp) &&
+ (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+ is_partition_root(cp)))
need_rebuild_sched_domains = true;
rcu_read_lock();
@@ -949,6 +1404,35 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
}
/**
+ * update_sibling_cpumasks - Update siblings cpumasks
+ * @parent: Parent cpuset
+ * @cs: Current cpuset
+ * @tmp: Temp variables
+ */
+static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
+ struct tmpmasks *tmp)
+{
+ struct cpuset *sibling;
+ struct cgroup_subsys_state *pos_css;
+
+ /*
+ * Check all its siblings and call update_cpumasks_hier()
+ * if their use_parent_ecpus flag is set in order for them
+ * to use the right effective_cpus value.
+ */
+ rcu_read_lock();
+ cpuset_for_each_child(sibling, pos_css, parent) {
+ if (sibling == cs)
+ continue;
+ if (!sibling->use_parent_ecpus)
+ continue;
+
+ update_cpumasks_hier(sibling, tmp);
+ }
+ rcu_read_unlock();
+}
+
+/**
* update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
* @cs: the cpuset to consider
* @trialcs: trial cpuset
@@ -958,6 +1442,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
int retval;
+ struct tmpmasks tmp;
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
if (cs == &top_cpuset)
@@ -989,12 +1474,50 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
return retval;
+#ifdef CONFIG_CPUMASK_OFFSTACK
+ /*
+ * Use the cpumasks in trialcs for tmpmasks when they are pointers
+ * to allocated cpumasks.
+ */
+ tmp.addmask = trialcs->subparts_cpus;
+ tmp.delmask = trialcs->effective_cpus;
+ tmp.new_cpus = trialcs->cpus_allowed;
+#endif
+
+ if (cs->partition_root_state) {
+ /* Cpumask of a partition root cannot be empty */
+ if (cpumask_empty(trialcs->cpus_allowed))
+ return -EINVAL;
+ if (update_parent_subparts_cpumask(cs, partcmd_update,
+ trialcs->cpus_allowed, &tmp) < 0)
+ return -EINVAL;
+ }
+
spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
+
+ /*
+ * Make sure that subparts_cpus is a subset of cpus_allowed.
+ */
+ if (cs->nr_subparts_cpus) {
+ cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus,
+ cs->cpus_allowed);
+ cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
+ }
spin_unlock_irq(&callback_lock);
- /* use trialcs->cpus_allowed as a temp variable */
- update_cpumasks_hier(cs, trialcs->cpus_allowed);
+ update_cpumasks_hier(cs, &tmp);
+
+ if (cs->partition_root_state) {
+ struct cpuset *parent = parent_cs(cs);
+
+ /*
+ * For partition root, update the cpumasks of sibling
+ * cpusets if they use parent's effective_cpus.
+ */
+ if (parent->child_ecpus_count)
+ update_sibling_cpumasks(parent, cs, &tmp);
+ }
return 0;
}
@@ -1348,7 +1871,95 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
if (spread_flag_changed)
update_tasks_flags(cs);
out:
- free_trial_cpuset(trialcs);
+ free_cpuset(trialcs);
+ return err;
+}
+
+/*
+ * update_prstate - update partititon_root_state
+ * cs: the cpuset to update
+ * val: 0 - disabled, 1 - enabled
+ *
+ * Call with cpuset_mutex held.
+ */
+static int update_prstate(struct cpuset *cs, int val)
+{
+ int err;
+ struct cpuset *parent = parent_cs(cs);
+ struct tmpmasks tmp;
+
+ if ((val != 0) && (val != 1))
+ return -EINVAL;
+ if (val == cs->partition_root_state)
+ return 0;
+
+ /*
+ * Cannot force a partial or invalid partition root to a full
+ * partition root.
+ */
+ if (val && cs->partition_root_state)
+ return -EINVAL;
+
+ if (alloc_cpumasks(NULL, &tmp))
+ return -ENOMEM;
+
+ err = -EINVAL;
+ if (!cs->partition_root_state) {
+ /*
+ * Turning on partition root requires setting the
+ * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
+ * cannot be NULL.
+ */
+ if (cpumask_empty(cs->cpus_allowed))
+ goto out;
+
+ err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
+ if (err)
+ goto out;
+
+ err = update_parent_subparts_cpumask(cs, partcmd_enable,
+ NULL, &tmp);
+ if (err) {
+ update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+ goto out;
+ }
+ cs->partition_root_state = PRS_ENABLED;
+ } else {
+ /*
+ * Turning off partition root will clear the
+ * CS_CPU_EXCLUSIVE bit.
+ */
+ if (cs->partition_root_state == PRS_ERROR) {
+ cs->partition_root_state = 0;
+ update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+ err = 0;
+ goto out;
+ }
+
+ err = update_parent_subparts_cpumask(cs, partcmd_disable,
+ NULL, &tmp);
+ if (err)
+ goto out;
+
+ cs->partition_root_state = 0;
+
+ /* Turning off CS_CPU_EXCLUSIVE will not return error */
+ update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+ }
+
+ /*
+ * Update cpumask of parent's tasks except when it is the top
+ * cpuset as some system daemons cannot be mapped to other CPUs.
+ */
+ if (parent != &top_cpuset)
+ update_tasks_cpumask(parent);
+
+ if (parent->child_ecpus_count)
+ update_sibling_cpumasks(parent, cs, &tmp);
+
+ rebuild_sched_domains_locked();
+out:
+ free_cpumasks(NULL, &tmp);
return err;
}
@@ -1498,10 +2109,8 @@ out_unlock:
static void cpuset_cancel_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
- struct cpuset *cs;
cgroup_taskset_first(tset, &css);
- cs = css_cs(css);
mutex_lock(&cpuset_mutex);
css_cs(css)->attach_in_progress--;
@@ -1593,10 +2202,12 @@ typedef enum {
FILE_MEMLIST,
FILE_EFFECTIVE_CPULIST,
FILE_EFFECTIVE_MEMLIST,
+ FILE_SUBPARTS_CPULIST,
FILE_CPU_EXCLUSIVE,
FILE_MEM_EXCLUSIVE,
FILE_MEM_HARDWALL,
FILE_SCHED_LOAD_BALANCE,
+ FILE_PARTITION_ROOT,
FILE_SCHED_RELAX_DOMAIN_LEVEL,
FILE_MEMORY_PRESSURE_ENABLED,
FILE_MEMORY_PRESSURE,
@@ -1732,7 +2343,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
break;
}
- free_trial_cpuset(trialcs);
+ free_cpuset(trialcs);
out_unlock:
mutex_unlock(&cpuset_mutex);
kernfs_unbreak_active_protection(of->kn);
@@ -1770,6 +2381,9 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
case FILE_EFFECTIVE_MEMLIST:
seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
break;
+ case FILE_SUBPARTS_CPULIST:
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
+ break;
default:
ret = -EINVAL;
}
@@ -1824,12 +2438,60 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
return 0;
}
+static int sched_partition_show(struct seq_file *seq, void *v)
+{
+ struct cpuset *cs = css_cs(seq_css(seq));
+
+ switch (cs->partition_root_state) {
+ case PRS_ENABLED:
+ seq_puts(seq, "root\n");
+ break;
+ case PRS_DISABLED:
+ seq_puts(seq, "member\n");
+ break;
+ case PRS_ERROR:
+ seq_puts(seq, "root invalid\n");
+ break;
+ }
+ return 0;
+}
+
+static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct cpuset *cs = css_cs(of_css(of));
+ int val;
+ int retval = -ENODEV;
+
+ buf = strstrip(buf);
+
+ /*
+ * Convert "root" to ENABLED, and convert "member" to DISABLED.
+ */
+ if (!strcmp(buf, "root"))
+ val = PRS_ENABLED;
+ else if (!strcmp(buf, "member"))
+ val = PRS_DISABLED;
+ else
+ return -EINVAL;
+
+ css_get(&cs->css);
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
+
+ retval = update_prstate(cs, val);
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ css_put(&cs->css);
+ return retval ?: nbytes;
+}
/*
* for the common functions, 'private' gives the type of file
*/
-static struct cftype files[] = {
+static struct cftype legacy_files[] = {
{
.name = "cpus",
.seq_show = cpuset_common_seq_show,
@@ -1932,6 +2594,60 @@ static struct cftype files[] = {
};
/*
+ * This is currently a minimal set for the default hierarchy. It can be
+ * expanded later on by migrating more features and control files from v1.
+ */
+static struct cftype dfl_files[] = {
+ {
+ .name = "cpus",
+ .seq_show = cpuset_common_seq_show,
+ .write = cpuset_write_resmask,
+ .max_write_len = (100U + 6 * NR_CPUS),
+ .private = FILE_CPULIST,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+
+ {
+ .name = "mems",
+ .seq_show = cpuset_common_seq_show,
+ .write = cpuset_write_resmask,
+ .max_write_len = (100U + 6 * MAX_NUMNODES),
+ .private = FILE_MEMLIST,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+
+ {
+ .name = "cpus.effective",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_CPULIST,
+ },
+
+ {
+ .name = "mems.effective",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_MEMLIST,
+ },
+
+ {
+ .name = "cpus.partition",
+ .seq_show = sched_partition_show,
+ .write = sched_partition_write,
+ .private = FILE_PARTITION_ROOT,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+
+ {
+ .name = "cpus.subpartitions",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_SUBPARTS_CPULIST,
+ .flags = CFTYPE_DEBUG,
+ },
+
+ { } /* terminate */
+};
+
+
+/*
* cpuset_css_alloc - allocate a cpuset css
* cgrp: control group that the new cpuset will be part of
*/
@@ -1947,26 +2663,19 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
cs = kzalloc(sizeof(*cs), GFP_KERNEL);
if (!cs)
return ERR_PTR(-ENOMEM);
- if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
- goto free_cs;
- if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
- goto free_cpus;
+
+ if (alloc_cpumasks(cs, NULL)) {
+ kfree(cs);
+ return ERR_PTR(-ENOMEM);
+ }
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
- cpumask_clear(cs->cpus_allowed);
nodes_clear(cs->mems_allowed);
- cpumask_clear(cs->effective_cpus);
nodes_clear(cs->effective_mems);
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
return &cs->css;
-
-free_cpus:
- free_cpumask_var(cs->cpus_allowed);
-free_cs:
- kfree(cs);
- return ERR_PTR(-ENOMEM);
}
static int cpuset_css_online(struct cgroup_subsys_state *css)
@@ -1993,6 +2702,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
if (is_in_v2_mode()) {
cpumask_copy(cs->effective_cpus, parent->effective_cpus);
cs->effective_mems = parent->effective_mems;
+ cs->use_parent_ecpus = true;
+ parent->child_ecpus_count++;
}
spin_unlock_irq(&callback_lock);
@@ -2035,7 +2746,12 @@ out_unlock:
/*
* If the cpuset being removed has its flag 'sched_load_balance'
* enabled, then simulate turning sched_load_balance off, which
- * will call rebuild_sched_domains_locked().
+ * will call rebuild_sched_domains_locked(). That is not needed
+ * in the default hierarchy where only changes in partition
+ * will cause repartitioning.
+ *
+ * If the cpuset has the 'sched.partition' flag enabled, simulate
+ * turning 'sched.partition" off.
*/
static void cpuset_css_offline(struct cgroup_subsys_state *css)
@@ -2044,9 +2760,20 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
mutex_lock(&cpuset_mutex);
- if (is_sched_load_balance(cs))
+ if (is_partition_root(cs))
+ update_prstate(cs, 0);
+
+ if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ is_sched_load_balance(cs))
update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+ if (cs->use_parent_ecpus) {
+ struct cpuset *parent = parent_cs(cs);
+
+ cs->use_parent_ecpus = false;
+ parent->child_ecpus_count--;
+ }
+
cpuset_dec();
clear_bit(CS_ONLINE, &cs->flags);
@@ -2057,9 +2784,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
- free_cpumask_var(cs->effective_cpus);
- free_cpumask_var(cs->cpus_allowed);
- kfree(cs);
+ free_cpuset(cs);
}
static void cpuset_bind(struct cgroup_subsys_state *root_css)
@@ -2105,8 +2830,10 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
.post_attach = cpuset_post_attach,
.bind = cpuset_bind,
.fork = cpuset_fork,
- .legacy_cftypes = files,
+ .legacy_cftypes = legacy_files,
+ .dfl_cftypes = dfl_files,
.early_init = true,
+ .threaded = true,
};
/**
@@ -2121,6 +2848,7 @@ int __init cpuset_init(void)
BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
cpumask_setall(top_cpuset.cpus_allowed);
nodes_setall(top_cpuset.mems_allowed);
@@ -2227,20 +2955,29 @@ hotplug_update_tasks(struct cpuset *cs,
update_tasks_nodemask(cs);
}
+static bool force_rebuild;
+
+void cpuset_force_rebuild(void)
+{
+ force_rebuild = true;
+}
+
/**
* cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
* @cs: cpuset in interest
+ * @tmp: the tmpmasks structure pointer
*
* Compare @cs's cpu and mem masks against top_cpuset and if some have gone
* offline, update @cs accordingly. If @cs ends up with no CPU or memory,
* all its tasks are moved to the nearest ancestor with both resources.
*/
-static void cpuset_hotplug_update_tasks(struct cpuset *cs)
+static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
{
static cpumask_t new_cpus;
static nodemask_t new_mems;
bool cpus_updated;
bool mems_updated;
+ struct cpuset *parent;
retry:
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
@@ -2255,9 +2992,60 @@ retry:
goto retry;
}
- cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
- nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
+ parent = parent_cs(cs);
+ compute_effective_cpumask(&new_cpus, cs, parent);
+ nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
+
+ if (cs->nr_subparts_cpus)
+ /*
+ * Make sure that CPUs allocated to child partitions
+ * do not show up in effective_cpus.
+ */
+ cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);
+
+ if (!tmp || !cs->partition_root_state)
+ goto update_tasks;
+
+ /*
+ * In the unlikely event that a partition root has empty
+ * effective_cpus or its parent becomes erroneous, we have to
+ * transition it to the erroneous state.
+ */
+ if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
+ (parent->partition_root_state == PRS_ERROR))) {
+ if (cs->nr_subparts_cpus) {
+ cs->nr_subparts_cpus = 0;
+ cpumask_clear(cs->subparts_cpus);
+ compute_effective_cpumask(&new_cpus, cs, parent);
+ }
+
+ /*
+ * If the effective_cpus is empty because the child
+ * partitions take away all the CPUs, we can keep
+ * the current partition and let the child partitions
+ * fight for available CPUs.
+ */
+ if ((parent->partition_root_state == PRS_ERROR) ||
+ cpumask_empty(&new_cpus)) {
+ update_parent_subparts_cpumask(cs, partcmd_disable,
+ NULL, tmp);
+ cs->partition_root_state = PRS_ERROR;
+ }
+ cpuset_force_rebuild();
+ }
+
+ /*
+ * On the other hand, an erroneous partition root may be transitioned
+ * back to a regular one or a partition root with no CPU allocated
+ * from the parent may change to erroneous.
+ */
+ if (is_partition_root(parent) &&
+ ((cs->partition_root_state == PRS_ERROR) ||
+ !cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
+ update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
+ cpuset_force_rebuild();
+update_tasks:
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
mems_updated = !nodes_equal(new_mems, cs->effective_mems);
@@ -2271,13 +3059,6 @@ retry:
mutex_unlock(&cpuset_mutex);
}
-static bool force_rebuild;
-
-void cpuset_force_rebuild(void)
-{
- force_rebuild = true;
-}
-
/**
* cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
*
@@ -2300,6 +3081,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
static nodemask_t new_mems;
bool cpus_updated, mems_updated;
bool on_dfl = is_in_v2_mode();
+ struct tmpmasks tmp, *ptmp = NULL;
+
+ if (on_dfl && !alloc_cpumasks(NULL, &tmp))
+ ptmp = &tmp;
mutex_lock(&cpuset_mutex);
@@ -2307,6 +3092,11 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
cpumask_copy(&new_cpus, cpu_active_mask);
new_mems = node_states[N_MEMORY];
+ /*
+ * If subparts_cpus is populated, it is likely that the check below
+ * will produce a false positive on cpus_updated when the cpu list
+ * isn't changed. It is extra work, but it is better to be safe.
+ */
cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
@@ -2315,6 +3105,22 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
spin_lock_irq(&callback_lock);
if (!on_dfl)
cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+ /*
+ * Make sure that CPUs allocated to child partitions
+ * do not show up in effective_cpus. If no CPU is left,
+ * we clear the subparts_cpus & let the child partitions
+ * fight for the CPUs again.
+ */
+ if (top_cpuset.nr_subparts_cpus) {
+ if (cpumask_subset(&new_cpus,
+ top_cpuset.subparts_cpus)) {
+ top_cpuset.nr_subparts_cpus = 0;
+ cpumask_clear(top_cpuset.subparts_cpus);
+ } else {
+ cpumask_andnot(&new_cpus, &new_cpus,
+ top_cpuset.subparts_cpus);
+ }
+ }
cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
spin_unlock_irq(&callback_lock);
/* we don't mess with cpumasks of tasks in top_cpuset */
@@ -2343,7 +3149,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
continue;
rcu_read_unlock();
- cpuset_hotplug_update_tasks(cs);
+ cpuset_hotplug_update_tasks(cs, ptmp);
rcu_read_lock();
css_put(&cs->css);
@@ -2356,6 +3162,8 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
force_rebuild = false;
rebuild_sched_domains();
}
+
+ free_cpumasks(NULL, ptmp);
}
void cpuset_update_active_cpus(void)
@@ -2666,9 +3474,9 @@ void cpuset_print_current_mems_allowed(void)
rcu_read_lock();
cgrp = task_cs(current)->css.cgroup;
- pr_info("%s cpuset=", current->comm);
+ pr_cont(",cpuset=");
pr_cont_cgroup_name(cgrp);
- pr_cont(" mems_allowed=%*pbl\n",
+ pr_cont(",mems_allowed=%*pbl",
nodemask_pr_args(&current->mems_allowed));
rcu_read_unlock();
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 9caeda610249..5f1b87330bee 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -373,11 +373,9 @@ struct cgroup_subsys debug_cgrp_subsys = {
* On v2, debug is an implicit controller enabled by "cgroup_debug" boot
* parameter.
*/
-static int __init enable_cgroup_debug(char *str)
+void __init enable_debug_cgroup(void)
{
debug_cgrp_subsys.dfl_cftypes = debug_files;
debug_cgrp_subsys.implicit_on_dfl = true;
debug_cgrp_subsys.threaded = true;
- return 1;
}
-__setup("cgroup_debug", enable_cgroup_debug);
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 645c7a2ecde8..ca88b867e7fe 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -35,13 +35,8 @@ config ARCH_HAS_DMA_COHERENT_TO_PFN
config ARCH_HAS_DMA_MMAP_PGPROT
bool
-config DMA_DIRECT_OPS
- bool
- depends on HAS_DMA
-
config DMA_NONCOHERENT_CACHE_SYNC
bool
- depends on DMA_DIRECT_OPS
config DMA_VIRT_OPS
bool
@@ -49,5 +44,12 @@ config DMA_VIRT_OPS
config SWIOTLB
bool
- select DMA_DIRECT_OPS
select NEED_DMA_MAP_STATE
+
+config DMA_REMAP
+ depends on MMU
+ bool
+
+config DMA_DIRECT_REMAP
+ bool
+ select DMA_REMAP
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index 7d581e4eea4a..72ff6e46aa86 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -1,10 +1,9 @@
# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_HAS_DMA) += mapping.o
+obj-$(CONFIG_HAS_DMA) += mapping.o direct.o dummy.o
obj-$(CONFIG_DMA_CMA) += contiguous.o
obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += coherent.o
-obj-$(CONFIG_DMA_DIRECT_OPS) += direct.o
obj-$(CONFIG_DMA_VIRT_OPS) += virt.o
obj-$(CONFIG_DMA_API_DEBUG) += debug.o
obj-$(CONFIG_SWIOTLB) += swiotlb.o
-
+obj-$(CONFIG_DMA_REMAP) += remap.o
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 231ca4628062..164706da2a73 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -17,6 +17,8 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#define pr_fmt(fmt) "DMA-API: " fmt
+
#include <linux/sched/task_stack.h>
#include <linux/scatterlist.h>
#include <linux/dma-mapping.h>
@@ -41,10 +43,9 @@
#define HASH_FN_SHIFT 13
#define HASH_FN_MASK (HASH_SIZE - 1)
-/* allow architectures to override this if absolutely required */
-#ifndef PREALLOC_DMA_DEBUG_ENTRIES
#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
-#endif
+/* If the pool runs out, add this many new entries at once */
+#define DMA_DEBUG_DYNAMIC_ENTRIES (PAGE_SIZE / sizeof(struct dma_debug_entry))
enum {
dma_debug_single,
@@ -142,6 +143,7 @@ static struct dentry *show_all_errors_dent __read_mostly;
static struct dentry *show_num_errors_dent __read_mostly;
static struct dentry *num_free_entries_dent __read_mostly;
static struct dentry *min_free_entries_dent __read_mostly;
+static struct dentry *nr_total_entries_dent __read_mostly;
static struct dentry *filter_dent __read_mostly;
/* per-driver filter related state */
@@ -234,7 +236,7 @@ static bool driver_filter(struct device *dev)
error_count += 1; \
if (driver_filter(dev) && \
(show_all_errors || show_num_errors > 0)) { \
- WARN(1, "%s %s: " format, \
+ WARN(1, pr_fmt("%s %s: ") format, \
dev ? dev_driver_string(dev) : "NULL", \
dev ? dev_name(dev) : "NULL", ## arg); \
dump_entry_trace(entry); \
@@ -519,7 +521,7 @@ static void active_cacheline_inc_overlap(phys_addr_t cln)
* prematurely.
*/
WARN_ONCE(overlap > ACTIVE_CACHELINE_MAX_OVERLAP,
- "DMA-API: exceeded %d overlapping mappings of cacheline %pa\n",
+ pr_fmt("exceeded %d overlapping mappings of cacheline %pa\n"),
ACTIVE_CACHELINE_MAX_OVERLAP, &cln);
}
@@ -614,7 +616,7 @@ void debug_dma_assert_idle(struct page *page)
cln = to_cacheline_number(entry);
err_printk(entry->dev, entry,
- "DMA-API: cpu touching an active dma mapped cacheline [cln=%pa]\n",
+ "cpu touching an active dma mapped cacheline [cln=%pa]\n",
&cln);
}
@@ -634,7 +636,7 @@ static void add_dma_entry(struct dma_debug_entry *entry)
rc = active_cacheline_insert(entry);
if (rc == -ENOMEM) {
- pr_err("DMA-API: cacheline tracking ENOMEM, dma-debug disabled\n");
+ pr_err("cacheline tracking ENOMEM, dma-debug disabled\n");
global_disable = true;
}
@@ -643,6 +645,24 @@ static void add_dma_entry(struct dma_debug_entry *entry)
*/
}
+static int dma_debug_create_entries(gfp_t gfp)
+{
+ struct dma_debug_entry *entry;
+ int i;
+
+ entry = (void *)get_zeroed_page(gfp);
+ if (!entry)
+ return -ENOMEM;
+
+ for (i = 0; i < DMA_DEBUG_DYNAMIC_ENTRIES; i++)
+ list_add_tail(&entry[i].list, &free_entries);
+
+ num_free_entries += DMA_DEBUG_DYNAMIC_ENTRIES;
+ nr_total_entries += DMA_DEBUG_DYNAMIC_ENTRIES;
+
+ return 0;
+}
+
static struct dma_debug_entry *__dma_entry_alloc(void)
{
struct dma_debug_entry *entry;
@@ -658,6 +678,18 @@ static struct dma_debug_entry *__dma_entry_alloc(void)
return entry;
}
+void __dma_entry_alloc_check_leak(void)
+{
+ u32 tmp = nr_total_entries % nr_prealloc_entries;
+
+ /* Shout each time we tick over some multiple of the initial pool */
+ if (tmp < DMA_DEBUG_DYNAMIC_ENTRIES) {
+ pr_info("dma_debug_entry pool grown to %u (%u00%%)\n",
+ nr_total_entries,
+ (nr_total_entries / nr_prealloc_entries));
+ }
+}
+
/* struct dma_entry allocator
*
* The next two functions implement the allocator for
@@ -669,12 +701,14 @@ static struct dma_debug_entry *dma_entry_alloc(void)
unsigned long flags;
spin_lock_irqsave(&free_entries_lock, flags);
-
- if (list_empty(&free_entries)) {
- global_disable = true;
- spin_unlock_irqrestore(&free_entries_lock, flags);
- pr_err("DMA-API: debugging out of memory - disabling\n");
- return NULL;
+ if (num_free_entries == 0) {
+ if (dma_debug_create_entries(GFP_ATOMIC)) {
+ global_disable = true;
+ spin_unlock_irqrestore(&free_entries_lock, flags);
+ pr_err("debugging out of memory - disabling\n");
+ return NULL;
+ }
+ __dma_entry_alloc_check_leak();
}
entry = __dma_entry_alloc();
@@ -707,52 +741,6 @@ static void dma_entry_free(struct dma_debug_entry *entry)
spin_unlock_irqrestore(&free_entries_lock, flags);
}
-int dma_debug_resize_entries(u32 num_entries)
-{
- int i, delta, ret = 0;
- unsigned long flags;
- struct dma_debug_entry *entry;
- LIST_HEAD(tmp);
-
- spin_lock_irqsave(&free_entries_lock, flags);
-
- if (nr_total_entries < num_entries) {
- delta = num_entries - nr_total_entries;
-
- spin_unlock_irqrestore(&free_entries_lock, flags);
-
- for (i = 0; i < delta; i++) {
- entry = kzalloc(sizeof(*entry), GFP_KERNEL);
- if (!entry)
- break;
-
- list_add_tail(&entry->list, &tmp);
- }
-
- spin_lock_irqsave(&free_entries_lock, flags);
-
- list_splice(&tmp, &free_entries);
- nr_total_entries += i;
- num_free_entries += i;
- } else {
- delta = nr_total_entries - num_entries;
-
- for (i = 0; i < delta && !list_empty(&free_entries); i++) {
- entry = __dma_entry_alloc();
- kfree(entry);
- }
-
- nr_total_entries -= i;
- }
-
- if (nr_total_entries != num_entries)
- ret = 1;
-
- spin_unlock_irqrestore(&free_entries_lock, flags);
-
- return ret;
-}
-
/*
* DMA-API debugging init code
*
@@ -761,36 +749,6 @@ int dma_debug_resize_entries(u32 num_entries)
* 2. Preallocate a given number of dma_debug_entry structs
*/
-static int prealloc_memory(u32 num_entries)
-{
- struct dma_debug_entry *entry, *next_entry;
- int i;
-
- for (i = 0; i < num_entries; ++i) {
- entry = kzalloc(sizeof(*entry), GFP_KERNEL);
- if (!entry)
- goto out_err;
-
- list_add_tail(&entry->list, &free_entries);
- }
-
- num_free_entries = num_entries;
- min_free_entries = num_entries;
-
- pr_info("DMA-API: preallocated %d debug entries\n", num_entries);
-
- return 0;
-
-out_err:
-
- list_for_each_entry_safe(entry, next_entry, &free_entries, list) {
- list_del(&entry->list);
- kfree(entry);
- }
-
- return -ENOMEM;
-}
-
static ssize_t filter_read(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
@@ -850,7 +808,7 @@ static ssize_t filter_write(struct file *file, const char __user *userbuf,
* switched off.
*/
if (current_driver_name[0])
- pr_info("DMA-API: switching off dma-debug driver filter\n");
+ pr_info("switching off dma-debug driver filter\n");
current_driver_name[0] = 0;
current_driver = NULL;
goto out_unlock;
@@ -868,7 +826,7 @@ static ssize_t filter_write(struct file *file, const char __user *userbuf,
current_driver_name[i] = 0;
current_driver = NULL;
- pr_info("DMA-API: enable driver filter for driver [%s]\n",
+ pr_info("enable driver filter for driver [%s]\n",
current_driver_name);
out_unlock:
@@ -887,7 +845,7 @@ static int dma_debug_fs_init(void)
{
dma_debug_dent = debugfs_create_dir("dma-api", NULL);
if (!dma_debug_dent) {
- pr_err("DMA-API: can not create debugfs directory\n");
+ pr_err("can not create debugfs directory\n");
return -ENOMEM;
}
@@ -926,6 +884,12 @@ static int dma_debug_fs_init(void)
if (!min_free_entries_dent)
goto out_err;
+ nr_total_entries_dent = debugfs_create_u32("nr_total_entries", 0444,
+ dma_debug_dent,
+ &nr_total_entries);
+ if (!nr_total_entries_dent)
+ goto out_err;
+
filter_dent = debugfs_create_file("driver_filter", 0644,
dma_debug_dent, NULL, &filter_fops);
if (!filter_dent)
@@ -973,7 +937,7 @@ static int dma_debug_device_change(struct notifier_block *nb, unsigned long acti
count = device_dma_allocations(dev, &entry);
if (count == 0)
break;
- err_printk(dev, entry, "DMA-API: device driver has pending "
+ err_printk(dev, entry, "device driver has pending "
"DMA allocations while released from device "
"[count=%d]\n"
"One of leaked entries details: "
@@ -1009,7 +973,7 @@ void dma_debug_add_bus(struct bus_type *bus)
static int dma_debug_init(void)
{
- int i;
+ int i, nr_pages;
/* Do not use dma_debug_initialized here, since we really want to be
* called to set dma_debug_initialized
@@ -1023,24 +987,31 @@ static int dma_debug_init(void)
}
if (dma_debug_fs_init() != 0) {
- pr_err("DMA-API: error creating debugfs entries - disabling\n");
+ pr_err("error creating debugfs entries - disabling\n");
global_disable = true;
return 0;
}
- if (prealloc_memory(nr_prealloc_entries) != 0) {
- pr_err("DMA-API: debugging out of memory error - disabled\n");
+ nr_pages = DIV_ROUND_UP(nr_prealloc_entries, DMA_DEBUG_DYNAMIC_ENTRIES);
+ for (i = 0; i < nr_pages; ++i)
+ dma_debug_create_entries(GFP_KERNEL);
+ if (num_free_entries >= nr_prealloc_entries) {
+ pr_info("preallocated %d debug entries\n", nr_total_entries);
+ } else if (num_free_entries > 0) {
+ pr_warn("%d debug entries requested but only %d allocated\n",
+ nr_prealloc_entries, nr_total_entries);
+ } else {
+ pr_err("debugging out of memory error - disabled\n");
global_disable = true;
return 0;
}
-
- nr_total_entries = num_free_entries;
+ min_free_entries = num_free_entries;
dma_debug_initialized = true;
- pr_info("DMA-API: debugging enabled by kernel config\n");
+ pr_info("debugging enabled by kernel config\n");
return 0;
}
core_initcall(dma_debug_init);
@@ -1051,7 +1022,7 @@ static __init int dma_debug_cmdline(char *str)
return -EINVAL;
if (strncmp(str, "off", 3) == 0) {
- pr_info("DMA-API: debugging disabled on kernel command line\n");
+ pr_info("debugging disabled on kernel command line\n");
global_disable = true;
}
@@ -1085,11 +1056,11 @@ static void check_unmap(struct dma_debug_entry *ref)
if (dma_mapping_error(ref->dev, ref->dev_addr)) {
err_printk(ref->dev, NULL,
- "DMA-API: device driver tries to free an "
+ "device driver tries to free an "
"invalid DMA memory address\n");
} else {
err_printk(ref->dev, NULL,
- "DMA-API: device driver tries to free DMA "
+ "device driver tries to free DMA "
"memory it has not allocated [device "
"address=0x%016llx] [size=%llu bytes]\n",
ref->dev_addr, ref->size);
@@ -1098,7 +1069,7 @@ static void check_unmap(struct dma_debug_entry *ref)
}
if (ref->size != entry->size) {
- err_printk(ref->dev, entry, "DMA-API: device driver frees "
+ err_printk(ref->dev, entry, "device driver frees "
"DMA memory with different size "
"[device address=0x%016llx] [map size=%llu bytes] "
"[unmap size=%llu bytes]\n",
@@ -1106,7 +1077,7 @@ static void check_unmap(struct dma_debug_entry *ref)
}
if (ref->type != entry->type) {
- err_printk(ref->dev, entry, "DMA-API: device driver frees "
+ err_printk(ref->dev, entry, "device driver frees "
"DMA memory with wrong function "
"[device address=0x%016llx] [size=%llu bytes] "
"[mapped as %s] [unmapped as %s]\n",
@@ -1114,7 +1085,7 @@ static void check_unmap(struct dma_debug_entry *ref)
type2name[entry->type], type2name[ref->type]);
} else if ((entry->type == dma_debug_coherent) &&
(phys_addr(ref) != phys_addr(entry))) {
- err_printk(ref->dev, entry, "DMA-API: device driver frees "
+ err_printk(ref->dev, entry, "device driver frees "
"DMA memory with different CPU address "
"[device address=0x%016llx] [size=%llu bytes] "
"[cpu alloc address=0x%016llx] "
@@ -1126,7 +1097,7 @@ static void check_unmap(struct dma_debug_entry *ref)
if (ref->sg_call_ents && ref->type == dma_debug_sg &&
ref->sg_call_ents != entry->sg_call_ents) {
- err_printk(ref->dev, entry, "DMA-API: device driver frees "
+ err_printk(ref->dev, entry, "device driver frees "
"DMA sg list with different entry count "
"[map count=%d] [unmap count=%d]\n",
entry->sg_call_ents, ref->sg_call_ents);
@@ -1137,7 +1108,7 @@ static void check_unmap(struct dma_debug_entry *ref)
* DMA API don't handle this properly, so check for it here
*/
if (ref->direction != entry->direction) {
- err_printk(ref->dev, entry, "DMA-API: device driver frees "
+ err_printk(ref->dev, entry, "device driver frees "
"DMA memory with different direction "
"[device address=0x%016llx] [size=%llu bytes] "
"[mapped with %s] [unmapped with %s]\n",
@@ -1153,7 +1124,7 @@ static void check_unmap(struct dma_debug_entry *ref)
*/
if (entry->map_err_type == MAP_ERR_NOT_CHECKED) {
err_printk(ref->dev, entry,
- "DMA-API: device driver failed to check map error"
+ "device driver failed to check map error"
"[device address=0x%016llx] [size=%llu bytes] "
"[mapped as %s]",
ref->dev_addr, ref->size,
@@ -1178,7 +1149,7 @@ static void check_for_stack(struct device *dev,
return;
addr = page_address(page) + offset;
if (object_is_on_stack(addr))
- err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [addr=%p]\n", addr);
+ err_printk(dev, NULL, "device driver maps memory from stack [addr=%p]\n", addr);
} else {
/* Stack is vmalloced. */
int i;
@@ -1188,7 +1159,7 @@ static void check_for_stack(struct device *dev,
continue;
addr = (u8 *)current->stack + i * PAGE_SIZE + offset;
- err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [probable addr=%p]\n", addr);
+ err_printk(dev, NULL, "device driver maps memory from stack [probable addr=%p]\n", addr);
break;
}
}
@@ -1208,7 +1179,7 @@ static void check_for_illegal_area(struct device *dev, void *addr, unsigned long
{
if (overlap(addr, len, _stext, _etext) ||
overlap(addr, len, __start_rodata, __end_rodata))
- err_printk(dev, NULL, "DMA-API: device driver maps memory from kernel text or rodata [addr=%p] [len=%lu]\n", addr, len);
+ err_printk(dev, NULL, "device driver maps memory from kernel text or rodata [addr=%p] [len=%lu]\n", addr, len);
}
static void check_sync(struct device *dev,
@@ -1224,7 +1195,7 @@ static void check_sync(struct device *dev,
entry = bucket_find_contain(&bucket, ref, &flags);
if (!entry) {
- err_printk(dev, NULL, "DMA-API: device driver tries "
+ err_printk(dev, NULL, "device driver tries "
"to sync DMA memory it has not allocated "
"[device address=0x%016llx] [size=%llu bytes]\n",
(unsigned long long)ref->dev_addr, ref->size);
@@ -1232,7 +1203,7 @@ static void check_sync(struct device *dev,
}
if (ref->size > entry->size) {
- err_printk(dev, entry, "DMA-API: device driver syncs"
+ err_printk(dev, entry, "device driver syncs"
" DMA memory outside allocated range "
"[device address=0x%016llx] "
"[allocation size=%llu bytes] "
@@ -1245,7 +1216,7 @@ static void check_sync(struct device *dev,
goto out;
if (ref->direction != entry->direction) {
- err_printk(dev, entry, "DMA-API: device driver syncs "
+ err_printk(dev, entry, "device driver syncs "
"DMA memory with different direction "
"[device address=0x%016llx] [size=%llu bytes] "
"[mapped with %s] [synced with %s]\n",
@@ -1256,7 +1227,7 @@ static void check_sync(struct device *dev,
if (to_cpu && !(entry->direction == DMA_FROM_DEVICE) &&
!(ref->direction == DMA_TO_DEVICE))
- err_printk(dev, entry, "DMA-API: device driver syncs "
+ err_printk(dev, entry, "device driver syncs "
"device read-only DMA memory for cpu "
"[device address=0x%016llx] [size=%llu bytes] "
"[mapped with %s] [synced with %s]\n",
@@ -1266,7 +1237,7 @@ static void check_sync(struct device *dev,
if (!to_cpu && !(entry->direction == DMA_TO_DEVICE) &&
!(ref->direction == DMA_FROM_DEVICE))
- err_printk(dev, entry, "DMA-API: device driver syncs "
+ err_printk(dev, entry, "device driver syncs "
"device write-only DMA memory to device "
"[device address=0x%016llx] [size=%llu bytes] "
"[mapped with %s] [synced with %s]\n",
@@ -1276,7 +1247,7 @@ static void check_sync(struct device *dev,
if (ref->sg_call_ents && ref->type == dma_debug_sg &&
ref->sg_call_ents != entry->sg_call_ents) {
- err_printk(ref->dev, entry, "DMA-API: device driver syncs "
+ err_printk(ref->dev, entry, "device driver syncs "
"DMA sg list with different entry count "
"[map count=%d] [sync count=%d]\n",
entry->sg_call_ents, ref->sg_call_ents);
@@ -1297,7 +1268,7 @@ static void check_sg_segment(struct device *dev, struct scatterlist *sg)
* whoever generated the list forgot to check them.
*/
if (sg->length > max_seg)
- err_printk(dev, NULL, "DMA-API: mapping sg segment longer than device claims to support [len=%u] [max=%u]\n",
+ err_printk(dev, NULL, "mapping sg segment longer than device claims to support [len=%u] [max=%u]\n",
sg->length, max_seg);
/*
* In some cases this could potentially be the DMA API
@@ -1307,7 +1278,7 @@ static void check_sg_segment(struct device *dev, struct scatterlist *sg)
start = sg_dma_address(sg);
end = start + sg_dma_len(sg) - 1;
if ((start ^ end) & ~boundary)
- err_printk(dev, NULL, "DMA-API: mapping sg segment across boundary [start=0x%016llx] [end=0x%016llx] [boundary=0x%016llx]\n",
+ err_printk(dev, NULL, "mapping sg segment across boundary [start=0x%016llx] [end=0x%016llx] [boundary=0x%016llx]\n",
start, end, boundary);
#endif
}
@@ -1319,11 +1290,11 @@ void debug_dma_map_single(struct device *dev, const void *addr,
return;
if (!virt_addr_valid(addr))
- err_printk(dev, NULL, "DMA-API: device driver maps memory from invalid area [addr=%p] [len=%lu]\n",
+ err_printk(dev, NULL, "device driver maps memory from invalid area [addr=%p] [len=%lu]\n",
addr, len);
if (is_vmalloc_addr(addr))
- err_printk(dev, NULL, "DMA-API: device driver maps memory from vmalloc area [addr=%p] [len=%lu]\n",
+ err_printk(dev, NULL, "device driver maps memory from vmalloc area [addr=%p] [len=%lu]\n",
addr, len);
}
EXPORT_SYMBOL(debug_dma_map_single);
@@ -1662,48 +1633,6 @@ void debug_dma_sync_single_for_device(struct device *dev,
}
EXPORT_SYMBOL(debug_dma_sync_single_for_device);
-void debug_dma_sync_single_range_for_cpu(struct device *dev,
- dma_addr_t dma_handle,
- unsigned long offset, size_t size,
- int direction)
-{
- struct dma_debug_entry ref;
-
- if (unlikely(dma_debug_disabled()))
- return;
-
- ref.type = dma_debug_single;
- ref.dev = dev;
- ref.dev_addr = dma_handle;
- ref.size = offset + size;
- ref.direction = direction;
- ref.sg_call_ents = 0;
-
- check_sync(dev, &ref, true);
-}
-EXPORT_SYMBOL(debug_dma_sync_single_range_for_cpu);
-
-void debug_dma_sync_single_range_for_device(struct device *dev,
- dma_addr_t dma_handle,
- unsigned long offset,
- size_t size, int direction)
-{
- struct dma_debug_entry ref;
-
- if (unlikely(dma_debug_disabled()))
- return;
-
- ref.type = dma_debug_single;
- ref.dev = dev;
- ref.dev_addr = dma_handle;
- ref.size = offset + size;
- ref.direction = direction;
- ref.sg_call_ents = 0;
-
- check_sync(dev, &ref, false);
-}
-EXPORT_SYMBOL(debug_dma_sync_single_range_for_device);
-
void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
int nelems, int direction)
{
@@ -1780,7 +1709,7 @@ static int __init dma_debug_driver_setup(char *str)
}
if (current_driver_name[0])
- pr_info("DMA-API: enable driver filter for driver [%s]\n",
+ pr_info("enable driver filter for driver [%s]\n",
current_driver_name);
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 375c77e8d52f..355d16acee6d 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -13,6 +13,7 @@
#include <linux/dma-noncoherent.h>
#include <linux/pfn.h>
#include <linux/set_memory.h>
+#include <linux/swiotlb.h>
/*
* Most architectures use ZONE_DMA for the first 16 Megabytes, but
@@ -30,27 +31,16 @@ static inline bool force_dma_unencrypted(void)
return sev_active();
}
-static bool
-check_addr(struct device *dev, dma_addr_t dma_addr, size_t size,
- const char *caller)
+static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size)
{
- if (unlikely(dev && !dma_capable(dev, dma_addr, size))) {
- if (!dev->dma_mask) {
- dev_err(dev,
- "%s: call on device without dma_mask\n",
- caller);
- return false;
- }
-
- if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) {
- dev_err(dev,
- "%s: overflow %pad+%zu of device mask %llx bus mask %llx\n",
- caller, &dma_addr, size,
- *dev->dma_mask, dev->bus_dma_mask);
- }
- return false;
+ if (!dev->dma_mask) {
+ dev_err_once(dev, "DMA map on device without dma_mask\n");
+ } else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) {
+ dev_err_once(dev,
+ "overflow %pad+%zu of DMA mask %llx bus mask %llx\n",
+ &dma_addr, size, *dev->dma_mask, dev->bus_dma_mask);
}
- return true;
+ WARN_ON_ONCE(1);
}
static inline dma_addr_t phys_to_dma_direct(struct device *dev,
@@ -103,14 +93,13 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
min_not_zero(dev->coherent_dma_mask, dev->bus_dma_mask);
}
-void *dma_direct_alloc_pages(struct device *dev, size_t size,
+struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
{
unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
int page_order = get_order(size);
struct page *page = NULL;
u64 phys_mask;
- void *ret;
if (attrs & DMA_ATTR_NO_WARN)
gfp |= __GFP_NOWARN;
@@ -150,11 +139,34 @@ again:
}
}
+ return page;
+}
+
+void *dma_direct_alloc_pages(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
+{
+ struct page *page;
+ void *ret;
+
+ page = __dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs);
if (!page)
return NULL;
+
+ if (PageHighMem(page)) {
+ /*
+ * Depending on the cma= arguments and per-arch setup
+ * dma_alloc_from_contiguous could return highmem pages.
+ * Without remapping there is no way to return them here,
+ * so log an error and fail.
+ */
+ dev_info(dev, "Rejecting highmem page from CMA.\n");
+ __dma_direct_free_pages(dev, size, page);
+ return NULL;
+ }
+
ret = page_address(page);
if (force_dma_unencrypted()) {
- set_memory_decrypted((unsigned long)ret, 1 << page_order);
+ set_memory_decrypted((unsigned long)ret, 1 << get_order(size));
*dma_handle = __phys_to_dma(dev, page_to_phys(page));
} else {
*dma_handle = phys_to_dma(dev, page_to_phys(page));
@@ -163,20 +175,22 @@ again:
return ret;
}
-/*
- * NOTE: this function must never look at the dma_addr argument, because we want
- * to be able to use it as a helper for iommu implementations as well.
- */
+void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page)
+{
+ unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+ if (!dma_release_from_contiguous(dev, page, count))
+ __free_pages(page, get_order(size));
+}
+
void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
dma_addr_t dma_addr, unsigned long attrs)
{
- unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
unsigned int page_order = get_order(size);
if (force_dma_unencrypted())
set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
- if (!dma_release_from_contiguous(dev, virt_to_page(cpu_addr), count))
- free_pages((unsigned long)cpu_addr, page_order);
+ __dma_direct_free_pages(dev, size, virt_to_page(cpu_addr));
}
void *dma_direct_alloc(struct device *dev, size_t size,
@@ -196,67 +210,111 @@ void dma_direct_free(struct device *dev, size_t size,
dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs);
}
-static void dma_direct_sync_single_for_device(struct device *dev,
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
+ defined(CONFIG_SWIOTLB)
+void dma_direct_sync_single_for_device(struct device *dev,
dma_addr_t addr, size_t size, enum dma_data_direction dir)
{
- if (dev_is_dma_coherent(dev))
- return;
- arch_sync_dma_for_device(dev, dma_to_phys(dev, addr), size, dir);
+ phys_addr_t paddr = dma_to_phys(dev, addr);
+
+ if (unlikely(is_swiotlb_buffer(paddr)))
+ swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE);
+
+ if (!dev_is_dma_coherent(dev))
+ arch_sync_dma_for_device(dev, paddr, size, dir);
}
+EXPORT_SYMBOL(dma_direct_sync_single_for_device);
-static void dma_direct_sync_sg_for_device(struct device *dev,
+void dma_direct_sync_sg_for_device(struct device *dev,
struct scatterlist *sgl, int nents, enum dma_data_direction dir)
{
struct scatterlist *sg;
int i;
- if (dev_is_dma_coherent(dev))
- return;
+ for_each_sg(sgl, sg, nents, i) {
+ if (unlikely(is_swiotlb_buffer(sg_phys(sg))))
+ swiotlb_tbl_sync_single(dev, sg_phys(sg), sg->length,
+ dir, SYNC_FOR_DEVICE);
- for_each_sg(sgl, sg, nents, i)
- arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir);
+ if (!dev_is_dma_coherent(dev))
+ arch_sync_dma_for_device(dev, sg_phys(sg), sg->length,
+ dir);
+ }
}
+EXPORT_SYMBOL(dma_direct_sync_sg_for_device);
+#endif
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
- defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
-static void dma_direct_sync_single_for_cpu(struct device *dev,
+ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
+ defined(CONFIG_SWIOTLB)
+void dma_direct_sync_single_for_cpu(struct device *dev,
dma_addr_t addr, size_t size, enum dma_data_direction dir)
{
- if (dev_is_dma_coherent(dev))
- return;
- arch_sync_dma_for_cpu(dev, dma_to_phys(dev, addr), size, dir);
- arch_sync_dma_for_cpu_all(dev);
+ phys_addr_t paddr = dma_to_phys(dev, addr);
+
+ if (!dev_is_dma_coherent(dev)) {
+ arch_sync_dma_for_cpu(dev, paddr, size, dir);
+ arch_sync_dma_for_cpu_all(dev);
+ }
+
+ if (unlikely(is_swiotlb_buffer(paddr)))
+ swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
}
+EXPORT_SYMBOL(dma_direct_sync_single_for_cpu);
-static void dma_direct_sync_sg_for_cpu(struct device *dev,
+void dma_direct_sync_sg_for_cpu(struct device *dev,
struct scatterlist *sgl, int nents, enum dma_data_direction dir)
{
struct scatterlist *sg;
int i;
- if (dev_is_dma_coherent(dev))
- return;
+ for_each_sg(sgl, sg, nents, i) {
+ if (!dev_is_dma_coherent(dev))
+ arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir);
+
+ if (unlikely(is_swiotlb_buffer(sg_phys(sg))))
+ swiotlb_tbl_sync_single(dev, sg_phys(sg), sg->length, dir,
+ SYNC_FOR_CPU);
+ }
- for_each_sg(sgl, sg, nents, i)
- arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir);
- arch_sync_dma_for_cpu_all(dev);
+ if (!dev_is_dma_coherent(dev))
+ arch_sync_dma_for_cpu_all(dev);
}
+EXPORT_SYMBOL(dma_direct_sync_sg_for_cpu);
-static void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
+void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
size_t size, enum dma_data_direction dir, unsigned long attrs)
{
+ phys_addr_t phys = dma_to_phys(dev, addr);
+
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+
+ if (unlikely(is_swiotlb_buffer(phys)))
+ swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
}
+EXPORT_SYMBOL(dma_direct_unmap_page);
-static void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
+void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
int nents, enum dma_data_direction dir, unsigned long attrs)
{
- if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- dma_direct_sync_sg_for_cpu(dev, sgl, nents, dir);
+ struct scatterlist *sg;
+ int i;
+
+ for_each_sg(sgl, sg, nents, i)
+ dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir,
+ attrs);
}
+EXPORT_SYMBOL(dma_direct_unmap_sg);
#endif
+static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr,
+ size_t size)
+{
+ return swiotlb_force != SWIOTLB_FORCE &&
+ (!dev || dma_capable(dev, dma_addr, size));
+}
+
dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t size, enum dma_data_direction dir,
unsigned long attrs)
@@ -264,13 +322,17 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
phys_addr_t phys = page_to_phys(page) + offset;
dma_addr_t dma_addr = phys_to_dma(dev, phys);
- if (!check_addr(dev, dma_addr, size, __func__))
- return DIRECT_MAPPING_ERROR;
+ if (unlikely(!dma_direct_possible(dev, dma_addr, size)) &&
+ !swiotlb_map(dev, &phys, &dma_addr, size, dir, attrs)) {
+ report_addr(dev, dma_addr, size);
+ return DMA_MAPPING_ERROR;
+ }
- if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- dma_direct_sync_single_for_device(dev, dma_addr, size, dir);
+ if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ arch_sync_dma_for_device(dev, phys, size, dir);
return dma_addr;
}
+EXPORT_SYMBOL(dma_direct_map_page);
int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
enum dma_data_direction dir, unsigned long attrs)
@@ -279,18 +341,20 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
struct scatterlist *sg;
for_each_sg(sgl, sg, nents, i) {
- BUG_ON(!sg_page(sg));
-
- sg_dma_address(sg) = phys_to_dma(dev, sg_phys(sg));
- if (!check_addr(dev, sg_dma_address(sg), sg->length, __func__))
- return 0;
+ sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
+ sg->offset, sg->length, dir, attrs);
+ if (sg->dma_address == DMA_MAPPING_ERROR)
+ goto out_unmap;
sg_dma_len(sg) = sg->length;
}
- if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- dma_direct_sync_sg_for_device(dev, sgl, nents, dir);
return nents;
+
+out_unmap:
+ dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
+ return 0;
}
+EXPORT_SYMBOL(dma_direct_map_sg);
/*
* Because 32-bit DMA masks are so common we expect every architecture to be
@@ -316,31 +380,3 @@ int dma_direct_supported(struct device *dev, u64 mask)
*/
return mask >= __phys_to_dma(dev, min_mask);
}
-
-int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
- return dma_addr == DIRECT_MAPPING_ERROR;
-}
-
-const struct dma_map_ops dma_direct_ops = {
- .alloc = dma_direct_alloc,
- .free = dma_direct_free,
- .map_page = dma_direct_map_page,
- .map_sg = dma_direct_map_sg,
-#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE)
- .sync_single_for_device = dma_direct_sync_single_for_device,
- .sync_sg_for_device = dma_direct_sync_sg_for_device,
-#endif
-#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
- defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
- .sync_single_for_cpu = dma_direct_sync_single_for_cpu,
- .sync_sg_for_cpu = dma_direct_sync_sg_for_cpu,
- .unmap_page = dma_direct_unmap_page,
- .unmap_sg = dma_direct_unmap_sg,
-#endif
- .get_required_mask = dma_direct_get_required_mask,
- .dma_supported = dma_direct_supported,
- .mapping_error = dma_direct_mapping_error,
- .cache_sync = arch_dma_cache_sync,
-};
-EXPORT_SYMBOL(dma_direct_ops);
diff --git a/kernel/dma/dummy.c b/kernel/dma/dummy.c
new file mode 100644
index 000000000000..05607642c888
--- /dev/null
+++ b/kernel/dma/dummy.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Dummy DMA ops that always fail.
+ */
+#include <linux/dma-mapping.h>
+
+static int dma_dummy_mmap(struct device *dev, struct vm_area_struct *vma,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs)
+{
+ return -ENXIO;
+}
+
+static dma_addr_t dma_dummy_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ return DMA_MAPPING_ERROR;
+}
+
+static int dma_dummy_map_sg(struct device *dev, struct scatterlist *sgl,
+ int nelems, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ return 0;
+}
+
+static int dma_dummy_supported(struct device *hwdev, u64 mask)
+{
+ return 0;
+}
+
+const struct dma_map_ops dma_dummy_ops = {
+ .mmap = dma_dummy_mmap,
+ .map_page = dma_dummy_map_page,
+ .map_sg = dma_dummy_map_sg,
+ .dma_supported = dma_dummy_supported,
+};
+EXPORT_SYMBOL(dma_dummy_ops);
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 58dec7a92b7b..d7c34d2d1ba5 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -5,8 +5,9 @@
* Copyright (c) 2006 SUSE Linux Products GmbH
* Copyright (c) 2006 Tejun Heo <teheo@suse.de>
*/
-
+#include <linux/memblock.h> /* for max_pfn */
#include <linux/acpi.h>
+#include <linux/dma-direct.h>
#include <linux/dma-noncoherent.h>
#include <linux/export.h>
#include <linux/gfp.h>
@@ -223,7 +224,20 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
return ret;
}
-EXPORT_SYMBOL(dma_common_get_sgtable);
+
+int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (!dma_is_direct(ops) && ops->get_sgtable)
+ return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size,
+ attrs);
+ return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr, size,
+ attrs);
+}
+EXPORT_SYMBOL(dma_get_sgtable_attrs);
/*
* Create userspace mapping for the DMA-coherent memory.
@@ -261,88 +275,179 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
return -ENXIO;
#endif /* !CONFIG_ARCH_NO_COHERENT_DMA_MMAP */
}
-EXPORT_SYMBOL(dma_common_mmap);
-#ifdef CONFIG_MMU
-static struct vm_struct *__dma_common_pages_remap(struct page **pages,
- size_t size, unsigned long vm_flags, pgprot_t prot,
- const void *caller)
+/**
+ * dma_mmap_attrs - map a coherent DMA allocation into user space
+ * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
+ * @vma: vm_area_struct describing requested user mapping
+ * @cpu_addr: kernel CPU-view address returned from dma_alloc_attrs
+ * @dma_addr: device-view address returned from dma_alloc_attrs
+ * @size: size of memory originally requested in dma_alloc_attrs
+ * @attrs: attributes of mapping properties requested in dma_alloc_attrs
+ *
+ * Map a coherent DMA buffer previously allocated by dma_alloc_attrs into user
+ * space. The coherent DMA buffer must not be freed by the driver until the
+ * user space mapping has been released.
+ */
+int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs)
{
- struct vm_struct *area;
+ const struct dma_map_ops *ops = get_dma_ops(dev);
- area = get_vm_area_caller(size, vm_flags, caller);
- if (!area)
- return NULL;
+ if (!dma_is_direct(ops) && ops->mmap)
+ return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
+ return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
+}
+EXPORT_SYMBOL(dma_mmap_attrs);
- if (map_vm_area(area, prot, pages)) {
- vunmap(area->addr);
- return NULL;
+#ifndef ARCH_HAS_DMA_GET_REQUIRED_MASK
+static u64 dma_default_get_required_mask(struct device *dev)
+{
+ u32 low_totalram = ((max_pfn - 1) << PAGE_SHIFT);
+ u32 high_totalram = ((max_pfn - 1) >> (32 - PAGE_SHIFT));
+ u64 mask;
+
+ if (!high_totalram) {
+ /* convert to mask just covering totalram */
+ low_totalram = (1 << (fls(low_totalram) - 1));
+ low_totalram += low_totalram - 1;
+ mask = low_totalram;
+ } else {
+ high_totalram = (1 << (fls(high_totalram) - 1));
+ high_totalram += high_totalram - 1;
+ mask = (((u64)high_totalram) << 32) + 0xffffffff;
}
+ return mask;
+}
- return area;
+u64 dma_get_required_mask(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (dma_is_direct(ops))
+ return dma_direct_get_required_mask(dev);
+ if (ops->get_required_mask)
+ return ops->get_required_mask(dev);
+ return dma_default_get_required_mask(dev);
}
+EXPORT_SYMBOL_GPL(dma_get_required_mask);
+#endif
-/*
- * remaps an array of PAGE_SIZE pages into another vm_area
- * Cannot be used in non-sleeping contexts
- */
-void *dma_common_pages_remap(struct page **pages, size_t size,
- unsigned long vm_flags, pgprot_t prot,
- const void *caller)
+#ifndef arch_dma_alloc_attrs
+#define arch_dma_alloc_attrs(dev) (true)
+#endif
+
+void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
+ gfp_t flag, unsigned long attrs)
{
- struct vm_struct *area;
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ void *cpu_addr;
+
+ WARN_ON_ONCE(dev && !dev->coherent_dma_mask);
- area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller);
- if (!area)
+ if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr))
+ return cpu_addr;
+
+ /* let the implementation decide on the zone to allocate from: */
+ flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
+
+ if (!arch_dma_alloc_attrs(&dev))
return NULL;
- area->pages = pages;
+ if (dma_is_direct(ops))
+ cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs);
+ else if (ops->alloc)
+ cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
+ else
+ return NULL;
- return area->addr;
+ debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
+ return cpu_addr;
}
+EXPORT_SYMBOL(dma_alloc_attrs);
-/*
- * remaps an allocated contiguous region into another vm_area.
- * Cannot be used in non-sleeping contexts
- */
-
-void *dma_common_contiguous_remap(struct page *page, size_t size,
- unsigned long vm_flags,
- pgprot_t prot, const void *caller)
+void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
+ dma_addr_t dma_handle, unsigned long attrs)
{
- int i;
- struct page **pages;
- struct vm_struct *area;
+ const struct dma_map_ops *ops = get_dma_ops(dev);
- pages = kmalloc(sizeof(struct page *) << get_order(size), GFP_KERNEL);
- if (!pages)
- return NULL;
+ if (dma_release_from_dev_coherent(dev, get_order(size), cpu_addr))
+ return;
+ /*
+ * On non-coherent platforms which implement DMA-coherent buffers via
+ * non-cacheable remaps, ops->free() may call vunmap(). Thus getting
+ * this far in IRQ context is a) at risk of a BUG_ON() or trying to
+ * sleep on some machines, and b) an indication that the driver is
+ * probably misusing the coherent API anyway.
+ */
+ WARN_ON(irqs_disabled());
+
+ if (!cpu_addr)
+ return;
- for (i = 0; i < (size >> PAGE_SHIFT); i++)
- pages[i] = nth_page(page, i);
+ debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
+ if (dma_is_direct(ops))
+ dma_direct_free(dev, size, cpu_addr, dma_handle, attrs);
+ else if (ops->free)
+ ops->free(dev, size, cpu_addr, dma_handle, attrs);
+}
+EXPORT_SYMBOL(dma_free_attrs);
- area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller);
+static inline void dma_check_mask(struct device *dev, u64 mask)
+{
+ if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1)))
+ dev_warn(dev, "SME is active, device will require DMA bounce buffers\n");
+}
- kfree(pages);
+int dma_supported(struct device *dev, u64 mask)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
- if (!area)
- return NULL;
- return area->addr;
+ if (dma_is_direct(ops))
+ return dma_direct_supported(dev, mask);
+ if (!ops->dma_supported)
+ return 1;
+ return ops->dma_supported(dev, mask);
}
+EXPORT_SYMBOL(dma_supported);
-/*
- * unmaps a range previously mapped by dma_common_*_remap
- */
-void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags)
+#ifndef HAVE_ARCH_DMA_SET_MASK
+int dma_set_mask(struct device *dev, u64 mask)
{
- struct vm_struct *area = find_vm_area(cpu_addr);
+ if (!dev->dma_mask || !dma_supported(dev, mask))
+ return -EIO;
- if (!area || (area->flags & vm_flags) != vm_flags) {
- WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr);
- return;
- }
+ dma_check_mask(dev, mask);
+ *dev->dma_mask = mask;
+ return 0;
+}
+EXPORT_SYMBOL(dma_set_mask);
+#endif
+
+#ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK
+int dma_set_coherent_mask(struct device *dev, u64 mask)
+{
+ if (!dma_supported(dev, mask))
+ return -EIO;
- unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size));
- vunmap(cpu_addr);
+ dma_check_mask(dev, mask);
+ dev->coherent_dma_mask = mask;
+ return 0;
}
+EXPORT_SYMBOL(dma_set_coherent_mask);
#endif
+
+void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
+ enum dma_data_direction dir)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(dir));
+
+ if (dma_is_direct(ops))
+ arch_dma_cache_sync(dev, vaddr, size, dir);
+ else if (ops->cache_sync)
+ ops->cache_sync(dev, vaddr, size, dir);
+}
+EXPORT_SYMBOL(dma_cache_sync);
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
new file mode 100644
index 000000000000..18cc09fc27b9
--- /dev/null
+++ b/kernel/dma/remap.c
@@ -0,0 +1,256 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2012 ARM Ltd.
+ * Copyright (c) 2014 The Linux Foundation
+ */
+#include <linux/dma-direct.h>
+#include <linux/dma-noncoherent.h>
+#include <linux/dma-contiguous.h>
+#include <linux/init.h>
+#include <linux/genalloc.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+static struct vm_struct *__dma_common_pages_remap(struct page **pages,
+ size_t size, unsigned long vm_flags, pgprot_t prot,
+ const void *caller)
+{
+ struct vm_struct *area;
+
+ area = get_vm_area_caller(size, vm_flags, caller);
+ if (!area)
+ return NULL;
+
+ if (map_vm_area(area, prot, pages)) {
+ vunmap(area->addr);
+ return NULL;
+ }
+
+ return area;
+}
+
+/*
+ * Remaps an array of PAGE_SIZE pages into another vm_area.
+ * Cannot be used in non-sleeping contexts
+ */
+void *dma_common_pages_remap(struct page **pages, size_t size,
+ unsigned long vm_flags, pgprot_t prot,
+ const void *caller)
+{
+ struct vm_struct *area;
+
+ area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller);
+ if (!area)
+ return NULL;
+
+ area->pages = pages;
+
+ return area->addr;
+}
+
+/*
+ * Remaps an allocated contiguous region into another vm_area.
+ * Cannot be used in non-sleeping contexts
+ */
+void *dma_common_contiguous_remap(struct page *page, size_t size,
+ unsigned long vm_flags,
+ pgprot_t prot, const void *caller)
+{
+ int i;
+ struct page **pages;
+ struct vm_struct *area;
+
+ pages = kmalloc(sizeof(struct page *) << get_order(size), GFP_KERNEL);
+ if (!pages)
+ return NULL;
+
+ for (i = 0; i < (size >> PAGE_SHIFT); i++)
+ pages[i] = nth_page(page, i);
+
+ area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller);
+
+ kfree(pages);
+
+ if (!area)
+ return NULL;
+ return area->addr;
+}
+
+/*
+ * Unmaps a range previously mapped by dma_common_*_remap
+ */
+void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags)
+{
+ struct vm_struct *area = find_vm_area(cpu_addr);
+
+ if (!area || (area->flags & vm_flags) != vm_flags) {
+ WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr);
+ return;
+ }
+
+ unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size));
+ vunmap(cpu_addr);
+}
+
+#ifdef CONFIG_DMA_DIRECT_REMAP
+static struct gen_pool *atomic_pool __ro_after_init;
+
+#define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K
+static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE;
+
+static int __init early_coherent_pool(char *p)
+{
+ atomic_pool_size = memparse(p, &p);
+ return 0;
+}
+early_param("coherent_pool", early_coherent_pool);
+
+int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot)
+{
+ unsigned int pool_size_order = get_order(atomic_pool_size);
+ unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT;
+ struct page *page;
+ void *addr;
+ int ret;
+
+ if (dev_get_cma_area(NULL))
+ page = dma_alloc_from_contiguous(NULL, nr_pages,
+ pool_size_order, false);
+ else
+ page = alloc_pages(gfp, pool_size_order);
+ if (!page)
+ goto out;
+
+ arch_dma_prep_coherent(page, atomic_pool_size);
+
+ atomic_pool = gen_pool_create(PAGE_SHIFT, -1);
+ if (!atomic_pool)
+ goto free_page;
+
+ addr = dma_common_contiguous_remap(page, atomic_pool_size, VM_USERMAP,
+ prot, __builtin_return_address(0));
+ if (!addr)
+ goto destroy_genpool;
+
+ ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr,
+ page_to_phys(page), atomic_pool_size, -1);
+ if (ret)
+ goto remove_mapping;
+ gen_pool_set_algo(atomic_pool, gen_pool_first_fit_order_align, NULL);
+
+ pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n",
+ atomic_pool_size / 1024);
+ return 0;
+
+remove_mapping:
+ dma_common_free_remap(addr, atomic_pool_size, VM_USERMAP);
+destroy_genpool:
+ gen_pool_destroy(atomic_pool);
+ atomic_pool = NULL;
+free_page:
+ if (!dma_release_from_contiguous(NULL, page, nr_pages))
+ __free_pages(page, pool_size_order);
+out:
+ pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent allocation\n",
+ atomic_pool_size / 1024);
+ return -ENOMEM;
+}
+
+bool dma_in_atomic_pool(void *start, size_t size)
+{
+ return addr_in_gen_pool(atomic_pool, (unsigned long)start, size);
+}
+
+void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags)
+{
+ unsigned long val;
+ void *ptr = NULL;
+
+ if (!atomic_pool) {
+ WARN(1, "coherent pool not initialised!\n");
+ return NULL;
+ }
+
+ val = gen_pool_alloc(atomic_pool, size);
+ if (val) {
+ phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val);
+
+ *ret_page = pfn_to_page(__phys_to_pfn(phys));
+ ptr = (void *)val;
+ memset(ptr, 0, size);
+ }
+
+ return ptr;
+}
+
+bool dma_free_from_pool(void *start, size_t size)
+{
+ if (!dma_in_atomic_pool(start, size))
+ return false;
+ gen_pool_free(atomic_pool, (unsigned long)start, size);
+ return true;
+}
+
+void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
+ gfp_t flags, unsigned long attrs)
+{
+ struct page *page = NULL;
+ void *ret;
+
+ size = PAGE_ALIGN(size);
+
+ if (!gfpflags_allow_blocking(flags) &&
+ !(attrs & DMA_ATTR_NO_KERNEL_MAPPING)) {
+ ret = dma_alloc_from_pool(size, &page, flags);
+ if (!ret)
+ return NULL;
+ *dma_handle = phys_to_dma(dev, page_to_phys(page));
+ return ret;
+ }
+
+ page = __dma_direct_alloc_pages(dev, size, dma_handle, flags, attrs);
+ if (!page)
+ return NULL;
+
+ /* remove any dirty cache lines on the kernel alias */
+ arch_dma_prep_coherent(page, size);
+
+ if (attrs & DMA_ATTR_NO_KERNEL_MAPPING)
+ return page; /* opaque cookie */
+
+ /* create a coherent mapping */
+ ret = dma_common_contiguous_remap(page, size, VM_USERMAP,
+ arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs),
+ __builtin_return_address(0));
+ if (!ret) {
+ __dma_direct_free_pages(dev, size, page);
+ return ret;
+ }
+
+ *dma_handle = phys_to_dma(dev, page_to_phys(page));
+ memset(ret, 0, size);
+
+ return ret;
+}
+
+void arch_dma_free(struct device *dev, size_t size, void *vaddr,
+ dma_addr_t dma_handle, unsigned long attrs)
+{
+ if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
+ /* vaddr is a struct page cookie, not a kernel address */
+ __dma_direct_free_pages(dev, size, vaddr);
+ } else if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) {
+ phys_addr_t phys = dma_to_phys(dev, dma_handle);
+ struct page *page = pfn_to_page(__phys_to_pfn(phys));
+
+ vunmap(vaddr);
+ __dma_direct_free_pages(dev, size, page);
+ }
+}
+
+long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr,
+ dma_addr_t dma_addr)
+{
+ return __phys_to_pfn(dma_to_phys(dev, dma_addr));
+}
+#endif /* CONFIG_DMA_DIRECT_REMAP */
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 045930e32c0e..d6361776dc5c 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -21,7 +21,6 @@
#include <linux/cache.h>
#include <linux/dma-direct.h>
-#include <linux/dma-noncoherent.h>
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/spinlock.h>
@@ -65,7 +64,7 @@ enum swiotlb_force swiotlb_force;
* swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this
* API.
*/
-static phys_addr_t io_tlb_start, io_tlb_end;
+phys_addr_t io_tlb_start, io_tlb_end;
/*
* The number of IO TLB blocks (in groups of 64) between io_tlb_start and
@@ -383,11 +382,6 @@ void __init swiotlb_exit(void)
max_segment = 0;
}
-static int is_swiotlb_buffer(phys_addr_t paddr)
-{
- return paddr >= io_tlb_start && paddr < io_tlb_end;
-}
-
/*
* Bounce: copy the swiotlb buffer back to the original dma location
*/
@@ -526,7 +520,7 @@ not_found:
spin_unlock_irqrestore(&io_tlb_lock, flags);
if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit())
dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes)\n", size);
- return SWIOTLB_MAP_ERROR;
+ return DMA_MAPPING_ERROR;
found:
spin_unlock_irqrestore(&io_tlb_lock, flags);
@@ -623,237 +617,36 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
}
}
-static dma_addr_t swiotlb_bounce_page(struct device *dev, phys_addr_t *phys,
+/*
+ * Create a swiotlb mapping for the buffer at @phys, and in case of DMAing
+ * to the device copy the data into it as well.
+ */
+bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr,
size_t size, enum dma_data_direction dir, unsigned long attrs)
{
- dma_addr_t dma_addr;
+ trace_swiotlb_bounced(dev, *dma_addr, size, swiotlb_force);
if (unlikely(swiotlb_force == SWIOTLB_NO_FORCE)) {
dev_warn_ratelimited(dev,
"Cannot do DMA to address %pa\n", phys);
- return DIRECT_MAPPING_ERROR;
+ return false;
}
/* Oh well, have to allocate and map a bounce buffer. */
*phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start),
*phys, size, dir, attrs);
- if (*phys == SWIOTLB_MAP_ERROR)
- return DIRECT_MAPPING_ERROR;
+ if (*phys == DMA_MAPPING_ERROR)
+ return false;
/* Ensure that the address returned is DMA'ble */
- dma_addr = __phys_to_dma(dev, *phys);
- if (unlikely(!dma_capable(dev, dma_addr, size))) {
+ *dma_addr = __phys_to_dma(dev, *phys);
+ if (unlikely(!dma_capable(dev, *dma_addr, size))) {
swiotlb_tbl_unmap_single(dev, *phys, size, dir,
attrs | DMA_ATTR_SKIP_CPU_SYNC);
- return DIRECT_MAPPING_ERROR;
- }
-
- return dma_addr;
-}
-
-/*
- * Map a single buffer of the indicated size for DMA in streaming mode. The
- * physical address to use is returned.
- *
- * Once the device is given the dma address, the device owns this memory until
- * either swiotlb_unmap_page or swiotlb_dma_sync_single is performed.
- */
-dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size,
- enum dma_data_direction dir,
- unsigned long attrs)
-{
- phys_addr_t phys = page_to_phys(page) + offset;
- dma_addr_t dev_addr = phys_to_dma(dev, phys);
-
- BUG_ON(dir == DMA_NONE);
- /*
- * If the address happens to be in the device's DMA window,
- * we can safely return the device addr and not worry about bounce
- * buffering it.
- */
- if (!dma_capable(dev, dev_addr, size) ||
- swiotlb_force == SWIOTLB_FORCE) {
- trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force);
- dev_addr = swiotlb_bounce_page(dev, &phys, size, dir, attrs);
- }
-
- if (!dev_is_dma_coherent(dev) &&
- (attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0 &&
- dev_addr != DIRECT_MAPPING_ERROR)
- arch_sync_dma_for_device(dev, phys, size, dir);
-
- return dev_addr;
-}
-
-/*
- * Unmap a single streaming mode DMA translation. The dma_addr and size must
- * match what was provided for in a previous swiotlb_map_page call. All
- * other usages are undefined.
- *
- * After this call, reads by the cpu to the buffer are guaranteed to see
- * whatever the device wrote there.
- */
-void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
- size_t size, enum dma_data_direction dir,
- unsigned long attrs)
-{
- phys_addr_t paddr = dma_to_phys(hwdev, dev_addr);
-
- BUG_ON(dir == DMA_NONE);
-
- if (!dev_is_dma_coherent(hwdev) &&
- (attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0)
- arch_sync_dma_for_cpu(hwdev, paddr, size, dir);
-
- if (is_swiotlb_buffer(paddr)) {
- swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs);
- return;
+ return false;
}
- if (dir != DMA_FROM_DEVICE)
- return;
-
- /*
- * phys_to_virt doesn't work with hihgmem page but we could
- * call dma_mark_clean() with hihgmem page here. However, we
- * are fine since dma_mark_clean() is null on POWERPC. We can
- * make dma_mark_clean() take a physical address if necessary.
- */
- dma_mark_clean(phys_to_virt(paddr), size);
-}
-
-/*
- * Make physical memory consistent for a single streaming mode DMA translation
- * after a transfer.
- *
- * If you perform a swiotlb_map_page() but wish to interrogate the buffer
- * using the cpu, yet do not wish to teardown the dma mapping, you must
- * call this function before doing so. At the next point you give the dma
- * address back to the card, you must first perform a
- * swiotlb_dma_sync_for_device, and then the device again owns the buffer
- */
-static void
-swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
- size_t size, enum dma_data_direction dir,
- enum dma_sync_target target)
-{
- phys_addr_t paddr = dma_to_phys(hwdev, dev_addr);
-
- BUG_ON(dir == DMA_NONE);
-
- if (!dev_is_dma_coherent(hwdev) && target == SYNC_FOR_CPU)
- arch_sync_dma_for_cpu(hwdev, paddr, size, dir);
-
- if (is_swiotlb_buffer(paddr))
- swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target);
-
- if (!dev_is_dma_coherent(hwdev) && target == SYNC_FOR_DEVICE)
- arch_sync_dma_for_device(hwdev, paddr, size, dir);
-
- if (!is_swiotlb_buffer(paddr) && dir == DMA_FROM_DEVICE)
- dma_mark_clean(phys_to_virt(paddr), size);
-}
-
-void
-swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
- size_t size, enum dma_data_direction dir)
-{
- swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU);
-}
-
-void
-swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
- size_t size, enum dma_data_direction dir)
-{
- swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE);
-}
-
-/*
- * Map a set of buffers described by scatterlist in streaming mode for DMA.
- * This is the scatter-gather version of the above swiotlb_map_page
- * interface. Here the scatter gather list elements are each tagged with the
- * appropriate dma address and length. They are obtained via
- * sg_dma_{address,length}(SG).
- *
- * Device ownership issues as mentioned above for swiotlb_map_page are the
- * same here.
- */
-int
-swiotlb_map_sg_attrs(struct device *dev, struct scatterlist *sgl, int nelems,
- enum dma_data_direction dir, unsigned long attrs)
-{
- struct scatterlist *sg;
- int i;
-
- for_each_sg(sgl, sg, nelems, i) {
- sg->dma_address = swiotlb_map_page(dev, sg_page(sg), sg->offset,
- sg->length, dir, attrs);
- if (sg->dma_address == DIRECT_MAPPING_ERROR)
- goto out_error;
- sg_dma_len(sg) = sg->length;
- }
-
- return nelems;
-
-out_error:
- swiotlb_unmap_sg_attrs(dev, sgl, i, dir,
- attrs | DMA_ATTR_SKIP_CPU_SYNC);
- sg_dma_len(sgl) = 0;
- return 0;
-}
-
-/*
- * Unmap a set of streaming mode DMA translations. Again, cpu read rules
- * concerning calls here are the same as for swiotlb_unmap_page() above.
- */
-void
-swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
- int nelems, enum dma_data_direction dir,
- unsigned long attrs)
-{
- struct scatterlist *sg;
- int i;
-
- BUG_ON(dir == DMA_NONE);
-
- for_each_sg(sgl, sg, nelems, i)
- swiotlb_unmap_page(hwdev, sg->dma_address, sg_dma_len(sg), dir,
- attrs);
-}
-
-/*
- * Make physical memory consistent for a set of streaming mode DMA translations
- * after a transfer.
- *
- * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules
- * and usage.
- */
-static void
-swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
- int nelems, enum dma_data_direction dir,
- enum dma_sync_target target)
-{
- struct scatterlist *sg;
- int i;
-
- for_each_sg(sgl, sg, nelems, i)
- swiotlb_sync_single(hwdev, sg->dma_address,
- sg_dma_len(sg), dir, target);
-}
-
-void
-swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
- int nelems, enum dma_data_direction dir)
-{
- swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU);
-}
-
-void
-swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
- int nelems, enum dma_data_direction dir)
-{
- swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE);
+ return true;
}
/*
@@ -867,19 +660,3 @@ swiotlb_dma_supported(struct device *hwdev, u64 mask)
{
return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
}
-
-const struct dma_map_ops swiotlb_dma_ops = {
- .mapping_error = dma_direct_mapping_error,
- .alloc = dma_direct_alloc,
- .free = dma_direct_free,
- .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
- .sync_single_for_device = swiotlb_sync_single_for_device,
- .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
- .sync_sg_for_device = swiotlb_sync_sg_for_device,
- .map_sg = swiotlb_map_sg_attrs,
- .unmap_sg = swiotlb_unmap_sg_attrs,
- .map_page = swiotlb_map_page,
- .unmap_page = swiotlb_unmap_page,
- .dma_supported = dma_direct_supported,
-};
-EXPORT_SYMBOL(swiotlb_dma_ops);
diff --git a/kernel/dma/virt.c b/kernel/dma/virt.c
index 631ddec4b60a..ebe128833af7 100644
--- a/kernel/dma/virt.c
+++ b/kernel/dma/virt.c
@@ -13,7 +13,7 @@ static void *dma_virt_alloc(struct device *dev, size_t size,
{
void *ret;
- ret = (void *)__get_free_pages(gfp, get_order(size));
+ ret = (void *)__get_free_pages(gfp | __GFP_ZERO, get_order(size));
if (ret)
*dma_handle = (uintptr_t)ret;
return ret;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index abbd8da9ac21..8aef47ee7bfa 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -171,11 +171,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
.address = addr,
};
int err;
- /* For mmu_notifiers */
- const unsigned long mmun_start = addr;
- const unsigned long mmun_end = addr + PAGE_SIZE;
+ struct mmu_notifier_range range;
struct mem_cgroup *memcg;
+ mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE);
+
VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page);
err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg,
@@ -186,7 +186,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
/* For try_to_free_swap() and munlock_vma_page() below */
lock_page(old_page);
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_start(&range);
err = -EAGAIN;
if (!page_vma_mapped_walk(&pvmw)) {
mem_cgroup_cancel_charge(new_page, memcg, false);
@@ -220,7 +220,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
err = 0;
unlock:
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(&range);
unlock_page(old_page);
return err;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index e2a5156bc9c3..d439c48ecf18 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -744,15 +744,16 @@ void __init __weak arch_task_cache_init(void) { }
static void set_max_threads(unsigned int max_threads_suggested)
{
u64 threads;
+ unsigned long nr_pages = totalram_pages();
/*
* The number of threads shall be limited such that the thread
* structures may only consume a small part of the available memory.
*/
- if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64)
+ if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64)
threads = MAX_THREADS;
else
- threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE,
+ threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE,
(u64) THREAD_SIZE * 8UL);
if (threads > max_threads_suggested)
@@ -840,7 +841,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
struct task_struct *tsk;
unsigned long *stack;
- struct vm_struct *stack_vm_area;
+ struct vm_struct *stack_vm_area __maybe_unused;
int err;
if (node == NUMA_NO_NODE)
diff --git a/kernel/futex.c b/kernel/futex.c
index 5cc8083a4c89..054105854e0e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -44,6 +44,7 @@
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#include <linux/compat.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/fs.h>
@@ -173,8 +174,10 @@
* double_lock_hb() and double_unlock_hb(), respectively.
*/
-#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
-int __read_mostly futex_cmpxchg_enabled;
+#ifdef CONFIG_HAVE_FUTEX_CMPXCHG
+#define futex_cmpxchg_enabled 1
+#else
+static int __read_mostly futex_cmpxchg_enabled;
#endif
/*
@@ -3417,7 +3420,7 @@ err_unlock:
* Process a futex-list entry, check whether it's owned by the
* dying task, and do notification if so:
*/
-int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
+static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
{
u32 uval, uninitialized_var(nval), mval;
@@ -3612,10 +3615,10 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
- struct timespec __user *, utime, u32 __user *, uaddr2,
+ struct __kernel_timespec __user *, utime, u32 __user *, uaddr2,
u32, val3)
{
- struct timespec ts;
+ struct timespec64 ts;
ktime_t t, *tp = NULL;
u32 val2 = 0;
int cmd = op & FUTEX_CMD_MASK;
@@ -3625,12 +3628,12 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
cmd == FUTEX_WAIT_REQUEUE_PI)) {
if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
return -EFAULT;
- if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
+ if (get_timespec64(&ts, utime))
return -EFAULT;
- if (!timespec_valid(&ts))
+ if (!timespec64_valid(&ts))
return -EINVAL;
- t = timespec_to_ktime(ts);
+ t = timespec64_to_ktime(ts);
if (cmd == FUTEX_WAIT)
t = ktime_add_safe(ktime_get(), t);
tp = &t;
@@ -3646,6 +3649,194 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
}
+#ifdef CONFIG_COMPAT
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int
+compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
+ compat_uptr_t __user *head, unsigned int *pi)
+{
+ if (get_user(*uentry, head))
+ return -EFAULT;
+
+ *entry = compat_ptr((*uentry) & ~1);
+ *pi = (unsigned int)(*uentry) & 1;
+
+ return 0;
+}
+
+static void __user *futex_uaddr(struct robust_list __user *entry,
+ compat_long_t futex_offset)
+{
+ compat_uptr_t base = ptr_to_compat(entry);
+ void __user *uaddr = compat_ptr(base + futex_offset);
+
+ return uaddr;
+}
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+void compat_exit_robust_list(struct task_struct *curr)
+{
+ struct compat_robust_list_head __user *head = curr->compat_robust_list;
+ struct robust_list __user *entry, *next_entry, *pending;
+ unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+ unsigned int uninitialized_var(next_pi);
+ compat_uptr_t uentry, next_uentry, upending;
+ compat_long_t futex_offset;
+ int rc;
+
+ if (!futex_cmpxchg_enabled)
+ return;
+
+ /*
+ * Fetch the list head (which was registered earlier, via
+ * sys_set_robust_list()):
+ */
+ if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
+ return;
+ /*
+ * Fetch the relative futex offset:
+ */
+ if (get_user(futex_offset, &head->futex_offset))
+ return;
+ /*
+ * Fetch any possibly pending lock-add first, and handle it
+ * if it exists:
+ */
+ if (compat_fetch_robust_entry(&upending, &pending,
+ &head->list_op_pending, &pip))
+ return;
+
+ next_entry = NULL; /* avoid warning with gcc */
+ while (entry != (struct robust_list __user *) &head->list) {
+ /*
+ * Fetch the next entry in the list before calling
+ * handle_futex_death:
+ */
+ rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
+ (compat_uptr_t __user *)&entry->next, &next_pi);
+ /*
+ * A pending lock might already be on the list, so
+ * dont process it twice:
+ */
+ if (entry != pending) {
+ void __user *uaddr = futex_uaddr(entry, futex_offset);
+
+ if (handle_futex_death(uaddr, curr, pi))
+ return;
+ }
+ if (rc)
+ return;
+ uentry = next_uentry;
+ entry = next_entry;
+ pi = next_pi;
+ /*
+ * Avoid excessively long or circular lists:
+ */
+ if (!--limit)
+ break;
+
+ cond_resched();
+ }
+ if (pending) {
+ void __user *uaddr = futex_uaddr(pending, futex_offset);
+
+ handle_futex_death(uaddr, curr, pip);
+ }
+}
+
+COMPAT_SYSCALL_DEFINE2(set_robust_list,
+ struct compat_robust_list_head __user *, head,
+ compat_size_t, len)
+{
+ if (!futex_cmpxchg_enabled)
+ return -ENOSYS;
+
+ if (unlikely(len != sizeof(*head)))
+ return -EINVAL;
+
+ current->compat_robust_list = head;
+
+ return 0;
+}
+
+COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
+ compat_uptr_t __user *, head_ptr,
+ compat_size_t __user *, len_ptr)
+{
+ struct compat_robust_list_head __user *head;
+ unsigned long ret;
+ struct task_struct *p;
+
+ if (!futex_cmpxchg_enabled)
+ return -ENOSYS;
+
+ rcu_read_lock();
+
+ ret = -ESRCH;
+ if (!pid)
+ p = current;
+ else {
+ p = find_task_by_vpid(pid);
+ if (!p)
+ goto err_unlock;
+ }
+
+ ret = -EPERM;
+ if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
+ goto err_unlock;
+
+ head = p->compat_robust_list;
+ rcu_read_unlock();
+
+ if (put_user(sizeof(*head), len_ptr))
+ return -EFAULT;
+ return put_user(ptr_to_compat(head), head_ptr);
+
+err_unlock:
+ rcu_read_unlock();
+
+ return ret;
+}
+#endif /* CONFIG_COMPAT */
+
+#ifdef CONFIG_COMPAT_32BIT_TIME
+COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
+ struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
+ u32, val3)
+{
+ struct timespec64 ts;
+ ktime_t t, *tp = NULL;
+ int val2 = 0;
+ int cmd = op & FUTEX_CMD_MASK;
+
+ if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
+ cmd == FUTEX_WAIT_BITSET ||
+ cmd == FUTEX_WAIT_REQUEUE_PI)) {
+ if (get_old_timespec32(&ts, utime))
+ return -EFAULT;
+ if (!timespec64_valid(&ts))
+ return -EINVAL;
+
+ t = timespec64_to_ktime(ts);
+ if (cmd == FUTEX_WAIT)
+ t = ktime_add_safe(ktime_get(), t);
+ tp = &t;
+ }
+ if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
+ cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
+ val2 = (int) (unsigned long) utime;
+
+ return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
+}
+#endif /* CONFIG_COMPAT_32BIT_TIME */
+
static void __init futex_detect_cmpxchg(void)
{
#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
deleted file mode 100644
index 410a77a8f6e2..000000000000
--- a/kernel/futex_compat.c
+++ /dev/null
@@ -1,202 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/kernel/futex_compat.c
- *
- * Futex compatibililty routines.
- *
- * Copyright 2006, Red Hat, Inc., Ingo Molnar
- */
-
-#include <linux/linkage.h>
-#include <linux/compat.h>
-#include <linux/nsproxy.h>
-#include <linux/futex.h>
-#include <linux/ptrace.h>
-#include <linux/syscalls.h>
-
-#include <linux/uaccess.h>
-
-
-/*
- * Fetch a robust-list pointer. Bit 0 signals PI futexes:
- */
-static inline int
-fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
- compat_uptr_t __user *head, unsigned int *pi)
-{
- if (get_user(*uentry, head))
- return -EFAULT;
-
- *entry = compat_ptr((*uentry) & ~1);
- *pi = (unsigned int)(*uentry) & 1;
-
- return 0;
-}
-
-static void __user *futex_uaddr(struct robust_list __user *entry,
- compat_long_t futex_offset)
-{
- compat_uptr_t base = ptr_to_compat(entry);
- void __user *uaddr = compat_ptr(base + futex_offset);
-
- return uaddr;
-}
-
-/*
- * Walk curr->robust_list (very carefully, it's a userspace list!)
- * and mark any locks found there dead, and notify any waiters.
- *
- * We silently return on any sign of list-walking problem.
- */
-void compat_exit_robust_list(struct task_struct *curr)
-{
- struct compat_robust_list_head __user *head = curr->compat_robust_list;
- struct robust_list __user *entry, *next_entry, *pending;
- unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
- unsigned int uninitialized_var(next_pi);
- compat_uptr_t uentry, next_uentry, upending;
- compat_long_t futex_offset;
- int rc;
-
- if (!futex_cmpxchg_enabled)
- return;
-
- /*
- * Fetch the list head (which was registered earlier, via
- * sys_set_robust_list()):
- */
- if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
- return;
- /*
- * Fetch the relative futex offset:
- */
- if (get_user(futex_offset, &head->futex_offset))
- return;
- /*
- * Fetch any possibly pending lock-add first, and handle it
- * if it exists:
- */
- if (fetch_robust_entry(&upending, &pending,
- &head->list_op_pending, &pip))
- return;
-
- next_entry = NULL; /* avoid warning with gcc */
- while (entry != (struct robust_list __user *) &head->list) {
- /*
- * Fetch the next entry in the list before calling
- * handle_futex_death:
- */
- rc = fetch_robust_entry(&next_uentry, &next_entry,
- (compat_uptr_t __user *)&entry->next, &next_pi);
- /*
- * A pending lock might already be on the list, so
- * dont process it twice:
- */
- if (entry != pending) {
- void __user *uaddr = futex_uaddr(entry, futex_offset);
-
- if (handle_futex_death(uaddr, curr, pi))
- return;
- }
- if (rc)
- return;
- uentry = next_uentry;
- entry = next_entry;
- pi = next_pi;
- /*
- * Avoid excessively long or circular lists:
- */
- if (!--limit)
- break;
-
- cond_resched();
- }
- if (pending) {
- void __user *uaddr = futex_uaddr(pending, futex_offset);
-
- handle_futex_death(uaddr, curr, pip);
- }
-}
-
-COMPAT_SYSCALL_DEFINE2(set_robust_list,
- struct compat_robust_list_head __user *, head,
- compat_size_t, len)
-{
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
-
- if (unlikely(len != sizeof(*head)))
- return -EINVAL;
-
- current->compat_robust_list = head;
-
- return 0;
-}
-
-COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
- compat_uptr_t __user *, head_ptr,
- compat_size_t __user *, len_ptr)
-{
- struct compat_robust_list_head __user *head;
- unsigned long ret;
- struct task_struct *p;
-
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
-
- rcu_read_lock();
-
- ret = -ESRCH;
- if (!pid)
- p = current;
- else {
- p = find_task_by_vpid(pid);
- if (!p)
- goto err_unlock;
- }
-
- ret = -EPERM;
- if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
- goto err_unlock;
-
- head = p->compat_robust_list;
- rcu_read_unlock();
-
- if (put_user(sizeof(*head), len_ptr))
- return -EFAULT;
- return put_user(ptr_to_compat(head), head_ptr);
-
-err_unlock:
- rcu_read_unlock();
-
- return ret;
-}
-
-COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
- struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
- u32, val3)
-{
- struct timespec ts;
- ktime_t t, *tp = NULL;
- int val2 = 0;
- int cmd = op & FUTEX_CMD_MASK;
-
- if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
- cmd == FUTEX_WAIT_BITSET ||
- cmd == FUTEX_WAIT_REQUEUE_PI)) {
- if (compat_get_timespec(&ts, utime))
- return -EFAULT;
- if (!timespec_valid(&ts))
- return -EINVAL;
-
- t = timespec_to_ktime(ts);
- if (cmd == FUTEX_WAIT)
- t = ktime_add_safe(ktime_get(), t);
- tp = &t;
- }
- if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
- cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
- val2 = (int) (unsigned long) utime;
-
- return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
-}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 86ef06d3dbe3..d7140447be75 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -152,6 +152,7 @@ int sanity_check_segment_list(struct kimage *image)
int i;
unsigned long nr_segments = image->nr_segments;
unsigned long total_pages = 0;
+ unsigned long nr_pages = totalram_pages();
/*
* Verify we have good destination addresses. The caller is
@@ -217,13 +218,13 @@ int sanity_check_segment_list(struct kimage *image)
* wasted allocating pages, which can cause a soft lockup.
*/
for (i = 0; i < nr_segments; i++) {
- if (PAGE_COUNT(image->segment[i].memsz) > totalram_pages / 2)
+ if (PAGE_COUNT(image->segment[i].memsz) > nr_pages / 2)
return -EINVAL;
total_pages += PAGE_COUNT(image->segment[i].memsz);
}
- if (total_pages > totalram_pages / 2)
+ if (total_pages > nr_pages / 2)
return -EINVAL;
/*
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 9eced2cc9f94..a856cb5ff192 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -11,6 +11,7 @@
#include <linux/types.h>
#include <linux/wait_bit.h>
#include <linux/xarray.h>
+#include <linux/hmm.h>
static DEFINE_XARRAY(pgmap_array);
#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
@@ -24,6 +25,9 @@ vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
pmd_t *pmdp)
{
struct page *page = device_private_entry_to_page(entry);
+ struct hmm_devmem *devmem;
+
+ devmem = container_of(page->pgmap, typeof(*devmem), pagemap);
/*
* The page_fault() callback must migrate page back to system memory
@@ -39,7 +43,7 @@ vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
* There is a more in-depth description of what that callback can and
* cannot do, in include/linux/memremap.h
*/
- return page->pgmap->page_fault(vma, addr, page, flags, pmdp);
+ return devmem->page_fault(vma, addr, page, flags, pmdp);
}
EXPORT_SYMBOL(device_private_entry_fault);
#endif /* CONFIG_DEVICE_PRIVATE */
@@ -87,24 +91,29 @@ static void devm_memremap_pages_release(void *data)
struct resource *res = &pgmap->res;
resource_size_t align_start, align_size;
unsigned long pfn;
+ int nid;
+ pgmap->kill(pgmap->ref);
for_each_device_pfn(pfn, pgmap)
put_page(pfn_to_page(pfn));
- if (percpu_ref_tryget_live(pgmap->ref)) {
- dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
- percpu_ref_put(pgmap->ref);
- }
-
/* pages are dead and unused, undo the arch mapping */
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
- align_start;
+ nid = page_to_nid(pfn_to_page(align_start >> PAGE_SHIFT));
+
mem_hotplug_begin();
- arch_remove_memory(align_start, align_size, pgmap->altmap_valid ?
- &pgmap->altmap : NULL);
- kasan_remove_zero_shadow(__va(align_start), align_size);
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+ pfn = align_start >> PAGE_SHIFT;
+ __remove_pages(page_zone(pfn_to_page(pfn)), pfn,
+ align_size >> PAGE_SHIFT, NULL);
+ } else {
+ arch_remove_memory(nid, align_start, align_size,
+ pgmap->altmap_valid ? &pgmap->altmap : NULL);
+ kasan_remove_zero_shadow(__va(align_start), align_size);
+ }
mem_hotplug_done();
untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
@@ -116,7 +125,7 @@ static void devm_memremap_pages_release(void *data)
/**
* devm_memremap_pages - remap and provide memmap backing for the given resource
* @dev: hosting device for @res
- * @pgmap: pointer to a struct dev_pgmap
+ * @pgmap: pointer to a struct dev_pagemap
*
* Notes:
* 1/ At a minimum the res, ref and type members of @pgmap must be initialized
@@ -125,11 +134,8 @@ static void devm_memremap_pages_release(void *data)
* 2/ The altmap field may optionally be initialized, in which case altmap_valid
* must be set to true
*
- * 3/ pgmap.ref must be 'live' on entry and 'dead' before devm_memunmap_pages()
- * time (or devm release event). The expected order of events is that ref has
- * been through percpu_ref_kill() before devm_memremap_pages_release(). The
- * wait for the completion of all references being dropped and
- * percpu_ref_exit() must occur after devm_memremap_pages_release().
+ * 3/ pgmap->ref must be 'live' on entry and will be killed at
+ * devm_memremap_pages_release() time, or if this routine fails.
*
* 4/ res is expected to be a host memory range that could feasibly be
* treated as a "System RAM" range, i.e. not a device mmio range, but
@@ -145,6 +151,9 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
pgprot_t pgprot = PAGE_KERNEL;
int error, nid, is_ram;
+ if (!pgmap->ref || !pgmap->kill)
+ return ERR_PTR(-EINVAL);
+
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
- align_start;
@@ -167,18 +176,13 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
is_ram = region_intersects(align_start, align_size,
IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
- if (is_ram == REGION_MIXED) {
- WARN_ONCE(1, "%s attempted on mixed region %pr\n",
- __func__, res);
- return ERR_PTR(-ENXIO);
+ if (is_ram != REGION_DISJOINT) {
+ WARN_ONCE(1, "%s attempted on %s region %pr\n", __func__,
+ is_ram == REGION_MIXED ? "mixed" : "ram", res);
+ error = -ENXIO;
+ goto err_array;
}
- if (is_ram == REGION_INTERSECTS)
- return __va(res->start);
-
- if (!pgmap->ref)
- return ERR_PTR(-EINVAL);
-
pgmap->dev = dev;
error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start),
@@ -196,17 +200,40 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
goto err_pfn_remap;
mem_hotplug_begin();
- error = kasan_add_zero_shadow(__va(align_start), align_size);
- if (error) {
- mem_hotplug_done();
- goto err_kasan;
+
+ /*
+ * For device private memory we call add_pages() as we only need to
+ * allocate and initialize struct page for the device memory. More-
+ * over the device memory is un-accessible thus we do not want to
+ * create a linear mapping for the memory like arch_add_memory()
+ * would do.
+ *
+ * For all other device memory types, which are accessible by
+ * the CPU, we do want the linear mapping and thus use
+ * arch_add_memory().
+ */
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+ error = add_pages(nid, align_start >> PAGE_SHIFT,
+ align_size >> PAGE_SHIFT, NULL, false);
+ } else {
+ error = kasan_add_zero_shadow(__va(align_start), align_size);
+ if (error) {
+ mem_hotplug_done();
+ goto err_kasan;
+ }
+
+ error = arch_add_memory(nid, align_start, align_size, altmap,
+ false);
+ }
+
+ if (!error) {
+ struct zone *zone;
+
+ zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
+ move_pfn_range_to_zone(zone, align_start >> PAGE_SHIFT,
+ align_size >> PAGE_SHIFT, altmap);
}
- error = arch_add_memory(nid, align_start, align_size, altmap, false);
- if (!error)
- move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
- align_start >> PAGE_SHIFT,
- align_size >> PAGE_SHIFT, altmap);
mem_hotplug_done();
if (error)
goto err_add_memory;
@@ -220,7 +247,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
align_size >> PAGE_SHIFT, pgmap);
percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap));
- devm_add_action(dev, devm_memremap_pages_release, pgmap);
+ error = devm_add_action_or_reset(dev, devm_memremap_pages_release,
+ pgmap);
+ if (error)
+ return ERR_PTR(error);
return __va(res->start);
@@ -231,9 +261,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
err_pfn_remap:
pgmap_array_delete(res);
err_array:
+ pgmap->kill(pgmap->ref);
return ERR_PTR(error);
}
-EXPORT_SYMBOL(devm_memremap_pages);
+EXPORT_SYMBOL_GPL(devm_memremap_pages);
unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
{
diff --git a/kernel/module.c b/kernel/module.c
index 99b46c32d579..fcbc0128810b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -495,9 +495,9 @@ struct find_symbol_arg {
const struct kernel_symbol *sym;
};
-static bool check_symbol(const struct symsearch *syms,
- struct module *owner,
- unsigned int symnum, void *data)
+static bool check_exported_symbol(const struct symsearch *syms,
+ struct module *owner,
+ unsigned int symnum, void *data)
{
struct find_symbol_arg *fsa = data;
@@ -555,9 +555,9 @@ static int cmp_name(const void *va, const void *vb)
return strcmp(a, kernel_symbol_name(b));
}
-static bool find_symbol_in_section(const struct symsearch *syms,
- struct module *owner,
- void *data)
+static bool find_exported_symbol_in_section(const struct symsearch *syms,
+ struct module *owner,
+ void *data)
{
struct find_symbol_arg *fsa = data;
struct kernel_symbol *sym;
@@ -565,13 +565,14 @@ static bool find_symbol_in_section(const struct symsearch *syms,
sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
sizeof(struct kernel_symbol), cmp_name);
- if (sym != NULL && check_symbol(syms, owner, sym - syms->start, data))
+ if (sym != NULL && check_exported_symbol(syms, owner,
+ sym - syms->start, data))
return true;
return false;
}
-/* Find a symbol and return it, along with, (optional) crc and
+/* Find an exported symbol and return it, along with, (optional) crc and
* (optional) module which owns it. Needs preempt disabled or module_mutex. */
const struct kernel_symbol *find_symbol(const char *name,
struct module **owner,
@@ -585,7 +586,7 @@ const struct kernel_symbol *find_symbol(const char *name,
fsa.gplok = gplok;
fsa.warn = warn;
- if (each_symbol_section(find_symbol_in_section, &fsa)) {
+ if (each_symbol_section(find_exported_symbol_in_section, &fsa)) {
if (owner)
*owner = fsa.owner;
if (crc)
@@ -1207,8 +1208,10 @@ static ssize_t store_uevent(struct module_attribute *mattr,
struct module_kobject *mk,
const char *buffer, size_t count)
{
- kobject_synth_uevent(&mk->kobj, buffer, count);
- return count;
+ int rc;
+
+ rc = kobject_synth_uevent(&mk->kobj, buffer, count);
+ return rc ? rc : count;
}
struct module_attribute module_uevent =
@@ -2198,7 +2201,7 @@ EXPORT_SYMBOL_GPL(__symbol_get);
*
* You must hold the module_mutex.
*/
-static int verify_export_symbols(struct module *mod)
+static int verify_exported_symbols(struct module *mod)
{
unsigned int i;
struct module *owner;
@@ -2519,10 +2522,10 @@ static void free_modinfo(struct module *mod)
#ifdef CONFIG_KALLSYMS
-/* lookup symbol in given range of kernel_symbols */
-static const struct kernel_symbol *lookup_symbol(const char *name,
- const struct kernel_symbol *start,
- const struct kernel_symbol *stop)
+/* Lookup exported symbol in given range of kernel_symbols */
+static const struct kernel_symbol *lookup_exported_symbol(const char *name,
+ const struct kernel_symbol *start,
+ const struct kernel_symbol *stop)
{
return bsearch(name, start, stop - start,
sizeof(struct kernel_symbol), cmp_name);
@@ -2533,9 +2536,10 @@ static int is_exported(const char *name, unsigned long value,
{
const struct kernel_symbol *ks;
if (!mod)
- ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab);
+ ks = lookup_exported_symbol(name, __start___ksymtab, __stop___ksymtab);
else
- ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms);
+ ks = lookup_exported_symbol(name, mod->syms, mod->syms + mod->num_syms);
+
return ks != NULL && kernel_symbol_value(ks) == value;
}
@@ -2682,7 +2686,7 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
/* Set types up while we still have access to sections. */
for (i = 0; i < mod->kallsyms->num_symtab; i++)
- mod->kallsyms->symtab[i].st_info
+ mod->kallsyms->symtab[i].st_size
= elf_type(&mod->kallsyms->symtab[i], info);
/* Now populate the cut down core kallsyms for after init. */
@@ -3093,6 +3097,11 @@ static int find_module_sections(struct module *mod, struct load_info *info)
sizeof(*mod->tracepoints_ptrs),
&mod->num_tracepoints);
#endif
+#ifdef CONFIG_BPF_EVENTS
+ mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map",
+ sizeof(*mod->bpf_raw_events),
+ &mod->num_bpf_raw_events);
+#endif
#ifdef HAVE_JUMP_LABEL
mod->jump_entries = section_objs(info, "__jump_table",
sizeof(*mod->jump_entries),
@@ -3592,7 +3601,7 @@ static int complete_formation(struct module *mod, struct load_info *info)
mutex_lock(&module_mutex);
/* Find duplicate symbols (must be called under lock). */
- err = verify_export_symbols(mod);
+ err = verify_exported_symbols(mod);
if (err < 0)
goto out;
@@ -3911,18 +3920,22 @@ static inline int is_arm_mapping_symbol(const char *str)
&& (str[2] == '\0' || str[2] == '.');
}
-static const char *symname(struct mod_kallsyms *kallsyms, unsigned int symnum)
+static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum)
{
return kallsyms->strtab + kallsyms->symtab[symnum].st_name;
}
-static const char *get_ksymbol(struct module *mod,
- unsigned long addr,
- unsigned long *size,
- unsigned long *offset)
+/*
+ * Given a module and address, find the corresponding symbol and return its name
+ * while providing its size and offset if needed.
+ */
+static const char *find_kallsyms_symbol(struct module *mod,
+ unsigned long addr,
+ unsigned long *size,
+ unsigned long *offset)
{
unsigned int i, best = 0;
- unsigned long nextval;
+ unsigned long nextval, bestval;
struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
/* At worse, next value is at end of module */
@@ -3931,34 +3944,40 @@ static const char *get_ksymbol(struct module *mod,
else
nextval = (unsigned long)mod->core_layout.base+mod->core_layout.text_size;
+ bestval = kallsyms_symbol_value(&kallsyms->symtab[best]);
+
/* Scan for closest preceding symbol, and next symbol. (ELF
starts real symbols at 1). */
for (i = 1; i < kallsyms->num_symtab; i++) {
- if (kallsyms->symtab[i].st_shndx == SHN_UNDEF)
+ const Elf_Sym *sym = &kallsyms->symtab[i];
+ unsigned long thisval = kallsyms_symbol_value(sym);
+
+ if (sym->st_shndx == SHN_UNDEF)
continue;
/* We ignore unnamed symbols: they're uninformative
* and inserted at a whim. */
- if (*symname(kallsyms, i) == '\0'
- || is_arm_mapping_symbol(symname(kallsyms, i)))
+ if (*kallsyms_symbol_name(kallsyms, i) == '\0'
+ || is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i)))
continue;
- if (kallsyms->symtab[i].st_value <= addr
- && kallsyms->symtab[i].st_value > kallsyms->symtab[best].st_value)
+ if (thisval <= addr && thisval > bestval) {
best = i;
- if (kallsyms->symtab[i].st_value > addr
- && kallsyms->symtab[i].st_value < nextval)
- nextval = kallsyms->symtab[i].st_value;
+ bestval = thisval;
+ }
+ if (thisval > addr && thisval < nextval)
+ nextval = thisval;
}
if (!best)
return NULL;
if (size)
- *size = nextval - kallsyms->symtab[best].st_value;
+ *size = nextval - bestval;
if (offset)
- *offset = addr - kallsyms->symtab[best].st_value;
- return symname(kallsyms, best);
+ *offset = addr - bestval;
+
+ return kallsyms_symbol_name(kallsyms, best);
}
void * __weak dereference_module_function_descriptor(struct module *mod,
@@ -3983,7 +4002,8 @@ const char *module_address_lookup(unsigned long addr,
if (mod) {
if (modname)
*modname = mod->name;
- ret = get_ksymbol(mod, addr, size, offset);
+
+ ret = find_kallsyms_symbol(mod, addr, size, offset);
}
/* Make a copy in here where it's safe */
if (ret) {
@@ -4006,9 +4026,10 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
if (within_module(addr, mod)) {
const char *sym;
- sym = get_ksymbol(mod, addr, NULL, NULL);
+ sym = find_kallsyms_symbol(mod, addr, NULL, NULL);
if (!sym)
goto out;
+
strlcpy(symname, sym, KSYM_NAME_LEN);
preempt_enable();
return 0;
@@ -4031,7 +4052,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
if (within_module(addr, mod)) {
const char *sym;
- sym = get_ksymbol(mod, addr, size, offset);
+ sym = find_kallsyms_symbol(mod, addr, size, offset);
if (!sym)
goto out;
if (modname)
@@ -4060,9 +4081,11 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
continue;
kallsyms = rcu_dereference_sched(mod->kallsyms);
if (symnum < kallsyms->num_symtab) {
- *value = kallsyms->symtab[symnum].st_value;
- *type = kallsyms->symtab[symnum].st_info;
- strlcpy(name, symname(kallsyms, symnum), KSYM_NAME_LEN);
+ const Elf_Sym *sym = &kallsyms->symtab[symnum];
+
+ *value = kallsyms_symbol_value(sym);
+ *type = sym->st_size;
+ strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN);
strlcpy(module_name, mod->name, MODULE_NAME_LEN);
*exported = is_exported(name, *value, mod);
preempt_enable();
@@ -4074,15 +4097,19 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
return -ERANGE;
}
-static unsigned long mod_find_symname(struct module *mod, const char *name)
+/* Given a module and name of symbol, find and return the symbol's value */
+static unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
{
unsigned int i;
struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
- for (i = 0; i < kallsyms->num_symtab; i++)
- if (strcmp(name, symname(kallsyms, i)) == 0 &&
- kallsyms->symtab[i].st_shndx != SHN_UNDEF)
- return kallsyms->symtab[i].st_value;
+ for (i = 0; i < kallsyms->num_symtab; i++) {
+ const Elf_Sym *sym = &kallsyms->symtab[i];
+
+ if (strcmp(name, kallsyms_symbol_name(kallsyms, i)) == 0 &&
+ sym->st_shndx != SHN_UNDEF)
+ return kallsyms_symbol_value(sym);
+ }
return 0;
}
@@ -4097,12 +4124,12 @@ unsigned long module_kallsyms_lookup_name(const char *name)
preempt_disable();
if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) {
if ((mod = find_module_all(name, colon - name, false)) != NULL)
- ret = mod_find_symname(mod, colon+1);
+ ret = find_kallsyms_symbol_value(mod, colon+1);
} else {
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if ((ret = mod_find_symname(mod, name)) != 0)
+ if ((ret = find_kallsyms_symbol_value(mod, name)) != 0)
break;
}
}
@@ -4127,12 +4154,13 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
if (mod->state == MODULE_STATE_UNFORMED)
continue;
for (i = 0; i < kallsyms->num_symtab; i++) {
+ const Elf_Sym *sym = &kallsyms->symtab[i];
- if (kallsyms->symtab[i].st_shndx == SHN_UNDEF)
+ if (sym->st_shndx == SHN_UNDEF)
continue;
- ret = fn(data, symname(kallsyms, i),
- mod, kallsyms->symtab[i].st_value);
+ ret = fn(data, kallsyms_symbol_name(kallsyms, i),
+ mod, kallsyms_symbol_value(sym));
if (ret != 0)
return ret;
}
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index f2075ce8e4b3..6b9a926fd86b 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -83,6 +83,7 @@ int mod_verify_sig(const void *mod, struct load_info *info)
}
return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
- NULL, VERIFYING_MODULE_SIGNATURE,
+ VERIFY_USE_SECONDARY_KEYRING,
+ VERIFYING_MODULE_SIGNATURE,
NULL, NULL);
}
diff --git a/kernel/padata.c b/kernel/padata.c
index d568cc56405f..3e2633ae3bca 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -720,7 +720,7 @@ int padata_start(struct padata_instance *pinst)
if (pinst->flags & PADATA_INVALID)
err = -EINVAL;
- __padata_start(pinst);
+ __padata_start(pinst);
mutex_unlock(&pinst->lock);
diff --git a/kernel/panic.c b/kernel/panic.c
index f6d549a29a5c..d10c340c43b0 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -14,6 +14,7 @@
#include <linux/kmsg_dump.h>
#include <linux/kallsyms.h>
#include <linux/notifier.h>
+#include <linux/vt_kern.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/ftrace.h>
@@ -237,7 +238,10 @@ void panic(const char *fmt, ...)
if (_crash_kexec_post_notifiers)
__crash_kexec(NULL);
- bust_spinlocks(0);
+#ifdef CONFIG_VT
+ unblank_screen();
+#endif
+ console_unblank();
/*
* We may have ended up stopping the CPU holding the lock (in
diff --git a/kernel/pid.c b/kernel/pid.c
index b2f6c506035d..20881598bdfa 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -233,8 +233,10 @@ out_unlock:
out_free:
spin_lock_irq(&pidmap_lock);
- while (++i <= ns->level)
- idr_remove(&ns->idr, (pid->numbers + i)->nr);
+ while (++i <= ns->level) {
+ upid = pid->numbers + i;
+ idr_remove(&upid->ns->idr, upid->nr);
+ }
/* On failure to allocate the first pid, reset the state */
if (ns->pid_allocated == PIDNS_ADDING)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index b0308a2c6000..640b2034edd6 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -105,7 +105,7 @@ unsigned long image_size;
void __init hibernate_image_size_init(void)
{
- image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
+ image_size = ((totalram_pages() * 2) / 5) * PAGE_SIZE;
}
/*
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 1b2a029360b7..1306fe0c1dc6 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -403,6 +403,7 @@ DECLARE_WAIT_QUEUE_HEAD(log_wait);
static u64 syslog_seq;
static u32 syslog_idx;
static size_t syslog_partial;
+static bool syslog_time;
/* index and sequence number of the first record stored in the buffer */
static u64 log_first_seq;
@@ -752,6 +753,19 @@ struct devkmsg_user {
char buf[CONSOLE_EXT_LOG_MAX];
};
+static __printf(3, 4) __cold
+int devkmsg_emit(int facility, int level, const char *fmt, ...)
+{
+ va_list args;
+ int r;
+
+ va_start(args, fmt);
+ r = vprintk_emit(facility, level, NULL, 0, fmt, args);
+ va_end(args);
+
+ return r;
+}
+
static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
{
char *buf, *line;
@@ -810,7 +824,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
}
}
- printk_emit(facility, level, NULL, 0, "%s", line);
+ devkmsg_emit(facility, level, "%s", line);
kfree(buf);
return ret;
}
@@ -1213,50 +1227,39 @@ static inline void boot_delay_msec(int level)
static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME);
module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
-static size_t print_time(u64 ts, char *buf)
+static size_t print_syslog(unsigned int level, char *buf)
{
- unsigned long rem_nsec;
-
- if (!printk_time)
- return 0;
-
- rem_nsec = do_div(ts, 1000000000);
+ return sprintf(buf, "<%u>", level);
+}
- if (!buf)
- return snprintf(NULL, 0, "[%5lu.000000] ", (unsigned long)ts);
+static size_t print_time(u64 ts, char *buf)
+{
+ unsigned long rem_nsec = do_div(ts, 1000000000);
return sprintf(buf, "[%5lu.%06lu] ",
(unsigned long)ts, rem_nsec / 1000);
}
-static size_t print_prefix(const struct printk_log *msg, bool syslog, char *buf)
+static size_t print_prefix(const struct printk_log *msg, bool syslog,
+ bool time, char *buf)
{
size_t len = 0;
- unsigned int prefix = (msg->facility << 3) | msg->level;
-
- if (syslog) {
- if (buf) {
- len += sprintf(buf, "<%u>", prefix);
- } else {
- len += 3;
- if (prefix > 999)
- len += 3;
- else if (prefix > 99)
- len += 2;
- else if (prefix > 9)
- len++;
- }
- }
- len += print_time(msg->ts_nsec, buf ? buf + len : NULL);
+ if (syslog)
+ len = print_syslog((msg->facility << 3) | msg->level, buf);
+ if (time)
+ len += print_time(msg->ts_nsec, buf + len);
return len;
}
-static size_t msg_print_text(const struct printk_log *msg, bool syslog, char *buf, size_t size)
+static size_t msg_print_text(const struct printk_log *msg, bool syslog,
+ bool time, char *buf, size_t size)
{
const char *text = log_text(msg);
size_t text_size = msg->text_len;
size_t len = 0;
+ char prefix[PREFIX_MAX];
+ const size_t prefix_len = print_prefix(msg, syslog, time, prefix);
do {
const char *next = memchr(text, '\n', text_size);
@@ -1271,19 +1274,17 @@ static size_t msg_print_text(const struct printk_log *msg, bool syslog, char *bu
}
if (buf) {
- if (print_prefix(msg, syslog, NULL) +
- text_len + 1 >= size - len)
+ if (prefix_len + text_len + 1 >= size - len)
break;
- len += print_prefix(msg, syslog, buf + len);
+ memcpy(buf + len, prefix, prefix_len);
+ len += prefix_len;
memcpy(buf + len, text, text_len);
len += text_len;
buf[len++] = '\n';
} else {
/* SYSLOG_ACTION_* buffer size only calculation */
- len += print_prefix(msg, syslog, NULL);
- len += text_len;
- len++;
+ len += prefix_len + text_len + 1;
}
text = next;
@@ -1318,9 +1319,17 @@ static int syslog_print(char __user *buf, int size)
break;
}
+ /*
+ * To keep reading/counting partial line consistent,
+ * use printk_time value as of the beginning of a line.
+ */
+ if (!syslog_partial)
+ syslog_time = printk_time;
+
skip = syslog_partial;
msg = log_from_idx(syslog_idx);
- n = msg_print_text(msg, true, text, LOG_LINE_MAX + PREFIX_MAX);
+ n = msg_print_text(msg, true, syslog_time, text,
+ LOG_LINE_MAX + PREFIX_MAX);
if (n - syslog_partial <= size) {
/* message fits into buffer, move forward */
syslog_idx = log_next(syslog_idx);
@@ -1360,11 +1369,13 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
u64 next_seq;
u64 seq;
u32 idx;
+ bool time;
text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
+ time = printk_time;
logbuf_lock_irq();
/*
* Find first record that fits, including all following records,
@@ -1375,7 +1386,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
while (seq < log_next_seq) {
struct printk_log *msg = log_from_idx(idx);
- len += msg_print_text(msg, true, NULL, 0);
+ len += msg_print_text(msg, true, time, NULL, 0);
idx = log_next(idx);
seq++;
}
@@ -1386,7 +1397,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
while (len > size && seq < log_next_seq) {
struct printk_log *msg = log_from_idx(idx);
- len -= msg_print_text(msg, true, NULL, 0);
+ len -= msg_print_text(msg, true, time, NULL, 0);
idx = log_next(idx);
seq++;
}
@@ -1397,14 +1408,9 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
len = 0;
while (len >= 0 && seq < next_seq) {
struct printk_log *msg = log_from_idx(idx);
- int textlen;
+ int textlen = msg_print_text(msg, true, time, text,
+ LOG_LINE_MAX + PREFIX_MAX);
- textlen = msg_print_text(msg, true, text,
- LOG_LINE_MAX + PREFIX_MAX);
- if (textlen < 0) {
- len = textlen;
- break;
- }
idx = log_next(idx);
seq++;
@@ -1528,11 +1534,14 @@ int do_syslog(int type, char __user *buf, int len, int source)
} else {
u64 seq = syslog_seq;
u32 idx = syslog_idx;
+ bool time = syslog_partial ? syslog_time : printk_time;
while (seq < log_next_seq) {
struct printk_log *msg = log_from_idx(idx);
- error += msg_print_text(msg, true, NULL, 0);
+ error += msg_print_text(msg, true, time, NULL,
+ 0);
+ time = printk_time;
idx = log_next(idx);
seq++;
}
@@ -1935,21 +1944,6 @@ asmlinkage int vprintk(const char *fmt, va_list args)
}
EXPORT_SYMBOL(vprintk);
-asmlinkage int printk_emit(int facility, int level,
- const char *dict, size_t dictlen,
- const char *fmt, ...)
-{
- va_list args;
- int r;
-
- va_start(args, fmt);
- r = vprintk_emit(facility, level, dict, dictlen, fmt, args);
- va_end(args);
-
- return r;
-}
-EXPORT_SYMBOL(printk_emit);
-
int vprintk_default(const char *fmt, va_list args)
{
int r;
@@ -2005,6 +1999,7 @@ EXPORT_SYMBOL(printk);
#define LOG_LINE_MAX 0
#define PREFIX_MAX 0
+#define printk_time false
static u64 syslog_seq;
static u32 syslog_idx;
@@ -2028,8 +2023,8 @@ static void console_lock_spinning_enable(void) { }
static int console_lock_spinning_disable_and_check(void) { return 0; }
static void call_console_drivers(const char *ext_text, size_t ext_len,
const char *text, size_t len) {}
-static size_t msg_print_text(const struct printk_log *msg,
- bool syslog, char *buf, size_t size) { return 0; }
+static size_t msg_print_text(const struct printk_log *msg, bool syslog,
+ bool time, char *buf, size_t size) { return 0; }
static bool suppress_message_printing(int level) { return false; }
#endif /* CONFIG_PRINTK */
@@ -2387,8 +2382,7 @@ skip:
len += msg_print_text(msg,
console_msg_format & MSG_FORMAT_SYSLOG,
- text + len,
- sizeof(text) - len);
+ printk_time, text + len, sizeof(text) - len);
if (nr_ext_console_drivers) {
ext_len = msg_print_ext_header(ext_text,
sizeof(ext_text),
@@ -3112,7 +3106,7 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
goto out;
msg = log_from_idx(dumper->cur_idx);
- l = msg_print_text(msg, syslog, line, size);
+ l = msg_print_text(msg, syslog, printk_time, line, size);
dumper->cur_idx = log_next(dumper->cur_idx);
dumper->cur_seq++;
@@ -3183,6 +3177,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
u32 next_idx;
size_t l = 0;
bool ret = false;
+ bool time = printk_time;
if (!dumper->active)
goto out;
@@ -3206,7 +3201,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
while (seq < dumper->next_seq) {
struct printk_log *msg = log_from_idx(idx);
- l += msg_print_text(msg, true, NULL, 0);
+ l += msg_print_text(msg, true, time, NULL, 0);
idx = log_next(idx);
seq++;
}
@@ -3217,7 +3212,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
while (l > size && seq < dumper->next_seq) {
struct printk_log *msg = log_from_idx(idx);
- l -= msg_print_text(msg, true, NULL, 0);
+ l -= msg_print_text(msg, true, time, NULL, 0);
idx = log_next(idx);
seq++;
}
@@ -3230,7 +3225,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
while (seq < dumper->next_seq) {
struct printk_log *msg = log_from_idx(idx);
- l += msg_print_text(msg, syslog, buf + l, size - l);
+ l += msg_print_text(msg, syslog, time, buf + l, size - l);
idx = log_next(idx);
seq++;
}
diff --git a/kernel/resource.c b/kernel/resource.c
index b0fbf685c77a..915c02e8e5dd 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1256,6 +1256,21 @@ int release_mem_region_adjustable(struct resource *parent,
continue;
}
+ /*
+ * All memory regions added from memory-hotplug path have the
+ * flag IORESOURCE_SYSTEM_RAM. If the resource does not have
+ * this flag, we know that we are dealing with a resource coming
+ * from HMM/devm. HMM/devm use another mechanism to add/release
+ * a resource. This goes via devm_request_mem_region and
+ * devm_release_mem_region.
+ * HMM/devm take care to release their resources when they want,
+ * so if we are dealing with them, let us just back off here.
+ */
+ if (!(res->flags & IORESOURCE_SYSRAM)) {
+ ret = 0;
+ break;
+ }
+
if (!(res->flags & IORESOURCE_MEM))
break;
diff --git a/kernel/signal.c b/kernel/signal.c
index 9a32bc2088c9..53e07d97ffe0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2735,6 +2735,84 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
}
EXPORT_SYMBOL(sigprocmask);
+/*
+ * The api helps set app-provided sigmasks.
+ *
+ * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
+ * epoll_pwait where a new sigmask is passed from userland for the syscalls.
+ */
+int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
+ sigset_t *oldset, size_t sigsetsize)
+{
+ if (!usigmask)
+ return 0;
+
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+ if (copy_from_user(set, usigmask, sizeof(sigset_t)))
+ return -EFAULT;
+
+ *oldset = current->blocked;
+ set_current_blocked(set);
+
+ return 0;
+}
+EXPORT_SYMBOL(set_user_sigmask);
+
+#ifdef CONFIG_COMPAT
+int set_compat_user_sigmask(const compat_sigset_t __user *usigmask,
+ sigset_t *set, sigset_t *oldset,
+ size_t sigsetsize)
+{
+ if (!usigmask)
+ return 0;
+
+ if (sigsetsize != sizeof(compat_sigset_t))
+ return -EINVAL;
+ if (get_compat_sigset(set, usigmask))
+ return -EFAULT;
+
+ *oldset = current->blocked;
+ set_current_blocked(set);
+
+ return 0;
+}
+EXPORT_SYMBOL(set_compat_user_sigmask);
+#endif
+
+/*
+ * restore_user_sigmask:
+ * usigmask: sigmask passed in from userland.
+ * sigsaved: saved sigmask when the syscall started and changed the sigmask to
+ * usigmask.
+ *
+ * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
+ * epoll_pwait where a new sigmask is passed in from userland for the syscalls.
+ */
+void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
+{
+
+ if (!usigmask)
+ return;
+ /*
+ * When signals are pending, do not restore them here.
+ * Restoring sigmask here can lead to delivering signals that the above
+ * syscalls are intended to block because of the sigmask passed in.
+ */
+ if (signal_pending(current)) {
+ current->saved_sigmask = *sigsaved;
+ set_restore_sigmask();
+ return;
+ }
+
+ /*
+ * This is needed because the fast syscall return path does not restore
+ * saved_sigmask when signals are not pending.
+ */
+ set_current_blocked(sigsaved);
+}
+EXPORT_SYMBOL(restore_user_sigmask);
+
/**
* sys_rt_sigprocmask - change the list of currently blocked signals
* @how: whether to add, remove, or set signals
@@ -3254,7 +3332,71 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
return ret;
}
+#ifdef CONFIG_COMPAT_32BIT_TIME
+SYSCALL_DEFINE4(rt_sigtimedwait_time32, const sigset_t __user *, uthese,
+ siginfo_t __user *, uinfo,
+ const struct old_timespec32 __user *, uts,
+ size_t, sigsetsize)
+{
+ sigset_t these;
+ struct timespec64 ts;
+ kernel_siginfo_t info;
+ int ret;
+
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+
+ if (copy_from_user(&these, uthese, sizeof(these)))
+ return -EFAULT;
+
+ if (uts) {
+ if (get_old_timespec32(&ts, uts))
+ return -EFAULT;
+ }
+
+ ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL);
+
+ if (ret > 0 && uinfo) {
+ if (copy_siginfo_to_user(uinfo, &info))
+ ret = -EFAULT;
+ }
+
+ return ret;
+}
+#endif
+
#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time64, compat_sigset_t __user *, uthese,
+ struct compat_siginfo __user *, uinfo,
+ struct __kernel_timespec __user *, uts, compat_size_t, sigsetsize)
+{
+ sigset_t s;
+ struct timespec64 t;
+ kernel_siginfo_t info;
+ long ret;
+
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+
+ if (get_compat_sigset(&s, uthese))
+ return -EFAULT;
+
+ if (uts) {
+ if (get_timespec64(&t, uts))
+ return -EFAULT;
+ }
+
+ ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
+
+ if (ret > 0 && uinfo) {
+ if (copy_siginfo_to_user32(uinfo, &info))
+ ret = -EFAULT;
+ }
+
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT_32BIT_TIME
COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
struct compat_siginfo __user *, uinfo,
struct old_timespec32 __user *, uts, compat_size_t, sigsetsize)
@@ -3285,6 +3427,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
return ret;
}
#endif
+#endif
/**
* sys_kill - send a signal to a process
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index df556175be50..ab9d0e3c6d50 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -284,7 +284,9 @@ COND_SYSCALL_COMPAT(move_pages);
COND_SYSCALL(perf_event_open);
COND_SYSCALL(accept4);
COND_SYSCALL(recvmmsg);
+COND_SYSCALL(recvmmsg_time32);
COND_SYSCALL_COMPAT(recvmmsg);
+COND_SYSCALL_COMPAT(recvmmsg_time64);
/*
* Architecture specific syscalls: see further below
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5fc724e4e454..1825f712e73b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1463,6 +1463,14 @@ static struct ctl_table vm_table[] = {
.extra1 = &zero,
},
{
+ .procname = "watermark_boost_factor",
+ .data = &watermark_boost_factor,
+ .maxlen = sizeof(watermark_boost_factor),
+ .mode = 0644,
+ .proc_handler = watermark_boost_factor_sysctl_handler,
+ .extra1 = &zero,
+ },
+ {
.procname = "watermark_scale_factor",
.data = &watermark_scale_factor,
.maxlen = sizeof(watermark_scale_factor),
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index bc3a3c37ec9c..36a2bef00125 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -554,17 +554,9 @@ static void sync_rtc_clock(void)
}
#ifdef CONFIG_GENERIC_CMOS_UPDATE
-int __weak update_persistent_clock(struct timespec now)
-{
- return -ENODEV;
-}
-
int __weak update_persistent_clock64(struct timespec64 now64)
{
- struct timespec now;
-
- now = timespec64_to_timespec(now64);
- return update_persistent_clock(now);
+ return -ENODEV;
}
#endif
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 5aa0a156e331..2edb5088a70b 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -384,42 +384,6 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0,
EXPORT_SYMBOL(mktime64);
/**
- * set_normalized_timespec - set timespec sec and nsec parts and normalize
- *
- * @ts: pointer to timespec variable to be set
- * @sec: seconds to set
- * @nsec: nanoseconds to set
- *
- * Set seconds and nanoseconds field of a timespec variable and
- * normalize to the timespec storage format
- *
- * Note: The tv_nsec part is always in the range of
- * 0 <= tv_nsec < NSEC_PER_SEC
- * For negative values only the tv_sec field is negative !
- */
-void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec)
-{
- while (nsec >= NSEC_PER_SEC) {
- /*
- * The following asm() prevents the compiler from
- * optimising this loop into a modulo operation. See
- * also __iter_div_u64_rem() in include/linux/time.h
- */
- asm("" : "+rm"(nsec));
- nsec -= NSEC_PER_SEC;
- ++sec;
- }
- while (nsec < 0) {
- asm("" : "+rm"(nsec));
- nsec += NSEC_PER_SEC;
- --sec;
- }
- ts->tv_sec = sec;
- ts->tv_nsec = nsec;
-}
-EXPORT_SYMBOL(set_normalized_timespec);
-
-/**
* ns_to_timespec - Convert nanoseconds to timespec
* @nsec: the nanoseconds value to be converted
*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c801e25875a3..ac5dbf2cd4a2 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1464,7 +1464,7 @@ u64 timekeeping_max_deferment(void)
}
/**
- * read_persistent_clock - Return time from the persistent clock.
+ * read_persistent_clock64 - Return time from the persistent clock.
*
* Weak dummy function for arches that do not yet support it.
* Reads the time from the battery backed persistent clock.
@@ -1472,20 +1472,12 @@ u64 timekeeping_max_deferment(void)
*
* XXX - Do be sure to remove it once all arches implement it.
*/
-void __weak read_persistent_clock(struct timespec *ts)
+void __weak read_persistent_clock64(struct timespec64 *ts)
{
ts->tv_sec = 0;
ts->tv_nsec = 0;
}
-void __weak read_persistent_clock64(struct timespec64 *ts64)
-{
- struct timespec ts;
-
- read_persistent_clock(&ts);
- *ts64 = timespec_to_timespec64(ts);
-}
-
/**
* read_persistent_wall_and_boot_offset - Read persistent clock, and also offset
* from the boot.
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 2868d85f1fb1..fac0ddf8a8e2 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -764,9 +764,9 @@ blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
return NULL;
- if (!bio->bi_css)
+ if (!bio->bi_blkg)
return NULL;
- return cgroup_get_kernfs_id(bio->bi_css->cgroup);
+ return cgroup_get_kernfs_id(bio_blkcg(bio)->css.cgroup);
}
#else
static union kernfs_node_id *
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 9864a35c8bb5..9ddb6fddb4e0 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -17,6 +17,43 @@
#include "trace_probe.h"
#include "trace.h"
+#ifdef CONFIG_MODULES
+struct bpf_trace_module {
+ struct module *module;
+ struct list_head list;
+};
+
+static LIST_HEAD(bpf_trace_modules);
+static DEFINE_MUTEX(bpf_module_mutex);
+
+static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name)
+{
+ struct bpf_raw_event_map *btp, *ret = NULL;
+ struct bpf_trace_module *btm;
+ unsigned int i;
+
+ mutex_lock(&bpf_module_mutex);
+ list_for_each_entry(btm, &bpf_trace_modules, list) {
+ for (i = 0; i < btm->module->num_bpf_raw_events; ++i) {
+ btp = &btm->module->bpf_raw_events[i];
+ if (!strcmp(btp->tp->name, name)) {
+ if (try_module_get(btm->module))
+ ret = btp;
+ goto out;
+ }
+ }
+ }
+out:
+ mutex_unlock(&bpf_module_mutex);
+ return ret;
+}
+#else
+static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name)
+{
+ return NULL;
+}
+#endif /* CONFIG_MODULES */
+
u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
@@ -1076,7 +1113,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
extern struct bpf_raw_event_map __start__bpf_raw_tp[];
extern struct bpf_raw_event_map __stop__bpf_raw_tp[];
-struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name)
+struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name)
{
struct bpf_raw_event_map *btp = __start__bpf_raw_tp;
@@ -1084,7 +1121,16 @@ struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name)
if (!strcmp(btp->tp->name, name))
return btp;
}
- return NULL;
+
+ return bpf_get_raw_tracepoint_module(name);
+}
+
+void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
+{
+ struct module *mod = __module_address((unsigned long)btp);
+
+ if (mod)
+ module_put(mod);
}
static __always_inline
@@ -1222,3 +1268,52 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
return err;
}
+
+#ifdef CONFIG_MODULES
+int bpf_event_notify(struct notifier_block *nb, unsigned long op, void *module)
+{
+ struct bpf_trace_module *btm, *tmp;
+ struct module *mod = module;
+
+ if (mod->num_bpf_raw_events == 0 ||
+ (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING))
+ return 0;
+
+ mutex_lock(&bpf_module_mutex);
+
+ switch (op) {
+ case MODULE_STATE_COMING:
+ btm = kzalloc(sizeof(*btm), GFP_KERNEL);
+ if (btm) {
+ btm->module = module;
+ list_add(&btm->list, &bpf_trace_modules);
+ }
+ break;
+ case MODULE_STATE_GOING:
+ list_for_each_entry_safe(btm, tmp, &bpf_trace_modules, list) {
+ if (btm->module == module) {
+ list_del(&btm->list);
+ kfree(btm);
+ break;
+ }
+ }
+ break;
+ }
+
+ mutex_unlock(&bpf_module_mutex);
+
+ return 0;
+}
+
+static struct notifier_block bpf_module_nb = {
+ .notifier_call = bpf_event_notify,
+};
+
+int __init bpf_event_init(void)
+{
+ register_module_notifier(&bpf_module_nb);
+ return 0;
+}
+
+fs_initcall(bpf_event_init);
+#endif /* CONFIG_MODULES */
OpenPOWER on IntegriCloud