summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/async.c1
-rw-r--r--kernel/audit.c19
-rw-r--r--kernel/audit.h2
-rw-r--r--kernel/auditsc.c6
-rw-r--r--kernel/cgroup.c81
-rw-r--r--kernel/cgroup_freezer.c2
-rw-r--r--kernel/cgroup_pids.c6
-rw-r--r--kernel/cpuset.c12
-rw-r--r--kernel/cred.c4
-rw-r--r--kernel/debug/kdb/kdb_main.c4
-rw-r--r--kernel/delayacct.c2
-rw-r--r--kernel/events/uprobes.c13
-rw-r--r--kernel/fork.c34
-rw-r--r--kernel/futex.c65
-rw-r--r--kernel/gcov/base.c7
-rw-r--r--kernel/livepatch/core.c176
-rw-r--r--kernel/memremap.c219
-rw-r--r--kernel/module.c349
-rw-r--r--kernel/panic.c3
-rw-r--r--kernel/pid.c2
-rw-r--r--kernel/power/main.c17
-rw-r--r--kernel/power/power.h9
-rw-r--r--kernel/printk/printk.c67
-rw-r--r--kernel/resource.c11
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/sched/idle.c1
-rw-r--r--kernel/stop_machine.c4
-rw-r--r--kernel/sysctl.c22
-rw-r--r--kernel/trace/bpf_trace.c2
-rw-r--r--kernel/trace/ftrace.c451
-rw-r--r--kernel/trace/ring_buffer.c57
-rw-r--r--kernel/trace/trace.h6
-rw-r--r--kernel/trace/trace_event_perf.c2
-rw-r--r--kernel/trace/trace_events_trigger.c10
34 files changed, 952 insertions, 716 deletions
diff --git a/kernel/async.c b/kernel/async.c
index 4c3773c0bf63..d2edd6efec56 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -326,3 +326,4 @@ bool current_is_async(void)
return worker && worker->current_func == async_run_entry_fn;
}
+EXPORT_SYMBOL_GPL(current_is_async);
diff --git a/kernel/audit.c b/kernel/audit.c
index 5ffcbd354a52..3a3e5deeda8d 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -110,7 +110,6 @@ static u32 audit_backlog_limit = 64;
#define AUDIT_BACKLOG_WAIT_TIME (60 * HZ)
static u32 audit_backlog_wait_time_master = AUDIT_BACKLOG_WAIT_TIME;
static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
-static u32 audit_backlog_wait_overflow = 0;
/* The identity of the user shutting down the audit system. */
kuid_t audit_sig_uid = INVALID_UID;
@@ -509,8 +508,7 @@ static void flush_hold_queue(void)
* if auditd just disappeared but we
* dequeued an skb we need to drop ref
*/
- if (skb)
- consume_skb(skb);
+ consume_skb(skb);
}
static int kauditd_thread(void *dummy)
@@ -524,7 +522,8 @@ static int kauditd_thread(void *dummy)
skb = skb_dequeue(&audit_skb_queue);
if (skb) {
- if (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit)
+ if (!audit_backlog_limit ||
+ (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit))
wake_up(&audit_backlog_wait);
if (audit_pid)
kauditd_send_skb(skb);
@@ -1232,9 +1231,7 @@ static void audit_buffer_free(struct audit_buffer *ab)
if (!ab)
return;
- if (ab->skb)
- kfree_skb(ab->skb);
-
+ kfree_skb(ab->skb);
spin_lock_irqsave(&audit_freelist_lock, flags);
if (audit_freelist_count > AUDIT_MAXFREE)
kfree(ab);
@@ -1372,7 +1369,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
return NULL;
if (gfp_mask & __GFP_DIRECT_RECLAIM) {
- if (audit_pid && audit_pid == current->pid)
+ if (audit_pid && audit_pid == current->tgid)
gfp_mask &= ~__GFP_DIRECT_RECLAIM;
else
reserve = 0;
@@ -1395,12 +1392,12 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
skb_queue_len(&audit_skb_queue),
audit_backlog_limit);
audit_log_lost("backlog limit exceeded");
- audit_backlog_wait_time = audit_backlog_wait_overflow;
+ audit_backlog_wait_time = 0;
wake_up(&audit_backlog_wait);
return NULL;
}
- if (!reserve)
+ if (!reserve && !audit_backlog_wait_time)
audit_backlog_wait_time = audit_backlog_wait_time_master;
ab = audit_buffer_alloc(ctx, gfp_mask, type);
@@ -1722,7 +1719,7 @@ static inline int audit_copy_fcaps(struct audit_names *name,
/* Copy inode data into an audit_names. */
void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
- const struct inode *inode)
+ struct inode *inode)
{
name->ino = inode->i_ino;
name->dev = inode->i_sb->s_dev;
diff --git a/kernel/audit.h b/kernel/audit.h
index de6cbb7cf547..cbbe6bb6496e 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -207,7 +207,7 @@ extern u32 audit_ever_enabled;
extern void audit_copy_inode(struct audit_names *name,
const struct dentry *dentry,
- const struct inode *inode);
+ struct inode *inode);
extern void audit_log_cap(struct audit_buffer *ab, char *prefix,
kernel_cap_t *cap);
extern void audit_log_name(struct audit_context *context,
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b86cc04959de..195ffaee50b9 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1754,7 +1754,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
unsigned int flags)
{
struct audit_context *context = current->audit_context;
- const struct inode *inode = d_backing_inode(dentry);
+ struct inode *inode = d_backing_inode(dentry);
struct audit_names *n;
bool parent = flags & AUDIT_INODE_PARENT;
@@ -1848,12 +1848,12 @@ void __audit_file(const struct file *file)
* must be hooked prior, in order to capture the target inode during
* unsuccessful attempts.
*/
-void __audit_inode_child(const struct inode *parent,
+void __audit_inode_child(struct inode *parent,
const struct dentry *dentry,
const unsigned char type)
{
struct audit_context *context = current->audit_context;
- const struct inode *inode = d_backing_inode(dentry);
+ struct inode *inode = d_backing_inode(dentry);
const char *dname = dentry->d_name.name;
struct audit_names *n, *found_parent = NULL, *found_child = NULL;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fe95970b1f79..c03a640ef6da 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -211,6 +211,7 @@ static unsigned long have_free_callback __read_mostly;
/* Ditto for the can_fork callback. */
static unsigned long have_canfork_callback __read_mostly;
+static struct file_system_type cgroup2_fs_type;
static struct cftype cgroup_dfl_base_files[];
static struct cftype cgroup_legacy_base_files[];
@@ -1623,10 +1624,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
all_ss = true;
continue;
}
- if (!strcmp(token, "__DEVEL__sane_behavior")) {
- opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
- continue;
- }
if (!strcmp(token, "noprefix")) {
opts->flags |= CGRP_ROOT_NOPREFIX;
continue;
@@ -1693,15 +1690,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
return -ENOENT;
}
- if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
- pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
- if (nr_opts != 1) {
- pr_err("sane_behavior: no other mount options allowed\n");
- return -EINVAL;
- }
- return 0;
- }
-
/*
* If the 'all' option was specified select all the subsystems,
* otherwise if 'none', 'name=' and a subsystem name options were
@@ -1981,6 +1969,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
void *data)
{
+ bool is_v2 = fs_type == &cgroup2_fs_type;
struct super_block *pinned_sb = NULL;
struct cgroup_subsys *ss;
struct cgroup_root *root;
@@ -1997,6 +1986,17 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
if (!use_task_css_set_links)
cgroup_enable_task_cg_lists();
+ if (is_v2) {
+ if (data) {
+ pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
+ return ERR_PTR(-EINVAL);
+ }
+ cgrp_dfl_root_visible = true;
+ root = &cgrp_dfl_root;
+ cgroup_get(&root->cgrp);
+ goto out_mount;
+ }
+
mutex_lock(&cgroup_mutex);
/* First find the desired set of subsystems */
@@ -2004,15 +2004,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
if (ret)
goto out_unlock;
- /* look for a matching existing root */
- if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
- cgrp_dfl_root_visible = true;
- root = &cgrp_dfl_root;
- cgroup_get(&root->cgrp);
- ret = 0;
- goto out_unlock;
- }
-
/*
* Destruction of cgroup root is asynchronous, so subsystems may
* still be dying after the previous unmount. Let's drain the
@@ -2123,9 +2114,10 @@ out_free:
if (ret)
return ERR_PTR(ret);
-
+out_mount:
dentry = kernfs_mount(fs_type, flags, root->kf_root,
- CGROUP_SUPER_MAGIC, &new_sb);
+ is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
+ &new_sb);
if (IS_ERR(dentry) || !new_sb)
cgroup_put(&root->cgrp);
@@ -2168,6 +2160,12 @@ static struct file_system_type cgroup_fs_type = {
.kill_sb = cgroup_kill_sb,
};
+static struct file_system_type cgroup2_fs_type = {
+ .name = "cgroup2",
+ .mount = cgroup_mount,
+ .kill_sb = cgroup_kill_sb,
+};
+
/**
* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
* @task: target task
@@ -4039,7 +4037,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
goto out_err;
/*
- * Migrate tasks one-by-one until @form is empty. This fails iff
+ * Migrate tasks one-by-one until @from is empty. This fails iff
* ->can_attach() fails.
*/
do {
@@ -5171,7 +5169,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
{
struct cgroup_subsys_state *css;
- printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
+ pr_debug("Initializing cgroup subsys %s\n", ss->name);
mutex_lock(&cgroup_mutex);
@@ -5329,6 +5327,7 @@ int __init cgroup_init(void)
WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
WARN_ON(register_filesystem(&cgroup_fs_type));
+ WARN_ON(register_filesystem(&cgroup2_fs_type));
WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
return 0;
@@ -5472,19 +5471,6 @@ static const struct file_operations proc_cgroupstats_operations = {
.release = single_release,
};
-static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
-{
- if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END)
- return &ss_priv[i - CGROUP_CANFORK_START];
- return NULL;
-}
-
-static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
-{
- void **private = subsys_canfork_priv_p(ss_priv, i);
- return private ? *private : NULL;
-}
-
/**
* cgroup_fork - initialize cgroup related fields during copy_process()
* @child: pointer to task_struct of forking parent process.
@@ -5507,14 +5493,13 @@ void cgroup_fork(struct task_struct *child)
* returns an error, the fork aborts with that error code. This allows for
* a cgroup subsystem to conditionally allow or deny new forks.
*/
-int cgroup_can_fork(struct task_struct *child,
- void *ss_priv[CGROUP_CANFORK_COUNT])
+int cgroup_can_fork(struct task_struct *child)
{
struct cgroup_subsys *ss;
int i, j, ret;
for_each_subsys_which(ss, i, &have_canfork_callback) {
- ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i));
+ ret = ss->can_fork(child);
if (ret)
goto out_revert;
}
@@ -5526,7 +5511,7 @@ out_revert:
if (j >= i)
break;
if (ss->cancel_fork)
- ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j));
+ ss->cancel_fork(child);
}
return ret;
@@ -5539,15 +5524,14 @@ out_revert:
* This calls the cancel_fork() callbacks if a fork failed *after*
* cgroup_can_fork() succeded.
*/
-void cgroup_cancel_fork(struct task_struct *child,
- void *ss_priv[CGROUP_CANFORK_COUNT])
+void cgroup_cancel_fork(struct task_struct *child)
{
struct cgroup_subsys *ss;
int i;
for_each_subsys(ss, i)
if (ss->cancel_fork)
- ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i));
+ ss->cancel_fork(child);
}
/**
@@ -5560,8 +5544,7 @@ void cgroup_cancel_fork(struct task_struct *child,
* cgroup_task_iter_start() - to guarantee that the new task ends up on its
* list.
*/
-void cgroup_post_fork(struct task_struct *child,
- void *old_ss_priv[CGROUP_CANFORK_COUNT])
+void cgroup_post_fork(struct task_struct *child)
{
struct cgroup_subsys *ss;
int i;
@@ -5605,7 +5588,7 @@ void cgroup_post_fork(struct task_struct *child,
* and addition to css_set.
*/
for_each_subsys_which(ss, i, &have_fork_callback)
- ss->fork(child, subsys_canfork_priv(old_ss_priv, i));
+ ss->fork(child);
}
/**
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 2d3df82c54f2..1b72d56edce5 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -200,7 +200,7 @@ static void freezer_attach(struct cgroup_taskset *tset)
* to do anything as freezer_attach() will put @task into the appropriate
* state.
*/
-static void freezer_fork(struct task_struct *task, void *private)
+static void freezer_fork(struct task_struct *task)
{
struct freezer *freezer;
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
index b50d5a167fda..303097b37429 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup_pids.c
@@ -134,7 +134,7 @@ static void pids_charge(struct pids_cgroup *pids, int num)
*
* This function follows the set limit. It will fail if the charge would cause
* the new value to exceed the hierarchical limit. Returns 0 if the charge
- * succeded, otherwise -EAGAIN.
+ * succeeded, otherwise -EAGAIN.
*/
static int pids_try_charge(struct pids_cgroup *pids, int num)
{
@@ -209,7 +209,7 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
* task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
* on threadgroup_change_begin() held by the copy_process().
*/
-static int pids_can_fork(struct task_struct *task, void **priv_p)
+static int pids_can_fork(struct task_struct *task)
{
struct cgroup_subsys_state *css;
struct pids_cgroup *pids;
@@ -219,7 +219,7 @@ static int pids_can_fork(struct task_struct *task, void **priv_p)
return pids_try_charge(pids, 1);
}
-static void pids_cancel_fork(struct task_struct *task, void *priv)
+static void pids_cancel_fork(struct task_struct *task)
{
struct cgroup_subsys_state *css;
struct pids_cgroup *pids;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 02a8ea5c9963..3e945fcd8179 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -51,6 +51,7 @@
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/time.h>
+#include <linux/time64.h>
#include <linux/backing-dev.h>
#include <linux/sort.h>
@@ -68,7 +69,7 @@ struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
struct fmeter {
int cnt; /* unprocessed events count */
int val; /* most recent output value */
- time_t time; /* clock (secs) when val computed */
+ time64_t time; /* clock (secs) when val computed */
spinlock_t lock; /* guards read or write of above */
};
@@ -1374,7 +1375,7 @@ out:
*/
#define FM_COEF 933 /* coefficient for half-life of 10 secs */
-#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
+#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */
#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
#define FM_SCALE 1000 /* faux fixed point scale */
@@ -1390,8 +1391,11 @@ static void fmeter_init(struct fmeter *fmp)
/* Internal meter update - process cnt events and update value */
static void fmeter_update(struct fmeter *fmp)
{
- time_t now = get_seconds();
- time_t ticks = now - fmp->time;
+ time64_t now;
+ u32 ticks;
+
+ now = ktime_get_seconds();
+ ticks = now - fmp->time;
if (ticks == 0)
return;
diff --git a/kernel/cred.c b/kernel/cred.c
index 71179a09c1d6..0c0cd8a62285 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -569,8 +569,8 @@ EXPORT_SYMBOL(revert_creds);
void __init cred_init(void)
{
/* allocate a slab in which we can store credentials */
- cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred),
- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
}
/**
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 4121345498e0..2a20c0dfdafc 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2021,7 +2021,7 @@ static int kdb_lsmod(int argc, const char **argv)
continue;
kdb_printf("%-20s%8u 0x%p ", mod->name,
- mod->core_size, (void *)mod);
+ mod->core_layout.size, (void *)mod);
#ifdef CONFIG_MODULE_UNLOAD
kdb_printf("%4d ", module_refcount(mod));
#endif
@@ -2031,7 +2031,7 @@ static int kdb_lsmod(int argc, const char **argv)
kdb_printf(" (Loading)");
else
kdb_printf(" (Live)");
- kdb_printf(" 0x%p", mod->module_core);
+ kdb_printf(" 0x%p", mod->core_layout.base);
#ifdef CONFIG_MODULE_UNLOAD
{
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index ef90b04d783f..435c14a45118 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -34,7 +34,7 @@ __setup("nodelayacct", delayacct_setup_disable);
void delayacct_init(void)
{
- delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC);
+ delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT);
delayacct_tsk_init(&init_task);
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 7dad84913abf..0167679182c0 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
const unsigned long mmun_end = addr + PAGE_SIZE;
struct mem_cgroup *memcg;
- err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg);
+ err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg,
+ false);
if (err)
return err;
@@ -175,12 +176,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
goto unlock;
get_page(kpage);
- page_add_new_anon_rmap(kpage, vma, addr);
- mem_cgroup_commit_charge(kpage, memcg, false);
+ page_add_new_anon_rmap(kpage, vma, addr, false);
+ mem_cgroup_commit_charge(kpage, memcg, false, false);
lru_cache_add_active_or_unevictable(kpage, vma);
if (!PageAnon(page)) {
- dec_mm_counter(mm, MM_FILEPAGES);
+ dec_mm_counter(mm, mm_counter_file(page));
inc_mm_counter(mm, MM_ANONPAGES);
}
@@ -188,7 +189,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
ptep_clear_flush_notify(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
- page_remove_rmap(page);
+ page_remove_rmap(page, false);
if (!page_mapped(page))
try_to_free_swap(page);
pte_unmap_unlock(ptep, ptl);
@@ -199,7 +200,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
err = 0;
unlock:
- mem_cgroup_cancel_charge(kpage, memcg);
+ mem_cgroup_cancel_charge(kpage, memcg, false);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
unlock_page(page);
return err;
diff --git a/kernel/fork.c b/kernel/fork.c
index 291b08cc817b..2e391c754ae7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -300,9 +300,9 @@ void __init fork_init(void)
#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
#endif
/* create a slab on which task_structs can be allocated */
- task_struct_cachep =
- kmem_cache_create("task_struct", arch_task_struct_size,
- ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
+ task_struct_cachep = kmem_cache_create("task_struct",
+ arch_task_struct_size, ARCH_MIN_TASKALIGN,
+ SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL);
#endif
/* do the arch specific task caches init */
@@ -414,7 +414,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
mm->total_vm = oldmm->total_vm;
- mm->shared_vm = oldmm->shared_vm;
+ mm->data_vm = oldmm->data_vm;
mm->exec_vm = oldmm->exec_vm;
mm->stack_vm = oldmm->stack_vm;
@@ -433,8 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
struct file *file;
if (mpnt->vm_flags & VM_DONTCOPY) {
- vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
- -vma_pages(mpnt));
+ vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
continue;
}
charge = 0;
@@ -1250,7 +1249,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
{
int retval;
struct task_struct *p;
- void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
@@ -1527,7 +1525,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
* between here and cgroup_post_fork() if an organisation operation is in
* progress.
*/
- retval = cgroup_can_fork(p, cgrp_ss_priv);
+ retval = cgroup_can_fork(p);
if (retval)
goto bad_fork_free_pid;
@@ -1609,7 +1607,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
- cgroup_post_fork(p, cgrp_ss_priv);
+ cgroup_post_fork(p);
threadgroup_change_end(current);
perf_event_fork(p);
@@ -1619,7 +1617,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
return p;
bad_fork_cancel_cgroup:
- cgroup_cancel_fork(p, cgrp_ss_priv);
+ cgroup_cancel_fork(p);
bad_fork_free_pid:
if (pid != &init_struct_pid)
free_pid(pid);
@@ -1849,16 +1847,19 @@ void __init proc_caches_init(void)
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
- SLAB_NOTRACK, sighand_ctor);
+ SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor);
signal_cachep = kmem_cache_create("signal_cache",
sizeof(struct signal_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ NULL);
files_cachep = kmem_cache_create("files_cache",
sizeof(struct files_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ NULL);
fs_cachep = kmem_cache_create("fs_cache",
sizeof(struct fs_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ NULL);
/*
* FIXME! The "sizeof(struct mm_struct)" currently includes the
* whole struct cpumask for the OFFSTACK case. We could change
@@ -1868,8 +1869,9 @@ void __init proc_caches_init(void)
*/
mm_cachep = kmem_cache_create("mm_struct",
sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
- vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ NULL);
+ vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
mmap_init();
nsproxy_cache_init();
}
diff --git a/kernel/futex.c b/kernel/futex.c
index 8a310e240cda..c6f514573b28 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -469,7 +469,8 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
- struct page *page, *page_head;
+ struct page *page;
+ struct address_space *mapping;
int err, ro = 0;
/*
@@ -519,46 +520,9 @@ again:
else
err = 0;
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- page_head = page;
- if (unlikely(PageTail(page))) {
- put_page(page);
- /* serialize against __split_huge_page_splitting() */
- local_irq_disable();
- if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
- page_head = compound_head(page);
- /*
- * page_head is valid pointer but we must pin
- * it before taking the PG_lock and/or
- * PG_compound_lock. The moment we re-enable
- * irqs __split_huge_page_splitting() can
- * return and the head page can be freed from
- * under us. We can't take the PG_lock and/or
- * PG_compound_lock on a page that could be
- * freed from under us.
- */
- if (page != page_head) {
- get_page(page_head);
- put_page(page);
- }
- local_irq_enable();
- } else {
- local_irq_enable();
- goto again;
- }
- }
-#else
- page_head = compound_head(page);
- if (page != page_head) {
- get_page(page_head);
- put_page(page);
- }
-#endif
-
- lock_page(page_head);
-
+ lock_page(page);
/*
- * If page_head->mapping is NULL, then it cannot be a PageAnon
+ * If page->mapping is NULL, then it cannot be a PageAnon
* page; but it might be the ZERO_PAGE or in the gate area or
* in a special mapping (all cases which we are happy to fail);
* or it may have been a good file page when get_user_pages_fast
@@ -570,12 +534,13 @@ again:
*
* The case we do have to guard against is when memory pressure made
* shmem_writepage move it from filecache to swapcache beneath us:
- * an unlikely race, but we do need to retry for page_head->mapping.
+ * an unlikely race, but we do need to retry for page->mapping.
*/
- if (!page_head->mapping) {
- int shmem_swizzled = PageSwapCache(page_head);
- unlock_page(page_head);
- put_page(page_head);
+ mapping = compound_head(page)->mapping;
+ if (!mapping) {
+ int shmem_swizzled = PageSwapCache(page);
+ unlock_page(page);
+ put_page(page);
if (shmem_swizzled)
goto again;
return -EFAULT;
@@ -588,7 +553,7 @@ again:
* it's a read-only handle, it's expected that futexes attach to
* the object not the particular process.
*/
- if (PageAnon(page_head)) {
+ if (PageAnon(page)) {
/*
* A RO anonymous page will never change and thus doesn't make
* sense for futex operations.
@@ -603,15 +568,15 @@ again:
key->private.address = address;
} else {
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
- key->shared.inode = page_head->mapping->host;
+ key->shared.inode = mapping->host;
key->shared.pgoff = basepage_index(page);
}
get_futex_key_refs(key); /* implies MB (B) */
out:
- unlock_page(page_head);
- put_page(page_head);
+ unlock_page(page);
+ put_page(page);
return err;
}
@@ -639,7 +604,7 @@ static int fault_in_user_writeable(u32 __user *uaddr)
down_read(&mm->mmap_sem);
ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
- FAULT_FLAG_WRITE);
+ FAULT_FLAG_WRITE, NULL);
up_read(&mm->mmap_sem);
return ret < 0 ? ret : 0;
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index 7080ae1eb6c1..2f9df37940a0 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -123,11 +123,6 @@ void gcov_enable_events(void)
}
#ifdef CONFIG_MODULES
-static inline int within(void *addr, void *start, unsigned long size)
-{
- return ((addr >= start) && (addr < start + size));
-}
-
/* Update list and generate events when modules are unloaded. */
static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
void *data)
@@ -142,7 +137,7 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
/* Remove entries located in module from linked list. */
while ((info = gcov_info_next(info))) {
- if (within(info, mod->module_core, mod->core_size)) {
+ if (within_module((unsigned long)info, mod)) {
gcov_info_unlink(prev, info);
if (gcov_events_enabled)
gcov_event(GCOV_REMOVE, info);
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index db545cbcdb89..bc2c85c064c1 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -28,6 +28,7 @@
#include <linux/list.h>
#include <linux/kallsyms.h>
#include <linux/livepatch.h>
+#include <asm/cacheflush.h>
/**
* struct klp_ops - structure for tracking registered ftrace ops structs
@@ -135,13 +136,8 @@ struct klp_find_arg {
const char *objname;
const char *name;
unsigned long addr;
- /*
- * If count == 0, the symbol was not found. If count == 1, a unique
- * match was found and addr is set. If count > 1, there is
- * unresolvable ambiguity among "count" number of symbols with the same
- * name in the same object.
- */
unsigned long count;
+ unsigned long pos;
};
static int klp_find_callback(void *data, const char *name,
@@ -158,37 +154,48 @@ static int klp_find_callback(void *data, const char *name,
if (args->objname && strcmp(args->objname, mod->name))
return 0;
- /*
- * args->addr might be overwritten if another match is found
- * but klp_find_object_symbol() handles this and only returns the
- * addr if count == 1.
- */
args->addr = addr;
args->count++;
+ /*
+ * Finish the search when the symbol is found for the desired position
+ * or the position is not defined for a non-unique symbol.
+ */
+ if ((args->pos && (args->count == args->pos)) ||
+ (!args->pos && (args->count > 1)))
+ return 1;
+
return 0;
}
static int klp_find_object_symbol(const char *objname, const char *name,
- unsigned long *addr)
+ unsigned long sympos, unsigned long *addr)
{
struct klp_find_arg args = {
.objname = objname,
.name = name,
.addr = 0,
- .count = 0
+ .count = 0,
+ .pos = sympos,
};
mutex_lock(&module_mutex);
kallsyms_on_each_symbol(klp_find_callback, &args);
mutex_unlock(&module_mutex);
- if (args.count == 0)
+ /*
+ * Ensure an address was found. If sympos is 0, ensure symbol is unique;
+ * otherwise ensure the symbol position count matches sympos.
+ */
+ if (args.addr == 0)
pr_err("symbol '%s' not found in symbol table\n", name);
- else if (args.count > 1)
+ else if (args.count > 1 && sympos == 0) {
pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n",
args.count, name, objname);
- else {
+ } else if (sympos != args.count && sympos > 0) {
+ pr_err("symbol position %lu for symbol '%s' in object '%s' not found\n",
+ sympos, name, objname ? objname : "vmlinux");
+ } else {
*addr = args.addr;
return 0;
}
@@ -197,66 +204,6 @@ static int klp_find_object_symbol(const char *objname, const char *name,
return -EINVAL;
}
-struct klp_verify_args {
- const char *name;
- const unsigned long addr;
-};
-
-static int klp_verify_callback(void *data, const char *name,
- struct module *mod, unsigned long addr)
-{
- struct klp_verify_args *args = data;
-
- if (!mod &&
- !strcmp(args->name, name) &&
- args->addr == addr)
- return 1;
-
- return 0;
-}
-
-static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr)
-{
- struct klp_verify_args args = {
- .name = name,
- .addr = addr,
- };
- int ret;
-
- mutex_lock(&module_mutex);
- ret = kallsyms_on_each_symbol(klp_verify_callback, &args);
- mutex_unlock(&module_mutex);
-
- if (!ret) {
- pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?\n",
- name, addr);
- return -EINVAL;
- }
-
- return 0;
-}
-
-static int klp_find_verify_func_addr(struct klp_object *obj,
- struct klp_func *func)
-{
- int ret;
-
-#if defined(CONFIG_RANDOMIZE_BASE)
- /* If KASLR has been enabled, adjust old_addr accordingly */
- if (kaslr_enabled() && func->old_addr)
- func->old_addr += kaslr_offset();
-#endif
-
- if (!func->old_addr || klp_is_module(obj))
- ret = klp_find_object_symbol(obj->name, func->old_name,
- &func->old_addr);
- else
- ret = klp_verify_vmlinux_symbol(func->old_name,
- func->old_addr);
-
- return ret;
-}
-
/*
* external symbols are located outside the parent object (where the parent
* object is either vmlinux or the kmod being patched).
@@ -276,14 +223,18 @@ static int klp_find_external_symbol(struct module *pmod, const char *name,
}
preempt_enable();
- /* otherwise check if it's in another .o within the patch module */
- return klp_find_object_symbol(pmod->name, name, addr);
+ /*
+ * Check if it's in another .o within the patch module. This also
+ * checks that the external symbol is unique.
+ */
+ return klp_find_object_symbol(pmod->name, name, 0, addr);
}
static int klp_write_object_relocations(struct module *pmod,
struct klp_object *obj)
{
- int ret;
+ int ret = 0;
+ unsigned long val;
struct klp_reloc *reloc;
if (WARN_ON(!klp_is_object_loaded(obj)))
@@ -292,41 +243,38 @@ static int klp_write_object_relocations(struct module *pmod,
if (WARN_ON(!obj->relocs))
return -EINVAL;
+ module_disable_ro(pmod);
+
for (reloc = obj->relocs; reloc->name; reloc++) {
- if (!klp_is_module(obj)) {
-
-#if defined(CONFIG_RANDOMIZE_BASE)
- /* If KASLR has been enabled, adjust old value accordingly */
- if (kaslr_enabled())
- reloc->val += kaslr_offset();
-#endif
- ret = klp_verify_vmlinux_symbol(reloc->name,
- reloc->val);
- if (ret)
- return ret;
- } else {
- /* module, reloc->val needs to be discovered */
- if (reloc->external)
- ret = klp_find_external_symbol(pmod,
- reloc->name,
- &reloc->val);
- else
- ret = klp_find_object_symbol(obj->mod->name,
- reloc->name,
- &reloc->val);
- if (ret)
- return ret;
- }
+ /* discover the address of the referenced symbol */
+ if (reloc->external) {
+ if (reloc->sympos > 0) {
+ pr_err("non-zero sympos for external reloc symbol '%s' is not supported\n",
+ reloc->name);
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = klp_find_external_symbol(pmod, reloc->name, &val);
+ } else
+ ret = klp_find_object_symbol(obj->name,
+ reloc->name,
+ reloc->sympos,
+ &val);
+ if (ret)
+ goto out;
+
ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc,
- reloc->val + reloc->addend);
+ val + reloc->addend);
if (ret) {
pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n",
- reloc->name, reloc->val, ret);
- return ret;
+ reloc->name, val, ret);
+ goto out;
}
}
- return 0;
+out:
+ module_enable_ro(pmod);
+ return ret;
}
static void notrace klp_ftrace_handler(unsigned long ip,
@@ -593,7 +541,7 @@ EXPORT_SYMBOL_GPL(klp_enable_patch);
* /sys/kernel/livepatch/<patch>
* /sys/kernel/livepatch/<patch>/enabled
* /sys/kernel/livepatch/<patch>/<object>
- * /sys/kernel/livepatch/<patch>/<object>/<func>
+ * /sys/kernel/livepatch/<patch>/<object>/<function,sympos>
*/
static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -738,8 +686,14 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
INIT_LIST_HEAD(&func->stack_node);
func->state = KLP_DISABLED;
+ /* The format for the sysfs directory is <function,sympos> where sympos
+ * is the nth occurrence of this symbol in kallsyms for the patched
+ * object. If the user selects 0 for old_sympos, then 1 will be used
+ * since a unique symbol will be the first occurrence.
+ */
return kobject_init_and_add(&func->kobj, &klp_ktype_func,
- &obj->kobj, "%s", func->old_name);
+ &obj->kobj, "%s,%lu", func->old_name,
+ func->old_sympos ? func->old_sympos : 1);
}
/* parts of the initialization that is done only when the object is loaded */
@@ -756,7 +710,9 @@ static int klp_init_object_loaded(struct klp_patch *patch,
}
klp_for_each_func(obj, func) {
- ret = klp_find_verify_func_addr(obj, func);
+ ret = klp_find_object_symbol(obj->name, func->old_name,
+ func->old_sympos,
+ &func->old_addr);
if (ret)
return ret;
}
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 7658d32c5c78..e517a16cb426 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -10,8 +10,11 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
+#include <linux/radix-tree.h>
+#include <linux/memremap.h>
#include <linux/device.h>
#include <linux/types.h>
+#include <linux/pfn_t.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/memory_hotplug.h>
@@ -147,24 +150,127 @@ void devm_memunmap(struct device *dev, void *addr)
}
EXPORT_SYMBOL(devm_memunmap);
+pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags)
+{
+ return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags);
+}
+EXPORT_SYMBOL(phys_to_pfn_t);
+
#ifdef CONFIG_ZONE_DEVICE
+static DEFINE_MUTEX(pgmap_lock);
+static RADIX_TREE(pgmap_radix, GFP_KERNEL);
+#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
+#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
+
struct page_map {
struct resource res;
+ struct percpu_ref *ref;
+ struct dev_pagemap pgmap;
+ struct vmem_altmap altmap;
};
-static void devm_memremap_pages_release(struct device *dev, void *res)
+void get_zone_device_page(struct page *page)
+{
+ percpu_ref_get(page->pgmap->ref);
+}
+EXPORT_SYMBOL(get_zone_device_page);
+
+void put_zone_device_page(struct page *page)
+{
+ put_dev_pagemap(page->pgmap);
+}
+EXPORT_SYMBOL(put_zone_device_page);
+
+static void pgmap_radix_release(struct resource *res)
+{
+ resource_size_t key;
+
+ mutex_lock(&pgmap_lock);
+ for (key = res->start; key <= res->end; key += SECTION_SIZE)
+ radix_tree_delete(&pgmap_radix, key >> PA_SECTION_SHIFT);
+ mutex_unlock(&pgmap_lock);
+}
+
+static unsigned long pfn_first(struct page_map *page_map)
+{
+ struct dev_pagemap *pgmap = &page_map->pgmap;
+ const struct resource *res = &page_map->res;
+ struct vmem_altmap *altmap = pgmap->altmap;
+ unsigned long pfn;
+
+ pfn = res->start >> PAGE_SHIFT;
+ if (altmap)
+ pfn += vmem_altmap_offset(altmap);
+ return pfn;
+}
+
+static unsigned long pfn_end(struct page_map *page_map)
+{
+ const struct resource *res = &page_map->res;
+
+ return (res->start + resource_size(res)) >> PAGE_SHIFT;
+}
+
+#define for_each_device_pfn(pfn, map) \
+ for (pfn = pfn_first(map); pfn < pfn_end(map); pfn++)
+
+static void devm_memremap_pages_release(struct device *dev, void *data)
{
- struct page_map *page_map = res;
+ struct page_map *page_map = data;
+ struct resource *res = &page_map->res;
+ resource_size_t align_start, align_size;
+ struct dev_pagemap *pgmap = &page_map->pgmap;
+
+ if (percpu_ref_tryget_live(pgmap->ref)) {
+ dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
+ percpu_ref_put(pgmap->ref);
+ }
+
+ pgmap_radix_release(res);
/* pages are dead and unused, undo the arch mapping */
- arch_remove_memory(page_map->res.start, resource_size(&page_map->res));
+ align_start = res->start & ~(SECTION_SIZE - 1);
+ align_size = ALIGN(resource_size(res), SECTION_SIZE);
+ arch_remove_memory(align_start, align_size);
+ dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
+ "%s: failed to free all reserved pages\n", __func__);
+}
+
+/* assumes rcu_read_lock() held at entry */
+struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
+{
+ struct page_map *page_map;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ page_map = radix_tree_lookup(&pgmap_radix, phys >> PA_SECTION_SHIFT);
+ return page_map ? &page_map->pgmap : NULL;
}
-void *devm_memremap_pages(struct device *dev, struct resource *res)
+/**
+ * devm_memremap_pages - remap and provide memmap backing for the given resource
+ * @dev: hosting device for @res
+ * @res: "host memory" address range
+ * @ref: a live per-cpu reference count
+ * @altmap: optional descriptor for allocating the memmap from @res
+ *
+ * Notes:
+ * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time
+ * (or devm release event).
+ *
+ * 2/ @res is expected to be a host memory range that could feasibly be
+ * treated as a "System RAM" range, i.e. not a device mmio range, but
+ * this is not enforced.
+ */
+void *devm_memremap_pages(struct device *dev, struct resource *res,
+ struct percpu_ref *ref, struct vmem_altmap *altmap)
{
int is_ram = region_intersects(res->start, resource_size(res),
"System RAM");
+ resource_size_t key, align_start, align_size;
+ struct dev_pagemap *pgmap;
struct page_map *page_map;
+ unsigned long pfn;
int error, nid;
if (is_ram == REGION_MIXED) {
@@ -176,25 +282,120 @@ void *devm_memremap_pages(struct device *dev, struct resource *res)
if (is_ram == REGION_INTERSECTS)
return __va(res->start);
+ if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) {
+ dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n",
+ __func__);
+ return ERR_PTR(-ENXIO);
+ }
+
+ if (!ref)
+ return ERR_PTR(-EINVAL);
+
page_map = devres_alloc_node(devm_memremap_pages_release,
sizeof(*page_map), GFP_KERNEL, dev_to_node(dev));
if (!page_map)
return ERR_PTR(-ENOMEM);
+ pgmap = &page_map->pgmap;
memcpy(&page_map->res, res, sizeof(*res));
+ pgmap->dev = dev;
+ if (altmap) {
+ memcpy(&page_map->altmap, altmap, sizeof(*altmap));
+ pgmap->altmap = &page_map->altmap;
+ }
+ pgmap->ref = ref;
+ pgmap->res = &page_map->res;
+
+ mutex_lock(&pgmap_lock);
+ error = 0;
+ for (key = res->start; key <= res->end; key += SECTION_SIZE) {
+ struct dev_pagemap *dup;
+
+ rcu_read_lock();
+ dup = find_dev_pagemap(key);
+ rcu_read_unlock();
+ if (dup) {
+ dev_err(dev, "%s: %pr collides with mapping for %s\n",
+ __func__, res, dev_name(dup->dev));
+ error = -EBUSY;
+ break;
+ }
+ error = radix_tree_insert(&pgmap_radix, key >> PA_SECTION_SHIFT,
+ page_map);
+ if (error) {
+ dev_err(dev, "%s: failed: %d\n", __func__, error);
+ break;
+ }
+ }
+ mutex_unlock(&pgmap_lock);
+ if (error)
+ goto err_radix;
+
nid = dev_to_node(dev);
if (nid < 0)
nid = numa_mem_id();
- error = arch_add_memory(nid, res->start, resource_size(res), true);
- if (error) {
- devres_free(page_map);
- return ERR_PTR(error);
- }
+ align_start = res->start & ~(SECTION_SIZE - 1);
+ align_size = ALIGN(resource_size(res), SECTION_SIZE);
+ error = arch_add_memory(nid, align_start, align_size, true);
+ if (error)
+ goto err_add_memory;
+ for_each_device_pfn(pfn, page_map) {
+ struct page *page = pfn_to_page(pfn);
+
+ /* ZONE_DEVICE pages must never appear on a slab lru */
+ list_force_poison(&page->lru);
+ page->pgmap = pgmap;
+ }
devres_add(dev, page_map);
return __va(res->start);
+
+ err_add_memory:
+ err_radix:
+ pgmap_radix_release(res);
+ devres_free(page_map);
+ return ERR_PTR(error);
}
EXPORT_SYMBOL(devm_memremap_pages);
+
+unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
+{
+ /* number of pfns from base where pfn_to_page() is valid */
+ return altmap->reserve + altmap->free;
+}
+
+void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
+{
+ altmap->alloc -= nr_pfns;
+}
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
+{
+ /*
+ * 'memmap_start' is the virtual address for the first "struct
+ * page" in this range of the vmemmap array. In the case of
+ * CONFIG_SPARSE_VMEMMAP a page_to_pfn conversion is simple
+ * pointer arithmetic, so we can perform this to_vmem_altmap()
+ * conversion without concern for the initialization state of
+ * the struct page fields.
+ */
+ struct page *page = (struct page *) memmap_start;
+ struct dev_pagemap *pgmap;
+
+ /*
+ * Uncoditionally retrieve a dev_pagemap associated with the
+ * given physical address, this is only for use in the
+ * arch_{add|remove}_memory() for setting up and tearing down
+ * the memmap.
+ */
+ rcu_read_lock();
+ pgmap = find_dev_pagemap(__pfn_to_phys(page_to_pfn(page)));
+ rcu_read_unlock();
+
+ return pgmap ? pgmap->altmap : NULL;
+}
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
#endif /* CONFIG_ZONE_DEVICE */
diff --git a/kernel/module.c b/kernel/module.c
index 38c7bd5583ff..8358f4697c0c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -80,15 +80,6 @@
# define debug_align(X) (X)
#endif
-/*
- * Given BASE and SIZE this macro calculates the number of pages the
- * memory regions occupies
- */
-#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ? \
- (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \
- PFN_DOWN((unsigned long)BASE) + 1) \
- : (0UL))
-
/* If this is set, the section belongs in the init part of the module */
#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
@@ -108,13 +99,6 @@ static LIST_HEAD(modules);
* Use a latched RB-tree for __module_address(); this allows us to use
* RCU-sched lookups of the address from any context.
*
- * Because modules have two address ranges: init and core, we need two
- * latch_tree_nodes entries. Therefore we need the back-pointer from
- * mod_tree_node.
- *
- * Because init ranges are short lived we mark them unlikely and have placed
- * them outside the critical cacheline in struct module.
- *
* This is conditional on PERF_EVENTS || TRACING because those can really hit
* __module_address() hard by doing a lot of stack unwinding; potentially from
* NMI context.
@@ -122,24 +106,16 @@ static LIST_HEAD(modules);
static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n)
{
- struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node);
- struct module *mod = mtn->mod;
+ struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
- if (unlikely(mtn == &mod->mtn_init))
- return (unsigned long)mod->module_init;
-
- return (unsigned long)mod->module_core;
+ return (unsigned long)layout->base;
}
static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n)
{
- struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node);
- struct module *mod = mtn->mod;
-
- if (unlikely(mtn == &mod->mtn_init))
- return (unsigned long)mod->init_size;
+ struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
- return (unsigned long)mod->core_size;
+ return (unsigned long)layout->size;
}
static __always_inline bool
@@ -197,23 +173,23 @@ static void __mod_tree_remove(struct mod_tree_node *node)
*/
static void mod_tree_insert(struct module *mod)
{
- mod->mtn_core.mod = mod;
- mod->mtn_init.mod = mod;
+ mod->core_layout.mtn.mod = mod;
+ mod->init_layout.mtn.mod = mod;
- __mod_tree_insert(&mod->mtn_core);
- if (mod->init_size)
- __mod_tree_insert(&mod->mtn_init);
+ __mod_tree_insert(&mod->core_layout.mtn);
+ if (mod->init_layout.size)
+ __mod_tree_insert(&mod->init_layout.mtn);
}
static void mod_tree_remove_init(struct module *mod)
{
- if (mod->init_size)
- __mod_tree_remove(&mod->mtn_init);
+ if (mod->init_layout.size)
+ __mod_tree_remove(&mod->init_layout.mtn);
}
static void mod_tree_remove(struct module *mod)
{
- __mod_tree_remove(&mod->mtn_core);
+ __mod_tree_remove(&mod->core_layout.mtn);
mod_tree_remove_init(mod);
}
@@ -267,9 +243,9 @@ static void __mod_update_bounds(void *base, unsigned int size)
static void mod_update_bounds(struct module *mod)
{
- __mod_update_bounds(mod->module_core, mod->core_size);
- if (mod->init_size)
- __mod_update_bounds(mod->module_init, mod->init_size);
+ __mod_update_bounds(mod->core_layout.base, mod->core_layout.size);
+ if (mod->init_layout.size)
+ __mod_update_bounds(mod->init_layout.base, mod->init_layout.size);
}
#ifdef CONFIG_KGDB_KDB
@@ -1214,7 +1190,7 @@ struct module_attribute module_uevent =
static ssize_t show_coresize(struct module_attribute *mattr,
struct module_kobject *mk, char *buffer)
{
- return sprintf(buffer, "%u\n", mk->mod->core_size);
+ return sprintf(buffer, "%u\n", mk->mod->core_layout.size);
}
static struct module_attribute modinfo_coresize =
@@ -1223,7 +1199,7 @@ static struct module_attribute modinfo_coresize =
static ssize_t show_initsize(struct module_attribute *mattr,
struct module_kobject *mk, char *buffer)
{
- return sprintf(buffer, "%u\n", mk->mod->init_size);
+ return sprintf(buffer, "%u\n", mk->mod->init_layout.size);
}
static struct module_attribute modinfo_initsize =
@@ -1873,64 +1849,75 @@ static void mod_sysfs_teardown(struct module *mod)
/*
* LKM RO/NX protection: protect module's text/ro-data
* from modification and any data from execution.
+ *
+ * General layout of module is:
+ * [text] [read-only-data] [writable data]
+ * text_size -----^ ^ ^
+ * ro_size ------------------------| |
+ * size -------------------------------------------|
+ *
+ * These values are always page-aligned (as is base)
*/
-void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages))
+static void frob_text(const struct module_layout *layout,
+ int (*set_memory)(unsigned long start, int num_pages))
{
- unsigned long begin_pfn = PFN_DOWN((unsigned long)start);
- unsigned long end_pfn = PFN_DOWN((unsigned long)end);
+ BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1));
+ set_memory((unsigned long)layout->base,
+ layout->text_size >> PAGE_SHIFT);
+}
- if (end_pfn > begin_pfn)
- set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
+static void frob_rodata(const struct module_layout *layout,
+ int (*set_memory)(unsigned long start, int num_pages))
+{
+ BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
+ set_memory((unsigned long)layout->base + layout->text_size,
+ (layout->ro_size - layout->text_size) >> PAGE_SHIFT);
}
-static void set_section_ro_nx(void *base,
- unsigned long text_size,
- unsigned long ro_size,
- unsigned long total_size)
+static void frob_writable_data(const struct module_layout *layout,
+ int (*set_memory)(unsigned long start, int num_pages))
{
- /* begin and end PFNs of the current subsection */
- unsigned long begin_pfn;
- unsigned long end_pfn;
+ BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1));
+ set_memory((unsigned long)layout->base + layout->ro_size,
+ (layout->size - layout->ro_size) >> PAGE_SHIFT);
+}
- /*
- * Set RO for module text and RO-data:
- * - Always protect first page.
- * - Do not protect last partial page.
- */
- if (ro_size > 0)
- set_page_attributes(base, base + ro_size, set_memory_ro);
+/* livepatching wants to disable read-only so it can frob module. */
+void module_disable_ro(const struct module *mod)
+{
+ frob_text(&mod->core_layout, set_memory_rw);
+ frob_rodata(&mod->core_layout, set_memory_rw);
+ frob_text(&mod->init_layout, set_memory_rw);
+ frob_rodata(&mod->init_layout, set_memory_rw);
+}
- /*
- * Set NX permissions for module data:
- * - Do not protect first partial page.
- * - Always protect last page.
- */
- if (total_size > text_size) {
- begin_pfn = PFN_UP((unsigned long)base + text_size);
- end_pfn = PFN_UP((unsigned long)base + total_size);
- if (end_pfn > begin_pfn)
- set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
- }
+void module_enable_ro(const struct module *mod)
+{
+ frob_text(&mod->core_layout, set_memory_ro);
+ frob_rodata(&mod->core_layout, set_memory_ro);
+ frob_text(&mod->init_layout, set_memory_ro);
+ frob_rodata(&mod->init_layout, set_memory_ro);
}
-static void unset_module_core_ro_nx(struct module *mod)
+static void module_enable_nx(const struct module *mod)
{
- set_page_attributes(mod->module_core + mod->core_text_size,
- mod->module_core + mod->core_size,
- set_memory_x);
- set_page_attributes(mod->module_core,
- mod->module_core + mod->core_ro_size,
- set_memory_rw);
+ frob_rodata(&mod->core_layout, set_memory_nx);
+ frob_writable_data(&mod->core_layout, set_memory_nx);
+ frob_rodata(&mod->init_layout, set_memory_nx);
+ frob_writable_data(&mod->init_layout, set_memory_nx);
}
-static void unset_module_init_ro_nx(struct module *mod)
+static void module_disable_nx(const struct module *mod)
{
- set_page_attributes(mod->module_init + mod->init_text_size,
- mod->module_init + mod->init_size,
- set_memory_x);
- set_page_attributes(mod->module_init,
- mod->module_init + mod->init_ro_size,
- set_memory_rw);
+ frob_rodata(&mod->core_layout, set_memory_x);
+ frob_writable_data(&mod->core_layout, set_memory_x);
+ frob_rodata(&mod->init_layout, set_memory_x);
+ frob_writable_data(&mod->init_layout, set_memory_x);
}
/* Iterate through all modules and set each module's text as RW */
@@ -1942,16 +1929,9 @@ void set_all_modules_text_rw(void)
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if ((mod->module_core) && (mod->core_text_size)) {
- set_page_attributes(mod->module_core,
- mod->module_core + mod->core_text_size,
- set_memory_rw);
- }
- if ((mod->module_init) && (mod->init_text_size)) {
- set_page_attributes(mod->module_init,
- mod->module_init + mod->init_text_size,
- set_memory_rw);
- }
+
+ frob_text(&mod->core_layout, set_memory_rw);
+ frob_text(&mod->init_layout, set_memory_rw);
}
mutex_unlock(&module_mutex);
}
@@ -1965,23 +1945,25 @@ void set_all_modules_text_ro(void)
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if ((mod->module_core) && (mod->core_text_size)) {
- set_page_attributes(mod->module_core,
- mod->module_core + mod->core_text_size,
- set_memory_ro);
- }
- if ((mod->module_init) && (mod->init_text_size)) {
- set_page_attributes(mod->module_init,
- mod->module_init + mod->init_text_size,
- set_memory_ro);
- }
+
+ frob_text(&mod->core_layout, set_memory_ro);
+ frob_text(&mod->init_layout, set_memory_ro);
}
mutex_unlock(&module_mutex);
}
+
+static void disable_ro_nx(const struct module_layout *layout)
+{
+ frob_text(layout, set_memory_rw);
+ frob_rodata(layout, set_memory_rw);
+ frob_rodata(layout, set_memory_x);
+ frob_writable_data(layout, set_memory_x);
+}
+
#else
-static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { }
-static void unset_module_core_ro_nx(struct module *mod) { }
-static void unset_module_init_ro_nx(struct module *mod) { }
+static void disable_ro_nx(const struct module_layout *layout) { }
+static void module_enable_nx(const struct module *mod) { }
+static void module_disable_nx(const struct module *mod) { }
#endif
void __weak module_memfree(void *module_region)
@@ -2033,19 +2015,19 @@ static void free_module(struct module *mod)
synchronize_sched();
mutex_unlock(&module_mutex);
- /* This may be NULL, but that's OK */
- unset_module_init_ro_nx(mod);
+ /* This may be empty, but that's OK */
+ disable_ro_nx(&mod->init_layout);
module_arch_freeing_init(mod);
- module_memfree(mod->module_init);
+ module_memfree(mod->init_layout.base);
kfree(mod->args);
percpu_modfree(mod);
/* Free lock-classes; relies on the preceding sync_rcu(). */
- lockdep_free_key_range(mod->module_core, mod->core_size);
+ lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
/* Finally, free the core (containing the module structure) */
- unset_module_core_ro_nx(mod);
- module_memfree(mod->module_core);
+ disable_ro_nx(&mod->core_layout);
+ module_memfree(mod->core_layout.base);
#ifdef CONFIG_MPU
update_protections(current->mm);
@@ -2248,20 +2230,20 @@ static void layout_sections(struct module *mod, struct load_info *info)
|| s->sh_entsize != ~0UL
|| strstarts(sname, ".init"))
continue;
- s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
+ s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i);
pr_debug("\t%s\n", sname);
}
switch (m) {
case 0: /* executable */
- mod->core_size = debug_align(mod->core_size);
- mod->core_text_size = mod->core_size;
+ mod->core_layout.size = debug_align(mod->core_layout.size);
+ mod->core_layout.text_size = mod->core_layout.size;
break;
case 1: /* RO: text and ro-data */
- mod->core_size = debug_align(mod->core_size);
- mod->core_ro_size = mod->core_size;
+ mod->core_layout.size = debug_align(mod->core_layout.size);
+ mod->core_layout.ro_size = mod->core_layout.size;
break;
case 3: /* whole core */
- mod->core_size = debug_align(mod->core_size);
+ mod->core_layout.size = debug_align(mod->core_layout.size);
break;
}
}
@@ -2277,21 +2259,21 @@ static void layout_sections(struct module *mod, struct load_info *info)
|| s->sh_entsize != ~0UL
|| !strstarts(sname, ".init"))
continue;
- s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
+ s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i)
| INIT_OFFSET_MASK);
pr_debug("\t%s\n", sname);
}
switch (m) {
case 0: /* executable */
- mod->init_size = debug_align(mod->init_size);
- mod->init_text_size = mod->init_size;
+ mod->init_layout.size = debug_align(mod->init_layout.size);
+ mod->init_layout.text_size = mod->init_layout.size;
break;
case 1: /* RO: text and ro-data */
- mod->init_size = debug_align(mod->init_size);
- mod->init_ro_size = mod->init_size;
+ mod->init_layout.size = debug_align(mod->init_layout.size);
+ mod->init_layout.ro_size = mod->init_layout.size;
break;
case 3: /* whole init */
- mod->init_size = debug_align(mod->init_size);
+ mod->init_layout.size = debug_align(mod->init_layout.size);
break;
}
}
@@ -2401,7 +2383,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info)
}
if (sym->st_shndx == SHN_UNDEF)
return 'U';
- if (sym->st_shndx == SHN_ABS)
+ if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu)
return 'a';
if (sym->st_shndx >= SHN_LORESERVE)
return '?';
@@ -2430,7 +2412,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info)
}
static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
- unsigned int shnum)
+ unsigned int shnum, unsigned int pcpundx)
{
const Elf_Shdr *sec;
@@ -2439,6 +2421,11 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
|| !src->st_name)
return false;
+#ifdef CONFIG_KALLSYMS_ALL
+ if (src->st_shndx == pcpundx)
+ return true;
+#endif
+
sec = sechdrs + src->st_shndx;
if (!(sec->sh_flags & SHF_ALLOC)
#ifndef CONFIG_KALLSYMS_ALL
@@ -2466,7 +2453,7 @@ static void layout_symtab(struct module *mod, struct load_info *info)
/* Put symbol section at end of init part of module. */
symsect->sh_flags |= SHF_ALLOC;
- symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
+ symsect->sh_entsize = get_offset(mod, &mod->init_layout.size, symsect,
info->index.sym) | INIT_OFFSET_MASK;
pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
@@ -2476,23 +2463,24 @@ static void layout_symtab(struct module *mod, struct load_info *info)
/* Compute total space required for the core symbols' strtab. */
for (ndst = i = 0; i < nsrc; i++) {
if (i == 0 ||
- is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
+ is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
+ info->index.pcpu)) {
strtab_size += strlen(&info->strtab[src[i].st_name])+1;
ndst++;
}
}
/* Append room for core symbols at end of core part. */
- info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
- info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
- mod->core_size += strtab_size;
- mod->core_size = debug_align(mod->core_size);
+ info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1);
+ info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym);
+ mod->core_layout.size += strtab_size;
+ mod->core_layout.size = debug_align(mod->core_layout.size);
/* Put string table section at end of init part of module. */
strsect->sh_flags |= SHF_ALLOC;
- strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
+ strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect,
info->index.str) | INIT_OFFSET_MASK;
- mod->init_size = debug_align(mod->init_size);
+ mod->init_layout.size = debug_align(mod->init_layout.size);
pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
}
@@ -2513,12 +2501,13 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
for (i = 0; i < mod->num_symtab; i++)
mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
- mod->core_symtab = dst = mod->module_core + info->symoffs;
- mod->core_strtab = s = mod->module_core + info->stroffs;
+ mod->core_symtab = dst = mod->core_layout.base + info->symoffs;
+ mod->core_strtab = s = mod->core_layout.base + info->stroffs;
src = mod->symtab;
for (ndst = i = 0; i < mod->num_symtab; i++) {
if (i == 0 ||
- is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
+ is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
+ info->index.pcpu)) {
dst[ndst] = src[i];
dst[ndst++].st_name = s - mod->core_strtab;
s += strlcpy(s, &mod->strtab[src[i].st_name],
@@ -2964,7 +2953,7 @@ static int move_module(struct module *mod, struct load_info *info)
void *ptr;
/* Do the allocs. */
- ptr = module_alloc(mod->core_size);
+ ptr = module_alloc(mod->core_layout.size);
/*
* The pointer to this block is stored in the module structure
* which is inside the block. Just mark it as not being a
@@ -2974,11 +2963,11 @@ static int move_module(struct module *mod, struct load_info *info)
if (!ptr)
return -ENOMEM;
- memset(ptr, 0, mod->core_size);
- mod->module_core = ptr;
+ memset(ptr, 0, mod->core_layout.size);
+ mod->core_layout.base = ptr;
- if (mod->init_size) {
- ptr = module_alloc(mod->init_size);
+ if (mod->init_layout.size) {
+ ptr = module_alloc(mod->init_layout.size);
/*
* The pointer to this block is stored in the module structure
* which is inside the block. This block doesn't need to be
@@ -2987,13 +2976,13 @@ static int move_module(struct module *mod, struct load_info *info)
*/
kmemleak_ignore(ptr);
if (!ptr) {
- module_memfree(mod->module_core);
+ module_memfree(mod->core_layout.base);
return -ENOMEM;
}
- memset(ptr, 0, mod->init_size);
- mod->module_init = ptr;
+ memset(ptr, 0, mod->init_layout.size);
+ mod->init_layout.base = ptr;
} else
- mod->module_init = NULL;
+ mod->init_layout.base = NULL;
/* Transfer each section which specifies SHF_ALLOC */
pr_debug("final section addresses:\n");
@@ -3005,10 +2994,10 @@ static int move_module(struct module *mod, struct load_info *info)
continue;
if (shdr->sh_entsize & INIT_OFFSET_MASK)
- dest = mod->module_init
+ dest = mod->init_layout.base
+ (shdr->sh_entsize & ~INIT_OFFSET_MASK);
else
- dest = mod->module_core + shdr->sh_entsize;
+ dest = mod->core_layout.base + shdr->sh_entsize;
if (shdr->sh_type != SHT_NOBITS)
memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
@@ -3070,12 +3059,12 @@ static void flush_module_icache(const struct module *mod)
* Do it before processing of module parameters, so the module
* can provide parameter accessor functions of its own.
*/
- if (mod->module_init)
- flush_icache_range((unsigned long)mod->module_init,
- (unsigned long)mod->module_init
- + mod->init_size);
- flush_icache_range((unsigned long)mod->module_core,
- (unsigned long)mod->module_core + mod->core_size);
+ if (mod->init_layout.base)
+ flush_icache_range((unsigned long)mod->init_layout.base,
+ (unsigned long)mod->init_layout.base
+ + mod->init_layout.size);
+ flush_icache_range((unsigned long)mod->core_layout.base,
+ (unsigned long)mod->core_layout.base + mod->core_layout.size);
set_fs(old_fs);
}
@@ -3133,8 +3122,8 @@ static void module_deallocate(struct module *mod, struct load_info *info)
{
percpu_modfree(mod);
module_arch_freeing_init(mod);
- module_memfree(mod->module_init);
- module_memfree(mod->module_core);
+ module_memfree(mod->init_layout.base);
+ module_memfree(mod->core_layout.base);
}
int __weak module_finalize(const Elf_Ehdr *hdr,
@@ -3221,7 +3210,7 @@ static noinline int do_init_module(struct module *mod)
ret = -ENOMEM;
goto fail;
}
- freeinit->module_init = mod->module_init;
+ freeinit->module_init = mod->init_layout.base;
/*
* We want to find out whether @mod uses async during init. Clear
@@ -3279,12 +3268,12 @@ static noinline int do_init_module(struct module *mod)
mod->strtab = mod->core_strtab;
#endif
mod_tree_remove_init(mod);
- unset_module_init_ro_nx(mod);
+ disable_ro_nx(&mod->init_layout);
module_arch_freeing_init(mod);
- mod->module_init = NULL;
- mod->init_size = 0;
- mod->init_ro_size = 0;
- mod->init_text_size = 0;
+ mod->init_layout.base = NULL;
+ mod->init_layout.size = 0;
+ mod->init_layout.ro_size = 0;
+ mod->init_layout.text_size = 0;
/*
* We want to free module_init, but be aware that kallsyms may be
* walking this with preempt disabled. In all the failure paths, we
@@ -3373,17 +3362,9 @@ static int complete_formation(struct module *mod, struct load_info *info)
/* This relies on module_mutex for list integrity. */
module_bug_finalize(info->hdr, info->sechdrs, mod);
- /* Set RO and NX regions for core */
- set_section_ro_nx(mod->module_core,
- mod->core_text_size,
- mod->core_ro_size,
- mod->core_size);
-
- /* Set RO and NX regions for init */
- set_section_ro_nx(mod->module_init,
- mod->init_text_size,
- mod->init_ro_size,
- mod->init_size);
+ /* Set RO and NX regions */
+ module_enable_ro(mod);
+ module_enable_nx(mod);
/* Mark state as coming so strong_try_module_get() ignores us,
* but kallsyms etc. can see us. */
@@ -3548,8 +3529,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
MODULE_STATE_GOING, mod);
/* we can't deallocate the module until we clear memory protection */
- unset_module_init_ro_nx(mod);
- unset_module_core_ro_nx(mod);
+ module_disable_ro(mod);
+ module_disable_nx(mod);
ddebug_cleanup:
dynamic_debug_remove(info->debug);
@@ -3578,7 +3559,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
*/
ftrace_release_mod(mod);
/* Free lock-classes; relies on the preceding sync_rcu() */
- lockdep_free_key_range(mod->module_core, mod->core_size);
+ lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
module_deallocate(mod, info);
free_copy:
@@ -3656,9 +3637,9 @@ static const char *get_ksymbol(struct module *mod,
/* At worse, next value is at end of module */
if (within_module_init(addr, mod))
- nextval = (unsigned long)mod->module_init+mod->init_text_size;
+ nextval = (unsigned long)mod->init_layout.base+mod->init_layout.text_size;
else
- nextval = (unsigned long)mod->module_core+mod->core_text_size;
+ nextval = (unsigned long)mod->core_layout.base+mod->core_layout.text_size;
/* Scan for closest preceding symbol, and next symbol. (ELF
starts real symbols at 1). */
@@ -3905,7 +3886,7 @@ static int m_show(struct seq_file *m, void *p)
return 0;
seq_printf(m, "%s %u",
- mod->name, mod->init_size + mod->core_size);
+ mod->name, mod->init_layout.size + mod->core_layout.size);
print_unload_info(m, mod);
/* Informative for users. */
@@ -3914,7 +3895,7 @@ static int m_show(struct seq_file *m, void *p)
mod->state == MODULE_STATE_COMING ? "Loading" :
"Live");
/* Used by oprofile and other similar tools. */
- seq_printf(m, " 0x%pK", mod->module_core);
+ seq_printf(m, " 0x%pK", mod->core_layout.base);
/* Taints info */
if (mod->taints)
@@ -4057,8 +4038,8 @@ struct module *__module_text_address(unsigned long addr)
struct module *mod = __module_address(addr);
if (mod) {
/* Make sure it's within the text section. */
- if (!within(addr, mod->module_init, mod->init_text_size)
- && !within(addr, mod->module_core, mod->core_text_size))
+ if (!within(addr, mod->init_layout.base, mod->init_layout.text_size)
+ && !within(addr, mod->core_layout.base, mod->core_layout.text_size))
mod = NULL;
}
return mod;
diff --git a/kernel/panic.c b/kernel/panic.c
index b333380c6bb2..d96469de72dc 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -180,8 +180,7 @@ void panic(const char *fmt, ...)
* panic() is not being callled from OOPS.
*/
debug_locks_off();
- console_trylock();
- console_unlock();
+ console_flush_on_panic();
if (!panic_blink)
panic_blink = no_blink;
diff --git a/kernel/pid.c b/kernel/pid.c
index 78b3d9f80d44..f4ad91b746f1 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -604,5 +604,5 @@ void __init pidmap_init(void)
atomic_dec(&init_pid_ns.pidmap[0].nr_free);
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
- SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b2dd4d999900..27946975eff0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -280,13 +280,7 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj,
return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA;
}
-static ssize_t pm_wakeup_irq_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t n)
-{
- return -EINVAL;
-}
-power_attr(pm_wakeup_irq);
+power_attr_ro(pm_wakeup_irq);
#else /* !CONFIG_PM_SLEEP_DEBUG */
static inline void pm_print_times_init(void) {}
@@ -564,14 +558,7 @@ static ssize_t pm_trace_dev_match_show(struct kobject *kobj,
return show_trace_dev_match(buf, PAGE_SIZE);
}
-static ssize_t
-pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr,
- const char *buf, size_t n)
-{
- return -EINVAL;
-}
-
-power_attr(pm_trace_dev_match);
+power_attr_ro(pm_trace_dev_match);
#endif /* CONFIG_PM_TRACE */
diff --git a/kernel/power/power.h b/kernel/power/power.h
index caadb566e82b..efe1b3b17c88 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -77,6 +77,15 @@ static struct kobj_attribute _name##_attr = { \
.store = _name##_store, \
}
+#define power_attr_ro(_name) \
+static struct kobj_attribute _name##_attr = { \
+ .attr = { \
+ .name = __stringify(_name), \
+ .mode = S_IRUGO, \
+ }, \
+ .show = _name##_show, \
+}
+
/* Preferred image size in bytes (default 500 MB) */
extern unsigned long image_size;
/* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 2ce8826f1053..e79439134978 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -48,6 +48,7 @@
#include <linux/uio.h>
#include <asm/uaccess.h>
+#include <asm-generic/sections.h>
#define CREATE_TRACE_POINTS
#include <trace/events/printk.h>
@@ -1660,7 +1661,7 @@ asmlinkage int vprintk_emit(int facility, int level,
const char *dict, size_t dictlen,
const char *fmt, va_list args)
{
- static int recursion_bug;
+ static bool recursion_bug;
static char textbuf[LOG_LINE_MAX];
char *text = textbuf;
size_t text_len = 0;
@@ -1696,7 +1697,7 @@ asmlinkage int vprintk_emit(int facility, int level,
* it can be printed at the next appropriate moment:
*/
if (!oops_in_progress && !lockdep_recursing(current)) {
- recursion_bug = 1;
+ recursion_bug = true;
local_irq_restore(flags);
return 0;
}
@@ -1711,7 +1712,7 @@ asmlinkage int vprintk_emit(int facility, int level,
static const char recursion_msg[] =
"BUG: recent printk recursion!";
- recursion_bug = 0;
+ recursion_bug = false;
/* emit KERN_CRIT message */
printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
NULL, 0, recursion_msg,
@@ -2233,13 +2234,24 @@ void console_unlock(void)
static u64 seen_seq;
unsigned long flags;
bool wake_klogd = false;
- bool retry;
+ bool do_cond_resched, retry;
if (console_suspended) {
up_console_sem();
return;
}
+ /*
+ * Console drivers are called under logbuf_lock, so
+ * @console_may_schedule should be cleared before; however, we may
+ * end up dumping a lot of lines, for example, if called from
+ * console registration path, and should invoke cond_resched()
+ * between lines if allowable. Not doing so can cause a very long
+ * scheduling stall on a slow console leading to RCU stall and
+ * softlockup warnings which exacerbate the issue with more
+ * messages practically incapacitating the system.
+ */
+ do_cond_resched = console_may_schedule;
console_may_schedule = 0;
/* flush buffered message fragment immediately to console */
@@ -2311,6 +2323,9 @@ skip:
call_console_drivers(level, ext_text, ext_len, text, len);
start_critical_timings();
local_irq_restore(flags);
+
+ if (do_cond_resched)
+ cond_resched();
}
console_locked = 0;
@@ -2378,6 +2393,25 @@ void console_unblank(void)
console_unlock();
}
+/**
+ * console_flush_on_panic - flush console content on panic
+ *
+ * Immediately output all pending messages no matter what.
+ */
+void console_flush_on_panic(void)
+{
+ /*
+ * If someone else is holding the console lock, trylock will fail
+ * and may_schedule may be set. Ignore and proceed to unlock so
+ * that messages are flushed out. As this can be called from any
+ * context and we don't want to get preempted while flushing,
+ * ensure may_schedule is cleared.
+ */
+ console_trylock();
+ console_may_schedule = 0;
+ console_unlock();
+}
+
/*
* Return the console tty driver structure and its associated index
*/
@@ -2658,13 +2692,36 @@ int unregister_console(struct console *console)
}
EXPORT_SYMBOL(unregister_console);
+/*
+ * Some boot consoles access data that is in the init section and which will
+ * be discarded after the initcalls have been run. To make sure that no code
+ * will access this data, unregister the boot consoles in a late initcall.
+ *
+ * If for some reason, such as deferred probe or the driver being a loadable
+ * module, the real console hasn't registered yet at this point, there will
+ * be a brief interval in which no messages are logged to the console, which
+ * makes it difficult to diagnose problems that occur during this time.
+ *
+ * To mitigate this problem somewhat, only unregister consoles whose memory
+ * intersects with the init section. Note that code exists elsewhere to get
+ * rid of the boot console as soon as the proper console shows up, so there
+ * won't be side-effects from postponing the removal.
+ */
static int __init printk_late_init(void)
{
struct console *con;
for_each_console(con) {
if (!keep_bootcon && con->flags & CON_BOOT) {
- unregister_console(con);
+ /*
+ * Make sure to unregister boot consoles whose data
+ * resides in the init section before the init section
+ * is discarded. Boot consoles whose data will stick
+ * around will automatically be unregistered when the
+ * proper console replaces them.
+ */
+ if (init_section_intersects(con, sizeof(*con)))
+ unregister_console(con);
}
}
hotcpu_notifier(console_cpu_notify, 0);
diff --git a/kernel/resource.c b/kernel/resource.c
index f150dbbe6f62..09c0597840b0 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1498,8 +1498,15 @@ int iomem_is_exclusive(u64 addr)
break;
if (p->end < addr)
continue;
- if (p->flags & IORESOURCE_BUSY &&
- p->flags & IORESOURCE_EXCLUSIVE) {
+ /*
+ * A resource is exclusive if IORESOURCE_EXCLUSIVE is set
+ * or CONFIG_IO_STRICT_DEVMEM is enabled and the
+ * resource is busy.
+ */
+ if ((p->flags & IORESOURCE_BUSY) == 0)
+ continue;
+ if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM)
+ || p->flags & IORESOURCE_EXCLUSIVE) {
err = 1;
break;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 77d97a6fc715..44253adb3c36 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8342,7 +8342,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
sched_offline_group(tg);
}
-static void cpu_cgroup_fork(struct task_struct *task, void *private)
+static void cpu_cgroup_fork(struct task_struct *task)
{
sched_move_task(task);
}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 4a2ef5a02fd3..2489140a7c51 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -219,6 +219,7 @@ static void cpu_idle_loop(void)
*/
__current_set_polling();
+ quiet_vmstat();
tick_nohz_idle_enter();
while (!need_resched()) {
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index edb6de4f5908..a467e6c28a3b 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -529,8 +529,6 @@ static int __init cpu_stop_init(void)
}
early_initcall(cpu_stop_init);
-#if defined(CONFIG_SMP) || defined(CONFIG_HOTPLUG_CPU)
-
static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
{
struct multi_stop_data msdata = {
@@ -628,5 +626,3 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
mutex_unlock(&stop_cpus_mutex);
return ret ?: done.ret;
}
-
-#endif /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5faf89ac9ec0..c810f8afdb7f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1568,6 +1568,28 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
+ {
+ .procname = "mmap_rnd_bits",
+ .data = &mmap_rnd_bits,
+ .maxlen = sizeof(mmap_rnd_bits),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&mmap_rnd_bits_min,
+ .extra2 = (void *)&mmap_rnd_bits_max,
+ },
+#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
+ {
+ .procname = "mmap_rnd_compat_bits",
+ .data = &mmap_rnd_compat_bits,
+ .maxlen = sizeof(mmap_rnd_compat_bits),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&mmap_rnd_compat_bits_min,
+ .extra2 = (void *)&mmap_rnd_compat_bits_max,
+ },
+#endif
{ }
};
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 4228fd3682c3..45dd798bcd37 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -316,7 +316,7 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type
return true;
}
-static struct bpf_verifier_ops kprobe_prog_ops = {
+static const struct bpf_verifier_ops kprobe_prog_ops = {
.get_func_proto = kprobe_prog_func_proto,
.is_valid_access = kprobe_prog_is_valid_access,
};
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3f743b147247..eca592f977b2 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -62,8 +62,6 @@
#define FTRACE_HASH_DEFAULT_BITS 10
#define FTRACE_HASH_MAX_BITS 12
-#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL)
-
#ifdef CONFIG_DYNAMIC_FTRACE
#define INIT_OPS_HASH(opsname) \
.func_hash = &opsname.local_hash, \
@@ -113,14 +111,9 @@ static int ftrace_disabled __read_mostly;
static DEFINE_MUTEX(ftrace_lock);
-static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
static struct ftrace_ops global_ops;
-static struct ftrace_ops control_ops;
-
-static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *regs);
#if ARCH_SUPPORTS_FTRACE_OPS
static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
@@ -203,7 +196,7 @@ void clear_ftrace_function(void)
ftrace_trace_function = ftrace_stub;
}
-static void control_ops_disable_all(struct ftrace_ops *ops)
+static void per_cpu_ops_disable_all(struct ftrace_ops *ops)
{
int cpu;
@@ -211,16 +204,19 @@ static void control_ops_disable_all(struct ftrace_ops *ops)
*per_cpu_ptr(ops->disabled, cpu) = 1;
}
-static int control_ops_alloc(struct ftrace_ops *ops)
+static int per_cpu_ops_alloc(struct ftrace_ops *ops)
{
int __percpu *disabled;
+ if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU)))
+ return -EINVAL;
+
disabled = alloc_percpu(int);
if (!disabled)
return -ENOMEM;
ops->disabled = disabled;
- control_ops_disable_all(ops);
+ per_cpu_ops_disable_all(ops);
return 0;
}
@@ -256,10 +252,11 @@ static inline void update_function_graph_func(void) { }
static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops)
{
/*
- * If this is a dynamic ops or we force list func,
+ * If this is a dynamic, RCU, or per CPU ops, or we force list func,
* then it needs to call the list anyway.
*/
- if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC)
+ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU |
+ FTRACE_OPS_FL_RCU) || FTRACE_FORCE_LIST_FUNC)
return ftrace_ops_list_func;
return ftrace_ops_get_func(ops);
@@ -383,26 +380,6 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
return 0;
}
-static void add_ftrace_list_ops(struct ftrace_ops **list,
- struct ftrace_ops *main_ops,
- struct ftrace_ops *ops)
-{
- int first = *list == &ftrace_list_end;
- add_ftrace_ops(list, ops);
- if (first)
- add_ftrace_ops(&ftrace_ops_list, main_ops);
-}
-
-static int remove_ftrace_list_ops(struct ftrace_ops **list,
- struct ftrace_ops *main_ops,
- struct ftrace_ops *ops)
-{
- int ret = remove_ftrace_ops(list, ops);
- if (!ret && *list == &ftrace_list_end)
- ret = remove_ftrace_ops(&ftrace_ops_list, main_ops);
- return ret;
-}
-
static void ftrace_update_trampoline(struct ftrace_ops *ops);
static int __register_ftrace_function(struct ftrace_ops *ops)
@@ -430,14 +407,12 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
if (!core_kernel_data((unsigned long)ops))
ops->flags |= FTRACE_OPS_FL_DYNAMIC;
- if (ops->flags & FTRACE_OPS_FL_CONTROL) {
- if (control_ops_alloc(ops))
+ if (ops->flags & FTRACE_OPS_FL_PER_CPU) {
+ if (per_cpu_ops_alloc(ops))
return -ENOMEM;
- add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
- /* The control_ops needs the trampoline update */
- ops = &control_ops;
- } else
- add_ftrace_ops(&ftrace_ops_list, ops);
+ }
+
+ add_ftrace_ops(&ftrace_ops_list, ops);
/* Always save the function, and reset at unregistering */
ops->saved_func = ops->func;
@@ -460,11 +435,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
return -EBUSY;
- if (ops->flags & FTRACE_OPS_FL_CONTROL) {
- ret = remove_ftrace_list_ops(&ftrace_control_list,
- &control_ops, ops);
- } else
- ret = remove_ftrace_ops(&ftrace_ops_list, ops);
+ ret = remove_ftrace_ops(&ftrace_ops_list, ops);
if (ret < 0)
return ret;
@@ -1687,6 +1658,9 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
int in_hash = 0;
int match = 0;
+ if (rec->flags & FTRACE_FL_DISABLED)
+ continue;
+
if (all) {
/*
* Only the filter_hash affects all records.
@@ -1940,7 +1914,7 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash);
}
-static void print_ip_ins(const char *fmt, unsigned char *p)
+static void print_ip_ins(const char *fmt, const unsigned char *p)
{
int i;
@@ -1952,6 +1926,31 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
static struct ftrace_ops *
ftrace_find_tramp_ops_any(struct dyn_ftrace *rec);
+static struct ftrace_ops *
+ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops);
+
+enum ftrace_bug_type ftrace_bug_type;
+const void *ftrace_expected;
+
+static void print_bug_type(void)
+{
+ switch (ftrace_bug_type) {
+ case FTRACE_BUG_UNKNOWN:
+ break;
+ case FTRACE_BUG_INIT:
+ pr_info("Initializing ftrace call sites\n");
+ break;
+ case FTRACE_BUG_NOP:
+ pr_info("Setting ftrace call site to NOP\n");
+ break;
+ case FTRACE_BUG_CALL:
+ pr_info("Setting ftrace call site to call ftrace function\n");
+ break;
+ case FTRACE_BUG_UPDATE:
+ pr_info("Updating ftrace call site to call a different ftrace function\n");
+ break;
+ }
+}
/**
* ftrace_bug - report and shutdown function tracer
@@ -1979,8 +1978,12 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
FTRACE_WARN_ON_ONCE(1);
pr_info("ftrace failed to modify ");
print_ip_sym(ip);
- print_ip_ins(" actual: ", (unsigned char *)ip);
+ print_ip_ins(" actual: ", (unsigned char *)ip);
pr_cont("\n");
+ if (ftrace_expected) {
+ print_ip_ins(" expected: ", ftrace_expected);
+ pr_cont("\n");
+ }
break;
case -EPERM:
FTRACE_WARN_ON_ONCE(1);
@@ -1992,6 +1995,7 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
pr_info("ftrace faulted on unknown error ");
print_ip_sym(ip);
}
+ print_bug_type();
if (rec) {
struct ftrace_ops *ops = NULL;
@@ -2000,15 +2004,19 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
rec->flags & FTRACE_FL_REGS ? " R" : " ");
if (rec->flags & FTRACE_FL_TRAMP_EN) {
ops = ftrace_find_tramp_ops_any(rec);
- if (ops)
- pr_cont("\ttramp: %pS",
- (void *)ops->trampoline);
- else
+ if (ops) {
+ do {
+ pr_cont("\ttramp: %pS (%pS)",
+ (void *)ops->trampoline,
+ (void *)ops->func);
+ ops = ftrace_find_tramp_ops_next(rec, ops);
+ } while (ops);
+ } else
pr_cont("\ttramp: ERROR!");
}
ip = ftrace_get_addr_curr(rec);
- pr_cont(" expected tramp: %lx\n", ip);
+ pr_cont("\n expected tramp: %lx\n", ip);
}
}
@@ -2016,6 +2024,11 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
{
unsigned long flag = 0UL;
+ ftrace_bug_type = FTRACE_BUG_UNKNOWN;
+
+ if (rec->flags & FTRACE_FL_DISABLED)
+ return FTRACE_UPDATE_IGNORE;
+
/*
* If we are updating calls:
*
@@ -2077,9 +2090,12 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
* from the save regs, to a non-save regs function or
* vice versa, or from a trampoline call.
*/
- if (flag & FTRACE_FL_ENABLED)
+ if (flag & FTRACE_FL_ENABLED) {
+ ftrace_bug_type = FTRACE_BUG_CALL;
return FTRACE_UPDATE_MAKE_CALL;
+ }
+ ftrace_bug_type = FTRACE_BUG_UPDATE;
return FTRACE_UPDATE_MODIFY_CALL;
}
@@ -2096,6 +2112,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
FTRACE_FL_REGS_EN);
}
+ ftrace_bug_type = FTRACE_BUG_NOP;
return FTRACE_UPDATE_MAKE_NOP;
}
@@ -2145,6 +2162,24 @@ ftrace_find_tramp_ops_any(struct dyn_ftrace *rec)
}
static struct ftrace_ops *
+ftrace_find_tramp_ops_next(struct dyn_ftrace *rec,
+ struct ftrace_ops *op)
+{
+ unsigned long ip = rec->ip;
+
+ while_for_each_ftrace_op(op) {
+
+ if (!op->trampoline)
+ continue;
+
+ if (hash_contains_ip(ip, op->func_hash))
+ return op;
+ }
+
+ return NULL;
+}
+
+static struct ftrace_ops *
ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec)
{
struct ftrace_ops *op;
@@ -2307,17 +2342,22 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
ret = ftrace_update_record(rec, enable);
+ ftrace_bug_type = FTRACE_BUG_UNKNOWN;
+
switch (ret) {
case FTRACE_UPDATE_IGNORE:
return 0;
case FTRACE_UPDATE_MAKE_CALL:
+ ftrace_bug_type = FTRACE_BUG_CALL;
return ftrace_make_call(rec, ftrace_addr);
case FTRACE_UPDATE_MAKE_NOP:
+ ftrace_bug_type = FTRACE_BUG_NOP;
return ftrace_make_nop(NULL, rec, ftrace_old_addr);
case FTRACE_UPDATE_MODIFY_CALL:
+ ftrace_bug_type = FTRACE_BUG_UPDATE;
return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
}
@@ -2425,6 +2465,7 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
if (ret) {
+ ftrace_bug_type = FTRACE_BUG_INIT;
ftrace_bug(ret, rec);
return 0;
}
@@ -2566,7 +2607,7 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
{
}
-static void control_ops_free(struct ftrace_ops *ops)
+static void per_cpu_ops_free(struct ftrace_ops *ops)
{
free_percpu(ops->disabled);
}
@@ -2667,13 +2708,13 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
if (!command || !ftrace_enabled) {
/*
- * If these are control ops, they still need their
+ * If these are per_cpu ops, they still need their
* per_cpu field freed. Since, function tracing is
* not currently active, we can just free them
* without synchronizing all CPUs.
*/
- if (ops->flags & FTRACE_OPS_FL_CONTROL)
- control_ops_free(ops);
+ if (ops->flags & FTRACE_OPS_FL_PER_CPU)
+ per_cpu_ops_free(ops);
return 0;
}
@@ -2714,7 +2755,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
/*
* Dynamic ops may be freed, we must make sure that all
* callers are done before leaving this function.
- * The same goes for freeing the per_cpu data of the control
+ * The same goes for freeing the per_cpu data of the per_cpu
* ops.
*
* Again, normal synchronize_sched() is not good enough.
@@ -2725,13 +2766,13 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
* infrastructure to do the synchronization, thus we must do it
* ourselves.
*/
- if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) {
+ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) {
schedule_on_each_cpu(ftrace_sync);
arch_ftrace_trampoline_free(ops);
- if (ops->flags & FTRACE_OPS_FL_CONTROL)
- control_ops_free(ops);
+ if (ops->flags & FTRACE_OPS_FL_PER_CPU)
+ per_cpu_ops_free(ops);
}
return 0;
@@ -2798,9 +2839,9 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
return 0;
- /* If ops traces all mods, we already accounted for it */
+ /* If ops traces all then it includes this function */
if (ops_traces_mod(ops))
- return 0;
+ return 1;
/* The function must be in the filter */
if (!ftrace_hash_empty(ops->func_hash->filter_hash) &&
@@ -2814,64 +2855,41 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
return 1;
}
-static int referenced_filters(struct dyn_ftrace *rec)
-{
- struct ftrace_ops *ops;
- int cnt = 0;
-
- for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
- if (ops_references_rec(ops, rec))
- cnt++;
- }
-
- return cnt;
-}
-
static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
{
struct ftrace_page *pg;
struct dyn_ftrace *p;
cycle_t start, stop;
unsigned long update_cnt = 0;
- unsigned long ref = 0;
- bool test = false;
+ unsigned long rec_flags = 0;
int i;
+ start = ftrace_now(raw_smp_processor_id());
+
/*
- * When adding a module, we need to check if tracers are
- * currently enabled and if they are set to trace all functions.
- * If they are, we need to enable the module functions as well
- * as update the reference counts for those function records.
+ * When a module is loaded, this function is called to convert
+ * the calls to mcount in its text to nops, and also to create
+ * an entry in the ftrace data. Now, if ftrace is activated
+ * after this call, but before the module sets its text to
+ * read-only, the modification of enabling ftrace can fail if
+ * the read-only is done while ftrace is converting the calls.
+ * To prevent this, the module's records are set as disabled
+ * and will be enabled after the call to set the module's text
+ * to read-only.
*/
- if (mod) {
- struct ftrace_ops *ops;
-
- for (ops = ftrace_ops_list;
- ops != &ftrace_list_end; ops = ops->next) {
- if (ops->flags & FTRACE_OPS_FL_ENABLED) {
- if (ops_traces_mod(ops))
- ref++;
- else
- test = true;
- }
- }
- }
-
- start = ftrace_now(raw_smp_processor_id());
+ if (mod)
+ rec_flags |= FTRACE_FL_DISABLED;
for (pg = new_pgs; pg; pg = pg->next) {
for (i = 0; i < pg->index; i++) {
- int cnt = ref;
/* If something went wrong, bail without enabling anything */
if (unlikely(ftrace_disabled))
return -1;
p = &pg->records[i];
- if (test)
- cnt += referenced_filters(p);
- p->flags = cnt;
+ p->flags = rec_flags;
/*
* Do the initial record conversion from mcount jump
@@ -2881,21 +2899,6 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
break;
update_cnt++;
-
- /*
- * If the tracing is enabled, go ahead and enable the record.
- *
- * The reason not to enable the record immediatelly is the
- * inherent check of ftrace_make_nop/ftrace_make_call for
- * correct previous instructions. Making first the NOP
- * conversion puts the module to the correct state, thus
- * passing the ftrace_make_call check.
- */
- if (ftrace_start_up && cnt) {
- int failed = __ftrace_replace_code(p, 1);
- if (failed)
- ftrace_bug(failed, p);
- }
}
}
@@ -3258,7 +3261,7 @@ static int t_show(struct seq_file *m, void *v)
seq_printf(m, "%ps", (void *)rec->ip);
if (iter->flags & FTRACE_ITER_ENABLED) {
- struct ftrace_ops *ops = NULL;
+ struct ftrace_ops *ops;
seq_printf(m, " (%ld)%s%s",
ftrace_rec_count(rec),
@@ -3266,14 +3269,19 @@ static int t_show(struct seq_file *m, void *v)
rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ");
if (rec->flags & FTRACE_FL_TRAMP_EN) {
ops = ftrace_find_tramp_ops_any(rec);
- if (ops)
- seq_printf(m, "\ttramp: %pS",
- (void *)ops->trampoline);
- else
+ if (ops) {
+ do {
+ seq_printf(m, "\ttramp: %pS (%pS)",
+ (void *)ops->trampoline,
+ (void *)ops->func);
+ add_trampoline_func(m, ops, rec);
+ ops = ftrace_find_tramp_ops_next(rec, ops);
+ } while (ops);
+ } else
seq_puts(m, "\ttramp: ERROR!");
-
+ } else {
+ add_trampoline_func(m, NULL, rec);
}
- add_trampoline_func(m, ops, rec);
}
seq_putc(m, '\n');
@@ -4898,6 +4906,19 @@ static int ftrace_process_locs(struct module *mod,
#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
+static int referenced_filters(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *ops;
+ int cnt = 0;
+
+ for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
+ if (ops_references_rec(ops, rec))
+ cnt++;
+ }
+
+ return cnt;
+}
+
void ftrace_release_mod(struct module *mod)
{
struct dyn_ftrace *rec;
@@ -4940,41 +4961,112 @@ void ftrace_release_mod(struct module *mod)
mutex_unlock(&ftrace_lock);
}
-static void ftrace_init_module(struct module *mod,
- unsigned long *start, unsigned long *end)
+static void ftrace_module_enable(struct module *mod)
{
- if (ftrace_disabled || start == end)
- return;
- ftrace_process_locs(mod, start, end);
+ struct dyn_ftrace *rec;
+ struct ftrace_page *pg;
+
+ mutex_lock(&ftrace_lock);
+
+ if (ftrace_disabled)
+ goto out_unlock;
+
+ /*
+ * If the tracing is enabled, go ahead and enable the record.
+ *
+ * The reason not to enable the record immediatelly is the
+ * inherent check of ftrace_make_nop/ftrace_make_call for
+ * correct previous instructions. Making first the NOP
+ * conversion puts the module to the correct state, thus
+ * passing the ftrace_make_call check.
+ *
+ * We also delay this to after the module code already set the
+ * text to read-only, as we now need to set it back to read-write
+ * so that we can modify the text.
+ */
+ if (ftrace_start_up)
+ ftrace_arch_code_modify_prepare();
+
+ do_for_each_ftrace_rec(pg, rec) {
+ int cnt;
+ /*
+ * do_for_each_ftrace_rec() is a double loop.
+ * module text shares the pg. If a record is
+ * not part of this module, then skip this pg,
+ * which the "break" will do.
+ */
+ if (!within_module_core(rec->ip, mod))
+ break;
+
+ cnt = 0;
+
+ /*
+ * When adding a module, we need to check if tracers are
+ * currently enabled and if they are, and can trace this record,
+ * we need to enable the module functions as well as update the
+ * reference counts for those function records.
+ */
+ if (ftrace_start_up)
+ cnt += referenced_filters(rec);
+
+ /* This clears FTRACE_FL_DISABLED */
+ rec->flags = cnt;
+
+ if (ftrace_start_up && cnt) {
+ int failed = __ftrace_replace_code(rec, 1);
+ if (failed) {
+ ftrace_bug(failed, rec);
+ goto out_loop;
+ }
+ }
+
+ } while_for_each_ftrace_rec();
+
+ out_loop:
+ if (ftrace_start_up)
+ ftrace_arch_code_modify_post_process();
+
+ out_unlock:
+ mutex_unlock(&ftrace_lock);
}
void ftrace_module_init(struct module *mod)
{
- ftrace_init_module(mod, mod->ftrace_callsites,
- mod->ftrace_callsites +
- mod->num_ftrace_callsites);
+ if (ftrace_disabled || !mod->num_ftrace_callsites)
+ return;
+
+ ftrace_process_locs(mod, mod->ftrace_callsites,
+ mod->ftrace_callsites + mod->num_ftrace_callsites);
}
-static int ftrace_module_notify_exit(struct notifier_block *self,
- unsigned long val, void *data)
+static int ftrace_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
{
struct module *mod = data;
- if (val == MODULE_STATE_GOING)
+ switch (val) {
+ case MODULE_STATE_COMING:
+ ftrace_module_enable(mod);
+ break;
+ case MODULE_STATE_GOING:
ftrace_release_mod(mod);
+ break;
+ default:
+ break;
+ }
return 0;
}
#else
-static int ftrace_module_notify_exit(struct notifier_block *self,
- unsigned long val, void *data)
+static int ftrace_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
{
return 0;
}
#endif /* CONFIG_MODULES */
-struct notifier_block ftrace_module_exit_nb = {
- .notifier_call = ftrace_module_notify_exit,
+struct notifier_block ftrace_module_nb = {
+ .notifier_call = ftrace_module_notify,
.priority = INT_MIN, /* Run after anything that can remove kprobes */
};
@@ -5006,7 +5098,7 @@ void __init ftrace_init(void)
__start_mcount_loc,
__stop_mcount_loc);
- ret = register_module_notifier(&ftrace_module_exit_nb);
+ ret = register_module_notifier(&ftrace_module_nb);
if (ret)
pr_warning("Failed to register trace ftrace module exit notifier\n");
@@ -5116,44 +5208,6 @@ void ftrace_reset_array_ops(struct trace_array *tr)
tr->ops->func = ftrace_stub;
}
-static void
-ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *regs)
-{
- if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT)))
- return;
-
- /*
- * Some of the ops may be dynamically allocated,
- * they must be freed after a synchronize_sched().
- */
- preempt_disable_notrace();
- trace_recursion_set(TRACE_CONTROL_BIT);
-
- /*
- * Control funcs (perf) uses RCU. Only trace if
- * RCU is currently active.
- */
- if (!rcu_is_watching())
- goto out;
-
- do_for_each_ftrace_op(op, ftrace_control_list) {
- if (!(op->flags & FTRACE_OPS_FL_STUB) &&
- !ftrace_function_local_disabled(op) &&
- ftrace_ops_test(op, ip, regs))
- op->func(ip, parent_ip, op, regs);
- } while_for_each_ftrace_op(op);
- out:
- trace_recursion_clear(TRACE_CONTROL_BIT);
- preempt_enable_notrace();
-}
-
-static struct ftrace_ops control_ops = {
- .func = ftrace_ops_control_func,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
- INIT_OPS_HASH(control_ops)
-};
-
static inline void
__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ignored, struct pt_regs *regs)
@@ -5170,8 +5224,22 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
* they must be freed after a synchronize_sched().
*/
preempt_disable_notrace();
+
do_for_each_ftrace_op(op, ftrace_ops_list) {
- if (ftrace_ops_test(op, ip, regs)) {
+ /*
+ * Check the following for each ops before calling their func:
+ * if RCU flag is set, then rcu_is_watching() must be true
+ * if PER_CPU is set, then ftrace_function_local_disable()
+ * must be false
+ * Otherwise test if the ip matches the ops filter
+ *
+ * If any of the above fails then the op->func() is not executed.
+ */
+ if ((!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) &&
+ (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
+ !ftrace_function_local_disabled(op)) &&
+ ftrace_ops_test(op, ip, regs)) {
+
if (FTRACE_WARN_ON(!op->func)) {
pr_warn("op=%p %pS\n", op, op);
goto out;
@@ -5195,7 +5263,7 @@ out:
* being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS.
* Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved.
* An architecture can pass partial regs with ftrace_ops and still
- * set the ARCH_SUPPORT_FTARCE_OPS.
+ * set the ARCH_SUPPORTS_FTRACE_OPS.
*/
#if ARCH_SUPPORTS_FTRACE_OPS
static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
@@ -5212,20 +5280,29 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
/*
* If there's only one function registered but it does not support
- * recursion, this function will be called by the mcount trampoline.
- * This function will handle recursion protection.
+ * recursion, needs RCU protection and/or requires per cpu handling, then
+ * this function will be called by the mcount trampoline.
*/
-static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
+static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *regs)
{
int bit;
+ if ((op->flags & FTRACE_OPS_FL_RCU) && !rcu_is_watching())
+ return;
+
bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
if (bit < 0)
return;
- op->func(ip, parent_ip, op, regs);
+ preempt_disable_notrace();
+ if (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
+ !ftrace_function_local_disabled(op)) {
+ op->func(ip, parent_ip, op, regs);
+ }
+
+ preempt_enable_notrace();
trace_clear_recursion(bit);
}
@@ -5243,12 +5320,12 @@ static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
{
/*
- * If the func handles its own recursion, call it directly.
- * Otherwise call the recursion protected function that
- * will call the ftrace ops function.
+ * If the function does not handle recursion, needs to be RCU safe,
+ * or does per cpu logic, then we need to call the assist handler.
*/
- if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE))
- return ftrace_ops_recurs_func;
+ if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) ||
+ ops->flags & (FTRACE_OPS_FL_RCU | FTRACE_OPS_FL_PER_CPU))
+ return ftrace_ops_assist_func;
return ops->func;
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9c6045a27ba3..95181e36891a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1001,17 +1001,13 @@ static int rb_head_page_replace(struct buffer_page *old,
/*
* rb_tail_page_update - move the tail page forward
- *
- * Returns 1 if moved tail page, 0 if someone else did.
*/
-static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
+static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *tail_page,
struct buffer_page *next_page)
{
- struct buffer_page *old_tail;
unsigned long old_entries;
unsigned long old_write;
- int ret = 0;
/*
* The tail page now needs to be moved forward.
@@ -1036,7 +1032,7 @@ static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
* it is, then it is up to us to update the tail
* pointer.
*/
- if (tail_page == cpu_buffer->tail_page) {
+ if (tail_page == READ_ONCE(cpu_buffer->tail_page)) {
/* Zero the write counter */
unsigned long val = old_write & ~RB_WRITE_MASK;
unsigned long eval = old_entries & ~RB_WRITE_MASK;
@@ -1061,14 +1057,9 @@ static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
*/
local_set(&next_page->page->commit, 0);
- old_tail = cmpxchg(&cpu_buffer->tail_page,
- tail_page, next_page);
-
- if (old_tail == tail_page)
- ret = 1;
+ /* Again, either we update tail_page or an interrupt does */
+ (void)cmpxchg(&cpu_buffer->tail_page, tail_page, next_page);
}
-
- return ret;
}
static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2036,12 +2027,15 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
* the tail page would have moved.
*/
if (ret == RB_PAGE_NORMAL) {
+ struct buffer_page *buffer_tail_page;
+
+ buffer_tail_page = READ_ONCE(cpu_buffer->tail_page);
/*
* If the tail had moved passed next, then we need
* to reset the pointer.
*/
- if (cpu_buffer->tail_page != tail_page &&
- cpu_buffer->tail_page != next_page)
+ if (buffer_tail_page != tail_page &&
+ buffer_tail_page != next_page)
rb_head_page_set_normal(cpu_buffer, new_head,
next_page,
RB_PAGE_HEAD);
@@ -2135,6 +2129,8 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
local_sub(length, &tail_page->write);
}
+static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer);
+
/*
* This is the slow path, force gcc not to inline it.
*/
@@ -2147,7 +2143,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer *buffer = cpu_buffer->buffer;
struct buffer_page *next_page;
int ret;
- u64 ts;
next_page = tail_page;
@@ -2221,20 +2216,17 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
}
}
- ret = rb_tail_page_update(cpu_buffer, tail_page, next_page);
- if (ret) {
- /*
- * Nested commits always have zero deltas, so
- * just reread the time stamp
- */
- ts = rb_time_stamp(buffer);
- next_page->page->time_stamp = ts;
- }
+ rb_tail_page_update(cpu_buffer, tail_page, next_page);
out_again:
rb_reset_tail(cpu_buffer, tail, info);
+ /* Commit what we have for now. */
+ rb_end_commit(cpu_buffer);
+ /* rb_end_commit() decs committing */
+ local_inc(&cpu_buffer->committing);
+
/* fail and let the caller try again */
return ERR_PTR(-EAGAIN);
@@ -2362,7 +2354,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
addr = (unsigned long)event;
addr &= PAGE_MASK;
- bpage = cpu_buffer->tail_page;
+ bpage = READ_ONCE(cpu_buffer->tail_page);
if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
unsigned long write_mask =
@@ -2410,7 +2402,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
again:
max_count = cpu_buffer->nr_pages * 100;
- while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
+ while (cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page)) {
if (RB_WARN_ON(cpu_buffer, !(--max_count)))
return;
if (RB_WARN_ON(cpu_buffer,
@@ -2419,8 +2411,10 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
- cpu_buffer->write_stamp =
- cpu_buffer->commit_page->page->time_stamp;
+ /* Only update the write stamp if the page has an event */
+ if (rb_page_write(cpu_buffer->commit_page))
+ cpu_buffer->write_stamp =
+ cpu_buffer->commit_page->page->time_stamp;
/* add barrier to keep gcc from optimizing too much */
barrier();
}
@@ -2443,7 +2437,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
* and pushed the tail page forward, we will be left with
* a dangling commit that will never go forward.
*/
- if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
+ if (unlikely(cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page)))
goto again;
}
@@ -2699,7 +2693,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
if (unlikely(info->add_timestamp))
info->length += RB_LEN_TIME_EXTEND;
- tail_page = info->tail_page = cpu_buffer->tail_page;
+ /* Don't let the compiler play games with cpu_buffer->tail_page */
+ tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page);
write = local_add_return(info->length, &tail_page->write);
/* set write to only the index of the write */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 919d9d07686f..8414fa40bf27 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -363,8 +363,8 @@ struct trace_option_dentry {
* @name: the name chosen to select it on the available_tracers file
* @init: called when one switches to this tracer (echo name > current_tracer)
* @reset: called when one switches to another tracer
- * @start: called when tracing is unpaused (echo 1 > tracing_enabled)
- * @stop: called when tracing is paused (echo 0 > tracing_enabled)
+ * @start: called when tracing is unpaused (echo 1 > tracing_on)
+ * @stop: called when tracing is paused (echo 0 > tracing_on)
* @update_thresh: called when tracing_thresh is updated
* @open: called when the trace file is opened
* @pipe_open: called when the trace_pipe file is opened
@@ -467,8 +467,6 @@ enum {
TRACE_INTERNAL_IRQ_BIT,
TRACE_INTERNAL_SIRQ_BIT,
- TRACE_CONTROL_BIT,
-
TRACE_BRANCH_BIT,
/*
* Abuse of the trace_recursion.
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index cc9f7a9319be..00df25fd86ef 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -334,7 +334,7 @@ static int perf_ftrace_function_register(struct perf_event *event)
{
struct ftrace_ops *ops = &event->ftrace_ops;
- ops->flags |= FTRACE_OPS_FL_CONTROL;
+ ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU;
ops->func = perf_ftrace_function_call;
return register_ftrace_function(ops);
}
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 4b5e8ed68d77..b38f617b6181 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -538,11 +538,12 @@ static int register_trigger(char *glob, struct event_trigger_ops *ops,
list_add_rcu(&data->list, &file->triggers);
ret++;
+ update_cond_flag(file);
if (trace_event_trigger_enable_disable(file, 1) < 0) {
list_del_rcu(&data->list);
+ update_cond_flag(file);
ret--;
}
- update_cond_flag(file);
out:
return ret;
}
@@ -570,8 +571,8 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) {
unregistered = true;
list_del_rcu(&data->list);
- update_cond_flag(file);
trace_event_trigger_enable_disable(file, 0);
+ update_cond_flag(file);
break;
}
}
@@ -1314,11 +1315,12 @@ static int event_enable_register_trigger(char *glob,
list_add_rcu(&data->list, &file->triggers);
ret++;
+ update_cond_flag(file);
if (trace_event_trigger_enable_disable(file, 1) < 0) {
list_del_rcu(&data->list);
+ update_cond_flag(file);
ret--;
}
- update_cond_flag(file);
out:
return ret;
}
@@ -1339,8 +1341,8 @@ static void event_enable_unregister_trigger(char *glob,
(enable_data->file == test_enable_data->file)) {
unregistered = true;
list_del_rcu(&data->list);
- update_cond_flag(file);
trace_event_trigger_enable_disable(file, 0);
+ update_cond_flag(file);
break;
}
}
OpenPOWER on IntegriCloud