summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/audit.c4
-rw-r--r--kernel/audit.h2
-rw-r--r--kernel/auditsc.c32
-rw-r--r--kernel/bpf/devmap.c2
-rw-r--r--kernel/bpf/sockmap.c2
-rw-r--r--kernel/bpf/syscall.c27
-rw-r--r--kernel/bpf/verifier.c16
-rw-r--r--kernel/cgroup/cgroup.c15
-rw-r--r--kernel/cgroup/cpuset.c16
-rw-r--r--kernel/compat.c23
-rw-r--r--kernel/exit.c4
-rw-r--r--kernel/fork.c10
-rw-r--r--kernel/irq/irqdesc.c24
-rw-r--r--kernel/irq/msi.c5
-rw-r--r--kernel/kcov.c1
-rw-r--r--kernel/kmod.c563
-rw-r--r--kernel/locking/rtmutex-debug.c2
-rw-r--r--kernel/locking/rtmutex.c35
-rw-r--r--kernel/locking/rtmutex_common.h12
-rw-r--r--kernel/locking/test-ww_mutex.c2
-rw-r--r--kernel/memremap.c60
-rw-r--r--kernel/module.c12
-rw-r--r--kernel/pid_namespace.c4
-rw-r--r--kernel/power/process.c5
-rw-r--r--kernel/power/swap.c5
-rw-r--r--kernel/printk/printk.c72
-rw-r--r--kernel/ptrace.c6
-rw-r--r--kernel/rcu/tree.c2
-rw-r--r--kernel/rcu/tree_plugin.h2
-rw-r--r--kernel/sched/core.c11
-rw-r--r--kernel/sched/deadline.c50
-rw-r--r--kernel/sched/debug.c7
-rw-r--r--kernel/sched/fair.c56
-rw-r--r--kernel/sched/sched.h11
-rw-r--r--kernel/sched/topology.c6
-rw-r--r--kernel/sched/wait.c85
-rw-r--r--kernel/signal.c72
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/sys.c8
-rw-r--r--kernel/sysctl_binary.c21
-rw-r--r--kernel/time/timekeeping.c2
-rw-r--r--kernel/trace/blktrace.c261
-rw-r--r--kernel/trace/ftrace.c68
-rw-r--r--kernel/trace/trace.c13
-rw-r--r--kernel/trace/trace.h1
-rw-r--r--kernel/trace/trace_events.c17
-rw-r--r--kernel/trace/trace_events_filter.c2
-rw-r--r--kernel/trace/trace_functions_graph.c2
-rw-r--r--kernel/trace/trace_selftest.c2
-rw-r--r--kernel/umh.c568
-rw-r--r--kernel/user_namespace.c20
53 files changed, 1293 insertions, 962 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 9c323a6daa46..ed470aac53da 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -5,12 +5,13 @@
obj-y = fork.o exec_domain.o panic.o \
cpu.o exit.o softirq.o resource.o \
sysctl.o sysctl_binary.o capability.o ptrace.o user.o \
- signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
+ signal.o sys.o umh.o workqueue.o pid.o task_work.o \
extable.o params.o \
kthread.o sys_ni.o nsproxy.o \
notifier.o ksysfs.o cred.o reboot.o \
async.o range.o smpboot.o ucount.o
+obj-$(CONFIG_MODULES) += kmod.o
obj-$(CONFIG_MULTIUSER) += groups.o
ifdef CONFIG_FUNCTION_TRACER
diff --git a/kernel/acct.c b/kernel/acct.c
index 5b1284370367..5e72af29ab73 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -516,7 +516,7 @@ static void do_acct_process(struct bsd_acct_struct *acct)
if (file_start_write_trylock(file)) {
/* it's been opened O_APPEND, so position is irrelevant */
loff_t pos = 0;
- __kernel_write(file, (char *)&ac, sizeof(acct_t), &pos);
+ __kernel_write(file, &ac, sizeof(acct_t), &pos);
file_end_write(file);
}
out:
diff --git a/kernel/audit.c b/kernel/audit.c
index 6dd556931739..be1c28fd4d57 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1662,7 +1662,7 @@ static inline void audit_get_stamp(struct audit_context *ctx,
struct timespec64 *t, unsigned int *serial)
{
if (!ctx || !auditsc_get_stamp(ctx, t, serial)) {
- ktime_get_real_ts64(t);
+ *t = current_kernel_time64();
*serial = audit_serial();
}
}
@@ -1833,7 +1833,7 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
}
/**
- * audit_log_hex - convert a buffer to hex and append it to the audit skb
+ * audit_log_n_hex - convert a buffer to hex and append it to the audit skb
* @ab: the audit_buffer
* @buf: buffer to convert to hex
* @len: length of @buf to be converted
diff --git a/kernel/audit.h b/kernel/audit.h
index b331d9b83f63..9b110ae17ee3 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -182,7 +182,7 @@ struct audit_context {
mqd_t mqdes;
size_t msg_len;
unsigned int msg_prio;
- struct timespec abs_timeout;
+ struct timespec64 abs_timeout;
} mq_sendrecv;
struct {
int oflag;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3260ba2312a9..ecc23e25c9eb 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1235,11 +1235,11 @@ static void show_special(struct audit_context *context, int *call_panic)
case AUDIT_MQ_SENDRECV:
audit_log_format(ab,
"mqdes=%d msg_len=%zd msg_prio=%u "
- "abs_timeout_sec=%ld abs_timeout_nsec=%ld",
+ "abs_timeout_sec=%lld abs_timeout_nsec=%ld",
context->mq_sendrecv.mqdes,
context->mq_sendrecv.msg_len,
context->mq_sendrecv.msg_prio,
- context->mq_sendrecv.abs_timeout.tv_sec,
+ (long long) context->mq_sendrecv.abs_timeout.tv_sec,
context->mq_sendrecv.abs_timeout.tv_nsec);
break;
case AUDIT_MQ_NOTIFY:
@@ -1462,7 +1462,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
}
/**
- * audit_free - free a per-task audit context
+ * __audit_free - free a per-task audit context
* @tsk: task whose audit context block to free
*
* Called from copy_process and do_exit
@@ -1489,7 +1489,7 @@ void __audit_free(struct task_struct *tsk)
}
/**
- * audit_syscall_entry - fill in an audit record at syscall entry
+ * __audit_syscall_entry - fill in an audit record at syscall entry
* @major: major syscall type (function)
* @a1: additional syscall register 1
* @a2: additional syscall register 2
@@ -1536,14 +1536,14 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
return;
context->serial = 0;
- ktime_get_real_ts64(&context->ctime);
+ context->ctime = current_kernel_time64();
context->in_syscall = 1;
context->current_state = state;
context->ppid = 0;
}
/**
- * audit_syscall_exit - deallocate audit context after a system call
+ * __audit_syscall_exit - deallocate audit context after a system call
* @success: success value of the syscall
* @return_code: return value of the syscall
*
@@ -1705,7 +1705,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
}
/**
- * audit_reusename - fill out filename with info from existing entry
+ * __audit_reusename - fill out filename with info from existing entry
* @uptr: userland ptr to pathname
*
* Search the audit_names list for the current audit context. If there is an
@@ -1730,7 +1730,7 @@ __audit_reusename(const __user char *uptr)
}
/**
- * audit_getname - add a name to the list
+ * __audit_getname - add a name to the list
* @name: name to add
*
* Add a name to the list of audit names for this context.
@@ -2083,15 +2083,15 @@ void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
*
*/
void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
- const struct timespec *abs_timeout)
+ const struct timespec64 *abs_timeout)
{
struct audit_context *context = current->audit_context;
- struct timespec *p = &context->mq_sendrecv.abs_timeout;
+ struct timespec64 *p = &context->mq_sendrecv.abs_timeout;
if (abs_timeout)
- memcpy(p, abs_timeout, sizeof(struct timespec));
+ memcpy(p, abs_timeout, sizeof(*p));
else
- memset(p, 0, sizeof(struct timespec));
+ memset(p, 0, sizeof(*p));
context->mq_sendrecv.mqdes = mqdes;
context->mq_sendrecv.msg_len = msg_len;
@@ -2135,7 +2135,7 @@ void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
}
/**
- * audit_ipc_obj - record audit data for ipc object
+ * __audit_ipc_obj - record audit data for ipc object
* @ipcp: ipc permissions
*
*/
@@ -2151,7 +2151,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
}
/**
- * audit_ipc_set_perm - record audit data for new ipc permissions
+ * __audit_ipc_set_perm - record audit data for new ipc permissions
* @qbytes: msgq bytes
* @uid: msgq user id
* @gid: msgq group id
@@ -2180,7 +2180,7 @@ void __audit_bprm(struct linux_binprm *bprm)
/**
- * audit_socketcall - record audit data for sys_socketcall
+ * __audit_socketcall - record audit data for sys_socketcall
* @nargs: number of args, which should not be more than AUDITSC_ARGS.
* @args: args array
*
@@ -2211,7 +2211,7 @@ void __audit_fd_pair(int fd1, int fd2)
}
/**
- * audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto
+ * __audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto
* @len: data length in user space
* @a: data address in kernel space
*
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index ecf9f99ecc57..959c9a07f318 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -159,7 +159,7 @@ static void dev_map_free(struct bpf_map *map)
unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu);
while (!bitmap_empty(bitmap, dtab->map.max_entries))
- cpu_relax();
+ cond_resched();
}
for (i = 0; i < dtab->map.max_entries; i++) {
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index f6ffde9c6a68..6424ce0e4969 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -792,7 +792,7 @@ out_progs:
return err;
}
-int sock_map_attach_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
+int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
{
struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
struct bpf_prog *orig;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 70ad8e220343..cb17e1cd1d43 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1096,10 +1096,10 @@ static int bpf_obj_get(const union bpf_attr *attr)
#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
-static int sockmap_get_from_fd(const union bpf_attr *attr)
+static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach)
{
+ struct bpf_prog *prog = NULL;
int ufd = attr->target_fd;
- struct bpf_prog *prog;
struct bpf_map *map;
struct fd f;
int err;
@@ -1109,16 +1109,20 @@ static int sockmap_get_from_fd(const union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
- prog = bpf_prog_get_type(attr->attach_bpf_fd, BPF_PROG_TYPE_SK_SKB);
- if (IS_ERR(prog)) {
- fdput(f);
- return PTR_ERR(prog);
+ if (attach) {
+ prog = bpf_prog_get_type(attr->attach_bpf_fd,
+ BPF_PROG_TYPE_SK_SKB);
+ if (IS_ERR(prog)) {
+ fdput(f);
+ return PTR_ERR(prog);
+ }
}
- err = sock_map_attach_prog(map, prog, attr->attach_type);
+ err = sock_map_prog(map, prog, attr->attach_type);
if (err) {
fdput(f);
- bpf_prog_put(prog);
+ if (prog)
+ bpf_prog_put(prog);
return err;
}
@@ -1155,7 +1159,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
break;
case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT:
- return sockmap_get_from_fd(attr);
+ return sockmap_get_from_fd(attr, true);
default:
return -EINVAL;
}
@@ -1204,7 +1208,10 @@ static int bpf_prog_detach(const union bpf_attr *attr)
ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false);
cgroup_put(cgrp);
break;
-
+ case BPF_SK_SKB_STREAM_PARSER:
+ case BPF_SK_SKB_STREAM_VERDICT:
+ ret = sockmap_get_from_fd(attr, false);
+ break;
default:
return -EINVAL;
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d690c7dd1f1a..477b6932c3c1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4203,6 +4203,22 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
continue;
}
+ if (insn->imm == BPF_FUNC_redirect_map) {
+ u64 addr = (unsigned long)prog;
+ struct bpf_insn r4_ld[] = {
+ BPF_LD_IMM64(BPF_REG_4, addr),
+ *insn,
+ };
+ cnt = ARRAY_SIZE(r4_ld);
+
+ new_prog = bpf_patch_insn_data(env, i + delta, r4_ld, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ }
patch_call_imm:
fn = prog->aux->ops->get_func_proto(insn->imm);
/* all functions that have prototype and verifier allowed
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 4f2196a00953..d6551cd45238 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1879,7 +1879,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
&cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
root->kf_root = kernfs_create_root(kf_sops,
- KERNFS_ROOT_CREATE_DEACTIVATED,
+ KERNFS_ROOT_CREATE_DEACTIVATED |
+ KERNFS_ROOT_SUPPORT_EXPORTOP,
root_cgrp);
if (IS_ERR(root->kf_root)) {
ret = PTR_ERR(root->kf_root);
@@ -5256,6 +5257,18 @@ static int __init cgroup_wq_init(void)
}
core_initcall(cgroup_wq_init);
+void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
+ char *buf, size_t buflen)
+{
+ struct kernfs_node *kn;
+
+ kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id);
+ if (!kn)
+ return;
+ kernfs_path(kn, buf, buflen);
+ kernfs_put(kn);
+}
+
/*
* proc_cgroup_show()
* - Print task's cgroup paths into seq_file, one line for each hierarchy
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 67230ecf2ce1..4657e2924ecb 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2275,6 +2275,13 @@ retry:
mutex_unlock(&cpuset_mutex);
}
+static bool force_rebuild;
+
+void cpuset_force_rebuild(void)
+{
+ force_rebuild = true;
+}
+
/**
* cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
*
@@ -2349,8 +2356,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
}
/* rebuild sched domains if cpus_allowed has changed */
- if (cpus_updated)
+ if (cpus_updated || force_rebuild) {
+ force_rebuild = false;
rebuild_sched_domains();
+ }
}
void cpuset_update_active_cpus(void)
@@ -2363,6 +2372,11 @@ void cpuset_update_active_cpus(void)
schedule_work(&cpuset_hotplug_work);
}
+void cpuset_wait_for_hotplug(void)
+{
+ flush_work(&cpuset_hotplug_work);
+}
+
/*
* Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
* Call this routine anytime after node_states[N_MEMORY] changes.
diff --git a/kernel/compat.c b/kernel/compat.c
index 6f0a0e723a06..772e038d04d9 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -200,29 +200,6 @@ int compat_put_timespec(const struct timespec *ts, void __user *uts)
}
EXPORT_SYMBOL_GPL(compat_put_timespec);
-int compat_convert_timespec(struct timespec __user **kts,
- const void __user *cts)
-{
- struct timespec ts;
- struct timespec __user *uts;
-
- if (!cts || COMPAT_USE_64BIT_TIME) {
- *kts = (struct timespec __user *)cts;
- return 0;
- }
-
- uts = compat_alloc_user_space(sizeof(ts));
- if (!uts)
- return -EFAULT;
- if (compat_get_timespec(&ts, cts))
- return -EFAULT;
- if (copy_to_user(uts, &ts, sizeof(ts)))
- return -EFAULT;
-
- *kts = uts;
- return 0;
-}
-
int get_compat_itimerval(struct itimerval *o, const struct compat_itimerval __user *i)
{
struct compat_itimerval v32;
diff --git a/kernel/exit.c b/kernel/exit.c
index a35d8a17e01f..3481ababd06a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1615,7 +1615,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
user_access_begin();
unsafe_put_user(signo, &infop->si_signo, Efault);
unsafe_put_user(0, &infop->si_errno, Efault);
- unsafe_put_user((short)info.cause, &infop->si_code, Efault);
+ unsafe_put_user(info.cause, &infop->si_code, Efault);
unsafe_put_user(info.pid, &infop->si_pid, Efault);
unsafe_put_user(info.uid, &infop->si_uid, Efault);
unsafe_put_user(info.status, &infop->si_status, Efault);
@@ -1741,7 +1741,7 @@ COMPAT_SYSCALL_DEFINE5(waitid,
user_access_begin();
unsafe_put_user(signo, &infop->si_signo, Efault);
unsafe_put_user(0, &infop->si_errno, Efault);
- unsafe_put_user((short)info.cause, &infop->si_code, Efault);
+ unsafe_put_user(info.cause, &infop->si_code, Efault);
unsafe_put_user(info.pid, &infop->si_pid, Efault);
unsafe_put_user(info.uid, &infop->si_uid, Efault);
unsafe_put_user(info.status, &infop->si_status, Efault);
diff --git a/kernel/fork.c b/kernel/fork.c
index 24a4c0be80d5..10646182440f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -37,6 +37,7 @@
#include <linux/binfmts.h>
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
+#include <linux/hmm.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
@@ -824,6 +825,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_owner(mm, p);
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_mm_init(mm);
+ hmm_mm_init(mm);
init_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
mm->pmd_huge_pte = NULL;
@@ -903,6 +905,7 @@ void __mmdrop(struct mm_struct *mm)
BUG_ON(mm == &init_mm);
mm_free_pgd(mm);
destroy_context(mm);
+ hmm_mm_destroy(mm);
mmu_notifier_mm_destroy(mm);
check_mm(mm);
put_user_ns(mm->user_ns);
@@ -1459,8 +1462,7 @@ static void rt_mutex_init_task(struct task_struct *p)
{
raw_spin_lock_init(&p->pi_lock);
#ifdef CONFIG_RT_MUTEXES
- p->pi_waiters = RB_ROOT;
- p->pi_waiters_leftmost = NULL;
+ p->pi_waiters = RB_ROOT_CACHED;
p->pi_top_task = NULL;
p->pi_blocked_on = NULL;
#endif
@@ -1567,10 +1569,6 @@ static __latent_entropy struct task_struct *copy_process(
return ERR_PTR(-EINVAL);
}
- retval = security_task_create(clone_flags);
- if (retval)
- goto fork_out;
-
retval = -ENOMEM;
p = dup_task_struct(current, node);
if (!p)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 73be2b3909bd..82afb7ed369f 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -421,10 +421,8 @@ static void free_desc(unsigned int irq)
* The sysfs entry must be serialized against a concurrent
* irq_sysfs_init() as well.
*/
- mutex_lock(&sparse_irq_lock);
kobject_del(&desc->kobj);
delete_irq_desc(irq);
- mutex_unlock(&sparse_irq_lock);
/*
* We free the descriptor, masks and stat fields via RCU. That
@@ -462,20 +460,15 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
desc = alloc_desc(start + i, node, flags, mask, owner);
if (!desc)
goto err;
- mutex_lock(&sparse_irq_lock);
irq_insert_desc(start + i, desc);
irq_sysfs_add(start + i, desc);
- mutex_unlock(&sparse_irq_lock);
}
+ bitmap_set(allocated_irqs, start, cnt);
return start;
err:
for (i--; i >= 0; i--)
free_desc(start + i);
-
- mutex_lock(&sparse_irq_lock);
- bitmap_clear(allocated_irqs, start, cnt);
- mutex_unlock(&sparse_irq_lock);
return -ENOMEM;
}
@@ -575,6 +568,7 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
desc->owner = owner;
}
+ bitmap_set(allocated_irqs, start, cnt);
return start;
}
@@ -670,10 +664,10 @@ void irq_free_descs(unsigned int from, unsigned int cnt)
if (from >= nr_irqs || (from + cnt) > nr_irqs)
return;
+ mutex_lock(&sparse_irq_lock);
for (i = 0; i < cnt; i++)
free_desc(from + i);
- mutex_lock(&sparse_irq_lock);
bitmap_clear(allocated_irqs, from, cnt);
mutex_unlock(&sparse_irq_lock);
}
@@ -720,19 +714,15 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
from, cnt, 0);
ret = -EEXIST;
if (irq >=0 && start != irq)
- goto err;
+ goto unlock;
if (start + cnt > nr_irqs) {
ret = irq_expand_nr_irqs(start + cnt);
if (ret)
- goto err;
+ goto unlock;
}
-
- bitmap_set(allocated_irqs, start, cnt);
- mutex_unlock(&sparse_irq_lock);
- return alloc_descs(start, cnt, node, affinity, owner);
-
-err:
+ ret = alloc_descs(start, cnt, node, affinity, owner);
+unlock:
mutex_unlock(&sparse_irq_lock);
return ret;
}
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 48eadf416c24..3fa4bd59f569 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -315,11 +315,12 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
ops->set_desc(arg, desc);
/* Assumes the domain mutex is held! */
- ret = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg);
+ ret = irq_domain_alloc_irqs_hierarchy(domain, desc->irq, 1,
+ arg);
if (ret)
break;
- irq_set_msi_desc_off(virq, 0, desc);
+ irq_set_msi_desc_off(desc->irq, 0, desc);
}
if (ret) {
diff --git a/kernel/kcov.c b/kernel/kcov.c
index cd771993f96f..3f693a0f6f3e 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -270,6 +270,7 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
static const struct file_operations kcov_fops = {
.open = kcov_open,
.unlocked_ioctl = kcov_ioctl,
+ .compat_ioctl = kcov_ioctl,
.mmap = kcov_mmap,
.release = kcov_close,
};
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 2f37acde640b..bc6addd9152b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -1,23 +1,6 @@
/*
- kmod, the new module loader (replaces kerneld)
- Kirk Petersen
-
- Reorganized not to be a daemon by Adam Richter, with guidance
- from Greg Zornetzer.
-
- Modified to avoid chroot and file sharing problems.
- Mikael Pettersson
-
- Limit the concurrent number of kmod modprobes to catch loops from
- "modprobe needs a service that is in a module".
- Keith Owens <kaos@ocs.com.au> December 1999
-
- Unblock all signals when we exec a usermode process.
- Shuu Yamaguchi <shuu@wondernetworkresources.com> December 2000
-
- call_usermodehelper wait flag, and remove exec_usermodehelper.
- Rusty Russell <rusty@rustcorp.com.au> Jan 2003
-*/
+ * kmod - the kernel module loader
+ */
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
@@ -45,15 +28,6 @@
#include <trace/events/module.h>
-#define CAP_BSET (void *)1
-#define CAP_PI (void *)2
-
-static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
-static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
-static DEFINE_SPINLOCK(umh_sysctl_lock);
-static DECLARE_RWSEM(umhelper_sem);
-
-#ifdef CONFIG_MODULES
/*
* Assuming:
*
@@ -202,536 +176,3 @@ int __request_module(bool wait, const char *fmt, ...)
return ret;
}
EXPORT_SYMBOL(__request_module);
-
-#endif /* CONFIG_MODULES */
-
-static void call_usermodehelper_freeinfo(struct subprocess_info *info)
-{
- if (info->cleanup)
- (*info->cleanup)(info);
- kfree(info);
-}
-
-static void umh_complete(struct subprocess_info *sub_info)
-{
- struct completion *comp = xchg(&sub_info->complete, NULL);
- /*
- * See call_usermodehelper_exec(). If xchg() returns NULL
- * we own sub_info, the UMH_KILLABLE caller has gone away
- * or the caller used UMH_NO_WAIT.
- */
- if (comp)
- complete(comp);
- else
- call_usermodehelper_freeinfo(sub_info);
-}
-
-/*
- * This is the task which runs the usermode application
- */
-static int call_usermodehelper_exec_async(void *data)
-{
- struct subprocess_info *sub_info = data;
- struct cred *new;
- int retval;
-
- spin_lock_irq(&current->sighand->siglock);
- flush_signal_handlers(current, 1);
- spin_unlock_irq(&current->sighand->siglock);
-
- /*
- * Our parent (unbound workqueue) runs with elevated scheduling
- * priority. Avoid propagating that into the userspace child.
- */
- set_user_nice(current, 0);
-
- retval = -ENOMEM;
- new = prepare_kernel_cred(current);
- if (!new)
- goto out;
-
- spin_lock(&umh_sysctl_lock);
- new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
- new->cap_inheritable = cap_intersect(usermodehelper_inheritable,
- new->cap_inheritable);
- spin_unlock(&umh_sysctl_lock);
-
- if (sub_info->init) {
- retval = sub_info->init(sub_info, new);
- if (retval) {
- abort_creds(new);
- goto out;
- }
- }
-
- commit_creds(new);
-
- retval = do_execve(getname_kernel(sub_info->path),
- (const char __user *const __user *)sub_info->argv,
- (const char __user *const __user *)sub_info->envp);
-out:
- sub_info->retval = retval;
- /*
- * call_usermodehelper_exec_sync() will call umh_complete
- * if UHM_WAIT_PROC.
- */
- if (!(sub_info->wait & UMH_WAIT_PROC))
- umh_complete(sub_info);
- if (!retval)
- return 0;
- do_exit(0);
-}
-
-/* Handles UMH_WAIT_PROC. */
-static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
-{
- pid_t pid;
-
- /* If SIGCLD is ignored sys_wait4 won't populate the status. */
- kernel_sigaction(SIGCHLD, SIG_DFL);
- pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
- if (pid < 0) {
- sub_info->retval = pid;
- } else {
- int ret = -ECHILD;
- /*
- * Normally it is bogus to call wait4() from in-kernel because
- * wait4() wants to write the exit code to a userspace address.
- * But call_usermodehelper_exec_sync() always runs as kernel
- * thread (workqueue) and put_user() to a kernel address works
- * OK for kernel threads, due to their having an mm_segment_t
- * which spans the entire address space.
- *
- * Thus the __user pointer cast is valid here.
- */
- sys_wait4(pid, (int __user *)&ret, 0, NULL);
-
- /*
- * If ret is 0, either call_usermodehelper_exec_async failed and
- * the real error code is already in sub_info->retval or
- * sub_info->retval is 0 anyway, so don't mess with it then.
- */
- if (ret)
- sub_info->retval = ret;
- }
-
- /* Restore default kernel sig handler */
- kernel_sigaction(SIGCHLD, SIG_IGN);
-
- umh_complete(sub_info);
-}
-
-/*
- * We need to create the usermodehelper kernel thread from a task that is affine
- * to an optimized set of CPUs (or nohz housekeeping ones) such that they
- * inherit a widest affinity irrespective of call_usermodehelper() callers with
- * possibly reduced affinity (eg: per-cpu workqueues). We don't want
- * usermodehelper targets to contend a busy CPU.
- *
- * Unbound workqueues provide such wide affinity and allow to block on
- * UMH_WAIT_PROC requests without blocking pending request (up to some limit).
- *
- * Besides, workqueues provide the privilege level that caller might not have
- * to perform the usermodehelper request.
- *
- */
-static void call_usermodehelper_exec_work(struct work_struct *work)
-{
- struct subprocess_info *sub_info =
- container_of(work, struct subprocess_info, work);
-
- if (sub_info->wait & UMH_WAIT_PROC) {
- call_usermodehelper_exec_sync(sub_info);
- } else {
- pid_t pid;
- /*
- * Use CLONE_PARENT to reparent it to kthreadd; we do not
- * want to pollute current->children, and we need a parent
- * that always ignores SIGCHLD to ensure auto-reaping.
- */
- pid = kernel_thread(call_usermodehelper_exec_async, sub_info,
- CLONE_PARENT | SIGCHLD);
- if (pid < 0) {
- sub_info->retval = pid;
- umh_complete(sub_info);
- }
- }
-}
-
-/*
- * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
- * (used for preventing user land processes from being created after the user
- * land has been frozen during a system-wide hibernation or suspend operation).
- * Should always be manipulated under umhelper_sem acquired for write.
- */
-static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED;
-
-/* Number of helpers running */
-static atomic_t running_helpers = ATOMIC_INIT(0);
-
-/*
- * Wait queue head used by usermodehelper_disable() to wait for all running
- * helpers to finish.
- */
-static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
-
-/*
- * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled
- * to become 'false'.
- */
-static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq);
-
-/*
- * Time to wait for running_helpers to become zero before the setting of
- * usermodehelper_disabled in usermodehelper_disable() fails
- */
-#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
-
-int usermodehelper_read_trylock(void)
-{
- DEFINE_WAIT(wait);
- int ret = 0;
-
- down_read(&umhelper_sem);
- for (;;) {
- prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
- TASK_INTERRUPTIBLE);
- if (!usermodehelper_disabled)
- break;
-
- if (usermodehelper_disabled == UMH_DISABLED)
- ret = -EAGAIN;
-
- up_read(&umhelper_sem);
-
- if (ret)
- break;
-
- schedule();
- try_to_freeze();
-
- down_read(&umhelper_sem);
- }
- finish_wait(&usermodehelper_disabled_waitq, &wait);
- return ret;
-}
-EXPORT_SYMBOL_GPL(usermodehelper_read_trylock);
-
-long usermodehelper_read_lock_wait(long timeout)
-{
- DEFINE_WAIT(wait);
-
- if (timeout < 0)
- return -EINVAL;
-
- down_read(&umhelper_sem);
- for (;;) {
- prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
- TASK_UNINTERRUPTIBLE);
- if (!usermodehelper_disabled)
- break;
-
- up_read(&umhelper_sem);
-
- timeout = schedule_timeout(timeout);
- if (!timeout)
- break;
-
- down_read(&umhelper_sem);
- }
- finish_wait(&usermodehelper_disabled_waitq, &wait);
- return timeout;
-}
-EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait);
-
-void usermodehelper_read_unlock(void)
-{
- up_read(&umhelper_sem);
-}
-EXPORT_SYMBOL_GPL(usermodehelper_read_unlock);
-
-/**
- * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled.
- * @depth: New value to assign to usermodehelper_disabled.
- *
- * Change the value of usermodehelper_disabled (under umhelper_sem locked for
- * writing) and wakeup tasks waiting for it to change.
- */
-void __usermodehelper_set_disable_depth(enum umh_disable_depth depth)
-{
- down_write(&umhelper_sem);
- usermodehelper_disabled = depth;
- wake_up(&usermodehelper_disabled_waitq);
- up_write(&umhelper_sem);
-}
-
-/**
- * __usermodehelper_disable - Prevent new helpers from being started.
- * @depth: New value to assign to usermodehelper_disabled.
- *
- * Set usermodehelper_disabled to @depth and wait for running helpers to exit.
- */
-int __usermodehelper_disable(enum umh_disable_depth depth)
-{
- long retval;
-
- if (!depth)
- return -EINVAL;
-
- down_write(&umhelper_sem);
- usermodehelper_disabled = depth;
- up_write(&umhelper_sem);
-
- /*
- * From now on call_usermodehelper_exec() won't start any new
- * helpers, so it is sufficient if running_helpers turns out to
- * be zero at one point (it may be increased later, but that
- * doesn't matter).
- */
- retval = wait_event_timeout(running_helpers_waitq,
- atomic_read(&running_helpers) == 0,
- RUNNING_HELPERS_TIMEOUT);
- if (retval)
- return 0;
-
- __usermodehelper_set_disable_depth(UMH_ENABLED);
- return -EAGAIN;
-}
-
-static void helper_lock(void)
-{
- atomic_inc(&running_helpers);
- smp_mb__after_atomic();
-}
-
-static void helper_unlock(void)
-{
- if (atomic_dec_and_test(&running_helpers))
- wake_up(&running_helpers_waitq);
-}
-
-/**
- * call_usermodehelper_setup - prepare to call a usermode helper
- * @path: path to usermode executable
- * @argv: arg vector for process
- * @envp: environment for process
- * @gfp_mask: gfp mask for memory allocation
- * @cleanup: a cleanup function
- * @init: an init function
- * @data: arbitrary context sensitive data
- *
- * Returns either %NULL on allocation failure, or a subprocess_info
- * structure. This should be passed to call_usermodehelper_exec to
- * exec the process and free the structure.
- *
- * The init function is used to customize the helper process prior to
- * exec. A non-zero return code causes the process to error out, exit,
- * and return the failure to the calling process
- *
- * The cleanup function is just before ethe subprocess_info is about to
- * be freed. This can be used for freeing the argv and envp. The
- * Function must be runnable in either a process context or the
- * context in which call_usermodehelper_exec is called.
- */
-struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
- char **envp, gfp_t gfp_mask,
- int (*init)(struct subprocess_info *info, struct cred *new),
- void (*cleanup)(struct subprocess_info *info),
- void *data)
-{
- struct subprocess_info *sub_info;
- sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
- if (!sub_info)
- goto out;
-
- INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
-
-#ifdef CONFIG_STATIC_USERMODEHELPER
- sub_info->path = CONFIG_STATIC_USERMODEHELPER_PATH;
-#else
- sub_info->path = path;
-#endif
- sub_info->argv = argv;
- sub_info->envp = envp;
-
- sub_info->cleanup = cleanup;
- sub_info->init = init;
- sub_info->data = data;
- out:
- return sub_info;
-}
-EXPORT_SYMBOL(call_usermodehelper_setup);
-
-/**
- * call_usermodehelper_exec - start a usermode application
- * @sub_info: information about the subprocessa
- * @wait: wait for the application to finish and return status.
- * when UMH_NO_WAIT don't wait at all, but you get no useful error back
- * when the program couldn't be exec'ed. This makes it safe to call
- * from interrupt context.
- *
- * Runs a user-space application. The application is started
- * asynchronously if wait is not set, and runs as a child of system workqueues.
- * (ie. it runs with full root capabilities and optimized affinity).
- */
-int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
-{
- DECLARE_COMPLETION_ONSTACK(done);
- int retval = 0;
-
- if (!sub_info->path) {
- call_usermodehelper_freeinfo(sub_info);
- return -EINVAL;
- }
- helper_lock();
- if (usermodehelper_disabled) {
- retval = -EBUSY;
- goto out;
- }
-
- /*
- * If there is no binary for us to call, then just return and get out of
- * here. This allows us to set STATIC_USERMODEHELPER_PATH to "" and
- * disable all call_usermodehelper() calls.
- */
- if (strlen(sub_info->path) == 0)
- goto out;
-
- /*
- * Set the completion pointer only if there is a waiter.
- * This makes it possible to use umh_complete to free
- * the data structure in case of UMH_NO_WAIT.
- */
- sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
- sub_info->wait = wait;
-
- queue_work(system_unbound_wq, &sub_info->work);
- if (wait == UMH_NO_WAIT) /* task has freed sub_info */
- goto unlock;
-
- if (wait & UMH_KILLABLE) {
- retval = wait_for_completion_killable(&done);
- if (!retval)
- goto wait_done;
-
- /* umh_complete() will see NULL and free sub_info */
- if (xchg(&sub_info->complete, NULL))
- goto unlock;
- /* fallthrough, umh_complete() was already called */
- }
-
- wait_for_completion(&done);
-wait_done:
- retval = sub_info->retval;
-out:
- call_usermodehelper_freeinfo(sub_info);
-unlock:
- helper_unlock();
- return retval;
-}
-EXPORT_SYMBOL(call_usermodehelper_exec);
-
-/**
- * call_usermodehelper() - prepare and start a usermode application
- * @path: path to usermode executable
- * @argv: arg vector for process
- * @envp: environment for process
- * @wait: wait for the application to finish and return status.
- * when UMH_NO_WAIT don't wait at all, but you get no useful error back
- * when the program couldn't be exec'ed. This makes it safe to call
- * from interrupt context.
- *
- * This function is the equivalent to use call_usermodehelper_setup() and
- * call_usermodehelper_exec().
- */
-int call_usermodehelper(const char *path, char **argv, char **envp, int wait)
-{
- struct subprocess_info *info;
- gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
-
- info = call_usermodehelper_setup(path, argv, envp, gfp_mask,
- NULL, NULL, NULL);
- if (info == NULL)
- return -ENOMEM;
-
- return call_usermodehelper_exec(info, wait);
-}
-EXPORT_SYMBOL(call_usermodehelper);
-
-static int proc_cap_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- struct ctl_table t;
- unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
- kernel_cap_t new_cap;
- int err, i;
-
- if (write && (!capable(CAP_SETPCAP) ||
- !capable(CAP_SYS_MODULE)))
- return -EPERM;
-
- /*
- * convert from the global kernel_cap_t to the ulong array to print to
- * userspace if this is a read.
- */
- spin_lock(&umh_sysctl_lock);
- for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) {
- if (table->data == CAP_BSET)
- cap_array[i] = usermodehelper_bset.cap[i];
- else if (table->data == CAP_PI)
- cap_array[i] = usermodehelper_inheritable.cap[i];
- else
- BUG();
- }
- spin_unlock(&umh_sysctl_lock);
-
- t = *table;
- t.data = &cap_array;
-
- /*
- * actually read or write and array of ulongs from userspace. Remember
- * these are least significant 32 bits first
- */
- err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
- if (err < 0)
- return err;
-
- /*
- * convert from the sysctl array of ulongs to the kernel_cap_t
- * internal representation
- */
- for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
- new_cap.cap[i] = cap_array[i];
-
- /*
- * Drop everything not in the new_cap (but don't add things)
- */
- spin_lock(&umh_sysctl_lock);
- if (write) {
- if (table->data == CAP_BSET)
- usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
- if (table->data == CAP_PI)
- usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
- }
- spin_unlock(&umh_sysctl_lock);
-
- return 0;
-}
-
-struct ctl_table usermodehelper_table[] = {
- {
- .procname = "bset",
- .data = CAP_BSET,
- .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
- .mode = 0600,
- .proc_handler = proc_cap_handler,
- },
- {
- .procname = "inheritable",
- .data = CAP_PI,
- .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
- .mode = 0600,
- .proc_handler = proc_cap_handler,
- },
- { }
-};
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index ac35e648b0e5..f4a74e78d467 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -58,7 +58,7 @@ static void printk_lock(struct rt_mutex *lock, int print_owner)
void rt_mutex_debug_task_free(struct task_struct *task)
{
- DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters));
+ DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root));
DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 649dc9d3951a..6f3dba6e4e9e 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -271,10 +271,10 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
static void
rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
{
- struct rb_node **link = &lock->waiters.rb_node;
+ struct rb_node **link = &lock->waiters.rb_root.rb_node;
struct rb_node *parent = NULL;
struct rt_mutex_waiter *entry;
- int leftmost = 1;
+ bool leftmost = true;
while (*link) {
parent = *link;
@@ -283,15 +283,12 @@ rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
link = &parent->rb_left;
} else {
link = &parent->rb_right;
- leftmost = 0;
+ leftmost = false;
}
}
- if (leftmost)
- lock->waiters_leftmost = &waiter->tree_entry;
-
rb_link_node(&waiter->tree_entry, parent, link);
- rb_insert_color(&waiter->tree_entry, &lock->waiters);
+ rb_insert_color_cached(&waiter->tree_entry, &lock->waiters, leftmost);
}
static void
@@ -300,20 +297,17 @@ rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
if (RB_EMPTY_NODE(&waiter->tree_entry))
return;
- if (lock->waiters_leftmost == &waiter->tree_entry)
- lock->waiters_leftmost = rb_next(&waiter->tree_entry);
-
- rb_erase(&waiter->tree_entry, &lock->waiters);
+ rb_erase_cached(&waiter->tree_entry, &lock->waiters);
RB_CLEAR_NODE(&waiter->tree_entry);
}
static void
rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
{
- struct rb_node **link = &task->pi_waiters.rb_node;
+ struct rb_node **link = &task->pi_waiters.rb_root.rb_node;
struct rb_node *parent = NULL;
struct rt_mutex_waiter *entry;
- int leftmost = 1;
+ bool leftmost = true;
while (*link) {
parent = *link;
@@ -322,15 +316,12 @@ rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
link = &parent->rb_left;
} else {
link = &parent->rb_right;
- leftmost = 0;
+ leftmost = false;
}
}
- if (leftmost)
- task->pi_waiters_leftmost = &waiter->pi_tree_entry;
-
rb_link_node(&waiter->pi_tree_entry, parent, link);
- rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters);
+ rb_insert_color_cached(&waiter->pi_tree_entry, &task->pi_waiters, leftmost);
}
static void
@@ -339,10 +330,7 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
if (RB_EMPTY_NODE(&waiter->pi_tree_entry))
return;
- if (task->pi_waiters_leftmost == &waiter->pi_tree_entry)
- task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry);
-
- rb_erase(&waiter->pi_tree_entry, &task->pi_waiters);
+ rb_erase_cached(&waiter->pi_tree_entry, &task->pi_waiters);
RB_CLEAR_NODE(&waiter->pi_tree_entry);
}
@@ -1657,8 +1645,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name,
{
lock->owner = NULL;
raw_spin_lock_init(&lock->wait_lock);
- lock->waiters = RB_ROOT;
- lock->waiters_leftmost = NULL;
+ lock->waiters = RB_ROOT_CACHED;
if (name && key)
debug_rt_mutex_init(lock, name, key);
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 8d039b928d61..7453be0485a5 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -45,7 +45,7 @@ struct rt_mutex_waiter {
static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
{
- return !RB_EMPTY_ROOT(&lock->waiters);
+ return !RB_EMPTY_ROOT(&lock->waiters.rb_root);
}
static inline struct rt_mutex_waiter *
@@ -53,8 +53,8 @@ rt_mutex_top_waiter(struct rt_mutex *lock)
{
struct rt_mutex_waiter *w;
- w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter,
- tree_entry);
+ w = rb_entry(lock->waiters.rb_leftmost,
+ struct rt_mutex_waiter, tree_entry);
BUG_ON(w->lock != lock);
return w;
@@ -62,14 +62,14 @@ rt_mutex_top_waiter(struct rt_mutex *lock)
static inline int task_has_pi_waiters(struct task_struct *p)
{
- return !RB_EMPTY_ROOT(&p->pi_waiters);
+ return !RB_EMPTY_ROOT(&p->pi_waiters.rb_root);
}
static inline struct rt_mutex_waiter *
task_top_pi_waiter(struct task_struct *p)
{
- return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter,
- pi_tree_entry);
+ return rb_entry(p->pi_waiters.rb_leftmost,
+ struct rt_mutex_waiter, pi_tree_entry);
}
#else
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 39f56c870051..0e4cd64ad2c0 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -362,7 +362,7 @@ static int *get_random_order(int count)
int *order;
int n, r, tmp;
- order = kmalloc_array(count, sizeof(*order), GFP_TEMPORARY);
+ order = kmalloc_array(count, sizeof(*order), GFP_KERNEL);
if (!order)
return order;
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 066e73c2fcc9..6bcbfbf1a8fd 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -11,13 +11,14 @@
* General Public License for more details.
*/
#include <linux/radix-tree.h>
-#include <linux/memremap.h>
#include <linux/device.h>
#include <linux/types.h>
#include <linux/pfn_t.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/memory_hotplug.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
#ifndef ioremap_cache
/* temporary while we convert existing ioremap_cache users to memremap */
@@ -219,6 +220,34 @@ static unsigned long order_at(struct resource *res, unsigned long pgoff)
for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \
pgoff += 1UL << order, order = order_at((res), pgoff))
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
+int device_private_entry_fault(struct vm_area_struct *vma,
+ unsigned long addr,
+ swp_entry_t entry,
+ unsigned int flags,
+ pmd_t *pmdp)
+{
+ struct page *page = device_private_entry_to_page(entry);
+
+ /*
+ * The page_fault() callback must migrate page back to system memory
+ * so that CPU can access it. This might fail for various reasons
+ * (device issue, device was unsafely unplugged, ...). When such
+ * error conditions happen, the callback must return VM_FAULT_SIGBUS.
+ *
+ * Note that because memory cgroup charges are accounted to the device
+ * memory, this should never fail because of memory restrictions (but
+ * allocation of regular system page might still fail because we are
+ * out of memory).
+ *
+ * There is a more in-depth description of what that callback can and
+ * cannot do, in include/linux/memremap.h
+ */
+ return page->pgmap->page_fault(vma, addr, page, flags, pmdp);
+}
+EXPORT_SYMBOL(device_private_entry_fault);
+#endif /* CONFIG_DEVICE_PRIVATE */
+
static void pgmap_radix_release(struct resource *res)
{
unsigned long pgoff, order;
@@ -356,6 +385,10 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
}
pgmap->ref = ref;
pgmap->res = &page_map->res;
+ pgmap->type = MEMORY_DEVICE_HOST;
+ pgmap->page_fault = NULL;
+ pgmap->page_free = NULL;
+ pgmap->data = NULL;
mutex_lock(&pgmap_lock);
error = 0;
@@ -466,3 +499,28 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
return pgmap ? pgmap->altmap : NULL;
}
#endif /* CONFIG_ZONE_DEVICE */
+
+
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
+void put_zone_device_private_or_public_page(struct page *page)
+{
+ int count = page_ref_dec_return(page);
+
+ /*
+ * If refcount is 1 then page is freed and refcount is stable as nobody
+ * holds a reference on the page.
+ */
+ if (count == 1) {
+ /* Clear Active bit in case of parallel mark_page_accessed */
+ __ClearPageActive(page);
+ __ClearPageWaiters(page);
+
+ page->mapping = NULL;
+ mem_cgroup_uncharge(page);
+
+ page->pgmap->page_free(page, page->pgmap->data);
+ } else if (!count)
+ __put_page(page);
+}
+EXPORT_SYMBOL(put_zone_device_private_or_public_page);
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
diff --git a/kernel/module.c b/kernel/module.c
index 40f983cbea81..de66ec825992 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2707,21 +2707,21 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
}
#endif /* CONFIG_KALLSYMS */
-static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
+static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsigned int num)
{
if (!debug)
return;
#ifdef CONFIG_DYNAMIC_DEBUG
- if (ddebug_add_module(debug, num, debug->modname))
+ if (ddebug_add_module(debug, num, mod->name))
pr_err("dynamic debug error adding module: %s\n",
debug->modname);
#endif
}
-static void dynamic_debug_remove(struct _ddebug *debug)
+static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
{
if (debug)
- ddebug_remove_module(debug->modname);
+ ddebug_remove_module(mod->name);
}
void * __weak module_alloc(unsigned long size)
@@ -3715,7 +3715,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
goto free_arch_cleanup;
}
- dynamic_debug_setup(info->debug, info->num_debug);
+ dynamic_debug_setup(mod, info->debug, info->num_debug);
/* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
ftrace_module_init(mod);
@@ -3779,7 +3779,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
module_disable_nx(mod);
ddebug_cleanup:
- dynamic_debug_remove(info->debug);
+ dynamic_debug_remove(mod, info->debug);
synchronize_sched();
kfree(mod->args);
free_arch_cleanup:
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 74a5a7255b4d..4918314893bc 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -101,6 +101,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
int i;
int err;
+ err = -EINVAL;
+ if (!in_userns(parent_pid_ns->user_ns, user_ns))
+ goto out;
+
err = -ENOSPC;
if (level > MAX_PID_NS_LEVEL)
goto out;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 78672d324a6e..50f25cb370c6 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -20,8 +20,9 @@
#include <linux/workqueue.h>
#include <linux/kmod.h>
#include <trace/events/power.h>
+#include <linux/cpuset.h>
-/*
+/*
* Timeout for stopping processes
*/
unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;
@@ -202,6 +203,8 @@ void thaw_processes(void)
__usermodehelper_set_disable_depth(UMH_FREEZING);
thaw_workqueues();
+ cpuset_wait_for_hotplug();
+
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
/* No other threads should have PF_SUSPEND_TASK set */
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 57d22571f306..d7cdc426ee38 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -242,8 +242,7 @@ static void hib_end_io(struct bio *bio)
if (bio->bi_status) {
printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
- imajor(bio->bi_bdev->bd_inode),
- iminor(bio->bi_bdev->bd_inode),
+ MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
(unsigned long long)bio->bi_iter.bi_sector);
}
@@ -270,7 +269,7 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1);
bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
- bio->bi_bdev = hib_resume_bdev;
+ bio_set_dev(bio, hib_resume_bdev);
bio_set_op_attrs(bio, op, op_flags);
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index fc47863f629c..512f7c2baedd 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -649,7 +649,7 @@ static int syslog_action_restricted(int type)
type != SYSLOG_ACTION_SIZE_BUFFER;
}
-int check_syslog_permissions(int type, int source)
+static int check_syslog_permissions(int type, int source)
{
/*
* If this is from /proc/kmsg and we've already opened it, then we've
@@ -677,7 +677,6 @@ int check_syslog_permissions(int type, int source)
ok:
return security_syslog(type);
}
-EXPORT_SYMBOL_GPL(check_syslog_permissions);
static void append_char(char **pp, char *e, char c)
{
@@ -1435,7 +1434,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
error = check_syslog_permissions(type, source);
if (error)
- goto out;
+ return error;
switch (type) {
case SYSLOG_ACTION_CLOSE: /* Close log */
@@ -1443,20 +1442,16 @@ int do_syslog(int type, char __user *buf, int len, int source)
case SYSLOG_ACTION_OPEN: /* Open log */
break;
case SYSLOG_ACTION_READ: /* Read from log */
- error = -EINVAL;
if (!buf || len < 0)
- goto out;
- error = 0;
+ return -EINVAL;
if (!len)
- goto out;
- if (!access_ok(VERIFY_WRITE, buf, len)) {
- error = -EFAULT;
- goto out;
- }
+ return 0;
+ if (!access_ok(VERIFY_WRITE, buf, len))
+ return -EFAULT;
error = wait_event_interruptible(log_wait,
syslog_seq != log_next_seq);
if (error)
- goto out;
+ return error;
error = syslog_print(buf, len);
break;
/* Read/clear last kernel messages */
@@ -1465,16 +1460,12 @@ int do_syslog(int type, char __user *buf, int len, int source)
/* FALL THRU */
/* Read last kernel messages */
case SYSLOG_ACTION_READ_ALL:
- error = -EINVAL;
if (!buf || len < 0)
- goto out;
- error = 0;
+ return -EINVAL;
if (!len)
- goto out;
- if (!access_ok(VERIFY_WRITE, buf, len)) {
- error = -EFAULT;
- goto out;
- }
+ return 0;
+ if (!access_ok(VERIFY_WRITE, buf, len))
+ return -EFAULT;
error = syslog_print_all(buf, len, clear);
break;
/* Clear ring buffer */
@@ -1496,15 +1487,13 @@ int do_syslog(int type, char __user *buf, int len, int source)
break;
/* Set level of messages printed to console */
case SYSLOG_ACTION_CONSOLE_LEVEL:
- error = -EINVAL;
if (len < 1 || len > 8)
- goto out;
+ return -EINVAL;
if (len < minimum_console_loglevel)
len = minimum_console_loglevel;
console_loglevel = len;
/* Implicitly re-enable logging to console */
saved_console_loglevel = LOGLEVEL_DEFAULT;
- error = 0;
break;
/* Number of chars in the log buffer */
case SYSLOG_ACTION_SIZE_UNREAD:
@@ -1526,7 +1515,6 @@ int do_syslog(int type, char __user *buf, int len, int source)
u64 seq = syslog_seq;
u32 idx = syslog_idx;
- error = 0;
while (seq < log_next_seq) {
struct printk_log *msg = log_from_idx(idx);
@@ -1546,7 +1534,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
error = -EINVAL;
break;
}
-out:
+
return error;
}
@@ -1698,10 +1686,10 @@ asmlinkage int vprintk_emit(int facility, int level,
{
static char textbuf[LOG_LINE_MAX];
char *text = textbuf;
- size_t text_len = 0;
+ size_t text_len;
enum log_flags lflags = 0;
unsigned long flags;
- int printed_len = 0;
+ int printed_len;
bool in_sched = false;
if (level == LOGLEVEL_SCHED) {
@@ -1754,7 +1742,7 @@ asmlinkage int vprintk_emit(int facility, int level,
if (dict)
lflags |= LOG_PREFIX|LOG_NEWLINE;
- printed_len += log_output(facility, level, lflags, dict, dictlen, text, text_len);
+ printed_len = log_output(facility, level, lflags, dict, dictlen, text, text_len);
logbuf_unlock_irqrestore(flags);
@@ -2650,9 +2638,8 @@ void __init console_init(void)
* makes it difficult to diagnose problems that occur during this time.
*
* To mitigate this problem somewhat, only unregister consoles whose memory
- * intersects with the init section. Note that code exists elsewhere to get
- * rid of the boot console as soon as the proper console shows up, so there
- * won't be side-effects from postponing the removal.
+ * intersects with the init section. Note that all other boot consoles will
+ * get unregistred when the real preferred console is registered.
*/
static int __init printk_late_init(void)
{
@@ -2660,16 +2647,23 @@ static int __init printk_late_init(void)
int ret;
for_each_console(con) {
- if (!keep_bootcon && con->flags & CON_BOOT) {
+ if (!(con->flags & CON_BOOT))
+ continue;
+
+ /* Check addresses that might be used for enabled consoles. */
+ if (init_section_intersects(con, sizeof(*con)) ||
+ init_section_contains(con->write, 0) ||
+ init_section_contains(con->read, 0) ||
+ init_section_contains(con->device, 0) ||
+ init_section_contains(con->unblank, 0) ||
+ init_section_contains(con->data, 0)) {
/*
- * Make sure to unregister boot consoles whose data
- * resides in the init section before the init section
- * is discarded. Boot consoles whose data will stick
- * around will automatically be unregistered when the
- * proper console replaces them.
+ * Please, consider moving the reported consoles out
+ * of the init section.
*/
- if (init_section_intersects(con, sizeof(*con)))
- unregister_console(con);
+ pr_warn("bootconsole [%s%d] uses init memory and must be disabled even before the real one is ready\n",
+ con->name, con->index);
+ unregister_console(con);
}
}
ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL,
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 60f356d91060..84b1367935e4 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -728,8 +728,7 @@ static int ptrace_peek_siginfo(struct task_struct *child,
if (unlikely(in_compat_syscall())) {
compat_siginfo_t __user *uinfo = compat_ptr(data);
- if (copy_siginfo_to_user32(uinfo, &info) ||
- __put_user(info.si_code, &uinfo->si_code)) {
+ if (copy_siginfo_to_user32(uinfo, &info)) {
ret = -EFAULT;
break;
}
@@ -739,8 +738,7 @@ static int ptrace_peek_siginfo(struct task_struct *child,
{
siginfo_t __user *uinfo = (siginfo_t __user *) data;
- if (copy_siginfo_to_user(uinfo, &info) ||
- __put_user(info.si_code, &uinfo->si_code)) {
+ if (copy_siginfo_to_user(uinfo, &info)) {
ret = -EFAULT;
break;
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 84fe96641b2e..1250e4bd4b85 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4091,7 +4091,7 @@ static void __init rcu_init_geometry(void)
if (rcu_fanout_leaf == RCU_FANOUT_LEAF &&
nr_cpu_ids == NR_CPUS)
return;
- pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n",
+ pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n",
rcu_fanout_leaf, nr_cpu_ids);
/*
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 55bde94b9572..e012b9be777e 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -89,7 +89,7 @@ static void __init rcu_bootup_announce_oddness(void)
if (rcu_fanout_leaf != RCU_FANOUT_LEAF)
pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
if (nr_cpu_ids != NR_CPUS)
- pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
+ pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%u.\n", NR_CPUS, nr_cpu_ids);
#ifdef CONFIG_RCU_BOOST
pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", kthread_prio, CONFIG_RCU_BOOST_DELAY);
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6d2c7ff9ba98..18a6966567da 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1173,6 +1173,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
lockdep_is_held(&task_rq(p)->lock)));
#endif
+ /*
+ * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
+ */
+ WARN_ON_ONCE(!cpu_online(new_cpu));
#endif
trace_sched_migrate_task(p, new_cpu);
@@ -5556,16 +5560,15 @@ static void cpuset_cpu_active(void)
* operation in the resume sequence, just build a single sched
* domain, ignoring cpusets.
*/
- num_cpus_frozen--;
- if (likely(num_cpus_frozen)) {
- partition_sched_domains(1, NULL, NULL);
+ partition_sched_domains(1, NULL, NULL);
+ if (--num_cpus_frozen)
return;
- }
/*
* This is the last CPU online operation. So fall through and
* restore the original sched domains by considering the
* cpuset configurations.
*/
+ cpuset_force_rebuild();
}
cpuset_update_active_cpus();
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 9e38df7649f4..0191ec7667c3 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -296,7 +296,7 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
{
struct sched_dl_entity *dl_se = &p->dl;
- return dl_rq->rb_leftmost == &dl_se->rb_node;
+ return dl_rq->root.rb_leftmost == &dl_se->rb_node;
}
void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
@@ -320,7 +320,7 @@ void init_dl_bw(struct dl_bw *dl_b)
void init_dl_rq(struct dl_rq *dl_rq)
{
- dl_rq->rb_root = RB_ROOT;
+ dl_rq->root = RB_ROOT_CACHED;
#ifdef CONFIG_SMP
/* zero means no -deadline tasks */
@@ -328,7 +328,7 @@ void init_dl_rq(struct dl_rq *dl_rq)
dl_rq->dl_nr_migratory = 0;
dl_rq->overloaded = 0;
- dl_rq->pushable_dl_tasks_root = RB_ROOT;
+ dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED;
#else
init_dl_bw(&dl_rq->dl_bw);
#endif
@@ -410,10 +410,10 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
{
struct dl_rq *dl_rq = &rq->dl;
- struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node;
+ struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_root.rb_node;
struct rb_node *parent = NULL;
struct task_struct *entry;
- int leftmost = 1;
+ bool leftmost = true;
BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
@@ -425,17 +425,16 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
link = &parent->rb_left;
else {
link = &parent->rb_right;
- leftmost = 0;
+ leftmost = false;
}
}
- if (leftmost) {
- dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks;
+ if (leftmost)
dl_rq->earliest_dl.next = p->dl.deadline;
- }
rb_link_node(&p->pushable_dl_tasks, parent, link);
- rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
+ rb_insert_color_cached(&p->pushable_dl_tasks,
+ &dl_rq->pushable_dl_tasks_root, leftmost);
}
static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
@@ -445,24 +444,23 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
if (RB_EMPTY_NODE(&p->pushable_dl_tasks))
return;
- if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) {
+ if (dl_rq->pushable_dl_tasks_root.rb_leftmost == &p->pushable_dl_tasks) {
struct rb_node *next_node;
next_node = rb_next(&p->pushable_dl_tasks);
- dl_rq->pushable_dl_tasks_leftmost = next_node;
if (next_node) {
dl_rq->earliest_dl.next = rb_entry(next_node,
struct task_struct, pushable_dl_tasks)->dl.deadline;
}
}
- rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
+ rb_erase_cached(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
RB_CLEAR_NODE(&p->pushable_dl_tasks);
}
static inline int has_pushable_dl_tasks(struct rq *rq)
{
- return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root);
+ return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root);
}
static int push_dl_task(struct rq *rq);
@@ -1266,7 +1264,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
dl_rq->earliest_dl.next = 0;
cpudl_clear(&rq->rd->cpudl, rq->cpu);
} else {
- struct rb_node *leftmost = dl_rq->rb_leftmost;
+ struct rb_node *leftmost = dl_rq->root.rb_leftmost;
struct sched_dl_entity *entry;
entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
@@ -1313,7 +1311,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
{
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
- struct rb_node **link = &dl_rq->rb_root.rb_node;
+ struct rb_node **link = &dl_rq->root.rb_root.rb_node;
struct rb_node *parent = NULL;
struct sched_dl_entity *entry;
int leftmost = 1;
@@ -1331,11 +1329,8 @@ static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
}
}
- if (leftmost)
- dl_rq->rb_leftmost = &dl_se->rb_node;
-
rb_link_node(&dl_se->rb_node, parent, link);
- rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root);
+ rb_insert_color_cached(&dl_se->rb_node, &dl_rq->root, leftmost);
inc_dl_tasks(dl_se, dl_rq);
}
@@ -1347,14 +1342,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
if (RB_EMPTY_NODE(&dl_se->rb_node))
return;
- if (dl_rq->rb_leftmost == &dl_se->rb_node) {
- struct rb_node *next_node;
-
- next_node = rb_next(&dl_se->rb_node);
- dl_rq->rb_leftmost = next_node;
- }
-
- rb_erase(&dl_se->rb_node, &dl_rq->rb_root);
+ rb_erase_cached(&dl_se->rb_node, &dl_rq->root);
RB_CLEAR_NODE(&dl_se->rb_node);
dec_dl_tasks(dl_se, dl_rq);
@@ -1647,7 +1635,7 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
struct dl_rq *dl_rq)
{
- struct rb_node *left = dl_rq->rb_leftmost;
+ struct rb_node *left = rb_first_cached(&dl_rq->root);
if (!left)
return NULL;
@@ -1771,7 +1759,7 @@ static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
*/
static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu)
{
- struct rb_node *next_node = rq->dl.pushable_dl_tasks_leftmost;
+ struct rb_node *next_node = rq->dl.pushable_dl_tasks_root.rb_leftmost;
struct task_struct *p = NULL;
if (!has_pushable_dl_tasks(rq))
@@ -1945,7 +1933,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
if (!has_pushable_dl_tasks(rq))
return NULL;
- p = rb_entry(rq->dl.pushable_dl_tasks_leftmost,
+ p = rb_entry(rq->dl.pushable_dl_tasks_root.rb_leftmost,
struct task_struct, pushable_dl_tasks);
BUG_ON(rq->cpu != task_cpu(p));
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4a23bbc3111b..01217fb5a5de 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -181,11 +181,16 @@ static const struct file_operations sched_feat_fops = {
.release = single_release,
};
+__read_mostly bool sched_debug_enabled;
+
static __init int sched_init_debug(void)
{
debugfs_create_file("sched_features", 0644, NULL, NULL,
&sched_feat_fops);
+ debugfs_create_bool("sched_debug", 0644, NULL,
+ &sched_debug_enabled);
+
return 0;
}
late_initcall(sched_init_debug);
@@ -530,7 +535,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(cfs_rq->exec_clock));
raw_spin_lock_irqsave(&rq->lock, flags);
- if (cfs_rq->rb_leftmost)
+ if (rb_first_cached(&cfs_rq->tasks_timeline))
MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
last = __pick_last_entity(cfs_rq);
if (last)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8bc0a883d190..70ba32e08a23 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -513,6 +513,7 @@ static inline int entity_before(struct sched_entity *a,
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
+ struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
u64 vruntime = cfs_rq->min_vruntime;
@@ -523,10 +524,9 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
curr = NULL;
}
- if (cfs_rq->rb_leftmost) {
- struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
- struct sched_entity,
- run_node);
+ if (leftmost) { /* non-empty tree */
+ struct sched_entity *se;
+ se = rb_entry(leftmost, struct sched_entity, run_node);
if (!curr)
vruntime = se->vruntime;
@@ -547,10 +547,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
+ struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
struct rb_node *parent = NULL;
struct sched_entity *entry;
- int leftmost = 1;
+ bool leftmost = true;
/*
* Find the right place in the rbtree:
@@ -566,36 +566,23 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
link = &parent->rb_left;
} else {
link = &parent->rb_right;
- leftmost = 0;
+ leftmost = false;
}
}
- /*
- * Maintain a cache of leftmost tree entries (it is frequently
- * used):
- */
- if (leftmost)
- cfs_rq->rb_leftmost = &se->run_node;
-
rb_link_node(&se->run_node, parent, link);
- rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
+ rb_insert_color_cached(&se->run_node,
+ &cfs_rq->tasks_timeline, leftmost);
}
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (cfs_rq->rb_leftmost == &se->run_node) {
- struct rb_node *next_node;
-
- next_node = rb_next(&se->run_node);
- cfs_rq->rb_leftmost = next_node;
- }
-
- rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
+ rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
}
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{
- struct rb_node *left = cfs_rq->rb_leftmost;
+ struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
if (!left)
return NULL;
@@ -616,7 +603,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se)
#ifdef CONFIG_SCHED_DEBUG
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
- struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
+ struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
if (!last)
return NULL;
@@ -5437,7 +5424,7 @@ wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
return false;
/* if this cache has capacity, come here */
- if (this_stats.has_capacity && this_stats.nr_running < prev_stats.nr_running+1)
+ if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
return true;
/*
@@ -7721,7 +7708,7 @@ next_group:
* number.
*
* Return: 1 when packing is required and a task should be moved to
- * this CPU. The amount of the imbalance is returned in *imbalance.
+ * this CPU. The amount of the imbalance is returned in env->imbalance.
*
* @env: The load balancing environment.
* @sds: Statistics of the sched_domain which is to be packed
@@ -8450,6 +8437,12 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
this_rq->idle_stamp = rq_clock(this_rq);
/*
+ * Do not pull tasks towards !active CPUs...
+ */
+ if (!cpu_active(this_cpu))
+ return 0;
+
+ /*
* This is OK, because current is on_cpu, which avoids it being picked
* for load-balance and preemption/IRQs are still disabled avoiding
* further scheduler activity on it and we're being very careful to
@@ -8556,6 +8549,13 @@ static int active_load_balance_cpu_stop(void *data)
struct rq_flags rf;
rq_lock_irq(busiest_rq, &rf);
+ /*
+ * Between queueing the stop-work and running it is a hole in which
+ * CPUs can become inactive. We should not move tasks from or to
+ * inactive CPUs.
+ */
+ if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
+ goto out_unlock;
/* make sure the requested cpu hasn't gone down in the meantime */
if (unlikely(busiest_cpu != smp_processor_id() ||
@@ -9312,7 +9312,7 @@ static void set_curr_task_fair(struct rq *rq)
void init_cfs_rq(struct cfs_rq *cfs_rq)
{
- cfs_rq->tasks_timeline = RB_ROOT;
+ cfs_rq->tasks_timeline = RB_ROOT_CACHED;
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
#ifndef CONFIG_64BIT
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6ed7962dc896..14db76cd496f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -426,8 +426,7 @@ struct cfs_rq {
u64 min_vruntime_copy;
#endif
- struct rb_root tasks_timeline;
- struct rb_node *rb_leftmost;
+ struct rb_root_cached tasks_timeline;
/*
* 'curr' points to currently running entity on this cfs_rq.
@@ -550,8 +549,7 @@ struct rt_rq {
/* Deadline class' related fields in a runqueue */
struct dl_rq {
/* runqueue is an rbtree, ordered by deadline */
- struct rb_root rb_root;
- struct rb_node *rb_leftmost;
+ struct rb_root_cached root;
unsigned long dl_nr_running;
@@ -575,8 +573,7 @@ struct dl_rq {
* an rb-tree, ordered by tasks' deadlines, with caching
* of the leftmost (earliest deadline) element.
*/
- struct rb_root pushable_dl_tasks_root;
- struct rb_node *pushable_dl_tasks_leftmost;
+ struct rb_root_cached pushable_dl_tasks_root;
#else
struct dl_bw dl_bw;
#endif
@@ -1954,6 +1951,8 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
#ifdef CONFIG_SCHED_DEBUG
+extern bool sched_debug_enabled;
+
extern void print_cfs_stats(struct seq_file *m, int cpu);
extern void print_rt_stats(struct seq_file *m, int cpu);
extern void print_dl_stats(struct seq_file *m, int cpu);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 6f7b43982f73..f1cf4f306a82 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -14,11 +14,9 @@ cpumask_var_t sched_domains_tmpmask2;
#ifdef CONFIG_SCHED_DEBUG
-static __read_mostly int sched_debug_enabled;
-
static int __init sched_debug_setup(char *str)
{
- sched_debug_enabled = 1;
+ sched_debug_enabled = true;
return 0;
}
@@ -473,7 +471,7 @@ static int __init isolated_cpu_setup(char *str)
alloc_bootmem_cpumask_var(&cpu_isolated_map);
ret = cpulist_parse(str, cpu_isolated_map);
if (ret) {
- pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
+ pr_err("sched: Error, all isolcpus= values must be between 0 and %u\n", nr_cpu_ids);
return 0;
}
return 1;
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index d6afed6d0752..98feab7933c7 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -53,6 +53,12 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry
}
EXPORT_SYMBOL(remove_wait_queue);
+/*
+ * Scan threshold to break wait queue walk.
+ * This allows a waker to take a break from holding the
+ * wait queue lock during the wait queue walk.
+ */
+#define WAITQUEUE_WALK_BREAK_CNT 64
/*
* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
@@ -63,18 +69,67 @@ EXPORT_SYMBOL(remove_wait_queue);
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
* zero in this (rare) case, and we handle it by continuing to scan the queue.
*/
-static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
- int nr_exclusive, int wake_flags, void *key)
+static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
+ int nr_exclusive, int wake_flags, void *key,
+ wait_queue_entry_t *bookmark)
{
wait_queue_entry_t *curr, *next;
+ int cnt = 0;
+
+ if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) {
+ curr = list_next_entry(bookmark, entry);
+
+ list_del(&bookmark->entry);
+ bookmark->flags = 0;
+ } else
+ curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
- list_for_each_entry_safe(curr, next, &wq_head->head, entry) {
+ if (&curr->entry == &wq_head->head)
+ return nr_exclusive;
+
+ list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) {
unsigned flags = curr->flags;
- int ret = curr->func(curr, mode, wake_flags, key);
+ int ret;
+
+ if (flags & WQ_FLAG_BOOKMARK)
+ continue;
+
+ ret = curr->func(curr, mode, wake_flags, key);
if (ret < 0)
break;
if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
+
+ if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
+ (&next->entry != &wq_head->head)) {
+ bookmark->flags = WQ_FLAG_BOOKMARK;
+ list_add_tail(&bookmark->entry, &next->entry);
+ break;
+ }
+ }
+ return nr_exclusive;
+}
+
+static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode,
+ int nr_exclusive, int wake_flags, void *key)
+{
+ unsigned long flags;
+ wait_queue_entry_t bookmark;
+
+ bookmark.flags = 0;
+ bookmark.private = NULL;
+ bookmark.func = NULL;
+ INIT_LIST_HEAD(&bookmark.entry);
+
+ spin_lock_irqsave(&wq_head->lock, flags);
+ nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
+
+ while (bookmark.flags & WQ_FLAG_BOOKMARK) {
+ spin_lock_irqsave(&wq_head->lock, flags);
+ nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
+ wake_flags, key, &bookmark);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
}
}
@@ -91,11 +146,7 @@ static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
void __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
int nr_exclusive, void *key)
{
- unsigned long flags;
-
- spin_lock_irqsave(&wq_head->lock, flags);
- __wake_up_common(wq_head, mode, nr_exclusive, 0, key);
- spin_unlock_irqrestore(&wq_head->lock, flags);
+ __wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key);
}
EXPORT_SYMBOL(__wake_up);
@@ -104,16 +155,23 @@ EXPORT_SYMBOL(__wake_up);
*/
void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr)
{
- __wake_up_common(wq_head, mode, nr, 0, NULL);
+ __wake_up_common(wq_head, mode, nr, 0, NULL, NULL);
}
EXPORT_SYMBOL_GPL(__wake_up_locked);
void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key)
{
- __wake_up_common(wq_head, mode, 1, 0, key);
+ __wake_up_common(wq_head, mode, 1, 0, key, NULL);
}
EXPORT_SYMBOL_GPL(__wake_up_locked_key);
+void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
+ unsigned int mode, void *key, wait_queue_entry_t *bookmark)
+{
+ __wake_up_common(wq_head, mode, 1, 0, key, bookmark);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
+
/**
* __wake_up_sync_key - wake up threads blocked on a waitqueue.
* @wq_head: the waitqueue
@@ -134,7 +192,6 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key);
void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
int nr_exclusive, void *key)
{
- unsigned long flags;
int wake_flags = 1; /* XXX WF_SYNC */
if (unlikely(!wq_head))
@@ -143,9 +200,7 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
if (unlikely(nr_exclusive != 1))
wake_flags = 0;
- spin_lock_irqsave(&wq_head->lock, flags);
- __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key);
- spin_unlock_irqrestore(&wq_head->lock, flags);
+ __wake_up_common_lock(wq_head, mode, nr_exclusive, wake_flags, key);
}
EXPORT_SYMBOL_GPL(__wake_up_sync_key);
diff --git a/kernel/signal.c b/kernel/signal.c
index ed804a470dcd..800a18f77732 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2686,6 +2686,51 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
}
#endif
+enum siginfo_layout siginfo_layout(int sig, int si_code)
+{
+ enum siginfo_layout layout = SIL_KILL;
+ if ((si_code > SI_USER) && (si_code < SI_KERNEL)) {
+ static const struct {
+ unsigned char limit, layout;
+ } filter[] = {
+ [SIGILL] = { NSIGILL, SIL_FAULT },
+ [SIGFPE] = { NSIGFPE, SIL_FAULT },
+ [SIGSEGV] = { NSIGSEGV, SIL_FAULT },
+ [SIGBUS] = { NSIGBUS, SIL_FAULT },
+ [SIGTRAP] = { NSIGTRAP, SIL_FAULT },
+#if defined(SIGMET) && defined(NSIGEMT)
+ [SIGEMT] = { NSIGEMT, SIL_FAULT },
+#endif
+ [SIGCHLD] = { NSIGCHLD, SIL_CHLD },
+ [SIGPOLL] = { NSIGPOLL, SIL_POLL },
+#ifdef __ARCH_SIGSYS
+ [SIGSYS] = { NSIGSYS, SIL_SYS },
+#endif
+ };
+ if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit))
+ layout = filter[sig].layout;
+ else if (si_code <= NSIGPOLL)
+ layout = SIL_POLL;
+ } else {
+ if (si_code == SI_TIMER)
+ layout = SIL_TIMER;
+ else if (si_code == SI_SIGIO)
+ layout = SIL_POLL;
+ else if (si_code < 0)
+ layout = SIL_RT;
+ /* Tests to support buggy kernel ABIs */
+#ifdef TRAP_FIXME
+ if ((sig == SIGTRAP) && (si_code == TRAP_FIXME))
+ layout = SIL_FAULT;
+#endif
+#ifdef FPE_FIXME
+ if ((sig == SIGFPE) && (si_code == FPE_FIXME))
+ layout = SIL_FAULT;
+#endif
+ }
+ return layout;
+}
+
#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
@@ -2708,22 +2753,20 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
*/
err = __put_user(from->si_signo, &to->si_signo);
err |= __put_user(from->si_errno, &to->si_errno);
- err |= __put_user((short)from->si_code, &to->si_code);
- switch (from->si_code & __SI_MASK) {
- case __SI_KILL:
+ err |= __put_user(from->si_code, &to->si_code);
+ switch (siginfo_layout(from->si_signo, from->si_code)) {
+ case SIL_KILL:
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
break;
- case __SI_TIMER:
- err |= __put_user(from->si_tid, &to->si_tid);
- err |= __put_user(from->si_overrun, &to->si_overrun);
- err |= __put_user(from->si_ptr, &to->si_ptr);
+ case SIL_TIMER:
+ /* Unreached SI_TIMER is negative */
break;
- case __SI_POLL:
+ case SIL_POLL:
err |= __put_user(from->si_band, &to->si_band);
err |= __put_user(from->si_fd, &to->si_fd);
break;
- case __SI_FAULT:
+ case SIL_FAULT:
err |= __put_user(from->si_addr, &to->si_addr);
#ifdef __ARCH_SI_TRAPNO
err |= __put_user(from->si_trapno, &to->si_trapno);
@@ -2748,30 +2791,25 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
err |= __put_user(from->si_pkey, &to->si_pkey);
#endif
break;
- case __SI_CHLD:
+ case SIL_CHLD:
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
err |= __put_user(from->si_status, &to->si_status);
err |= __put_user(from->si_utime, &to->si_utime);
err |= __put_user(from->si_stime, &to->si_stime);
break;
- case __SI_RT: /* This is not generated by the kernel as of now. */
- case __SI_MESGQ: /* But this is */
+ case SIL_RT:
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
err |= __put_user(from->si_ptr, &to->si_ptr);
break;
#ifdef __ARCH_SIGSYS
- case __SI_SYS:
+ case SIL_SYS:
err |= __put_user(from->si_call_addr, &to->si_call_addr);
err |= __put_user(from->si_syscall, &to->si_syscall);
err |= __put_user(from->si_arch, &to->si_arch);
break;
#endif
- default: /* this is just in case for now ... */
- err |= __put_user(from->si_pid, &to->si_pid);
- err |= __put_user(from->si_uid, &to->si_uid);
- break;
}
return err;
}
diff --git a/kernel/smp.c b/kernel/smp.c
index 81cfca9b4cc3..c94dd85c8d41 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -550,7 +550,7 @@ static int __init maxcpus(char *str)
early_param("maxcpus", maxcpus);
/* Setup number of possible processor ids */
-int nr_cpu_ids __read_mostly = NR_CPUS;
+unsigned int nr_cpu_ids __read_mostly = NR_CPUS;
EXPORT_SYMBOL(nr_cpu_ids);
/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
diff --git a/kernel/sys.c b/kernel/sys.c
index 2855ee73acd0..9aebc2935013 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1896,15 +1896,11 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map)
/*
* Finally, make sure the caller has the rights to
- * change /proc/pid/exe link: only local root should
+ * change /proc/pid/exe link: only local sys admin should
* be allowed to.
*/
if (prctl_map->exe_fd != (u32)-1) {
- struct user_namespace *ns = current_user_ns();
- const struct cred *cred = current_cred();
-
- if (!uid_eq(cred->uid, make_kuid(ns, 0)) ||
- !gid_eq(cred->gid, make_kgid(ns, 0)))
+ if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
goto out;
}
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 02e1859f2ca8..58ea8c03662e 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -986,8 +986,9 @@ static ssize_t bin_intvec(struct file *file,
size_t length = oldlen / sizeof(*vec);
char *str, *end;
int i;
+ loff_t pos = 0;
- result = kernel_read(file, 0, buffer, BUFSZ - 1);
+ result = kernel_read(file, buffer, BUFSZ - 1, &pos);
if (result < 0)
goto out_kfree;
@@ -1016,6 +1017,7 @@ static ssize_t bin_intvec(struct file *file,
size_t length = newlen / sizeof(*vec);
char *str, *end;
int i;
+ loff_t pos = 0;
str = buffer;
end = str + BUFSZ;
@@ -1029,7 +1031,7 @@ static ssize_t bin_intvec(struct file *file,
str += scnprintf(str, end - str, "%lu\t", value);
}
- result = kernel_write(file, buffer, str - buffer, 0);
+ result = kernel_write(file, buffer, str - buffer, &pos);
if (result < 0)
goto out_kfree;
}
@@ -1057,8 +1059,9 @@ static ssize_t bin_ulongvec(struct file *file,
size_t length = oldlen / sizeof(*vec);
char *str, *end;
int i;
+ loff_t pos = 0;
- result = kernel_read(file, 0, buffer, BUFSZ - 1);
+ result = kernel_read(file, buffer, BUFSZ - 1, &pos);
if (result < 0)
goto out_kfree;
@@ -1087,6 +1090,7 @@ static ssize_t bin_ulongvec(struct file *file,
size_t length = newlen / sizeof(*vec);
char *str, *end;
int i;
+ loff_t pos = 0;
str = buffer;
end = str + BUFSZ;
@@ -1100,7 +1104,7 @@ static ssize_t bin_ulongvec(struct file *file,
str += scnprintf(str, end - str, "%lu\t", value);
}
- result = kernel_write(file, buffer, str - buffer, 0);
+ result = kernel_write(file, buffer, str - buffer, &pos);
if (result < 0)
goto out_kfree;
}
@@ -1120,8 +1124,9 @@ static ssize_t bin_uuid(struct file *file,
if (oldval && oldlen) {
char buf[UUID_STRING_LEN + 1];
uuid_t uuid;
+ loff_t pos = 0;
- result = kernel_read(file, 0, buf, sizeof(buf) - 1);
+ result = kernel_read(file, buf, sizeof(buf) - 1, &pos);
if (result < 0)
goto out;
@@ -1154,8 +1159,9 @@ static ssize_t bin_dn_node_address(struct file *file,
char buf[15], *nodep;
unsigned long area, node;
__le16 dnaddr;
+ loff_t pos = 0;
- result = kernel_read(file, 0, buf, sizeof(buf) - 1);
+ result = kernel_read(file, buf, sizeof(buf) - 1, &pos);
if (result < 0)
goto out;
@@ -1188,6 +1194,7 @@ static ssize_t bin_dn_node_address(struct file *file,
__le16 dnaddr;
char buf[15];
int len;
+ loff_t pos = 0;
result = -EINVAL;
if (newlen != sizeof(dnaddr))
@@ -1201,7 +1208,7 @@ static ssize_t bin_dn_node_address(struct file *file,
le16_to_cpu(dnaddr) >> 10,
le16_to_cpu(dnaddr) & 0x3ff);
- result = kernel_write(file, buf, len, 0);
+ result = kernel_write(file, buf, len, &pos);
if (result < 0)
goto out;
}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 8ea4fb315719..2cafb49aa65e 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2316,7 +2316,7 @@ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
}
EXPORT_SYMBOL(hardpps);
-#endif
+#endif /* CONFIG_NTP_PPS */
/**
* xtime_update() - advances the timekeeping infrastructure
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index bc364f86100a..2a685b45b73b 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -27,6 +27,7 @@
#include <linux/time.h>
#include <linux/uaccess.h>
#include <linux/list.h>
+#include <linux/blk-cgroup.h>
#include "../../block/blk.h"
@@ -46,10 +47,16 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock);
/* Select an alternative, minimalistic output than the original one */
#define TRACE_BLK_OPT_CLASSIC 0x1
+#define TRACE_BLK_OPT_CGROUP 0x2
+#define TRACE_BLK_OPT_CGNAME 0x4
static struct tracer_opt blk_tracer_opts[] = {
/* Default disable the minimalistic output */
{ TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
+#ifdef CONFIG_BLK_CGROUP
+ { TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) },
+ { TRACER_OPT(blk_cgname, TRACE_BLK_OPT_CGNAME) },
+#endif
{ }
};
@@ -68,7 +75,8 @@ static void blk_unregister_tracepoints(void);
* Send out a notify message.
*/
static void trace_note(struct blk_trace *bt, pid_t pid, int action,
- const void *data, size_t len)
+ const void *data, size_t len,
+ union kernfs_node_id *cgid)
{
struct blk_io_trace *t;
struct ring_buffer_event *event = NULL;
@@ -76,12 +84,13 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
int pc = 0;
int cpu = smp_processor_id();
bool blk_tracer = blk_tracer_enabled;
+ ssize_t cgid_len = cgid ? sizeof(*cgid) : 0;
if (blk_tracer) {
buffer = blk_tr->trace_buffer.buffer;
pc = preempt_count();
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
- sizeof(*t) + len,
+ sizeof(*t) + len + cgid_len,
0, pc);
if (!event)
return;
@@ -92,17 +101,19 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
if (!bt->rchan)
return;
- t = relay_reserve(bt->rchan, sizeof(*t) + len);
+ t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len);
if (t) {
t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
t->time = ktime_to_ns(ktime_get());
record_it:
t->device = bt->dev;
- t->action = action;
+ t->action = action | (cgid ? __BLK_TN_CGROUP : 0);
t->pid = pid;
t->cpu = cpu;
- t->pdu_len = len;
- memcpy((void *) t + sizeof(*t), data, len);
+ t->pdu_len = len + cgid_len;
+ if (cgid)
+ memcpy((void *)t + sizeof(*t), cgid, cgid_len);
+ memcpy((void *) t + sizeof(*t) + cgid_len, data, len);
if (blk_tracer)
trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
@@ -122,7 +133,7 @@ static void trace_note_tsk(struct task_struct *tsk)
spin_lock_irqsave(&running_trace_lock, flags);
list_for_each_entry(bt, &running_trace_list, running_list) {
trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm,
- sizeof(tsk->comm));
+ sizeof(tsk->comm), NULL);
}
spin_unlock_irqrestore(&running_trace_lock, flags);
}
@@ -139,11 +150,12 @@ static void trace_note_time(struct blk_trace *bt)
words[1] = now.tv_nsec;
local_irq_save(flags);
- trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words));
+ trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), NULL);
local_irq_restore(flags);
}
-void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
+void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg,
+ const char *fmt, ...)
{
int n;
va_list args;
@@ -167,7 +179,14 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
va_end(args);
- trace_note(bt, 0, BLK_TN_MESSAGE, buf, n);
+ if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
+ blkcg = NULL;
+#ifdef CONFIG_BLK_CGROUP
+ trace_note(bt, 0, BLK_TN_MESSAGE, buf, n,
+ blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : NULL);
+#else
+ trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, NULL);
+#endif
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(__trace_note_message);
@@ -204,7 +223,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
*/
static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
int op, int op_flags, u32 what, int error, int pdu_len,
- void *pdu_data)
+ void *pdu_data, union kernfs_node_id *cgid)
{
struct task_struct *tsk = current;
struct ring_buffer_event *event = NULL;
@@ -215,6 +234,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
pid_t pid;
int cpu, pc = 0;
bool blk_tracer = blk_tracer_enabled;
+ ssize_t cgid_len = cgid ? sizeof(*cgid) : 0;
if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
return;
@@ -229,6 +249,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
what |= BLK_TC_ACT(BLK_TC_DISCARD);
if (op == REQ_OP_FLUSH)
what |= BLK_TC_ACT(BLK_TC_FLUSH);
+ if (cgid)
+ what |= __BLK_TA_CGROUP;
pid = tsk->pid;
if (act_log_check(bt, what, sector, pid))
@@ -241,7 +263,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
buffer = blk_tr->trace_buffer.buffer;
pc = preempt_count();
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
- sizeof(*t) + pdu_len,
+ sizeof(*t) + pdu_len + cgid_len,
0, pc);
if (!event)
return;
@@ -258,7 +280,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
* from coming in and stepping on our toes.
*/
local_irq_save(flags);
- t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
+ t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len);
if (t) {
sequence = per_cpu_ptr(bt->sequence, cpu);
@@ -280,10 +302,12 @@ record_it:
t->action = what;
t->device = bt->dev;
t->error = error;
- t->pdu_len = pdu_len;
+ t->pdu_len = pdu_len + cgid_len;
+ if (cgid_len)
+ memcpy((void *)t + sizeof(*t), cgid, cgid_len);
if (pdu_len)
- memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
+ memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len);
if (blk_tracer) {
trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
@@ -359,7 +383,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
return PTR_ERR(msg);
bt = filp->private_data;
- __trace_note_message(bt, "%s", msg);
+ __trace_note_message(bt, NULL, "%s", msg);
kfree(msg);
return count;
@@ -684,6 +708,36 @@ void blk_trace_shutdown(struct request_queue *q)
}
}
+#ifdef CONFIG_BLK_CGROUP
+static union kernfs_node_id *
+blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
+{
+ struct blk_trace *bt = q->blk_trace;
+
+ if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
+ return NULL;
+
+ if (!bio->bi_css)
+ return NULL;
+ return cgroup_get_kernfs_id(bio->bi_css->cgroup);
+}
+#else
+static union kernfs_node_id *
+blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
+{
+ return NULL;
+}
+#endif
+
+static union kernfs_node_id *
+blk_trace_request_get_cgid(struct request_queue *q, struct request *rq)
+{
+ if (!rq->bio)
+ return NULL;
+ /* Use the first bio */
+ return blk_trace_bio_get_cgid(q, rq->bio);
+}
+
/*
* blktrace probes
*/
@@ -694,13 +748,15 @@ void blk_trace_shutdown(struct request_queue *q)
* @error: return status to log
* @nr_bytes: number of completed bytes
* @what: the action
+ * @cgid: the cgroup info
*
* Description:
* Records an action against a request. Will log the bio offset + size.
*
**/
static void blk_add_trace_rq(struct request *rq, int error,
- unsigned int nr_bytes, u32 what)
+ unsigned int nr_bytes, u32 what,
+ union kernfs_node_id *cgid)
{
struct blk_trace *bt = rq->q->blk_trace;
@@ -713,32 +769,36 @@ static void blk_add_trace_rq(struct request *rq, int error,
what |= BLK_TC_ACT(BLK_TC_FS);
__blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
- rq->cmd_flags, what, error, 0, NULL);
+ rq->cmd_flags, what, error, 0, NULL, cgid);
}
static void blk_add_trace_rq_insert(void *ignore,
struct request_queue *q, struct request *rq)
{
- blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT);
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT,
+ blk_trace_request_get_cgid(q, rq));
}
static void blk_add_trace_rq_issue(void *ignore,
struct request_queue *q, struct request *rq)
{
- blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE);
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE,
+ blk_trace_request_get_cgid(q, rq));
}
static void blk_add_trace_rq_requeue(void *ignore,
struct request_queue *q,
struct request *rq)
{
- blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE);
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE,
+ blk_trace_request_get_cgid(q, rq));
}
static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
int error, unsigned int nr_bytes)
{
- blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE);
+ blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE,
+ blk_trace_request_get_cgid(rq->q, rq));
}
/**
@@ -753,7 +813,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
*
**/
static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
- u32 what, int error)
+ u32 what, int error, union kernfs_node_id *cgid)
{
struct blk_trace *bt = q->blk_trace;
@@ -761,20 +821,22 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
return;
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio_op(bio), bio->bi_opf, what, error, 0, NULL);
+ bio_op(bio), bio->bi_opf, what, error, 0, NULL, cgid);
}
static void blk_add_trace_bio_bounce(void *ignore,
struct request_queue *q, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0,
+ blk_trace_bio_get_cgid(q, bio));
}
static void blk_add_trace_bio_complete(void *ignore,
struct request_queue *q, struct bio *bio,
int error)
{
- blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
+ blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error,
+ blk_trace_bio_get_cgid(q, bio));
}
static void blk_add_trace_bio_backmerge(void *ignore,
@@ -782,7 +844,8 @@ static void blk_add_trace_bio_backmerge(void *ignore,
struct request *rq,
struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0,
+ blk_trace_bio_get_cgid(q, bio));
}
static void blk_add_trace_bio_frontmerge(void *ignore,
@@ -790,13 +853,15 @@ static void blk_add_trace_bio_frontmerge(void *ignore,
struct request *rq,
struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0,
+ blk_trace_bio_get_cgid(q, bio));
}
static void blk_add_trace_bio_queue(void *ignore,
struct request_queue *q, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0,
+ blk_trace_bio_get_cgid(q, bio));
}
static void blk_add_trace_getrq(void *ignore,
@@ -804,13 +869,14 @@ static void blk_add_trace_getrq(void *ignore,
struct bio *bio, int rw)
{
if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0,
+ blk_trace_bio_get_cgid(q, bio));
else {
struct blk_trace *bt = q->blk_trace;
if (bt)
__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
- NULL);
+ NULL, NULL);
}
}
@@ -820,13 +886,14 @@ static void blk_add_trace_sleeprq(void *ignore,
struct bio *bio, int rw)
{
if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
+ blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0,
+ blk_trace_bio_get_cgid(q, bio));
else {
struct blk_trace *bt = q->blk_trace;
if (bt)
__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
- 0, 0, NULL);
+ 0, 0, NULL, NULL);
}
}
@@ -835,7 +902,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
struct blk_trace *bt = q->blk_trace;
if (bt)
- __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
+ __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, NULL);
}
static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
@@ -852,7 +919,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
else
what = BLK_TA_UNPLUG_TIMER;
- __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+ __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, NULL);
}
}
@@ -868,7 +935,7 @@ static void blk_add_trace_split(void *ignore,
__blk_add_trace(bt, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf,
BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu),
- &rpdu);
+ &rpdu, blk_trace_bio_get_cgid(q, bio));
}
}
@@ -896,12 +963,12 @@ static void blk_add_trace_bio_remap(void *ignore,
return;
r.device_from = cpu_to_be32(dev);
- r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev);
+ r.device_to = cpu_to_be32(bio_dev(bio));
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status,
- sizeof(r), &r);
+ sizeof(r), &r, blk_trace_bio_get_cgid(q, bio));
}
/**
@@ -934,7 +1001,7 @@ static void blk_add_trace_rq_remap(void *ignore,
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
- sizeof(r), &r);
+ sizeof(r), &r, blk_trace_request_get_cgid(q, rq));
}
/**
@@ -958,7 +1025,8 @@ void blk_add_driver_data(struct request_queue *q,
return;
__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
- BLK_TA_DRV_DATA, 0, len, data);
+ BLK_TA_DRV_DATA, 0, len, data,
+ blk_trace_request_get_cgid(q, rq));
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -1031,7 +1099,7 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
int i = 0;
int tc = t->action >> BLK_TC_SHIFT;
- if (t->action == BLK_TN_MESSAGE) {
+ if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) {
rwbs[i++] = 'N';
goto out;
}
@@ -1066,9 +1134,21 @@ const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
return (const struct blk_io_trace *)ent;
}
-static inline const void *pdu_start(const struct trace_entry *ent)
+static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg)
+{
+ return (void *)(te_blk_io_trace(ent) + 1) +
+ (has_cg ? sizeof(union kernfs_node_id) : 0);
+}
+
+static inline const void *cgid_start(const struct trace_entry *ent)
+{
+ return (void *)(te_blk_io_trace(ent) + 1);
+}
+
+static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg)
{
- return te_blk_io_trace(ent) + 1;
+ return te_blk_io_trace(ent)->pdu_len -
+ (has_cg ? sizeof(union kernfs_node_id) : 0);
}
static inline u32 t_action(const struct trace_entry *ent)
@@ -1096,16 +1176,16 @@ static inline __u16 t_error(const struct trace_entry *ent)
return te_blk_io_trace(ent)->error;
}
-static __u64 get_pdu_int(const struct trace_entry *ent)
+static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg)
{
- const __u64 *val = pdu_start(ent);
+ const __u64 *val = pdu_start(ent, has_cg);
return be64_to_cpu(*val);
}
static void get_pdu_remap(const struct trace_entry *ent,
- struct blk_io_trace_remap *r)
+ struct blk_io_trace_remap *r, bool has_cg)
{
- const struct blk_io_trace_remap *__r = pdu_start(ent);
+ const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg);
__u64 sector_from = __r->sector_from;
r->device_from = be32_to_cpu(__r->device_from);
@@ -1113,9 +1193,11 @@ static void get_pdu_remap(const struct trace_entry *ent,
r->sector_from = be64_to_cpu(sector_from);
}
-typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act);
+typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act,
+ bool has_cg);
-static void blk_log_action_classic(struct trace_iterator *iter, const char *act)
+static void blk_log_action_classic(struct trace_iterator *iter, const char *act,
+ bool has_cg)
{
char rwbs[RWBS_LEN];
unsigned long long ts = iter->ts;
@@ -1131,24 +1213,43 @@ static void blk_log_action_classic(struct trace_iterator *iter, const char *act)
secs, nsec_rem, iter->ent->pid, act, rwbs);
}
-static void blk_log_action(struct trace_iterator *iter, const char *act)
+static void blk_log_action(struct trace_iterator *iter, const char *act,
+ bool has_cg)
{
char rwbs[RWBS_LEN];
const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
fill_rwbs(rwbs, t);
- trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
- MAJOR(t->device), MINOR(t->device), act, rwbs);
-}
-
-static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
+ if (has_cg) {
+ const union kernfs_node_id *id = cgid_start(iter->ent);
+
+ if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) {
+ char blkcg_name_buf[NAME_MAX + 1] = "<...>";
+
+ cgroup_path_from_kernfs_id(id, blkcg_name_buf,
+ sizeof(blkcg_name_buf));
+ trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ",
+ MAJOR(t->device), MINOR(t->device),
+ blkcg_name_buf, act, rwbs);
+ } else
+ trace_seq_printf(&iter->seq,
+ "%3d,%-3d %x,%-x %2s %3s ",
+ MAJOR(t->device), MINOR(t->device),
+ id->ino, id->generation, act, rwbs);
+ } else
+ trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
+ MAJOR(t->device), MINOR(t->device), act, rwbs);
+}
+
+static void blk_log_dump_pdu(struct trace_seq *s,
+ const struct trace_entry *ent, bool has_cg)
{
const unsigned char *pdu_buf;
int pdu_len;
int i, end;
- pdu_buf = pdu_start(ent);
- pdu_len = te_blk_io_trace(ent)->pdu_len;
+ pdu_buf = pdu_start(ent, has_cg);
+ pdu_len = pdu_real_len(ent, has_cg);
if (!pdu_len)
return;
@@ -1179,7 +1280,7 @@ static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
trace_seq_puts(s, ") ");
}
-static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
char cmd[TASK_COMM_LEN];
@@ -1187,7 +1288,7 @@ static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
trace_seq_printf(s, "%u ", t_bytes(ent));
- blk_log_dump_pdu(s, ent);
+ blk_log_dump_pdu(s, ent, has_cg);
trace_seq_printf(s, "[%s]\n", cmd);
} else {
if (t_sec(ent))
@@ -1199,10 +1300,10 @@ static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
}
static void blk_log_with_error(struct trace_seq *s,
- const struct trace_entry *ent)
+ const struct trace_entry *ent, bool has_cg)
{
if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
- blk_log_dump_pdu(s, ent);
+ blk_log_dump_pdu(s, ent, has_cg);
trace_seq_printf(s, "[%d]\n", t_error(ent));
} else {
if (t_sec(ent))
@@ -1215,18 +1316,18 @@ static void blk_log_with_error(struct trace_seq *s,
}
}
-static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
struct blk_io_trace_remap r = { .device_from = 0, };
- get_pdu_remap(ent, &r);
+ get_pdu_remap(ent, &r, has_cg);
trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
t_sector(ent), t_sec(ent),
MAJOR(r.device_from), MINOR(r.device_from),
(unsigned long long)r.sector_from);
}
-static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
char cmd[TASK_COMM_LEN];
@@ -1235,30 +1336,31 @@ static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
trace_seq_printf(s, "[%s]\n", cmd);
}
-static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
- trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
+ trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent, has_cg));
}
-static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
- get_pdu_int(ent), cmd);
+ get_pdu_int(ent, has_cg), cmd);
}
-static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent,
+ bool has_cg)
{
- const struct blk_io_trace *t = te_blk_io_trace(ent);
- trace_seq_putmem(s, t + 1, t->pdu_len);
+ trace_seq_putmem(s, pdu_start(ent, has_cg),
+ pdu_real_len(ent, has_cg));
trace_seq_putc(s, '\n');
}
@@ -1298,7 +1400,8 @@ static void blk_tracer_reset(struct trace_array *tr)
static const struct {
const char *act[2];
- void (*print)(struct trace_seq *s, const struct trace_entry *ent);
+ void (*print)(struct trace_seq *s, const struct trace_entry *ent,
+ bool has_cg);
} what2act[] = {
[__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic },
[__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic },
@@ -1326,23 +1429,25 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
u16 what;
bool long_act;
blk_log_action_t *log_action;
+ bool has_cg;
t = te_blk_io_trace(iter->ent);
- what = t->action & ((1 << BLK_TC_SHIFT) - 1);
+ what = (t->action & ((1 << BLK_TC_SHIFT) - 1)) & ~__BLK_TA_CGROUP;
long_act = !!(tr->trace_flags & TRACE_ITER_VERBOSE);
log_action = classic ? &blk_log_action_classic : &blk_log_action;
+ has_cg = t->action & __BLK_TA_CGROUP;
- if (t->action == BLK_TN_MESSAGE) {
- log_action(iter, long_act ? "message" : "m");
- blk_log_msg(s, iter->ent);
+ if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) {
+ log_action(iter, long_act ? "message" : "m", has_cg);
+ blk_log_msg(s, iter->ent, has_cg);
return trace_handle_return(s);
}
if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
trace_seq_printf(s, "Unknown action %x\n", what);
else {
- log_action(iter, what2act[what].act[long_act]);
- what2act[what].print(s, iter->ent);
+ log_action(iter, what2act[what].act[long_act], has_cg);
+ what2act[what].print(s, iter->ent, has_cg);
}
return trace_handle_return(s);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 96cea88fa00f..6abfafd7f173 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2828,13 +2828,14 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
if (!command || !ftrace_enabled) {
/*
- * If these are per_cpu ops, they still need their
- * per_cpu field freed. Since, function tracing is
+ * If these are dynamic or per_cpu ops, they still
+ * need their data freed. Since, function tracing is
* not currently active, we can just free them
* without synchronizing all CPUs.
*/
- if (ops->flags & FTRACE_OPS_FL_PER_CPU)
- per_cpu_ops_free(ops);
+ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU))
+ goto free_ops;
+
return 0;
}
@@ -2900,6 +2901,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
if (IS_ENABLED(CONFIG_PREEMPT))
synchronize_rcu_tasks();
+ free_ops:
arch_ftrace_trampoline_free(ops);
if (ops->flags & FTRACE_OPS_FL_PER_CPU)
@@ -5690,10 +5692,51 @@ static int referenced_filters(struct dyn_ftrace *rec)
return cnt;
}
+static void
+clear_mod_from_hash(struct ftrace_page *pg, struct ftrace_hash *hash)
+{
+ struct ftrace_func_entry *entry;
+ struct dyn_ftrace *rec;
+ int i;
+
+ if (ftrace_hash_empty(hash))
+ return;
+
+ for (i = 0; i < pg->index; i++) {
+ rec = &pg->records[i];
+ entry = __ftrace_lookup_ip(hash, rec->ip);
+ /*
+ * Do not allow this rec to match again.
+ * Yeah, it may waste some memory, but will be removed
+ * if/when the hash is modified again.
+ */
+ if (entry)
+ entry->ip = 0;
+ }
+}
+
+/* Clear any records from hashs */
+static void clear_mod_from_hashes(struct ftrace_page *pg)
+{
+ struct trace_array *tr;
+
+ mutex_lock(&trace_types_lock);
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (!tr->ops || !tr->ops->func_hash)
+ continue;
+ mutex_lock(&tr->ops->func_hash->regex_lock);
+ clear_mod_from_hash(pg, tr->ops->func_hash->filter_hash);
+ clear_mod_from_hash(pg, tr->ops->func_hash->notrace_hash);
+ mutex_unlock(&tr->ops->func_hash->regex_lock);
+ }
+ mutex_unlock(&trace_types_lock);
+}
+
void ftrace_release_mod(struct module *mod)
{
struct dyn_ftrace *rec;
struct ftrace_page **last_pg;
+ struct ftrace_page *tmp_page = NULL;
struct ftrace_page *pg;
int order;
@@ -5723,14 +5766,25 @@ void ftrace_release_mod(struct module *mod)
ftrace_update_tot_cnt -= pg->index;
*last_pg = pg->next;
- order = get_count_order(pg->size / ENTRIES_PER_PAGE);
- free_pages((unsigned long)pg->records, order);
- kfree(pg);
+
+ pg->next = tmp_page;
+ tmp_page = pg;
} else
last_pg = &pg->next;
}
out_unlock:
mutex_unlock(&ftrace_lock);
+
+ for (pg = tmp_page; pg; pg = tmp_page) {
+
+ /* Needs to be called outside of ftrace_lock */
+ clear_mod_from_hashes(pg);
+
+ order = get_count_order(pg->size / ENTRIES_PER_PAGE);
+ free_pages((unsigned long)pg->records, order);
+ tmp_page = pg->next;
+ kfree(pg);
+ }
}
void ftrace_module_enable(struct module *mod)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 44004d8aa3b3..5360b7aec57a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1702,6 +1702,9 @@ void tracing_reset_all_online_cpus(void)
struct trace_array *tr;
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (!tr->clear_trace)
+ continue;
+ tr->clear_trace = false;
tracing_reset_online_cpus(&tr->trace_buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
tracing_reset_online_cpus(&tr->max_buffer);
@@ -2799,11 +2802,17 @@ static char *get_trace_buf(void)
if (!buffer || buffer->nesting >= 4)
return NULL;
- return &buffer->buffer[buffer->nesting++][0];
+ buffer->nesting++;
+
+ /* Interrupts must see nesting incremented before we use the buffer */
+ barrier();
+ return &buffer->buffer[buffer->nesting][0];
}
static void put_trace_buf(void)
{
+ /* Don't let the decrement of nesting leak before this */
+ barrier();
this_cpu_dec(trace_percpu_buffer->nesting);
}
@@ -6220,7 +6229,7 @@ static int tracing_set_clock(struct trace_array *tr, const char *clockstr)
tracing_reset_online_cpus(&tr->trace_buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
- if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer)
+ if (tr->max_buffer.buffer)
ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func);
tracing_reset_online_cpus(&tr->max_buffer);
#endif
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 490ba229931d..fb5d54d0d1b3 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -245,6 +245,7 @@ struct trace_array {
int stop_count;
int clock_id;
int nr_topts;
+ bool clear_trace;
struct tracer *current_trace;
unsigned int trace_flags;
unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE];
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 36132f9280e6..87468398b9ed 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -406,7 +406,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
if (file->flags & EVENT_FILE_FL_RECORDED_TGID) {
tracing_stop_tgid_record();
- clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
+ clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
}
call->class->reg(call, TRACE_REG_UNREGISTER, file);
@@ -466,7 +466,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
set_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags);
/* WAS_ENABLED gets set but never cleared. */
- call->flags |= TRACE_EVENT_FL_WAS_ENABLED;
+ set_bit(EVENT_FILE_FL_WAS_ENABLED_BIT, &file->flags);
}
break;
}
@@ -2058,6 +2058,10 @@ static void event_remove(struct trace_event_call *call)
do_for_each_event_file(tr, file) {
if (file->event_call != call)
continue;
+
+ if (file->flags & EVENT_FILE_FL_WAS_ENABLED)
+ tr->clear_trace = true;
+
ftrace_event_enable_disable(file, 0);
/*
* The do_for_each_event_file() is
@@ -2396,15 +2400,11 @@ static void trace_module_add_events(struct module *mod)
static void trace_module_remove_events(struct module *mod)
{
struct trace_event_call *call, *p;
- bool clear_trace = false;
down_write(&trace_event_sem);
list_for_each_entry_safe(call, p, &ftrace_events, list) {
- if (call->mod == mod) {
- if (call->flags & TRACE_EVENT_FL_WAS_ENABLED)
- clear_trace = true;
+ if (call->mod == mod)
__trace_remove_event_call(call);
- }
}
up_write(&trace_event_sem);
@@ -2416,8 +2416,7 @@ static void trace_module_remove_events(struct module *mod)
* over from this module may be passed to the new module events and
* unexpected results may occur.
*/
- if (clear_trace)
- tracing_reset_all_online_cpus();
+ tracing_reset_all_online_cpus();
}
static int trace_module_notify(struct notifier_block *self,
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 181e139a8057..61e7f0678d33 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -702,7 +702,7 @@ static void append_filter_err(struct filter_parse_state *ps,
int pos = ps->lasterr_pos;
char *buf, *pbuf;
- buf = (char *)__get_free_page(GFP_TEMPORARY);
+ buf = (char *)__get_free_page(GFP_KERNEL);
if (!buf)
return;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d56123cdcc89..b8f1f54731af 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -1543,7 +1543,7 @@ fs_initcall(init_graph_tracefs);
static __init int init_graph_trace(void)
{
- max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
+ max_bytes_for_cpu = snprintf(NULL, 0, "%u", nr_cpu_ids - 1);
if (!register_trace_event(&graph_trace_entry_event)) {
pr_warn("Warning: could not register graph trace events\n");
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index cb917cebae29..b17ec642793b 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -273,7 +273,7 @@ static int trace_selftest_ops(struct trace_array *tr, int cnt)
goto out_free;
if (cnt > 1) {
if (trace_selftest_test_global_cnt == 0)
- goto out;
+ goto out_free;
}
if (trace_selftest_test_dyn_cnt == 0)
goto out_free;
diff --git a/kernel/umh.c b/kernel/umh.c
new file mode 100644
index 000000000000..6ff9905250ff
--- /dev/null
+++ b/kernel/umh.c
@@ -0,0 +1,568 @@
+/*
+ * umh - the kernel usermode helper
+ */
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/binfmts.h>
+#include <linux/syscalls.h>
+#include <linux/unistd.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/completion.h>
+#include <linux/cred.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/workqueue.h>
+#include <linux/security.h>
+#include <linux/mount.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/resource.h>
+#include <linux/notifier.h>
+#include <linux/suspend.h>
+#include <linux/rwsem.h>
+#include <linux/ptrace.h>
+#include <linux/async.h>
+#include <linux/uaccess.h>
+
+#include <trace/events/module.h>
+
+#define CAP_BSET (void *)1
+#define CAP_PI (void *)2
+
+static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
+static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
+static DEFINE_SPINLOCK(umh_sysctl_lock);
+static DECLARE_RWSEM(umhelper_sem);
+
+static void call_usermodehelper_freeinfo(struct subprocess_info *info)
+{
+ if (info->cleanup)
+ (*info->cleanup)(info);
+ kfree(info);
+}
+
+static void umh_complete(struct subprocess_info *sub_info)
+{
+ struct completion *comp = xchg(&sub_info->complete, NULL);
+ /*
+ * See call_usermodehelper_exec(). If xchg() returns NULL
+ * we own sub_info, the UMH_KILLABLE caller has gone away
+ * or the caller used UMH_NO_WAIT.
+ */
+ if (comp)
+ complete(comp);
+ else
+ call_usermodehelper_freeinfo(sub_info);
+}
+
+/*
+ * This is the task which runs the usermode application
+ */
+static int call_usermodehelper_exec_async(void *data)
+{
+ struct subprocess_info *sub_info = data;
+ struct cred *new;
+ int retval;
+
+ spin_lock_irq(&current->sighand->siglock);
+ flush_signal_handlers(current, 1);
+ spin_unlock_irq(&current->sighand->siglock);
+
+ /*
+ * Our parent (unbound workqueue) runs with elevated scheduling
+ * priority. Avoid propagating that into the userspace child.
+ */
+ set_user_nice(current, 0);
+
+ retval = -ENOMEM;
+ new = prepare_kernel_cred(current);
+ if (!new)
+ goto out;
+
+ spin_lock(&umh_sysctl_lock);
+ new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
+ new->cap_inheritable = cap_intersect(usermodehelper_inheritable,
+ new->cap_inheritable);
+ spin_unlock(&umh_sysctl_lock);
+
+ if (sub_info->init) {
+ retval = sub_info->init(sub_info, new);
+ if (retval) {
+ abort_creds(new);
+ goto out;
+ }
+ }
+
+ commit_creds(new);
+
+ retval = do_execve(getname_kernel(sub_info->path),
+ (const char __user *const __user *)sub_info->argv,
+ (const char __user *const __user *)sub_info->envp);
+out:
+ sub_info->retval = retval;
+ /*
+ * call_usermodehelper_exec_sync() will call umh_complete
+ * if UHM_WAIT_PROC.
+ */
+ if (!(sub_info->wait & UMH_WAIT_PROC))
+ umh_complete(sub_info);
+ if (!retval)
+ return 0;
+ do_exit(0);
+}
+
+/* Handles UMH_WAIT_PROC. */
+static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
+{
+ pid_t pid;
+
+ /* If SIGCLD is ignored sys_wait4 won't populate the status. */
+ kernel_sigaction(SIGCHLD, SIG_DFL);
+ pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
+ if (pid < 0) {
+ sub_info->retval = pid;
+ } else {
+ int ret = -ECHILD;
+ /*
+ * Normally it is bogus to call wait4() from in-kernel because
+ * wait4() wants to write the exit code to a userspace address.
+ * But call_usermodehelper_exec_sync() always runs as kernel
+ * thread (workqueue) and put_user() to a kernel address works
+ * OK for kernel threads, due to their having an mm_segment_t
+ * which spans the entire address space.
+ *
+ * Thus the __user pointer cast is valid here.
+ */
+ sys_wait4(pid, (int __user *)&ret, 0, NULL);
+
+ /*
+ * If ret is 0, either call_usermodehelper_exec_async failed and
+ * the real error code is already in sub_info->retval or
+ * sub_info->retval is 0 anyway, so don't mess with it then.
+ */
+ if (ret)
+ sub_info->retval = ret;
+ }
+
+ /* Restore default kernel sig handler */
+ kernel_sigaction(SIGCHLD, SIG_IGN);
+
+ umh_complete(sub_info);
+}
+
+/*
+ * We need to create the usermodehelper kernel thread from a task that is affine
+ * to an optimized set of CPUs (or nohz housekeeping ones) such that they
+ * inherit a widest affinity irrespective of call_usermodehelper() callers with
+ * possibly reduced affinity (eg: per-cpu workqueues). We don't want
+ * usermodehelper targets to contend a busy CPU.
+ *
+ * Unbound workqueues provide such wide affinity and allow to block on
+ * UMH_WAIT_PROC requests without blocking pending request (up to some limit).
+ *
+ * Besides, workqueues provide the privilege level that caller might not have
+ * to perform the usermodehelper request.
+ *
+ */
+static void call_usermodehelper_exec_work(struct work_struct *work)
+{
+ struct subprocess_info *sub_info =
+ container_of(work, struct subprocess_info, work);
+
+ if (sub_info->wait & UMH_WAIT_PROC) {
+ call_usermodehelper_exec_sync(sub_info);
+ } else {
+ pid_t pid;
+ /*
+ * Use CLONE_PARENT to reparent it to kthreadd; we do not
+ * want to pollute current->children, and we need a parent
+ * that always ignores SIGCHLD to ensure auto-reaping.
+ */
+ pid = kernel_thread(call_usermodehelper_exec_async, sub_info,
+ CLONE_PARENT | SIGCHLD);
+ if (pid < 0) {
+ sub_info->retval = pid;
+ umh_complete(sub_info);
+ }
+ }
+}
+
+/*
+ * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
+ * (used for preventing user land processes from being created after the user
+ * land has been frozen during a system-wide hibernation or suspend operation).
+ * Should always be manipulated under umhelper_sem acquired for write.
+ */
+static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED;
+
+/* Number of helpers running */
+static atomic_t running_helpers = ATOMIC_INIT(0);
+
+/*
+ * Wait queue head used by usermodehelper_disable() to wait for all running
+ * helpers to finish.
+ */
+static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
+
+/*
+ * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled
+ * to become 'false'.
+ */
+static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq);
+
+/*
+ * Time to wait for running_helpers to become zero before the setting of
+ * usermodehelper_disabled in usermodehelper_disable() fails
+ */
+#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
+
+int usermodehelper_read_trylock(void)
+{
+ DEFINE_WAIT(wait);
+ int ret = 0;
+
+ down_read(&umhelper_sem);
+ for (;;) {
+ prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
+ TASK_INTERRUPTIBLE);
+ if (!usermodehelper_disabled)
+ break;
+
+ if (usermodehelper_disabled == UMH_DISABLED)
+ ret = -EAGAIN;
+
+ up_read(&umhelper_sem);
+
+ if (ret)
+ break;
+
+ schedule();
+ try_to_freeze();
+
+ down_read(&umhelper_sem);
+ }
+ finish_wait(&usermodehelper_disabled_waitq, &wait);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(usermodehelper_read_trylock);
+
+long usermodehelper_read_lock_wait(long timeout)
+{
+ DEFINE_WAIT(wait);
+
+ if (timeout < 0)
+ return -EINVAL;
+
+ down_read(&umhelper_sem);
+ for (;;) {
+ prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (!usermodehelper_disabled)
+ break;
+
+ up_read(&umhelper_sem);
+
+ timeout = schedule_timeout(timeout);
+ if (!timeout)
+ break;
+
+ down_read(&umhelper_sem);
+ }
+ finish_wait(&usermodehelper_disabled_waitq, &wait);
+ return timeout;
+}
+EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait);
+
+void usermodehelper_read_unlock(void)
+{
+ up_read(&umhelper_sem);
+}
+EXPORT_SYMBOL_GPL(usermodehelper_read_unlock);
+
+/**
+ * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled.
+ * @depth: New value to assign to usermodehelper_disabled.
+ *
+ * Change the value of usermodehelper_disabled (under umhelper_sem locked for
+ * writing) and wakeup tasks waiting for it to change.
+ */
+void __usermodehelper_set_disable_depth(enum umh_disable_depth depth)
+{
+ down_write(&umhelper_sem);
+ usermodehelper_disabled = depth;
+ wake_up(&usermodehelper_disabled_waitq);
+ up_write(&umhelper_sem);
+}
+
+/**
+ * __usermodehelper_disable - Prevent new helpers from being started.
+ * @depth: New value to assign to usermodehelper_disabled.
+ *
+ * Set usermodehelper_disabled to @depth and wait for running helpers to exit.
+ */
+int __usermodehelper_disable(enum umh_disable_depth depth)
+{
+ long retval;
+
+ if (!depth)
+ return -EINVAL;
+
+ down_write(&umhelper_sem);
+ usermodehelper_disabled = depth;
+ up_write(&umhelper_sem);
+
+ /*
+ * From now on call_usermodehelper_exec() won't start any new
+ * helpers, so it is sufficient if running_helpers turns out to
+ * be zero at one point (it may be increased later, but that
+ * doesn't matter).
+ */
+ retval = wait_event_timeout(running_helpers_waitq,
+ atomic_read(&running_helpers) == 0,
+ RUNNING_HELPERS_TIMEOUT);
+ if (retval)
+ return 0;
+
+ __usermodehelper_set_disable_depth(UMH_ENABLED);
+ return -EAGAIN;
+}
+
+static void helper_lock(void)
+{
+ atomic_inc(&running_helpers);
+ smp_mb__after_atomic();
+}
+
+static void helper_unlock(void)
+{
+ if (atomic_dec_and_test(&running_helpers))
+ wake_up(&running_helpers_waitq);
+}
+
+/**
+ * call_usermodehelper_setup - prepare to call a usermode helper
+ * @path: path to usermode executable
+ * @argv: arg vector for process
+ * @envp: environment for process
+ * @gfp_mask: gfp mask for memory allocation
+ * @cleanup: a cleanup function
+ * @init: an init function
+ * @data: arbitrary context sensitive data
+ *
+ * Returns either %NULL on allocation failure, or a subprocess_info
+ * structure. This should be passed to call_usermodehelper_exec to
+ * exec the process and free the structure.
+ *
+ * The init function is used to customize the helper process prior to
+ * exec. A non-zero return code causes the process to error out, exit,
+ * and return the failure to the calling process
+ *
+ * The cleanup function is just before ethe subprocess_info is about to
+ * be freed. This can be used for freeing the argv and envp. The
+ * Function must be runnable in either a process context or the
+ * context in which call_usermodehelper_exec is called.
+ */
+struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
+ char **envp, gfp_t gfp_mask,
+ int (*init)(struct subprocess_info *info, struct cred *new),
+ void (*cleanup)(struct subprocess_info *info),
+ void *data)
+{
+ struct subprocess_info *sub_info;
+ sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
+ if (!sub_info)
+ goto out;
+
+ INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
+
+#ifdef CONFIG_STATIC_USERMODEHELPER
+ sub_info->path = CONFIG_STATIC_USERMODEHELPER_PATH;
+#else
+ sub_info->path = path;
+#endif
+ sub_info->argv = argv;
+ sub_info->envp = envp;
+
+ sub_info->cleanup = cleanup;
+ sub_info->init = init;
+ sub_info->data = data;
+ out:
+ return sub_info;
+}
+EXPORT_SYMBOL(call_usermodehelper_setup);
+
+/**
+ * call_usermodehelper_exec - start a usermode application
+ * @sub_info: information about the subprocessa
+ * @wait: wait for the application to finish and return status.
+ * when UMH_NO_WAIT don't wait at all, but you get no useful error back
+ * when the program couldn't be exec'ed. This makes it safe to call
+ * from interrupt context.
+ *
+ * Runs a user-space application. The application is started
+ * asynchronously if wait is not set, and runs as a child of system workqueues.
+ * (ie. it runs with full root capabilities and optimized affinity).
+ */
+int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
+{
+ DECLARE_COMPLETION_ONSTACK(done);
+ int retval = 0;
+
+ if (!sub_info->path) {
+ call_usermodehelper_freeinfo(sub_info);
+ return -EINVAL;
+ }
+ helper_lock();
+ if (usermodehelper_disabled) {
+ retval = -EBUSY;
+ goto out;
+ }
+
+ /*
+ * If there is no binary for us to call, then just return and get out of
+ * here. This allows us to set STATIC_USERMODEHELPER_PATH to "" and
+ * disable all call_usermodehelper() calls.
+ */
+ if (strlen(sub_info->path) == 0)
+ goto out;
+
+ /*
+ * Set the completion pointer only if there is a waiter.
+ * This makes it possible to use umh_complete to free
+ * the data structure in case of UMH_NO_WAIT.
+ */
+ sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
+ sub_info->wait = wait;
+
+ queue_work(system_unbound_wq, &sub_info->work);
+ if (wait == UMH_NO_WAIT) /* task has freed sub_info */
+ goto unlock;
+
+ if (wait & UMH_KILLABLE) {
+ retval = wait_for_completion_killable(&done);
+ if (!retval)
+ goto wait_done;
+
+ /* umh_complete() will see NULL and free sub_info */
+ if (xchg(&sub_info->complete, NULL))
+ goto unlock;
+ /* fallthrough, umh_complete() was already called */
+ }
+
+ wait_for_completion(&done);
+wait_done:
+ retval = sub_info->retval;
+out:
+ call_usermodehelper_freeinfo(sub_info);
+unlock:
+ helper_unlock();
+ return retval;
+}
+EXPORT_SYMBOL(call_usermodehelper_exec);
+
+/**
+ * call_usermodehelper() - prepare and start a usermode application
+ * @path: path to usermode executable
+ * @argv: arg vector for process
+ * @envp: environment for process
+ * @wait: wait for the application to finish and return status.
+ * when UMH_NO_WAIT don't wait at all, but you get no useful error back
+ * when the program couldn't be exec'ed. This makes it safe to call
+ * from interrupt context.
+ *
+ * This function is the equivalent to use call_usermodehelper_setup() and
+ * call_usermodehelper_exec().
+ */
+int call_usermodehelper(const char *path, char **argv, char **envp, int wait)
+{
+ struct subprocess_info *info;
+ gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
+
+ info = call_usermodehelper_setup(path, argv, envp, gfp_mask,
+ NULL, NULL, NULL);
+ if (info == NULL)
+ return -ENOMEM;
+
+ return call_usermodehelper_exec(info, wait);
+}
+EXPORT_SYMBOL(call_usermodehelper);
+
+static int proc_cap_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
+ kernel_cap_t new_cap;
+ int err, i;
+
+ if (write && (!capable(CAP_SETPCAP) ||
+ !capable(CAP_SYS_MODULE)))
+ return -EPERM;
+
+ /*
+ * convert from the global kernel_cap_t to the ulong array to print to
+ * userspace if this is a read.
+ */
+ spin_lock(&umh_sysctl_lock);
+ for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) {
+ if (table->data == CAP_BSET)
+ cap_array[i] = usermodehelper_bset.cap[i];
+ else if (table->data == CAP_PI)
+ cap_array[i] = usermodehelper_inheritable.cap[i];
+ else
+ BUG();
+ }
+ spin_unlock(&umh_sysctl_lock);
+
+ t = *table;
+ t.data = &cap_array;
+
+ /*
+ * actually read or write and array of ulongs from userspace. Remember
+ * these are least significant 32 bits first
+ */
+ err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+
+ /*
+ * convert from the sysctl array of ulongs to the kernel_cap_t
+ * internal representation
+ */
+ for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
+ new_cap.cap[i] = cap_array[i];
+
+ /*
+ * Drop everything not in the new_cap (but don't add things)
+ */
+ spin_lock(&umh_sysctl_lock);
+ if (write) {
+ if (table->data == CAP_BSET)
+ usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
+ if (table->data == CAP_PI)
+ usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
+ }
+ spin_unlock(&umh_sysctl_lock);
+
+ return 0;
+}
+
+struct ctl_table usermodehelper_table[] = {
+ {
+ .procname = "bset",
+ .data = CAP_BSET,
+ .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+ .mode = 0600,
+ .proc_handler = proc_cap_handler,
+ },
+ {
+ .procname = "inheritable",
+ .data = CAP_PI,
+ .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+ .mode = 0600,
+ .proc_handler = proc_cap_handler,
+ },
+ { }
+};
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 2f735cbe05e8..c490f1e4313b 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -986,17 +986,21 @@ bool userns_may_setgroups(const struct user_namespace *ns)
}
/*
- * Returns true if @ns is the same namespace as or a descendant of
- * @target_ns.
+ * Returns true if @child is the same namespace or a descendant of
+ * @ancestor.
*/
+bool in_userns(const struct user_namespace *ancestor,
+ const struct user_namespace *child)
+{
+ const struct user_namespace *ns;
+ for (ns = child; ns->level > ancestor->level; ns = ns->parent)
+ ;
+ return (ns == ancestor);
+}
+
bool current_in_userns(const struct user_namespace *target_ns)
{
- struct user_namespace *ns;
- for (ns = current_user_ns(); ns; ns = ns->parent) {
- if (ns == target_ns)
- return true;
- }
- return false;
+ return in_userns(target_ns, current_user_ns());
}
static inline struct user_namespace *to_user_ns(struct ns_common *ns)
OpenPOWER on IntegriCloud