diff options
Diffstat (limited to 'kernel')
53 files changed, 1293 insertions, 962 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 9c323a6daa46..ed470aac53da 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -5,12 +5,13 @@ obj-y = fork.o exec_domain.o panic.o \ cpu.o exit.o softirq.o resource.o \ sysctl.o sysctl_binary.o capability.o ptrace.o user.o \ - signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ + signal.o sys.o umh.o workqueue.o pid.o task_work.o \ extable.o params.o \ kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o +obj-$(CONFIG_MODULES) += kmod.o obj-$(CONFIG_MULTIUSER) += groups.o ifdef CONFIG_FUNCTION_TRACER diff --git a/kernel/acct.c b/kernel/acct.c index 5b1284370367..5e72af29ab73 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -516,7 +516,7 @@ static void do_acct_process(struct bsd_acct_struct *acct) if (file_start_write_trylock(file)) { /* it's been opened O_APPEND, so position is irrelevant */ loff_t pos = 0; - __kernel_write(file, (char *)&ac, sizeof(acct_t), &pos); + __kernel_write(file, &ac, sizeof(acct_t), &pos); file_end_write(file); } out: diff --git a/kernel/audit.c b/kernel/audit.c index 6dd556931739..be1c28fd4d57 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1662,7 +1662,7 @@ static inline void audit_get_stamp(struct audit_context *ctx, struct timespec64 *t, unsigned int *serial) { if (!ctx || !auditsc_get_stamp(ctx, t, serial)) { - ktime_get_real_ts64(t); + *t = current_kernel_time64(); *serial = audit_serial(); } } @@ -1833,7 +1833,7 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) } /** - * audit_log_hex - convert a buffer to hex and append it to the audit skb + * audit_log_n_hex - convert a buffer to hex and append it to the audit skb * @ab: the audit_buffer * @buf: buffer to convert to hex * @len: length of @buf to be converted diff --git a/kernel/audit.h b/kernel/audit.h index b331d9b83f63..9b110ae17ee3 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -182,7 +182,7 @@ struct audit_context { mqd_t mqdes; size_t msg_len; unsigned int msg_prio; - struct timespec abs_timeout; + struct timespec64 abs_timeout; } mq_sendrecv; struct { int oflag; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3260ba2312a9..ecc23e25c9eb 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1235,11 +1235,11 @@ static void show_special(struct audit_context *context, int *call_panic) case AUDIT_MQ_SENDRECV: audit_log_format(ab, "mqdes=%d msg_len=%zd msg_prio=%u " - "abs_timeout_sec=%ld abs_timeout_nsec=%ld", + "abs_timeout_sec=%lld abs_timeout_nsec=%ld", context->mq_sendrecv.mqdes, context->mq_sendrecv.msg_len, context->mq_sendrecv.msg_prio, - context->mq_sendrecv.abs_timeout.tv_sec, + (long long) context->mq_sendrecv.abs_timeout.tv_sec, context->mq_sendrecv.abs_timeout.tv_nsec); break; case AUDIT_MQ_NOTIFY: @@ -1462,7 +1462,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts } /** - * audit_free - free a per-task audit context + * __audit_free - free a per-task audit context * @tsk: task whose audit context block to free * * Called from copy_process and do_exit @@ -1489,7 +1489,7 @@ void __audit_free(struct task_struct *tsk) } /** - * audit_syscall_entry - fill in an audit record at syscall entry + * __audit_syscall_entry - fill in an audit record at syscall entry * @major: major syscall type (function) * @a1: additional syscall register 1 * @a2: additional syscall register 2 @@ -1536,14 +1536,14 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, return; context->serial = 0; - ktime_get_real_ts64(&context->ctime); + context->ctime = current_kernel_time64(); context->in_syscall = 1; context->current_state = state; context->ppid = 0; } /** - * audit_syscall_exit - deallocate audit context after a system call + * __audit_syscall_exit - deallocate audit context after a system call * @success: success value of the syscall * @return_code: return value of the syscall * @@ -1705,7 +1705,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context, } /** - * audit_reusename - fill out filename with info from existing entry + * __audit_reusename - fill out filename with info from existing entry * @uptr: userland ptr to pathname * * Search the audit_names list for the current audit context. If there is an @@ -1730,7 +1730,7 @@ __audit_reusename(const __user char *uptr) } /** - * audit_getname - add a name to the list + * __audit_getname - add a name to the list * @name: name to add * * Add a name to the list of audit names for this context. @@ -2083,15 +2083,15 @@ void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) * */ void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, - const struct timespec *abs_timeout) + const struct timespec64 *abs_timeout) { struct audit_context *context = current->audit_context; - struct timespec *p = &context->mq_sendrecv.abs_timeout; + struct timespec64 *p = &context->mq_sendrecv.abs_timeout; if (abs_timeout) - memcpy(p, abs_timeout, sizeof(struct timespec)); + memcpy(p, abs_timeout, sizeof(*p)); else - memset(p, 0, sizeof(struct timespec)); + memset(p, 0, sizeof(*p)); context->mq_sendrecv.mqdes = mqdes; context->mq_sendrecv.msg_len = msg_len; @@ -2135,7 +2135,7 @@ void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) } /** - * audit_ipc_obj - record audit data for ipc object + * __audit_ipc_obj - record audit data for ipc object * @ipcp: ipc permissions * */ @@ -2151,7 +2151,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp) } /** - * audit_ipc_set_perm - record audit data for new ipc permissions + * __audit_ipc_set_perm - record audit data for new ipc permissions * @qbytes: msgq bytes * @uid: msgq user id * @gid: msgq group id @@ -2180,7 +2180,7 @@ void __audit_bprm(struct linux_binprm *bprm) /** - * audit_socketcall - record audit data for sys_socketcall + * __audit_socketcall - record audit data for sys_socketcall * @nargs: number of args, which should not be more than AUDITSC_ARGS. * @args: args array * @@ -2211,7 +2211,7 @@ void __audit_fd_pair(int fd1, int fd2) } /** - * audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto + * __audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto * @len: data length in user space * @a: data address in kernel space * diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index ecf9f99ecc57..959c9a07f318 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -159,7 +159,7 @@ static void dev_map_free(struct bpf_map *map) unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu); while (!bitmap_empty(bitmap, dtab->map.max_entries)) - cpu_relax(); + cond_resched(); } for (i = 0; i < dtab->map.max_entries; i++) { diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index f6ffde9c6a68..6424ce0e4969 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -792,7 +792,7 @@ out_progs: return err; } -int sock_map_attach_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) +int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct bpf_prog *orig; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 70ad8e220343..cb17e1cd1d43 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1096,10 +1096,10 @@ static int bpf_obj_get(const union bpf_attr *attr) #define BPF_PROG_ATTACH_LAST_FIELD attach_flags -static int sockmap_get_from_fd(const union bpf_attr *attr) +static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach) { + struct bpf_prog *prog = NULL; int ufd = attr->target_fd; - struct bpf_prog *prog; struct bpf_map *map; struct fd f; int err; @@ -1109,16 +1109,20 @@ static int sockmap_get_from_fd(const union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - prog = bpf_prog_get_type(attr->attach_bpf_fd, BPF_PROG_TYPE_SK_SKB); - if (IS_ERR(prog)) { - fdput(f); - return PTR_ERR(prog); + if (attach) { + prog = bpf_prog_get_type(attr->attach_bpf_fd, + BPF_PROG_TYPE_SK_SKB); + if (IS_ERR(prog)) { + fdput(f); + return PTR_ERR(prog); + } } - err = sock_map_attach_prog(map, prog, attr->attach_type); + err = sock_map_prog(map, prog, attr->attach_type); if (err) { fdput(f); - bpf_prog_put(prog); + if (prog) + bpf_prog_put(prog); return err; } @@ -1155,7 +1159,7 @@ static int bpf_prog_attach(const union bpf_attr *attr) break; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr); + return sockmap_get_from_fd(attr, true); default: return -EINVAL; } @@ -1204,7 +1208,10 @@ static int bpf_prog_detach(const union bpf_attr *attr) ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false); cgroup_put(cgrp); break; - + case BPF_SK_SKB_STREAM_PARSER: + case BPF_SK_SKB_STREAM_VERDICT: + ret = sockmap_get_from_fd(attr, false); + break; default: return -EINVAL; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d690c7dd1f1a..477b6932c3c1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4203,6 +4203,22 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) continue; } + if (insn->imm == BPF_FUNC_redirect_map) { + u64 addr = (unsigned long)prog; + struct bpf_insn r4_ld[] = { + BPF_LD_IMM64(BPF_REG_4, addr), + *insn, + }; + cnt = ARRAY_SIZE(r4_ld); + + new_prog = bpf_patch_insn_data(env, i + delta, r4_ld, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + } patch_call_imm: fn = prog->aux->ops->get_func_proto(insn->imm); /* all functions that have prototype and verifier allowed diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4f2196a00953..d6551cd45238 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1879,7 +1879,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops; root->kf_root = kernfs_create_root(kf_sops, - KERNFS_ROOT_CREATE_DEACTIVATED, + KERNFS_ROOT_CREATE_DEACTIVATED | + KERNFS_ROOT_SUPPORT_EXPORTOP, root_cgrp); if (IS_ERR(root->kf_root)) { ret = PTR_ERR(root->kf_root); @@ -5256,6 +5257,18 @@ static int __init cgroup_wq_init(void) } core_initcall(cgroup_wq_init); +void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, + char *buf, size_t buflen) +{ + struct kernfs_node *kn; + + kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id); + if (!kn) + return; + kernfs_path(kn, buf, buflen); + kernfs_put(kn); +} + /* * proc_cgroup_show() * - Print task's cgroup paths into seq_file, one line for each hierarchy diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 67230ecf2ce1..4657e2924ecb 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2275,6 +2275,13 @@ retry: mutex_unlock(&cpuset_mutex); } +static bool force_rebuild; + +void cpuset_force_rebuild(void) +{ + force_rebuild = true; +} + /** * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset * @@ -2349,8 +2356,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work) } /* rebuild sched domains if cpus_allowed has changed */ - if (cpus_updated) + if (cpus_updated || force_rebuild) { + force_rebuild = false; rebuild_sched_domains(); + } } void cpuset_update_active_cpus(void) @@ -2363,6 +2372,11 @@ void cpuset_update_active_cpus(void) schedule_work(&cpuset_hotplug_work); } +void cpuset_wait_for_hotplug(void) +{ + flush_work(&cpuset_hotplug_work); +} + /* * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. * Call this routine anytime after node_states[N_MEMORY] changes. diff --git a/kernel/compat.c b/kernel/compat.c index 6f0a0e723a06..772e038d04d9 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -200,29 +200,6 @@ int compat_put_timespec(const struct timespec *ts, void __user *uts) } EXPORT_SYMBOL_GPL(compat_put_timespec); -int compat_convert_timespec(struct timespec __user **kts, - const void __user *cts) -{ - struct timespec ts; - struct timespec __user *uts; - - if (!cts || COMPAT_USE_64BIT_TIME) { - *kts = (struct timespec __user *)cts; - return 0; - } - - uts = compat_alloc_user_space(sizeof(ts)); - if (!uts) - return -EFAULT; - if (compat_get_timespec(&ts, cts)) - return -EFAULT; - if (copy_to_user(uts, &ts, sizeof(ts))) - return -EFAULT; - - *kts = uts; - return 0; -} - int get_compat_itimerval(struct itimerval *o, const struct compat_itimerval __user *i) { struct compat_itimerval v32; diff --git a/kernel/exit.c b/kernel/exit.c index a35d8a17e01f..3481ababd06a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1615,7 +1615,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, user_access_begin(); unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); - unsafe_put_user((short)info.cause, &infop->si_code, Efault); + unsafe_put_user(info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); @@ -1741,7 +1741,7 @@ COMPAT_SYSCALL_DEFINE5(waitid, user_access_begin(); unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); - unsafe_put_user((short)info.cause, &infop->si_code, Efault); + unsafe_put_user(info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); diff --git a/kernel/fork.c b/kernel/fork.c index 24a4c0be80d5..10646182440f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -37,6 +37,7 @@ #include <linux/binfmts.h> #include <linux/mman.h> #include <linux/mmu_notifier.h> +#include <linux/hmm.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/vmacache.h> @@ -824,6 +825,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_owner(mm, p); RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_mm_init(mm); + hmm_mm_init(mm); init_tlb_flush_pending(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; @@ -903,6 +905,7 @@ void __mmdrop(struct mm_struct *mm) BUG_ON(mm == &init_mm); mm_free_pgd(mm); destroy_context(mm); + hmm_mm_destroy(mm); mmu_notifier_mm_destroy(mm); check_mm(mm); put_user_ns(mm->user_ns); @@ -1459,8 +1462,7 @@ static void rt_mutex_init_task(struct task_struct *p) { raw_spin_lock_init(&p->pi_lock); #ifdef CONFIG_RT_MUTEXES - p->pi_waiters = RB_ROOT; - p->pi_waiters_leftmost = NULL; + p->pi_waiters = RB_ROOT_CACHED; p->pi_top_task = NULL; p->pi_blocked_on = NULL; #endif @@ -1567,10 +1569,6 @@ static __latent_entropy struct task_struct *copy_process( return ERR_PTR(-EINVAL); } - retval = security_task_create(clone_flags); - if (retval) - goto fork_out; - retval = -ENOMEM; p = dup_task_struct(current, node); if (!p) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 73be2b3909bd..82afb7ed369f 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -421,10 +421,8 @@ static void free_desc(unsigned int irq) * The sysfs entry must be serialized against a concurrent * irq_sysfs_init() as well. */ - mutex_lock(&sparse_irq_lock); kobject_del(&desc->kobj); delete_irq_desc(irq); - mutex_unlock(&sparse_irq_lock); /* * We free the descriptor, masks and stat fields via RCU. That @@ -462,20 +460,15 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node, desc = alloc_desc(start + i, node, flags, mask, owner); if (!desc) goto err; - mutex_lock(&sparse_irq_lock); irq_insert_desc(start + i, desc); irq_sysfs_add(start + i, desc); - mutex_unlock(&sparse_irq_lock); } + bitmap_set(allocated_irqs, start, cnt); return start; err: for (i--; i >= 0; i--) free_desc(start + i); - - mutex_lock(&sparse_irq_lock); - bitmap_clear(allocated_irqs, start, cnt); - mutex_unlock(&sparse_irq_lock); return -ENOMEM; } @@ -575,6 +568,7 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, desc->owner = owner; } + bitmap_set(allocated_irqs, start, cnt); return start; } @@ -670,10 +664,10 @@ void irq_free_descs(unsigned int from, unsigned int cnt) if (from >= nr_irqs || (from + cnt) > nr_irqs) return; + mutex_lock(&sparse_irq_lock); for (i = 0; i < cnt; i++) free_desc(from + i); - mutex_lock(&sparse_irq_lock); bitmap_clear(allocated_irqs, from, cnt); mutex_unlock(&sparse_irq_lock); } @@ -720,19 +714,15 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, from, cnt, 0); ret = -EEXIST; if (irq >=0 && start != irq) - goto err; + goto unlock; if (start + cnt > nr_irqs) { ret = irq_expand_nr_irqs(start + cnt); if (ret) - goto err; + goto unlock; } - - bitmap_set(allocated_irqs, start, cnt); - mutex_unlock(&sparse_irq_lock); - return alloc_descs(start, cnt, node, affinity, owner); - -err: + ret = alloc_descs(start, cnt, node, affinity, owner); +unlock: mutex_unlock(&sparse_irq_lock); return ret; } diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 48eadf416c24..3fa4bd59f569 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -315,11 +315,12 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, ops->set_desc(arg, desc); /* Assumes the domain mutex is held! */ - ret = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg); + ret = irq_domain_alloc_irqs_hierarchy(domain, desc->irq, 1, + arg); if (ret) break; - irq_set_msi_desc_off(virq, 0, desc); + irq_set_msi_desc_off(desc->irq, 0, desc); } if (ret) { diff --git a/kernel/kcov.c b/kernel/kcov.c index cd771993f96f..3f693a0f6f3e 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -270,6 +270,7 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) static const struct file_operations kcov_fops = { .open = kcov_open, .unlocked_ioctl = kcov_ioctl, + .compat_ioctl = kcov_ioctl, .mmap = kcov_mmap, .release = kcov_close, }; diff --git a/kernel/kmod.c b/kernel/kmod.c index 2f37acde640b..bc6addd9152b 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -1,23 +1,6 @@ /* - kmod, the new module loader (replaces kerneld) - Kirk Petersen - - Reorganized not to be a daemon by Adam Richter, with guidance - from Greg Zornetzer. - - Modified to avoid chroot and file sharing problems. - Mikael Pettersson - - Limit the concurrent number of kmod modprobes to catch loops from - "modprobe needs a service that is in a module". - Keith Owens <kaos@ocs.com.au> December 1999 - - Unblock all signals when we exec a usermode process. - Shuu Yamaguchi <shuu@wondernetworkresources.com> December 2000 - - call_usermodehelper wait flag, and remove exec_usermodehelper. - Rusty Russell <rusty@rustcorp.com.au> Jan 2003 -*/ + * kmod - the kernel module loader + */ #include <linux/module.h> #include <linux/sched.h> #include <linux/sched/task.h> @@ -45,15 +28,6 @@ #include <trace/events/module.h> -#define CAP_BSET (void *)1 -#define CAP_PI (void *)2 - -static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; -static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; -static DEFINE_SPINLOCK(umh_sysctl_lock); -static DECLARE_RWSEM(umhelper_sem); - -#ifdef CONFIG_MODULES /* * Assuming: * @@ -202,536 +176,3 @@ int __request_module(bool wait, const char *fmt, ...) return ret; } EXPORT_SYMBOL(__request_module); - -#endif /* CONFIG_MODULES */ - -static void call_usermodehelper_freeinfo(struct subprocess_info *info) -{ - if (info->cleanup) - (*info->cleanup)(info); - kfree(info); -} - -static void umh_complete(struct subprocess_info *sub_info) -{ - struct completion *comp = xchg(&sub_info->complete, NULL); - /* - * See call_usermodehelper_exec(). If xchg() returns NULL - * we own sub_info, the UMH_KILLABLE caller has gone away - * or the caller used UMH_NO_WAIT. - */ - if (comp) - complete(comp); - else - call_usermodehelper_freeinfo(sub_info); -} - -/* - * This is the task which runs the usermode application - */ -static int call_usermodehelper_exec_async(void *data) -{ - struct subprocess_info *sub_info = data; - struct cred *new; - int retval; - - spin_lock_irq(¤t->sighand->siglock); - flush_signal_handlers(current, 1); - spin_unlock_irq(¤t->sighand->siglock); - - /* - * Our parent (unbound workqueue) runs with elevated scheduling - * priority. Avoid propagating that into the userspace child. - */ - set_user_nice(current, 0); - - retval = -ENOMEM; - new = prepare_kernel_cred(current); - if (!new) - goto out; - - spin_lock(&umh_sysctl_lock); - new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); - new->cap_inheritable = cap_intersect(usermodehelper_inheritable, - new->cap_inheritable); - spin_unlock(&umh_sysctl_lock); - - if (sub_info->init) { - retval = sub_info->init(sub_info, new); - if (retval) { - abort_creds(new); - goto out; - } - } - - commit_creds(new); - - retval = do_execve(getname_kernel(sub_info->path), - (const char __user *const __user *)sub_info->argv, - (const char __user *const __user *)sub_info->envp); -out: - sub_info->retval = retval; - /* - * call_usermodehelper_exec_sync() will call umh_complete - * if UHM_WAIT_PROC. - */ - if (!(sub_info->wait & UMH_WAIT_PROC)) - umh_complete(sub_info); - if (!retval) - return 0; - do_exit(0); -} - -/* Handles UMH_WAIT_PROC. */ -static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) -{ - pid_t pid; - - /* If SIGCLD is ignored sys_wait4 won't populate the status. */ - kernel_sigaction(SIGCHLD, SIG_DFL); - pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); - if (pid < 0) { - sub_info->retval = pid; - } else { - int ret = -ECHILD; - /* - * Normally it is bogus to call wait4() from in-kernel because - * wait4() wants to write the exit code to a userspace address. - * But call_usermodehelper_exec_sync() always runs as kernel - * thread (workqueue) and put_user() to a kernel address works - * OK for kernel threads, due to their having an mm_segment_t - * which spans the entire address space. - * - * Thus the __user pointer cast is valid here. - */ - sys_wait4(pid, (int __user *)&ret, 0, NULL); - - /* - * If ret is 0, either call_usermodehelper_exec_async failed and - * the real error code is already in sub_info->retval or - * sub_info->retval is 0 anyway, so don't mess with it then. - */ - if (ret) - sub_info->retval = ret; - } - - /* Restore default kernel sig handler */ - kernel_sigaction(SIGCHLD, SIG_IGN); - - umh_complete(sub_info); -} - -/* - * We need to create the usermodehelper kernel thread from a task that is affine - * to an optimized set of CPUs (or nohz housekeeping ones) such that they - * inherit a widest affinity irrespective of call_usermodehelper() callers with - * possibly reduced affinity (eg: per-cpu workqueues). We don't want - * usermodehelper targets to contend a busy CPU. - * - * Unbound workqueues provide such wide affinity and allow to block on - * UMH_WAIT_PROC requests without blocking pending request (up to some limit). - * - * Besides, workqueues provide the privilege level that caller might not have - * to perform the usermodehelper request. - * - */ -static void call_usermodehelper_exec_work(struct work_struct *work) -{ - struct subprocess_info *sub_info = - container_of(work, struct subprocess_info, work); - - if (sub_info->wait & UMH_WAIT_PROC) { - call_usermodehelper_exec_sync(sub_info); - } else { - pid_t pid; - /* - * Use CLONE_PARENT to reparent it to kthreadd; we do not - * want to pollute current->children, and we need a parent - * that always ignores SIGCHLD to ensure auto-reaping. - */ - pid = kernel_thread(call_usermodehelper_exec_async, sub_info, - CLONE_PARENT | SIGCHLD); - if (pid < 0) { - sub_info->retval = pid; - umh_complete(sub_info); - } - } -} - -/* - * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY - * (used for preventing user land processes from being created after the user - * land has been frozen during a system-wide hibernation or suspend operation). - * Should always be manipulated under umhelper_sem acquired for write. - */ -static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED; - -/* Number of helpers running */ -static atomic_t running_helpers = ATOMIC_INIT(0); - -/* - * Wait queue head used by usermodehelper_disable() to wait for all running - * helpers to finish. - */ -static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); - -/* - * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled - * to become 'false'. - */ -static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq); - -/* - * Time to wait for running_helpers to become zero before the setting of - * usermodehelper_disabled in usermodehelper_disable() fails - */ -#define RUNNING_HELPERS_TIMEOUT (5 * HZ) - -int usermodehelper_read_trylock(void) -{ - DEFINE_WAIT(wait); - int ret = 0; - - down_read(&umhelper_sem); - for (;;) { - prepare_to_wait(&usermodehelper_disabled_waitq, &wait, - TASK_INTERRUPTIBLE); - if (!usermodehelper_disabled) - break; - - if (usermodehelper_disabled == UMH_DISABLED) - ret = -EAGAIN; - - up_read(&umhelper_sem); - - if (ret) - break; - - schedule(); - try_to_freeze(); - - down_read(&umhelper_sem); - } - finish_wait(&usermodehelper_disabled_waitq, &wait); - return ret; -} -EXPORT_SYMBOL_GPL(usermodehelper_read_trylock); - -long usermodehelper_read_lock_wait(long timeout) -{ - DEFINE_WAIT(wait); - - if (timeout < 0) - return -EINVAL; - - down_read(&umhelper_sem); - for (;;) { - prepare_to_wait(&usermodehelper_disabled_waitq, &wait, - TASK_UNINTERRUPTIBLE); - if (!usermodehelper_disabled) - break; - - up_read(&umhelper_sem); - - timeout = schedule_timeout(timeout); - if (!timeout) - break; - - down_read(&umhelper_sem); - } - finish_wait(&usermodehelper_disabled_waitq, &wait); - return timeout; -} -EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait); - -void usermodehelper_read_unlock(void) -{ - up_read(&umhelper_sem); -} -EXPORT_SYMBOL_GPL(usermodehelper_read_unlock); - -/** - * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled. - * @depth: New value to assign to usermodehelper_disabled. - * - * Change the value of usermodehelper_disabled (under umhelper_sem locked for - * writing) and wakeup tasks waiting for it to change. - */ -void __usermodehelper_set_disable_depth(enum umh_disable_depth depth) -{ - down_write(&umhelper_sem); - usermodehelper_disabled = depth; - wake_up(&usermodehelper_disabled_waitq); - up_write(&umhelper_sem); -} - -/** - * __usermodehelper_disable - Prevent new helpers from being started. - * @depth: New value to assign to usermodehelper_disabled. - * - * Set usermodehelper_disabled to @depth and wait for running helpers to exit. - */ -int __usermodehelper_disable(enum umh_disable_depth depth) -{ - long retval; - - if (!depth) - return -EINVAL; - - down_write(&umhelper_sem); - usermodehelper_disabled = depth; - up_write(&umhelper_sem); - - /* - * From now on call_usermodehelper_exec() won't start any new - * helpers, so it is sufficient if running_helpers turns out to - * be zero at one point (it may be increased later, but that - * doesn't matter). - */ - retval = wait_event_timeout(running_helpers_waitq, - atomic_read(&running_helpers) == 0, - RUNNING_HELPERS_TIMEOUT); - if (retval) - return 0; - - __usermodehelper_set_disable_depth(UMH_ENABLED); - return -EAGAIN; -} - -static void helper_lock(void) -{ - atomic_inc(&running_helpers); - smp_mb__after_atomic(); -} - -static void helper_unlock(void) -{ - if (atomic_dec_and_test(&running_helpers)) - wake_up(&running_helpers_waitq); -} - -/** - * call_usermodehelper_setup - prepare to call a usermode helper - * @path: path to usermode executable - * @argv: arg vector for process - * @envp: environment for process - * @gfp_mask: gfp mask for memory allocation - * @cleanup: a cleanup function - * @init: an init function - * @data: arbitrary context sensitive data - * - * Returns either %NULL on allocation failure, or a subprocess_info - * structure. This should be passed to call_usermodehelper_exec to - * exec the process and free the structure. - * - * The init function is used to customize the helper process prior to - * exec. A non-zero return code causes the process to error out, exit, - * and return the failure to the calling process - * - * The cleanup function is just before ethe subprocess_info is about to - * be freed. This can be used for freeing the argv and envp. The - * Function must be runnable in either a process context or the - * context in which call_usermodehelper_exec is called. - */ -struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv, - char **envp, gfp_t gfp_mask, - int (*init)(struct subprocess_info *info, struct cred *new), - void (*cleanup)(struct subprocess_info *info), - void *data) -{ - struct subprocess_info *sub_info; - sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask); - if (!sub_info) - goto out; - - INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); - -#ifdef CONFIG_STATIC_USERMODEHELPER - sub_info->path = CONFIG_STATIC_USERMODEHELPER_PATH; -#else - sub_info->path = path; -#endif - sub_info->argv = argv; - sub_info->envp = envp; - - sub_info->cleanup = cleanup; - sub_info->init = init; - sub_info->data = data; - out: - return sub_info; -} -EXPORT_SYMBOL(call_usermodehelper_setup); - -/** - * call_usermodehelper_exec - start a usermode application - * @sub_info: information about the subprocessa - * @wait: wait for the application to finish and return status. - * when UMH_NO_WAIT don't wait at all, but you get no useful error back - * when the program couldn't be exec'ed. This makes it safe to call - * from interrupt context. - * - * Runs a user-space application. The application is started - * asynchronously if wait is not set, and runs as a child of system workqueues. - * (ie. it runs with full root capabilities and optimized affinity). - */ -int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) -{ - DECLARE_COMPLETION_ONSTACK(done); - int retval = 0; - - if (!sub_info->path) { - call_usermodehelper_freeinfo(sub_info); - return -EINVAL; - } - helper_lock(); - if (usermodehelper_disabled) { - retval = -EBUSY; - goto out; - } - - /* - * If there is no binary for us to call, then just return and get out of - * here. This allows us to set STATIC_USERMODEHELPER_PATH to "" and - * disable all call_usermodehelper() calls. - */ - if (strlen(sub_info->path) == 0) - goto out; - - /* - * Set the completion pointer only if there is a waiter. - * This makes it possible to use umh_complete to free - * the data structure in case of UMH_NO_WAIT. - */ - sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done; - sub_info->wait = wait; - - queue_work(system_unbound_wq, &sub_info->work); - if (wait == UMH_NO_WAIT) /* task has freed sub_info */ - goto unlock; - - if (wait & UMH_KILLABLE) { - retval = wait_for_completion_killable(&done); - if (!retval) - goto wait_done; - - /* umh_complete() will see NULL and free sub_info */ - if (xchg(&sub_info->complete, NULL)) - goto unlock; - /* fallthrough, umh_complete() was already called */ - } - - wait_for_completion(&done); -wait_done: - retval = sub_info->retval; -out: - call_usermodehelper_freeinfo(sub_info); -unlock: - helper_unlock(); - return retval; -} -EXPORT_SYMBOL(call_usermodehelper_exec); - -/** - * call_usermodehelper() - prepare and start a usermode application - * @path: path to usermode executable - * @argv: arg vector for process - * @envp: environment for process - * @wait: wait for the application to finish and return status. - * when UMH_NO_WAIT don't wait at all, but you get no useful error back - * when the program couldn't be exec'ed. This makes it safe to call - * from interrupt context. - * - * This function is the equivalent to use call_usermodehelper_setup() and - * call_usermodehelper_exec(). - */ -int call_usermodehelper(const char *path, char **argv, char **envp, int wait) -{ - struct subprocess_info *info; - gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; - - info = call_usermodehelper_setup(path, argv, envp, gfp_mask, - NULL, NULL, NULL); - if (info == NULL) - return -ENOMEM; - - return call_usermodehelper_exec(info, wait); -} -EXPORT_SYMBOL(call_usermodehelper); - -static int proc_cap_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table t; - unsigned long cap_array[_KERNEL_CAPABILITY_U32S]; - kernel_cap_t new_cap; - int err, i; - - if (write && (!capable(CAP_SETPCAP) || - !capable(CAP_SYS_MODULE))) - return -EPERM; - - /* - * convert from the global kernel_cap_t to the ulong array to print to - * userspace if this is a read. - */ - spin_lock(&umh_sysctl_lock); - for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) { - if (table->data == CAP_BSET) - cap_array[i] = usermodehelper_bset.cap[i]; - else if (table->data == CAP_PI) - cap_array[i] = usermodehelper_inheritable.cap[i]; - else - BUG(); - } - spin_unlock(&umh_sysctl_lock); - - t = *table; - t.data = &cap_array; - - /* - * actually read or write and array of ulongs from userspace. Remember - * these are least significant 32 bits first - */ - err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); - if (err < 0) - return err; - - /* - * convert from the sysctl array of ulongs to the kernel_cap_t - * internal representation - */ - for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) - new_cap.cap[i] = cap_array[i]; - - /* - * Drop everything not in the new_cap (but don't add things) - */ - spin_lock(&umh_sysctl_lock); - if (write) { - if (table->data == CAP_BSET) - usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap); - if (table->data == CAP_PI) - usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap); - } - spin_unlock(&umh_sysctl_lock); - - return 0; -} - -struct ctl_table usermodehelper_table[] = { - { - .procname = "bset", - .data = CAP_BSET, - .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), - .mode = 0600, - .proc_handler = proc_cap_handler, - }, - { - .procname = "inheritable", - .data = CAP_PI, - .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), - .mode = 0600, - .proc_handler = proc_cap_handler, - }, - { } -}; diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index ac35e648b0e5..f4a74e78d467 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -58,7 +58,7 @@ static void printk_lock(struct rt_mutex *lock, int print_owner) void rt_mutex_debug_task_free(struct task_struct *task) { - DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters)); + DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root)); DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); } diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 649dc9d3951a..6f3dba6e4e9e 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -271,10 +271,10 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left, static void rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) { - struct rb_node **link = &lock->waiters.rb_node; + struct rb_node **link = &lock->waiters.rb_root.rb_node; struct rb_node *parent = NULL; struct rt_mutex_waiter *entry; - int leftmost = 1; + bool leftmost = true; while (*link) { parent = *link; @@ -283,15 +283,12 @@ rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) link = &parent->rb_left; } else { link = &parent->rb_right; - leftmost = 0; + leftmost = false; } } - if (leftmost) - lock->waiters_leftmost = &waiter->tree_entry; - rb_link_node(&waiter->tree_entry, parent, link); - rb_insert_color(&waiter->tree_entry, &lock->waiters); + rb_insert_color_cached(&waiter->tree_entry, &lock->waiters, leftmost); } static void @@ -300,20 +297,17 @@ rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) if (RB_EMPTY_NODE(&waiter->tree_entry)) return; - if (lock->waiters_leftmost == &waiter->tree_entry) - lock->waiters_leftmost = rb_next(&waiter->tree_entry); - - rb_erase(&waiter->tree_entry, &lock->waiters); + rb_erase_cached(&waiter->tree_entry, &lock->waiters); RB_CLEAR_NODE(&waiter->tree_entry); } static void rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) { - struct rb_node **link = &task->pi_waiters.rb_node; + struct rb_node **link = &task->pi_waiters.rb_root.rb_node; struct rb_node *parent = NULL; struct rt_mutex_waiter *entry; - int leftmost = 1; + bool leftmost = true; while (*link) { parent = *link; @@ -322,15 +316,12 @@ rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) link = &parent->rb_left; } else { link = &parent->rb_right; - leftmost = 0; + leftmost = false; } } - if (leftmost) - task->pi_waiters_leftmost = &waiter->pi_tree_entry; - rb_link_node(&waiter->pi_tree_entry, parent, link); - rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters); + rb_insert_color_cached(&waiter->pi_tree_entry, &task->pi_waiters, leftmost); } static void @@ -339,10 +330,7 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) if (RB_EMPTY_NODE(&waiter->pi_tree_entry)) return; - if (task->pi_waiters_leftmost == &waiter->pi_tree_entry) - task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry); - - rb_erase(&waiter->pi_tree_entry, &task->pi_waiters); + rb_erase_cached(&waiter->pi_tree_entry, &task->pi_waiters); RB_CLEAR_NODE(&waiter->pi_tree_entry); } @@ -1657,8 +1645,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name, { lock->owner = NULL; raw_spin_lock_init(&lock->wait_lock); - lock->waiters = RB_ROOT; - lock->waiters_leftmost = NULL; + lock->waiters = RB_ROOT_CACHED; if (name && key) debug_rt_mutex_init(lock, name, key); diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 8d039b928d61..7453be0485a5 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -45,7 +45,7 @@ struct rt_mutex_waiter { static inline int rt_mutex_has_waiters(struct rt_mutex *lock) { - return !RB_EMPTY_ROOT(&lock->waiters); + return !RB_EMPTY_ROOT(&lock->waiters.rb_root); } static inline struct rt_mutex_waiter * @@ -53,8 +53,8 @@ rt_mutex_top_waiter(struct rt_mutex *lock) { struct rt_mutex_waiter *w; - w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter, - tree_entry); + w = rb_entry(lock->waiters.rb_leftmost, + struct rt_mutex_waiter, tree_entry); BUG_ON(w->lock != lock); return w; @@ -62,14 +62,14 @@ rt_mutex_top_waiter(struct rt_mutex *lock) static inline int task_has_pi_waiters(struct task_struct *p) { - return !RB_EMPTY_ROOT(&p->pi_waiters); + return !RB_EMPTY_ROOT(&p->pi_waiters.rb_root); } static inline struct rt_mutex_waiter * task_top_pi_waiter(struct task_struct *p) { - return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter, - pi_tree_entry); + return rb_entry(p->pi_waiters.rb_leftmost, + struct rt_mutex_waiter, pi_tree_entry); } #else diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 39f56c870051..0e4cd64ad2c0 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -362,7 +362,7 @@ static int *get_random_order(int count) int *order; int n, r, tmp; - order = kmalloc_array(count, sizeof(*order), GFP_TEMPORARY); + order = kmalloc_array(count, sizeof(*order), GFP_KERNEL); if (!order) return order; diff --git a/kernel/memremap.c b/kernel/memremap.c index 066e73c2fcc9..6bcbfbf1a8fd 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -11,13 +11,14 @@ * General Public License for more details. */ #include <linux/radix-tree.h> -#include <linux/memremap.h> #include <linux/device.h> #include <linux/types.h> #include <linux/pfn_t.h> #include <linux/io.h> #include <linux/mm.h> #include <linux/memory_hotplug.h> +#include <linux/swap.h> +#include <linux/swapops.h> #ifndef ioremap_cache /* temporary while we convert existing ioremap_cache users to memremap */ @@ -219,6 +220,34 @@ static unsigned long order_at(struct resource *res, unsigned long pgoff) for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \ pgoff += 1UL << order, order = order_at((res), pgoff)) +#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) +int device_private_entry_fault(struct vm_area_struct *vma, + unsigned long addr, + swp_entry_t entry, + unsigned int flags, + pmd_t *pmdp) +{ + struct page *page = device_private_entry_to_page(entry); + + /* + * The page_fault() callback must migrate page back to system memory + * so that CPU can access it. This might fail for various reasons + * (device issue, device was unsafely unplugged, ...). When such + * error conditions happen, the callback must return VM_FAULT_SIGBUS. + * + * Note that because memory cgroup charges are accounted to the device + * memory, this should never fail because of memory restrictions (but + * allocation of regular system page might still fail because we are + * out of memory). + * + * There is a more in-depth description of what that callback can and + * cannot do, in include/linux/memremap.h + */ + return page->pgmap->page_fault(vma, addr, page, flags, pmdp); +} +EXPORT_SYMBOL(device_private_entry_fault); +#endif /* CONFIG_DEVICE_PRIVATE */ + static void pgmap_radix_release(struct resource *res) { unsigned long pgoff, order; @@ -356,6 +385,10 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, } pgmap->ref = ref; pgmap->res = &page_map->res; + pgmap->type = MEMORY_DEVICE_HOST; + pgmap->page_fault = NULL; + pgmap->page_free = NULL; + pgmap->data = NULL; mutex_lock(&pgmap_lock); error = 0; @@ -466,3 +499,28 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) return pgmap ? pgmap->altmap : NULL; } #endif /* CONFIG_ZONE_DEVICE */ + + +#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) +void put_zone_device_private_or_public_page(struct page *page) +{ + int count = page_ref_dec_return(page); + + /* + * If refcount is 1 then page is freed and refcount is stable as nobody + * holds a reference on the page. + */ + if (count == 1) { + /* Clear Active bit in case of parallel mark_page_accessed */ + __ClearPageActive(page); + __ClearPageWaiters(page); + + page->mapping = NULL; + mem_cgroup_uncharge(page); + + page->pgmap->page_free(page, page->pgmap->data); + } else if (!count) + __put_page(page); +} +EXPORT_SYMBOL(put_zone_device_private_or_public_page); +#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ diff --git a/kernel/module.c b/kernel/module.c index 40f983cbea81..de66ec825992 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2707,21 +2707,21 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) } #endif /* CONFIG_KALLSYMS */ -static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) +static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsigned int num) { if (!debug) return; #ifdef CONFIG_DYNAMIC_DEBUG - if (ddebug_add_module(debug, num, debug->modname)) + if (ddebug_add_module(debug, num, mod->name)) pr_err("dynamic debug error adding module: %s\n", debug->modname); #endif } -static void dynamic_debug_remove(struct _ddebug *debug) +static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) { if (debug) - ddebug_remove_module(debug->modname); + ddebug_remove_module(mod->name); } void * __weak module_alloc(unsigned long size) @@ -3715,7 +3715,7 @@ static int load_module(struct load_info *info, const char __user *uargs, goto free_arch_cleanup; } - dynamic_debug_setup(info->debug, info->num_debug); + dynamic_debug_setup(mod, info->debug, info->num_debug); /* Ftrace init must be called in the MODULE_STATE_UNFORMED state */ ftrace_module_init(mod); @@ -3779,7 +3779,7 @@ static int load_module(struct load_info *info, const char __user *uargs, module_disable_nx(mod); ddebug_cleanup: - dynamic_debug_remove(info->debug); + dynamic_debug_remove(mod, info->debug); synchronize_sched(); kfree(mod->args); free_arch_cleanup: diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 74a5a7255b4d..4918314893bc 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -101,6 +101,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns int i; int err; + err = -EINVAL; + if (!in_userns(parent_pid_ns->user_ns, user_ns)) + goto out; + err = -ENOSPC; if (level > MAX_PID_NS_LEVEL) goto out; diff --git a/kernel/power/process.c b/kernel/power/process.c index 78672d324a6e..50f25cb370c6 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -20,8 +20,9 @@ #include <linux/workqueue.h> #include <linux/kmod.h> #include <trace/events/power.h> +#include <linux/cpuset.h> -/* +/* * Timeout for stopping processes */ unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC; @@ -202,6 +203,8 @@ void thaw_processes(void) __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); + cpuset_wait_for_hotplug(); + read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 57d22571f306..d7cdc426ee38 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -242,8 +242,7 @@ static void hib_end_io(struct bio *bio) if (bio->bi_status) { printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", - imajor(bio->bi_bdev->bd_inode), - iminor(bio->bi_bdev->bd_inode), + MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), (unsigned long long)bio->bi_iter.bi_sector); } @@ -270,7 +269,7 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); - bio->bi_bdev = hib_resume_bdev; + bio_set_dev(bio, hib_resume_bdev); bio_set_op_attrs(bio, op, op_flags); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index fc47863f629c..512f7c2baedd 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -649,7 +649,7 @@ static int syslog_action_restricted(int type) type != SYSLOG_ACTION_SIZE_BUFFER; } -int check_syslog_permissions(int type, int source) +static int check_syslog_permissions(int type, int source) { /* * If this is from /proc/kmsg and we've already opened it, then we've @@ -677,7 +677,6 @@ int check_syslog_permissions(int type, int source) ok: return security_syslog(type); } -EXPORT_SYMBOL_GPL(check_syslog_permissions); static void append_char(char **pp, char *e, char c) { @@ -1435,7 +1434,7 @@ int do_syslog(int type, char __user *buf, int len, int source) error = check_syslog_permissions(type, source); if (error) - goto out; + return error; switch (type) { case SYSLOG_ACTION_CLOSE: /* Close log */ @@ -1443,20 +1442,16 @@ int do_syslog(int type, char __user *buf, int len, int source) case SYSLOG_ACTION_OPEN: /* Open log */ break; case SYSLOG_ACTION_READ: /* Read from log */ - error = -EINVAL; if (!buf || len < 0) - goto out; - error = 0; + return -EINVAL; if (!len) - goto out; - if (!access_ok(VERIFY_WRITE, buf, len)) { - error = -EFAULT; - goto out; - } + return 0; + if (!access_ok(VERIFY_WRITE, buf, len)) + return -EFAULT; error = wait_event_interruptible(log_wait, syslog_seq != log_next_seq); if (error) - goto out; + return error; error = syslog_print(buf, len); break; /* Read/clear last kernel messages */ @@ -1465,16 +1460,12 @@ int do_syslog(int type, char __user *buf, int len, int source) /* FALL THRU */ /* Read last kernel messages */ case SYSLOG_ACTION_READ_ALL: - error = -EINVAL; if (!buf || len < 0) - goto out; - error = 0; + return -EINVAL; if (!len) - goto out; - if (!access_ok(VERIFY_WRITE, buf, len)) { - error = -EFAULT; - goto out; - } + return 0; + if (!access_ok(VERIFY_WRITE, buf, len)) + return -EFAULT; error = syslog_print_all(buf, len, clear); break; /* Clear ring buffer */ @@ -1496,15 +1487,13 @@ int do_syslog(int type, char __user *buf, int len, int source) break; /* Set level of messages printed to console */ case SYSLOG_ACTION_CONSOLE_LEVEL: - error = -EINVAL; if (len < 1 || len > 8) - goto out; + return -EINVAL; if (len < minimum_console_loglevel) len = minimum_console_loglevel; console_loglevel = len; /* Implicitly re-enable logging to console */ saved_console_loglevel = LOGLEVEL_DEFAULT; - error = 0; break; /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: @@ -1526,7 +1515,6 @@ int do_syslog(int type, char __user *buf, int len, int source) u64 seq = syslog_seq; u32 idx = syslog_idx; - error = 0; while (seq < log_next_seq) { struct printk_log *msg = log_from_idx(idx); @@ -1546,7 +1534,7 @@ int do_syslog(int type, char __user *buf, int len, int source) error = -EINVAL; break; } -out: + return error; } @@ -1698,10 +1686,10 @@ asmlinkage int vprintk_emit(int facility, int level, { static char textbuf[LOG_LINE_MAX]; char *text = textbuf; - size_t text_len = 0; + size_t text_len; enum log_flags lflags = 0; unsigned long flags; - int printed_len = 0; + int printed_len; bool in_sched = false; if (level == LOGLEVEL_SCHED) { @@ -1754,7 +1742,7 @@ asmlinkage int vprintk_emit(int facility, int level, if (dict) lflags |= LOG_PREFIX|LOG_NEWLINE; - printed_len += log_output(facility, level, lflags, dict, dictlen, text, text_len); + printed_len = log_output(facility, level, lflags, dict, dictlen, text, text_len); logbuf_unlock_irqrestore(flags); @@ -2650,9 +2638,8 @@ void __init console_init(void) * makes it difficult to diagnose problems that occur during this time. * * To mitigate this problem somewhat, only unregister consoles whose memory - * intersects with the init section. Note that code exists elsewhere to get - * rid of the boot console as soon as the proper console shows up, so there - * won't be side-effects from postponing the removal. + * intersects with the init section. Note that all other boot consoles will + * get unregistred when the real preferred console is registered. */ static int __init printk_late_init(void) { @@ -2660,16 +2647,23 @@ static int __init printk_late_init(void) int ret; for_each_console(con) { - if (!keep_bootcon && con->flags & CON_BOOT) { + if (!(con->flags & CON_BOOT)) + continue; + + /* Check addresses that might be used for enabled consoles. */ + if (init_section_intersects(con, sizeof(*con)) || + init_section_contains(con->write, 0) || + init_section_contains(con->read, 0) || + init_section_contains(con->device, 0) || + init_section_contains(con->unblank, 0) || + init_section_contains(con->data, 0)) { /* - * Make sure to unregister boot consoles whose data - * resides in the init section before the init section - * is discarded. Boot consoles whose data will stick - * around will automatically be unregistered when the - * proper console replaces them. + * Please, consider moving the reported consoles out + * of the init section. */ - if (init_section_intersects(con, sizeof(*con))) - unregister_console(con); + pr_warn("bootconsole [%s%d] uses init memory and must be disabled even before the real one is ready\n", + con->name, con->index); + unregister_console(con); } } ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL, diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 60f356d91060..84b1367935e4 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -728,8 +728,7 @@ static int ptrace_peek_siginfo(struct task_struct *child, if (unlikely(in_compat_syscall())) { compat_siginfo_t __user *uinfo = compat_ptr(data); - if (copy_siginfo_to_user32(uinfo, &info) || - __put_user(info.si_code, &uinfo->si_code)) { + if (copy_siginfo_to_user32(uinfo, &info)) { ret = -EFAULT; break; } @@ -739,8 +738,7 @@ static int ptrace_peek_siginfo(struct task_struct *child, { siginfo_t __user *uinfo = (siginfo_t __user *) data; - if (copy_siginfo_to_user(uinfo, &info) || - __put_user(info.si_code, &uinfo->si_code)) { + if (copy_siginfo_to_user(uinfo, &info)) { ret = -EFAULT; break; } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 84fe96641b2e..1250e4bd4b85 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4091,7 +4091,7 @@ static void __init rcu_init_geometry(void) if (rcu_fanout_leaf == RCU_FANOUT_LEAF && nr_cpu_ids == NR_CPUS) return; - pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n", + pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n", rcu_fanout_leaf, nr_cpu_ids); /* diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 55bde94b9572..e012b9be777e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -89,7 +89,7 @@ static void __init rcu_bootup_announce_oddness(void) if (rcu_fanout_leaf != RCU_FANOUT_LEAF) pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) - pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); + pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%u.\n", NR_CPUS, nr_cpu_ids); #ifdef CONFIG_RCU_BOOST pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", kthread_prio, CONFIG_RCU_BOOST_DELAY); #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6d2c7ff9ba98..18a6966567da 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1173,6 +1173,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || lockdep_is_held(&task_rq(p)->lock))); #endif + /* + * Clearly, migrating tasks to offline CPUs is a fairly daft thing. + */ + WARN_ON_ONCE(!cpu_online(new_cpu)); #endif trace_sched_migrate_task(p, new_cpu); @@ -5556,16 +5560,15 @@ static void cpuset_cpu_active(void) * operation in the resume sequence, just build a single sched * domain, ignoring cpusets. */ - num_cpus_frozen--; - if (likely(num_cpus_frozen)) { - partition_sched_domains(1, NULL, NULL); + partition_sched_domains(1, NULL, NULL); + if (--num_cpus_frozen) return; - } /* * This is the last CPU online operation. So fall through and * restore the original sched domains by considering the * cpuset configurations. */ + cpuset_force_rebuild(); } cpuset_update_active_cpus(); } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9e38df7649f4..0191ec7667c3 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -296,7 +296,7 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) { struct sched_dl_entity *dl_se = &p->dl; - return dl_rq->rb_leftmost == &dl_se->rb_node; + return dl_rq->root.rb_leftmost == &dl_se->rb_node; } void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) @@ -320,7 +320,7 @@ void init_dl_bw(struct dl_bw *dl_b) void init_dl_rq(struct dl_rq *dl_rq) { - dl_rq->rb_root = RB_ROOT; + dl_rq->root = RB_ROOT_CACHED; #ifdef CONFIG_SMP /* zero means no -deadline tasks */ @@ -328,7 +328,7 @@ void init_dl_rq(struct dl_rq *dl_rq) dl_rq->dl_nr_migratory = 0; dl_rq->overloaded = 0; - dl_rq->pushable_dl_tasks_root = RB_ROOT; + dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED; #else init_dl_bw(&dl_rq->dl_bw); #endif @@ -410,10 +410,10 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) { struct dl_rq *dl_rq = &rq->dl; - struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node; + struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_root.rb_node; struct rb_node *parent = NULL; struct task_struct *entry; - int leftmost = 1; + bool leftmost = true; BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); @@ -425,17 +425,16 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) link = &parent->rb_left; else { link = &parent->rb_right; - leftmost = 0; + leftmost = false; } } - if (leftmost) { - dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks; + if (leftmost) dl_rq->earliest_dl.next = p->dl.deadline; - } rb_link_node(&p->pushable_dl_tasks, parent, link); - rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); + rb_insert_color_cached(&p->pushable_dl_tasks, + &dl_rq->pushable_dl_tasks_root, leftmost); } static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) @@ -445,24 +444,23 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) if (RB_EMPTY_NODE(&p->pushable_dl_tasks)) return; - if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) { + if (dl_rq->pushable_dl_tasks_root.rb_leftmost == &p->pushable_dl_tasks) { struct rb_node *next_node; next_node = rb_next(&p->pushable_dl_tasks); - dl_rq->pushable_dl_tasks_leftmost = next_node; if (next_node) { dl_rq->earliest_dl.next = rb_entry(next_node, struct task_struct, pushable_dl_tasks)->dl.deadline; } } - rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); + rb_erase_cached(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); RB_CLEAR_NODE(&p->pushable_dl_tasks); } static inline int has_pushable_dl_tasks(struct rq *rq) { - return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root); + return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root); } static int push_dl_task(struct rq *rq); @@ -1266,7 +1264,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) dl_rq->earliest_dl.next = 0; cpudl_clear(&rq->rd->cpudl, rq->cpu); } else { - struct rb_node *leftmost = dl_rq->rb_leftmost; + struct rb_node *leftmost = dl_rq->root.rb_leftmost; struct sched_dl_entity *entry; entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); @@ -1313,7 +1311,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - struct rb_node **link = &dl_rq->rb_root.rb_node; + struct rb_node **link = &dl_rq->root.rb_root.rb_node; struct rb_node *parent = NULL; struct sched_dl_entity *entry; int leftmost = 1; @@ -1331,11 +1329,8 @@ static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) } } - if (leftmost) - dl_rq->rb_leftmost = &dl_se->rb_node; - rb_link_node(&dl_se->rb_node, parent, link); - rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root); + rb_insert_color_cached(&dl_se->rb_node, &dl_rq->root, leftmost); inc_dl_tasks(dl_se, dl_rq); } @@ -1347,14 +1342,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) if (RB_EMPTY_NODE(&dl_se->rb_node)) return; - if (dl_rq->rb_leftmost == &dl_se->rb_node) { - struct rb_node *next_node; - - next_node = rb_next(&dl_se->rb_node); - dl_rq->rb_leftmost = next_node; - } - - rb_erase(&dl_se->rb_node, &dl_rq->rb_root); + rb_erase_cached(&dl_se->rb_node, &dl_rq->root); RB_CLEAR_NODE(&dl_se->rb_node); dec_dl_tasks(dl_se, dl_rq); @@ -1647,7 +1635,7 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p) static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, struct dl_rq *dl_rq) { - struct rb_node *left = dl_rq->rb_leftmost; + struct rb_node *left = rb_first_cached(&dl_rq->root); if (!left) return NULL; @@ -1771,7 +1759,7 @@ static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) */ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu) { - struct rb_node *next_node = rq->dl.pushable_dl_tasks_leftmost; + struct rb_node *next_node = rq->dl.pushable_dl_tasks_root.rb_leftmost; struct task_struct *p = NULL; if (!has_pushable_dl_tasks(rq)) @@ -1945,7 +1933,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) if (!has_pushable_dl_tasks(rq)) return NULL; - p = rb_entry(rq->dl.pushable_dl_tasks_leftmost, + p = rb_entry(rq->dl.pushable_dl_tasks_root.rb_leftmost, struct task_struct, pushable_dl_tasks); BUG_ON(rq->cpu != task_cpu(p)); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4a23bbc3111b..01217fb5a5de 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -181,11 +181,16 @@ static const struct file_operations sched_feat_fops = { .release = single_release, }; +__read_mostly bool sched_debug_enabled; + static __init int sched_init_debug(void) { debugfs_create_file("sched_features", 0644, NULL, NULL, &sched_feat_fops); + debugfs_create_bool("sched_debug", 0644, NULL, + &sched_debug_enabled); + return 0; } late_initcall(sched_init_debug); @@ -530,7 +535,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(cfs_rq->exec_clock)); raw_spin_lock_irqsave(&rq->lock, flags); - if (cfs_rq->rb_leftmost) + if (rb_first_cached(&cfs_rq->tasks_timeline)) MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; last = __pick_last_entity(cfs_rq); if (last) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8bc0a883d190..70ba32e08a23 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -513,6 +513,7 @@ static inline int entity_before(struct sched_entity *a, static void update_min_vruntime(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; + struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline); u64 vruntime = cfs_rq->min_vruntime; @@ -523,10 +524,9 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) curr = NULL; } - if (cfs_rq->rb_leftmost) { - struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, - struct sched_entity, - run_node); + if (leftmost) { /* non-empty tree */ + struct sched_entity *se; + se = rb_entry(leftmost, struct sched_entity, run_node); if (!curr) vruntime = se->vruntime; @@ -547,10 +547,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) */ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; + struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node; struct rb_node *parent = NULL; struct sched_entity *entry; - int leftmost = 1; + bool leftmost = true; /* * Find the right place in the rbtree: @@ -566,36 +566,23 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) link = &parent->rb_left; } else { link = &parent->rb_right; - leftmost = 0; + leftmost = false; } } - /* - * Maintain a cache of leftmost tree entries (it is frequently - * used): - */ - if (leftmost) - cfs_rq->rb_leftmost = &se->run_node; - rb_link_node(&se->run_node, parent, link); - rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); + rb_insert_color_cached(&se->run_node, + &cfs_rq->tasks_timeline, leftmost); } static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (cfs_rq->rb_leftmost == &se->run_node) { - struct rb_node *next_node; - - next_node = rb_next(&se->run_node); - cfs_rq->rb_leftmost = next_node; - } - - rb_erase(&se->run_node, &cfs_rq->tasks_timeline); + rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline); } struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) { - struct rb_node *left = cfs_rq->rb_leftmost; + struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline); if (!left) return NULL; @@ -616,7 +603,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se) #ifdef CONFIG_SCHED_DEBUG struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { - struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); + struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root); if (!last) return NULL; @@ -5437,7 +5424,7 @@ wake_affine_llc(struct sched_domain *sd, struct task_struct *p, return false; /* if this cache has capacity, come here */ - if (this_stats.has_capacity && this_stats.nr_running < prev_stats.nr_running+1) + if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running) return true; /* @@ -7721,7 +7708,7 @@ next_group: * number. * * Return: 1 when packing is required and a task should be moved to - * this CPU. The amount of the imbalance is returned in *imbalance. + * this CPU. The amount of the imbalance is returned in env->imbalance. * * @env: The load balancing environment. * @sds: Statistics of the sched_domain which is to be packed @@ -8450,6 +8437,12 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) this_rq->idle_stamp = rq_clock(this_rq); /* + * Do not pull tasks towards !active CPUs... + */ + if (!cpu_active(this_cpu)) + return 0; + + /* * This is OK, because current is on_cpu, which avoids it being picked * for load-balance and preemption/IRQs are still disabled avoiding * further scheduler activity on it and we're being very careful to @@ -8556,6 +8549,13 @@ static int active_load_balance_cpu_stop(void *data) struct rq_flags rf; rq_lock_irq(busiest_rq, &rf); + /* + * Between queueing the stop-work and running it is a hole in which + * CPUs can become inactive. We should not move tasks from or to + * inactive CPUs. + */ + if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu)) + goto out_unlock; /* make sure the requested cpu hasn't gone down in the meantime */ if (unlikely(busiest_cpu != smp_processor_id() || @@ -9312,7 +9312,7 @@ static void set_curr_task_fair(struct rq *rq) void init_cfs_rq(struct cfs_rq *cfs_rq) { - cfs_rq->tasks_timeline = RB_ROOT; + cfs_rq->tasks_timeline = RB_ROOT_CACHED; cfs_rq->min_vruntime = (u64)(-(1LL << 20)); #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6ed7962dc896..14db76cd496f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -426,8 +426,7 @@ struct cfs_rq { u64 min_vruntime_copy; #endif - struct rb_root tasks_timeline; - struct rb_node *rb_leftmost; + struct rb_root_cached tasks_timeline; /* * 'curr' points to currently running entity on this cfs_rq. @@ -550,8 +549,7 @@ struct rt_rq { /* Deadline class' related fields in a runqueue */ struct dl_rq { /* runqueue is an rbtree, ordered by deadline */ - struct rb_root rb_root; - struct rb_node *rb_leftmost; + struct rb_root_cached root; unsigned long dl_nr_running; @@ -575,8 +573,7 @@ struct dl_rq { * an rb-tree, ordered by tasks' deadlines, with caching * of the leftmost (earliest deadline) element. */ - struct rb_root pushable_dl_tasks_root; - struct rb_node *pushable_dl_tasks_leftmost; + struct rb_root_cached pushable_dl_tasks_root; #else struct dl_bw dl_bw; #endif @@ -1954,6 +1951,8 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); #ifdef CONFIG_SCHED_DEBUG +extern bool sched_debug_enabled; + extern void print_cfs_stats(struct seq_file *m, int cpu); extern void print_rt_stats(struct seq_file *m, int cpu); extern void print_dl_stats(struct seq_file *m, int cpu); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 6f7b43982f73..f1cf4f306a82 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -14,11 +14,9 @@ cpumask_var_t sched_domains_tmpmask2; #ifdef CONFIG_SCHED_DEBUG -static __read_mostly int sched_debug_enabled; - static int __init sched_debug_setup(char *str) { - sched_debug_enabled = 1; + sched_debug_enabled = true; return 0; } @@ -473,7 +471,7 @@ static int __init isolated_cpu_setup(char *str) alloc_bootmem_cpumask_var(&cpu_isolated_map); ret = cpulist_parse(str, cpu_isolated_map); if (ret) { - pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); + pr_err("sched: Error, all isolcpus= values must be between 0 and %u\n", nr_cpu_ids); return 0; } return 1; diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index d6afed6d0752..98feab7933c7 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -53,6 +53,12 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry } EXPORT_SYMBOL(remove_wait_queue); +/* + * Scan threshold to break wait queue walk. + * This allows a waker to take a break from holding the + * wait queue lock during the wait queue walk. + */ +#define WAITQUEUE_WALK_BREAK_CNT 64 /* * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just @@ -63,18 +69,67 @@ EXPORT_SYMBOL(remove_wait_queue); * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns * zero in this (rare) case, and we handle it by continuing to scan the queue. */ -static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, - int nr_exclusive, int wake_flags, void *key) +static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, + int nr_exclusive, int wake_flags, void *key, + wait_queue_entry_t *bookmark) { wait_queue_entry_t *curr, *next; + int cnt = 0; + + if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) { + curr = list_next_entry(bookmark, entry); + + list_del(&bookmark->entry); + bookmark->flags = 0; + } else + curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry); - list_for_each_entry_safe(curr, next, &wq_head->head, entry) { + if (&curr->entry == &wq_head->head) + return nr_exclusive; + + list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) { unsigned flags = curr->flags; - int ret = curr->func(curr, mode, wake_flags, key); + int ret; + + if (flags & WQ_FLAG_BOOKMARK) + continue; + + ret = curr->func(curr, mode, wake_flags, key); if (ret < 0) break; if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) break; + + if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) && + (&next->entry != &wq_head->head)) { + bookmark->flags = WQ_FLAG_BOOKMARK; + list_add_tail(&bookmark->entry, &next->entry); + break; + } + } + return nr_exclusive; +} + +static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode, + int nr_exclusive, int wake_flags, void *key) +{ + unsigned long flags; + wait_queue_entry_t bookmark; + + bookmark.flags = 0; + bookmark.private = NULL; + bookmark.func = NULL; + INIT_LIST_HEAD(&bookmark.entry); + + spin_lock_irqsave(&wq_head->lock, flags); + nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark); + spin_unlock_irqrestore(&wq_head->lock, flags); + + while (bookmark.flags & WQ_FLAG_BOOKMARK) { + spin_lock_irqsave(&wq_head->lock, flags); + nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, + wake_flags, key, &bookmark); + spin_unlock_irqrestore(&wq_head->lock, flags); } } @@ -91,11 +146,7 @@ static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive, void *key) { - unsigned long flags; - - spin_lock_irqsave(&wq_head->lock, flags); - __wake_up_common(wq_head, mode, nr_exclusive, 0, key); - spin_unlock_irqrestore(&wq_head->lock, flags); + __wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key); } EXPORT_SYMBOL(__wake_up); @@ -104,16 +155,23 @@ EXPORT_SYMBOL(__wake_up); */ void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr) { - __wake_up_common(wq_head, mode, nr, 0, NULL); + __wake_up_common(wq_head, mode, nr, 0, NULL, NULL); } EXPORT_SYMBOL_GPL(__wake_up_locked); void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key) { - __wake_up_common(wq_head, mode, 1, 0, key); + __wake_up_common(wq_head, mode, 1, 0, key, NULL); } EXPORT_SYMBOL_GPL(__wake_up_locked_key); +void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head, + unsigned int mode, void *key, wait_queue_entry_t *bookmark) +{ + __wake_up_common(wq_head, mode, 1, 0, key, bookmark); +} +EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark); + /** * __wake_up_sync_key - wake up threads blocked on a waitqueue. * @wq_head: the waitqueue @@ -134,7 +192,6 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key); void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive, void *key) { - unsigned long flags; int wake_flags = 1; /* XXX WF_SYNC */ if (unlikely(!wq_head)) @@ -143,9 +200,7 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, if (unlikely(nr_exclusive != 1)) wake_flags = 0; - spin_lock_irqsave(&wq_head->lock, flags); - __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key); - spin_unlock_irqrestore(&wq_head->lock, flags); + __wake_up_common_lock(wq_head, mode, nr_exclusive, wake_flags, key); } EXPORT_SYMBOL_GPL(__wake_up_sync_key); diff --git a/kernel/signal.c b/kernel/signal.c index ed804a470dcd..800a18f77732 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2686,6 +2686,51 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, } #endif +enum siginfo_layout siginfo_layout(int sig, int si_code) +{ + enum siginfo_layout layout = SIL_KILL; + if ((si_code > SI_USER) && (si_code < SI_KERNEL)) { + static const struct { + unsigned char limit, layout; + } filter[] = { + [SIGILL] = { NSIGILL, SIL_FAULT }, + [SIGFPE] = { NSIGFPE, SIL_FAULT }, + [SIGSEGV] = { NSIGSEGV, SIL_FAULT }, + [SIGBUS] = { NSIGBUS, SIL_FAULT }, + [SIGTRAP] = { NSIGTRAP, SIL_FAULT }, +#if defined(SIGMET) && defined(NSIGEMT) + [SIGEMT] = { NSIGEMT, SIL_FAULT }, +#endif + [SIGCHLD] = { NSIGCHLD, SIL_CHLD }, + [SIGPOLL] = { NSIGPOLL, SIL_POLL }, +#ifdef __ARCH_SIGSYS + [SIGSYS] = { NSIGSYS, SIL_SYS }, +#endif + }; + if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit)) + layout = filter[sig].layout; + else if (si_code <= NSIGPOLL) + layout = SIL_POLL; + } else { + if (si_code == SI_TIMER) + layout = SIL_TIMER; + else if (si_code == SI_SIGIO) + layout = SIL_POLL; + else if (si_code < 0) + layout = SIL_RT; + /* Tests to support buggy kernel ABIs */ +#ifdef TRAP_FIXME + if ((sig == SIGTRAP) && (si_code == TRAP_FIXME)) + layout = SIL_FAULT; +#endif +#ifdef FPE_FIXME + if ((sig == SIGFPE) && (si_code == FPE_FIXME)) + layout = SIL_FAULT; +#endif + } + return layout; +} + #ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) @@ -2708,22 +2753,20 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) */ err = __put_user(from->si_signo, &to->si_signo); err |= __put_user(from->si_errno, &to->si_errno); - err |= __put_user((short)from->si_code, &to->si_code); - switch (from->si_code & __SI_MASK) { - case __SI_KILL: + err |= __put_user(from->si_code, &to->si_code); + switch (siginfo_layout(from->si_signo, from->si_code)) { + case SIL_KILL: err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_uid, &to->si_uid); break; - case __SI_TIMER: - err |= __put_user(from->si_tid, &to->si_tid); - err |= __put_user(from->si_overrun, &to->si_overrun); - err |= __put_user(from->si_ptr, &to->si_ptr); + case SIL_TIMER: + /* Unreached SI_TIMER is negative */ break; - case __SI_POLL: + case SIL_POLL: err |= __put_user(from->si_band, &to->si_band); err |= __put_user(from->si_fd, &to->si_fd); break; - case __SI_FAULT: + case SIL_FAULT: err |= __put_user(from->si_addr, &to->si_addr); #ifdef __ARCH_SI_TRAPNO err |= __put_user(from->si_trapno, &to->si_trapno); @@ -2748,30 +2791,25 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) err |= __put_user(from->si_pkey, &to->si_pkey); #endif break; - case __SI_CHLD: + case SIL_CHLD: err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_uid, &to->si_uid); err |= __put_user(from->si_status, &to->si_status); err |= __put_user(from->si_utime, &to->si_utime); err |= __put_user(from->si_stime, &to->si_stime); break; - case __SI_RT: /* This is not generated by the kernel as of now. */ - case __SI_MESGQ: /* But this is */ + case SIL_RT: err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_uid, &to->si_uid); err |= __put_user(from->si_ptr, &to->si_ptr); break; #ifdef __ARCH_SIGSYS - case __SI_SYS: + case SIL_SYS: err |= __put_user(from->si_call_addr, &to->si_call_addr); err |= __put_user(from->si_syscall, &to->si_syscall); err |= __put_user(from->si_arch, &to->si_arch); break; #endif - default: /* this is just in case for now ... */ - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - break; } return err; } diff --git a/kernel/smp.c b/kernel/smp.c index 81cfca9b4cc3..c94dd85c8d41 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -550,7 +550,7 @@ static int __init maxcpus(char *str) early_param("maxcpus", maxcpus); /* Setup number of possible processor ids */ -int nr_cpu_ids __read_mostly = NR_CPUS; +unsigned int nr_cpu_ids __read_mostly = NR_CPUS; EXPORT_SYMBOL(nr_cpu_ids); /* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */ diff --git a/kernel/sys.c b/kernel/sys.c index 2855ee73acd0..9aebc2935013 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1896,15 +1896,11 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map) /* * Finally, make sure the caller has the rights to - * change /proc/pid/exe link: only local root should + * change /proc/pid/exe link: only local sys admin should * be allowed to. */ if (prctl_map->exe_fd != (u32)-1) { - struct user_namespace *ns = current_user_ns(); - const struct cred *cred = current_cred(); - - if (!uid_eq(cred->uid, make_kuid(ns, 0)) || - !gid_eq(cred->gid, make_kgid(ns, 0))) + if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN)) goto out; } diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 02e1859f2ca8..58ea8c03662e 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -986,8 +986,9 @@ static ssize_t bin_intvec(struct file *file, size_t length = oldlen / sizeof(*vec); char *str, *end; int i; + loff_t pos = 0; - result = kernel_read(file, 0, buffer, BUFSZ - 1); + result = kernel_read(file, buffer, BUFSZ - 1, &pos); if (result < 0) goto out_kfree; @@ -1016,6 +1017,7 @@ static ssize_t bin_intvec(struct file *file, size_t length = newlen / sizeof(*vec); char *str, *end; int i; + loff_t pos = 0; str = buffer; end = str + BUFSZ; @@ -1029,7 +1031,7 @@ static ssize_t bin_intvec(struct file *file, str += scnprintf(str, end - str, "%lu\t", value); } - result = kernel_write(file, buffer, str - buffer, 0); + result = kernel_write(file, buffer, str - buffer, &pos); if (result < 0) goto out_kfree; } @@ -1057,8 +1059,9 @@ static ssize_t bin_ulongvec(struct file *file, size_t length = oldlen / sizeof(*vec); char *str, *end; int i; + loff_t pos = 0; - result = kernel_read(file, 0, buffer, BUFSZ - 1); + result = kernel_read(file, buffer, BUFSZ - 1, &pos); if (result < 0) goto out_kfree; @@ -1087,6 +1090,7 @@ static ssize_t bin_ulongvec(struct file *file, size_t length = newlen / sizeof(*vec); char *str, *end; int i; + loff_t pos = 0; str = buffer; end = str + BUFSZ; @@ -1100,7 +1104,7 @@ static ssize_t bin_ulongvec(struct file *file, str += scnprintf(str, end - str, "%lu\t", value); } - result = kernel_write(file, buffer, str - buffer, 0); + result = kernel_write(file, buffer, str - buffer, &pos); if (result < 0) goto out_kfree; } @@ -1120,8 +1124,9 @@ static ssize_t bin_uuid(struct file *file, if (oldval && oldlen) { char buf[UUID_STRING_LEN + 1]; uuid_t uuid; + loff_t pos = 0; - result = kernel_read(file, 0, buf, sizeof(buf) - 1); + result = kernel_read(file, buf, sizeof(buf) - 1, &pos); if (result < 0) goto out; @@ -1154,8 +1159,9 @@ static ssize_t bin_dn_node_address(struct file *file, char buf[15], *nodep; unsigned long area, node; __le16 dnaddr; + loff_t pos = 0; - result = kernel_read(file, 0, buf, sizeof(buf) - 1); + result = kernel_read(file, buf, sizeof(buf) - 1, &pos); if (result < 0) goto out; @@ -1188,6 +1194,7 @@ static ssize_t bin_dn_node_address(struct file *file, __le16 dnaddr; char buf[15]; int len; + loff_t pos = 0; result = -EINVAL; if (newlen != sizeof(dnaddr)) @@ -1201,7 +1208,7 @@ static ssize_t bin_dn_node_address(struct file *file, le16_to_cpu(dnaddr) >> 10, le16_to_cpu(dnaddr) & 0x3ff); - result = kernel_write(file, buf, len, 0); + result = kernel_write(file, buf, len, &pos); if (result < 0) goto out; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 8ea4fb315719..2cafb49aa65e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2316,7 +2316,7 @@ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } EXPORT_SYMBOL(hardpps); -#endif +#endif /* CONFIG_NTP_PPS */ /** * xtime_update() - advances the timekeeping infrastructure diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index bc364f86100a..2a685b45b73b 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -27,6 +27,7 @@ #include <linux/time.h> #include <linux/uaccess.h> #include <linux/list.h> +#include <linux/blk-cgroup.h> #include "../../block/blk.h" @@ -46,10 +47,16 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock); /* Select an alternative, minimalistic output than the original one */ #define TRACE_BLK_OPT_CLASSIC 0x1 +#define TRACE_BLK_OPT_CGROUP 0x2 +#define TRACE_BLK_OPT_CGNAME 0x4 static struct tracer_opt blk_tracer_opts[] = { /* Default disable the minimalistic output */ { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) }, +#ifdef CONFIG_BLK_CGROUP + { TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) }, + { TRACER_OPT(blk_cgname, TRACE_BLK_OPT_CGNAME) }, +#endif { } }; @@ -68,7 +75,8 @@ static void blk_unregister_tracepoints(void); * Send out a notify message. */ static void trace_note(struct blk_trace *bt, pid_t pid, int action, - const void *data, size_t len) + const void *data, size_t len, + union kernfs_node_id *cgid) { struct blk_io_trace *t; struct ring_buffer_event *event = NULL; @@ -76,12 +84,13 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, int pc = 0; int cpu = smp_processor_id(); bool blk_tracer = blk_tracer_enabled; + ssize_t cgid_len = cgid ? sizeof(*cgid) : 0; if (blk_tracer) { buffer = blk_tr->trace_buffer.buffer; pc = preempt_count(); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, - sizeof(*t) + len, + sizeof(*t) + len + cgid_len, 0, pc); if (!event) return; @@ -92,17 +101,19 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, if (!bt->rchan) return; - t = relay_reserve(bt->rchan, sizeof(*t) + len); + t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len); if (t) { t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; t->time = ktime_to_ns(ktime_get()); record_it: t->device = bt->dev; - t->action = action; + t->action = action | (cgid ? __BLK_TN_CGROUP : 0); t->pid = pid; t->cpu = cpu; - t->pdu_len = len; - memcpy((void *) t + sizeof(*t), data, len); + t->pdu_len = len + cgid_len; + if (cgid) + memcpy((void *)t + sizeof(*t), cgid, cgid_len); + memcpy((void *) t + sizeof(*t) + cgid_len, data, len); if (blk_tracer) trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc); @@ -122,7 +133,7 @@ static void trace_note_tsk(struct task_struct *tsk) spin_lock_irqsave(&running_trace_lock, flags); list_for_each_entry(bt, &running_trace_list, running_list) { trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, - sizeof(tsk->comm)); + sizeof(tsk->comm), NULL); } spin_unlock_irqrestore(&running_trace_lock, flags); } @@ -139,11 +150,12 @@ static void trace_note_time(struct blk_trace *bt) words[1] = now.tv_nsec; local_irq_save(flags); - trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words)); + trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), NULL); local_irq_restore(flags); } -void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) +void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, + const char *fmt, ...) { int n; va_list args; @@ -167,7 +179,14 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); - trace_note(bt, 0, BLK_TN_MESSAGE, buf, n); + if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) + blkcg = NULL; +#ifdef CONFIG_BLK_CGROUP + trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, + blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : NULL); +#else + trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, NULL); +#endif local_irq_restore(flags); } EXPORT_SYMBOL_GPL(__trace_note_message); @@ -204,7 +223,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, int op, int op_flags, u32 what, int error, int pdu_len, - void *pdu_data) + void *pdu_data, union kernfs_node_id *cgid) { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; @@ -215,6 +234,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, pid_t pid; int cpu, pc = 0; bool blk_tracer = blk_tracer_enabled; + ssize_t cgid_len = cgid ? sizeof(*cgid) : 0; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; @@ -229,6 +249,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, what |= BLK_TC_ACT(BLK_TC_DISCARD); if (op == REQ_OP_FLUSH) what |= BLK_TC_ACT(BLK_TC_FLUSH); + if (cgid) + what |= __BLK_TA_CGROUP; pid = tsk->pid; if (act_log_check(bt, what, sector, pid)) @@ -241,7 +263,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, buffer = blk_tr->trace_buffer.buffer; pc = preempt_count(); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, - sizeof(*t) + pdu_len, + sizeof(*t) + pdu_len + cgid_len, 0, pc); if (!event) return; @@ -258,7 +280,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, * from coming in and stepping on our toes. */ local_irq_save(flags); - t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); + t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len); if (t) { sequence = per_cpu_ptr(bt->sequence, cpu); @@ -280,10 +302,12 @@ record_it: t->action = what; t->device = bt->dev; t->error = error; - t->pdu_len = pdu_len; + t->pdu_len = pdu_len + cgid_len; + if (cgid_len) + memcpy((void *)t + sizeof(*t), cgid, cgid_len); if (pdu_len) - memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); + memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); if (blk_tracer) { trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc); @@ -359,7 +383,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, return PTR_ERR(msg); bt = filp->private_data; - __trace_note_message(bt, "%s", msg); + __trace_note_message(bt, NULL, "%s", msg); kfree(msg); return count; @@ -684,6 +708,36 @@ void blk_trace_shutdown(struct request_queue *q) } } +#ifdef CONFIG_BLK_CGROUP +static union kernfs_node_id * +blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +{ + struct blk_trace *bt = q->blk_trace; + + if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) + return NULL; + + if (!bio->bi_css) + return NULL; + return cgroup_get_kernfs_id(bio->bi_css->cgroup); +} +#else +static union kernfs_node_id * +blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +{ + return NULL; +} +#endif + +static union kernfs_node_id * +blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) +{ + if (!rq->bio) + return NULL; + /* Use the first bio */ + return blk_trace_bio_get_cgid(q, rq->bio); +} + /* * blktrace probes */ @@ -694,13 +748,15 @@ void blk_trace_shutdown(struct request_queue *q) * @error: return status to log * @nr_bytes: number of completed bytes * @what: the action + * @cgid: the cgroup info * * Description: * Records an action against a request. Will log the bio offset + size. * **/ static void blk_add_trace_rq(struct request *rq, int error, - unsigned int nr_bytes, u32 what) + unsigned int nr_bytes, u32 what, + union kernfs_node_id *cgid) { struct blk_trace *bt = rq->q->blk_trace; @@ -713,32 +769,36 @@ static void blk_add_trace_rq(struct request *rq, int error, what |= BLK_TC_ACT(BLK_TC_FS); __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq), - rq->cmd_flags, what, error, 0, NULL); + rq->cmd_flags, what, error, 0, NULL, cgid); } static void blk_add_trace_rq_insert(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT, + blk_trace_request_get_cgid(q, rq)); } static void blk_add_trace_rq_issue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE, + blk_trace_request_get_cgid(q, rq)); } static void blk_add_trace_rq_requeue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE, + blk_trace_request_get_cgid(q, rq)); } static void blk_add_trace_rq_complete(void *ignore, struct request *rq, int error, unsigned int nr_bytes) { - blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE); + blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE, + blk_trace_request_get_cgid(rq->q, rq)); } /** @@ -753,7 +813,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq, * **/ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, - u32 what, int error) + u32 what, int error, union kernfs_node_id *cgid) { struct blk_trace *bt = q->blk_trace; @@ -761,20 +821,22 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, return; __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio_op(bio), bio->bi_opf, what, error, 0, NULL); + bio_op(bio), bio->bi_opf, what, error, 0, NULL, cgid); } static void blk_add_trace_bio_bounce(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); + blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_bio_complete(void *ignore, struct request_queue *q, struct bio *bio, int error) { - blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); + blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_bio_backmerge(void *ignore, @@ -782,7 +844,8 @@ static void blk_add_trace_bio_backmerge(void *ignore, struct request *rq, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); + blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_bio_frontmerge(void *ignore, @@ -790,13 +853,15 @@ static void blk_add_trace_bio_frontmerge(void *ignore, struct request *rq, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); + blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_bio_queue(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0); + blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_getrq(void *ignore, @@ -804,13 +869,14 @@ static void blk_add_trace_getrq(void *ignore, struct bio *bio, int rw) { if (bio) - blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); + blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0, + blk_trace_bio_get_cgid(q, bio)); else { struct blk_trace *bt = q->blk_trace; if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, - NULL); + NULL, NULL); } } @@ -820,13 +886,14 @@ static void blk_add_trace_sleeprq(void *ignore, struct bio *bio, int rw) { if (bio) - blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); + blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0, + blk_trace_bio_get_cgid(q, bio)); else { struct blk_trace *bt = q->blk_trace; if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, - 0, 0, NULL); + 0, 0, NULL, NULL); } } @@ -835,7 +902,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) struct blk_trace *bt = q->blk_trace; if (bt) - __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); + __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, NULL); } static void blk_add_trace_unplug(void *ignore, struct request_queue *q, @@ -852,7 +919,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, else what = BLK_TA_UNPLUG_TIMER; - __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); + __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, NULL); } } @@ -868,7 +935,7 @@ static void blk_add_trace_split(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu), - &rpdu); + &rpdu, blk_trace_bio_get_cgid(q, bio)); } } @@ -896,12 +963,12 @@ static void blk_add_trace_bio_remap(void *ignore, return; r.device_from = cpu_to_be32(dev); - r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev); + r.device_to = cpu_to_be32(bio_dev(bio)); r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status, - sizeof(r), &r); + sizeof(r), &r, blk_trace_bio_get_cgid(q, bio)); } /** @@ -934,7 +1001,7 @@ static void blk_add_trace_rq_remap(void *ignore, __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rq_data_dir(rq), 0, BLK_TA_REMAP, 0, - sizeof(r), &r); + sizeof(r), &r, blk_trace_request_get_cgid(q, rq)); } /** @@ -958,7 +1025,8 @@ void blk_add_driver_data(struct request_queue *q, return; __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, - BLK_TA_DRV_DATA, 0, len, data); + BLK_TA_DRV_DATA, 0, len, data, + blk_trace_request_get_cgid(q, rq)); } EXPORT_SYMBOL_GPL(blk_add_driver_data); @@ -1031,7 +1099,7 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) int i = 0; int tc = t->action >> BLK_TC_SHIFT; - if (t->action == BLK_TN_MESSAGE) { + if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) { rwbs[i++] = 'N'; goto out; } @@ -1066,9 +1134,21 @@ const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) return (const struct blk_io_trace *)ent; } -static inline const void *pdu_start(const struct trace_entry *ent) +static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg) +{ + return (void *)(te_blk_io_trace(ent) + 1) + + (has_cg ? sizeof(union kernfs_node_id) : 0); +} + +static inline const void *cgid_start(const struct trace_entry *ent) +{ + return (void *)(te_blk_io_trace(ent) + 1); +} + +static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg) { - return te_blk_io_trace(ent) + 1; + return te_blk_io_trace(ent)->pdu_len - + (has_cg ? sizeof(union kernfs_node_id) : 0); } static inline u32 t_action(const struct trace_entry *ent) @@ -1096,16 +1176,16 @@ static inline __u16 t_error(const struct trace_entry *ent) return te_blk_io_trace(ent)->error; } -static __u64 get_pdu_int(const struct trace_entry *ent) +static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg) { - const __u64 *val = pdu_start(ent); + const __u64 *val = pdu_start(ent, has_cg); return be64_to_cpu(*val); } static void get_pdu_remap(const struct trace_entry *ent, - struct blk_io_trace_remap *r) + struct blk_io_trace_remap *r, bool has_cg) { - const struct blk_io_trace_remap *__r = pdu_start(ent); + const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg); __u64 sector_from = __r->sector_from; r->device_from = be32_to_cpu(__r->device_from); @@ -1113,9 +1193,11 @@ static void get_pdu_remap(const struct trace_entry *ent, r->sector_from = be64_to_cpu(sector_from); } -typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act); +typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act, + bool has_cg); -static void blk_log_action_classic(struct trace_iterator *iter, const char *act) +static void blk_log_action_classic(struct trace_iterator *iter, const char *act, + bool has_cg) { char rwbs[RWBS_LEN]; unsigned long long ts = iter->ts; @@ -1131,24 +1213,43 @@ static void blk_log_action_classic(struct trace_iterator *iter, const char *act) secs, nsec_rem, iter->ent->pid, act, rwbs); } -static void blk_log_action(struct trace_iterator *iter, const char *act) +static void blk_log_action(struct trace_iterator *iter, const char *act, + bool has_cg) { char rwbs[RWBS_LEN]; const struct blk_io_trace *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); - trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", - MAJOR(t->device), MINOR(t->device), act, rwbs); -} - -static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) + if (has_cg) { + const union kernfs_node_id *id = cgid_start(iter->ent); + + if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) { + char blkcg_name_buf[NAME_MAX + 1] = "<...>"; + + cgroup_path_from_kernfs_id(id, blkcg_name_buf, + sizeof(blkcg_name_buf)); + trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ", + MAJOR(t->device), MINOR(t->device), + blkcg_name_buf, act, rwbs); + } else + trace_seq_printf(&iter->seq, + "%3d,%-3d %x,%-x %2s %3s ", + MAJOR(t->device), MINOR(t->device), + id->ino, id->generation, act, rwbs); + } else + trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", + MAJOR(t->device), MINOR(t->device), act, rwbs); +} + +static void blk_log_dump_pdu(struct trace_seq *s, + const struct trace_entry *ent, bool has_cg) { const unsigned char *pdu_buf; int pdu_len; int i, end; - pdu_buf = pdu_start(ent); - pdu_len = te_blk_io_trace(ent)->pdu_len; + pdu_buf = pdu_start(ent, has_cg); + pdu_len = pdu_real_len(ent, has_cg); if (!pdu_len) return; @@ -1179,7 +1280,7 @@ static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) trace_seq_puts(s, ") "); } -static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; @@ -1187,7 +1288,7 @@ static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { trace_seq_printf(s, "%u ", t_bytes(ent)); - blk_log_dump_pdu(s, ent); + blk_log_dump_pdu(s, ent, has_cg); trace_seq_printf(s, "[%s]\n", cmd); } else { if (t_sec(ent)) @@ -1199,10 +1300,10 @@ static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) } static void blk_log_with_error(struct trace_seq *s, - const struct trace_entry *ent) + const struct trace_entry *ent, bool has_cg) { if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { - blk_log_dump_pdu(s, ent); + blk_log_dump_pdu(s, ent, has_cg); trace_seq_printf(s, "[%d]\n", t_error(ent)); } else { if (t_sec(ent)) @@ -1215,18 +1316,18 @@ static void blk_log_with_error(struct trace_seq *s, } } -static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { struct blk_io_trace_remap r = { .device_from = 0, }; - get_pdu_remap(ent, &r); + get_pdu_remap(ent, &r, has_cg); trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", t_sector(ent), t_sec(ent), MAJOR(r.device_from), MINOR(r.device_from), (unsigned long long)r.sector_from); } -static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; @@ -1235,30 +1336,31 @@ static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) trace_seq_printf(s, "[%s]\n", cmd); } -static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); - trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent)); + trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent, has_cg)); } -static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), - get_pdu_int(ent), cmd); + get_pdu_int(ent, has_cg), cmd); } -static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent, + bool has_cg) { - const struct blk_io_trace *t = te_blk_io_trace(ent); - trace_seq_putmem(s, t + 1, t->pdu_len); + trace_seq_putmem(s, pdu_start(ent, has_cg), + pdu_real_len(ent, has_cg)); trace_seq_putc(s, '\n'); } @@ -1298,7 +1400,8 @@ static void blk_tracer_reset(struct trace_array *tr) static const struct { const char *act[2]; - void (*print)(struct trace_seq *s, const struct trace_entry *ent); + void (*print)(struct trace_seq *s, const struct trace_entry *ent, + bool has_cg); } what2act[] = { [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, @@ -1326,23 +1429,25 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, u16 what; bool long_act; blk_log_action_t *log_action; + bool has_cg; t = te_blk_io_trace(iter->ent); - what = t->action & ((1 << BLK_TC_SHIFT) - 1); + what = (t->action & ((1 << BLK_TC_SHIFT) - 1)) & ~__BLK_TA_CGROUP; long_act = !!(tr->trace_flags & TRACE_ITER_VERBOSE); log_action = classic ? &blk_log_action_classic : &blk_log_action; + has_cg = t->action & __BLK_TA_CGROUP; - if (t->action == BLK_TN_MESSAGE) { - log_action(iter, long_act ? "message" : "m"); - blk_log_msg(s, iter->ent); + if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) { + log_action(iter, long_act ? "message" : "m", has_cg); + blk_log_msg(s, iter->ent, has_cg); return trace_handle_return(s); } if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) trace_seq_printf(s, "Unknown action %x\n", what); else { - log_action(iter, what2act[what].act[long_act]); - what2act[what].print(s, iter->ent); + log_action(iter, what2act[what].act[long_act], has_cg); + what2act[what].print(s, iter->ent, has_cg); } return trace_handle_return(s); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 96cea88fa00f..6abfafd7f173 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2828,13 +2828,14 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) if (!command || !ftrace_enabled) { /* - * If these are per_cpu ops, they still need their - * per_cpu field freed. Since, function tracing is + * If these are dynamic or per_cpu ops, they still + * need their data freed. Since, function tracing is * not currently active, we can just free them * without synchronizing all CPUs. */ - if (ops->flags & FTRACE_OPS_FL_PER_CPU) - per_cpu_ops_free(ops); + if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) + goto free_ops; + return 0; } @@ -2900,6 +2901,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) if (IS_ENABLED(CONFIG_PREEMPT)) synchronize_rcu_tasks(); + free_ops: arch_ftrace_trampoline_free(ops); if (ops->flags & FTRACE_OPS_FL_PER_CPU) @@ -5690,10 +5692,51 @@ static int referenced_filters(struct dyn_ftrace *rec) return cnt; } +static void +clear_mod_from_hash(struct ftrace_page *pg, struct ftrace_hash *hash) +{ + struct ftrace_func_entry *entry; + struct dyn_ftrace *rec; + int i; + + if (ftrace_hash_empty(hash)) + return; + + for (i = 0; i < pg->index; i++) { + rec = &pg->records[i]; + entry = __ftrace_lookup_ip(hash, rec->ip); + /* + * Do not allow this rec to match again. + * Yeah, it may waste some memory, but will be removed + * if/when the hash is modified again. + */ + if (entry) + entry->ip = 0; + } +} + +/* Clear any records from hashs */ +static void clear_mod_from_hashes(struct ftrace_page *pg) +{ + struct trace_array *tr; + + mutex_lock(&trace_types_lock); + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (!tr->ops || !tr->ops->func_hash) + continue; + mutex_lock(&tr->ops->func_hash->regex_lock); + clear_mod_from_hash(pg, tr->ops->func_hash->filter_hash); + clear_mod_from_hash(pg, tr->ops->func_hash->notrace_hash); + mutex_unlock(&tr->ops->func_hash->regex_lock); + } + mutex_unlock(&trace_types_lock); +} + void ftrace_release_mod(struct module *mod) { struct dyn_ftrace *rec; struct ftrace_page **last_pg; + struct ftrace_page *tmp_page = NULL; struct ftrace_page *pg; int order; @@ -5723,14 +5766,25 @@ void ftrace_release_mod(struct module *mod) ftrace_update_tot_cnt -= pg->index; *last_pg = pg->next; - order = get_count_order(pg->size / ENTRIES_PER_PAGE); - free_pages((unsigned long)pg->records, order); - kfree(pg); + + pg->next = tmp_page; + tmp_page = pg; } else last_pg = &pg->next; } out_unlock: mutex_unlock(&ftrace_lock); + + for (pg = tmp_page; pg; pg = tmp_page) { + + /* Needs to be called outside of ftrace_lock */ + clear_mod_from_hashes(pg); + + order = get_count_order(pg->size / ENTRIES_PER_PAGE); + free_pages((unsigned long)pg->records, order); + tmp_page = pg->next; + kfree(pg); + } } void ftrace_module_enable(struct module *mod) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 44004d8aa3b3..5360b7aec57a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1702,6 +1702,9 @@ void tracing_reset_all_online_cpus(void) struct trace_array *tr; list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (!tr->clear_trace) + continue; + tr->clear_trace = false; tracing_reset_online_cpus(&tr->trace_buffer); #ifdef CONFIG_TRACER_MAX_TRACE tracing_reset_online_cpus(&tr->max_buffer); @@ -2799,11 +2802,17 @@ static char *get_trace_buf(void) if (!buffer || buffer->nesting >= 4) return NULL; - return &buffer->buffer[buffer->nesting++][0]; + buffer->nesting++; + + /* Interrupts must see nesting incremented before we use the buffer */ + barrier(); + return &buffer->buffer[buffer->nesting][0]; } static void put_trace_buf(void) { + /* Don't let the decrement of nesting leak before this */ + barrier(); this_cpu_dec(trace_percpu_buffer->nesting); } @@ -6220,7 +6229,7 @@ static int tracing_set_clock(struct trace_array *tr, const char *clockstr) tracing_reset_online_cpus(&tr->trace_buffer); #ifdef CONFIG_TRACER_MAX_TRACE - if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer) + if (tr->max_buffer.buffer) ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); tracing_reset_online_cpus(&tr->max_buffer); #endif diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 490ba229931d..fb5d54d0d1b3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -245,6 +245,7 @@ struct trace_array { int stop_count; int clock_id; int nr_topts; + bool clear_trace; struct tracer *current_trace; unsigned int trace_flags; unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 36132f9280e6..87468398b9ed 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -406,7 +406,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, if (file->flags & EVENT_FILE_FL_RECORDED_TGID) { tracing_stop_tgid_record(); - clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); + clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); } call->class->reg(call, TRACE_REG_UNREGISTER, file); @@ -466,7 +466,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, set_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags); /* WAS_ENABLED gets set but never cleared. */ - call->flags |= TRACE_EVENT_FL_WAS_ENABLED; + set_bit(EVENT_FILE_FL_WAS_ENABLED_BIT, &file->flags); } break; } @@ -2058,6 +2058,10 @@ static void event_remove(struct trace_event_call *call) do_for_each_event_file(tr, file) { if (file->event_call != call) continue; + + if (file->flags & EVENT_FILE_FL_WAS_ENABLED) + tr->clear_trace = true; + ftrace_event_enable_disable(file, 0); /* * The do_for_each_event_file() is @@ -2396,15 +2400,11 @@ static void trace_module_add_events(struct module *mod) static void trace_module_remove_events(struct module *mod) { struct trace_event_call *call, *p; - bool clear_trace = false; down_write(&trace_event_sem); list_for_each_entry_safe(call, p, &ftrace_events, list) { - if (call->mod == mod) { - if (call->flags & TRACE_EVENT_FL_WAS_ENABLED) - clear_trace = true; + if (call->mod == mod) __trace_remove_event_call(call); - } } up_write(&trace_event_sem); @@ -2416,8 +2416,7 @@ static void trace_module_remove_events(struct module *mod) * over from this module may be passed to the new module events and * unexpected results may occur. */ - if (clear_trace) - tracing_reset_all_online_cpus(); + tracing_reset_all_online_cpus(); } static int trace_module_notify(struct notifier_block *self, diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 181e139a8057..61e7f0678d33 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -702,7 +702,7 @@ static void append_filter_err(struct filter_parse_state *ps, int pos = ps->lasterr_pos; char *buf, *pbuf; - buf = (char *)__get_free_page(GFP_TEMPORARY); + buf = (char *)__get_free_page(GFP_KERNEL); if (!buf) return; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d56123cdcc89..b8f1f54731af 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1543,7 +1543,7 @@ fs_initcall(init_graph_tracefs); static __init int init_graph_trace(void) { - max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); + max_bytes_for_cpu = snprintf(NULL, 0, "%u", nr_cpu_ids - 1); if (!register_trace_event(&graph_trace_entry_event)) { pr_warn("Warning: could not register graph trace events\n"); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index cb917cebae29..b17ec642793b 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -273,7 +273,7 @@ static int trace_selftest_ops(struct trace_array *tr, int cnt) goto out_free; if (cnt > 1) { if (trace_selftest_test_global_cnt == 0) - goto out; + goto out_free; } if (trace_selftest_test_dyn_cnt == 0) goto out_free; diff --git a/kernel/umh.c b/kernel/umh.c new file mode 100644 index 000000000000..6ff9905250ff --- /dev/null +++ b/kernel/umh.c @@ -0,0 +1,568 @@ +/* + * umh - the kernel usermode helper + */ +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/binfmts.h> +#include <linux/syscalls.h> +#include <linux/unistd.h> +#include <linux/kmod.h> +#include <linux/slab.h> +#include <linux/completion.h> +#include <linux/cred.h> +#include <linux/file.h> +#include <linux/fdtable.h> +#include <linux/workqueue.h> +#include <linux/security.h> +#include <linux/mount.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/resource.h> +#include <linux/notifier.h> +#include <linux/suspend.h> +#include <linux/rwsem.h> +#include <linux/ptrace.h> +#include <linux/async.h> +#include <linux/uaccess.h> + +#include <trace/events/module.h> + +#define CAP_BSET (void *)1 +#define CAP_PI (void *)2 + +static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; +static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; +static DEFINE_SPINLOCK(umh_sysctl_lock); +static DECLARE_RWSEM(umhelper_sem); + +static void call_usermodehelper_freeinfo(struct subprocess_info *info) +{ + if (info->cleanup) + (*info->cleanup)(info); + kfree(info); +} + +static void umh_complete(struct subprocess_info *sub_info) +{ + struct completion *comp = xchg(&sub_info->complete, NULL); + /* + * See call_usermodehelper_exec(). If xchg() returns NULL + * we own sub_info, the UMH_KILLABLE caller has gone away + * or the caller used UMH_NO_WAIT. + */ + if (comp) + complete(comp); + else + call_usermodehelper_freeinfo(sub_info); +} + +/* + * This is the task which runs the usermode application + */ +static int call_usermodehelper_exec_async(void *data) +{ + struct subprocess_info *sub_info = data; + struct cred *new; + int retval; + + spin_lock_irq(¤t->sighand->siglock); + flush_signal_handlers(current, 1); + spin_unlock_irq(¤t->sighand->siglock); + + /* + * Our parent (unbound workqueue) runs with elevated scheduling + * priority. Avoid propagating that into the userspace child. + */ + set_user_nice(current, 0); + + retval = -ENOMEM; + new = prepare_kernel_cred(current); + if (!new) + goto out; + + spin_lock(&umh_sysctl_lock); + new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); + new->cap_inheritable = cap_intersect(usermodehelper_inheritable, + new->cap_inheritable); + spin_unlock(&umh_sysctl_lock); + + if (sub_info->init) { + retval = sub_info->init(sub_info, new); + if (retval) { + abort_creds(new); + goto out; + } + } + + commit_creds(new); + + retval = do_execve(getname_kernel(sub_info->path), + (const char __user *const __user *)sub_info->argv, + (const char __user *const __user *)sub_info->envp); +out: + sub_info->retval = retval; + /* + * call_usermodehelper_exec_sync() will call umh_complete + * if UHM_WAIT_PROC. + */ + if (!(sub_info->wait & UMH_WAIT_PROC)) + umh_complete(sub_info); + if (!retval) + return 0; + do_exit(0); +} + +/* Handles UMH_WAIT_PROC. */ +static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) +{ + pid_t pid; + + /* If SIGCLD is ignored sys_wait4 won't populate the status. */ + kernel_sigaction(SIGCHLD, SIG_DFL); + pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); + if (pid < 0) { + sub_info->retval = pid; + } else { + int ret = -ECHILD; + /* + * Normally it is bogus to call wait4() from in-kernel because + * wait4() wants to write the exit code to a userspace address. + * But call_usermodehelper_exec_sync() always runs as kernel + * thread (workqueue) and put_user() to a kernel address works + * OK for kernel threads, due to their having an mm_segment_t + * which spans the entire address space. + * + * Thus the __user pointer cast is valid here. + */ + sys_wait4(pid, (int __user *)&ret, 0, NULL); + + /* + * If ret is 0, either call_usermodehelper_exec_async failed and + * the real error code is already in sub_info->retval or + * sub_info->retval is 0 anyway, so don't mess with it then. + */ + if (ret) + sub_info->retval = ret; + } + + /* Restore default kernel sig handler */ + kernel_sigaction(SIGCHLD, SIG_IGN); + + umh_complete(sub_info); +} + +/* + * We need to create the usermodehelper kernel thread from a task that is affine + * to an optimized set of CPUs (or nohz housekeeping ones) such that they + * inherit a widest affinity irrespective of call_usermodehelper() callers with + * possibly reduced affinity (eg: per-cpu workqueues). We don't want + * usermodehelper targets to contend a busy CPU. + * + * Unbound workqueues provide such wide affinity and allow to block on + * UMH_WAIT_PROC requests without blocking pending request (up to some limit). + * + * Besides, workqueues provide the privilege level that caller might not have + * to perform the usermodehelper request. + * + */ +static void call_usermodehelper_exec_work(struct work_struct *work) +{ + struct subprocess_info *sub_info = + container_of(work, struct subprocess_info, work); + + if (sub_info->wait & UMH_WAIT_PROC) { + call_usermodehelper_exec_sync(sub_info); + } else { + pid_t pid; + /* + * Use CLONE_PARENT to reparent it to kthreadd; we do not + * want to pollute current->children, and we need a parent + * that always ignores SIGCHLD to ensure auto-reaping. + */ + pid = kernel_thread(call_usermodehelper_exec_async, sub_info, + CLONE_PARENT | SIGCHLD); + if (pid < 0) { + sub_info->retval = pid; + umh_complete(sub_info); + } + } +} + +/* + * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY + * (used for preventing user land processes from being created after the user + * land has been frozen during a system-wide hibernation or suspend operation). + * Should always be manipulated under umhelper_sem acquired for write. + */ +static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED; + +/* Number of helpers running */ +static atomic_t running_helpers = ATOMIC_INIT(0); + +/* + * Wait queue head used by usermodehelper_disable() to wait for all running + * helpers to finish. + */ +static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); + +/* + * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled + * to become 'false'. + */ +static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq); + +/* + * Time to wait for running_helpers to become zero before the setting of + * usermodehelper_disabled in usermodehelper_disable() fails + */ +#define RUNNING_HELPERS_TIMEOUT (5 * HZ) + +int usermodehelper_read_trylock(void) +{ + DEFINE_WAIT(wait); + int ret = 0; + + down_read(&umhelper_sem); + for (;;) { + prepare_to_wait(&usermodehelper_disabled_waitq, &wait, + TASK_INTERRUPTIBLE); + if (!usermodehelper_disabled) + break; + + if (usermodehelper_disabled == UMH_DISABLED) + ret = -EAGAIN; + + up_read(&umhelper_sem); + + if (ret) + break; + + schedule(); + try_to_freeze(); + + down_read(&umhelper_sem); + } + finish_wait(&usermodehelper_disabled_waitq, &wait); + return ret; +} +EXPORT_SYMBOL_GPL(usermodehelper_read_trylock); + +long usermodehelper_read_lock_wait(long timeout) +{ + DEFINE_WAIT(wait); + + if (timeout < 0) + return -EINVAL; + + down_read(&umhelper_sem); + for (;;) { + prepare_to_wait(&usermodehelper_disabled_waitq, &wait, + TASK_UNINTERRUPTIBLE); + if (!usermodehelper_disabled) + break; + + up_read(&umhelper_sem); + + timeout = schedule_timeout(timeout); + if (!timeout) + break; + + down_read(&umhelper_sem); + } + finish_wait(&usermodehelper_disabled_waitq, &wait); + return timeout; +} +EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait); + +void usermodehelper_read_unlock(void) +{ + up_read(&umhelper_sem); +} +EXPORT_SYMBOL_GPL(usermodehelper_read_unlock); + +/** + * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled. + * @depth: New value to assign to usermodehelper_disabled. + * + * Change the value of usermodehelper_disabled (under umhelper_sem locked for + * writing) and wakeup tasks waiting for it to change. + */ +void __usermodehelper_set_disable_depth(enum umh_disable_depth depth) +{ + down_write(&umhelper_sem); + usermodehelper_disabled = depth; + wake_up(&usermodehelper_disabled_waitq); + up_write(&umhelper_sem); +} + +/** + * __usermodehelper_disable - Prevent new helpers from being started. + * @depth: New value to assign to usermodehelper_disabled. + * + * Set usermodehelper_disabled to @depth and wait for running helpers to exit. + */ +int __usermodehelper_disable(enum umh_disable_depth depth) +{ + long retval; + + if (!depth) + return -EINVAL; + + down_write(&umhelper_sem); + usermodehelper_disabled = depth; + up_write(&umhelper_sem); + + /* + * From now on call_usermodehelper_exec() won't start any new + * helpers, so it is sufficient if running_helpers turns out to + * be zero at one point (it may be increased later, but that + * doesn't matter). + */ + retval = wait_event_timeout(running_helpers_waitq, + atomic_read(&running_helpers) == 0, + RUNNING_HELPERS_TIMEOUT); + if (retval) + return 0; + + __usermodehelper_set_disable_depth(UMH_ENABLED); + return -EAGAIN; +} + +static void helper_lock(void) +{ + atomic_inc(&running_helpers); + smp_mb__after_atomic(); +} + +static void helper_unlock(void) +{ + if (atomic_dec_and_test(&running_helpers)) + wake_up(&running_helpers_waitq); +} + +/** + * call_usermodehelper_setup - prepare to call a usermode helper + * @path: path to usermode executable + * @argv: arg vector for process + * @envp: environment for process + * @gfp_mask: gfp mask for memory allocation + * @cleanup: a cleanup function + * @init: an init function + * @data: arbitrary context sensitive data + * + * Returns either %NULL on allocation failure, or a subprocess_info + * structure. This should be passed to call_usermodehelper_exec to + * exec the process and free the structure. + * + * The init function is used to customize the helper process prior to + * exec. A non-zero return code causes the process to error out, exit, + * and return the failure to the calling process + * + * The cleanup function is just before ethe subprocess_info is about to + * be freed. This can be used for freeing the argv and envp. The + * Function must be runnable in either a process context or the + * context in which call_usermodehelper_exec is called. + */ +struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv, + char **envp, gfp_t gfp_mask, + int (*init)(struct subprocess_info *info, struct cred *new), + void (*cleanup)(struct subprocess_info *info), + void *data) +{ + struct subprocess_info *sub_info; + sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask); + if (!sub_info) + goto out; + + INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); + +#ifdef CONFIG_STATIC_USERMODEHELPER + sub_info->path = CONFIG_STATIC_USERMODEHELPER_PATH; +#else + sub_info->path = path; +#endif + sub_info->argv = argv; + sub_info->envp = envp; + + sub_info->cleanup = cleanup; + sub_info->init = init; + sub_info->data = data; + out: + return sub_info; +} +EXPORT_SYMBOL(call_usermodehelper_setup); + +/** + * call_usermodehelper_exec - start a usermode application + * @sub_info: information about the subprocessa + * @wait: wait for the application to finish and return status. + * when UMH_NO_WAIT don't wait at all, but you get no useful error back + * when the program couldn't be exec'ed. This makes it safe to call + * from interrupt context. + * + * Runs a user-space application. The application is started + * asynchronously if wait is not set, and runs as a child of system workqueues. + * (ie. it runs with full root capabilities and optimized affinity). + */ +int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) +{ + DECLARE_COMPLETION_ONSTACK(done); + int retval = 0; + + if (!sub_info->path) { + call_usermodehelper_freeinfo(sub_info); + return -EINVAL; + } + helper_lock(); + if (usermodehelper_disabled) { + retval = -EBUSY; + goto out; + } + + /* + * If there is no binary for us to call, then just return and get out of + * here. This allows us to set STATIC_USERMODEHELPER_PATH to "" and + * disable all call_usermodehelper() calls. + */ + if (strlen(sub_info->path) == 0) + goto out; + + /* + * Set the completion pointer only if there is a waiter. + * This makes it possible to use umh_complete to free + * the data structure in case of UMH_NO_WAIT. + */ + sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done; + sub_info->wait = wait; + + queue_work(system_unbound_wq, &sub_info->work); + if (wait == UMH_NO_WAIT) /* task has freed sub_info */ + goto unlock; + + if (wait & UMH_KILLABLE) { + retval = wait_for_completion_killable(&done); + if (!retval) + goto wait_done; + + /* umh_complete() will see NULL and free sub_info */ + if (xchg(&sub_info->complete, NULL)) + goto unlock; + /* fallthrough, umh_complete() was already called */ + } + + wait_for_completion(&done); +wait_done: + retval = sub_info->retval; +out: + call_usermodehelper_freeinfo(sub_info); +unlock: + helper_unlock(); + return retval; +} +EXPORT_SYMBOL(call_usermodehelper_exec); + +/** + * call_usermodehelper() - prepare and start a usermode application + * @path: path to usermode executable + * @argv: arg vector for process + * @envp: environment for process + * @wait: wait for the application to finish and return status. + * when UMH_NO_WAIT don't wait at all, but you get no useful error back + * when the program couldn't be exec'ed. This makes it safe to call + * from interrupt context. + * + * This function is the equivalent to use call_usermodehelper_setup() and + * call_usermodehelper_exec(). + */ +int call_usermodehelper(const char *path, char **argv, char **envp, int wait) +{ + struct subprocess_info *info; + gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; + + info = call_usermodehelper_setup(path, argv, envp, gfp_mask, + NULL, NULL, NULL); + if (info == NULL) + return -ENOMEM; + + return call_usermodehelper_exec(info, wait); +} +EXPORT_SYMBOL(call_usermodehelper); + +static int proc_cap_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + unsigned long cap_array[_KERNEL_CAPABILITY_U32S]; + kernel_cap_t new_cap; + int err, i; + + if (write && (!capable(CAP_SETPCAP) || + !capable(CAP_SYS_MODULE))) + return -EPERM; + + /* + * convert from the global kernel_cap_t to the ulong array to print to + * userspace if this is a read. + */ + spin_lock(&umh_sysctl_lock); + for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) { + if (table->data == CAP_BSET) + cap_array[i] = usermodehelper_bset.cap[i]; + else if (table->data == CAP_PI) + cap_array[i] = usermodehelper_inheritable.cap[i]; + else + BUG(); + } + spin_unlock(&umh_sysctl_lock); + + t = *table; + t.data = &cap_array; + + /* + * actually read or write and array of ulongs from userspace. Remember + * these are least significant 32 bits first + */ + err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + + /* + * convert from the sysctl array of ulongs to the kernel_cap_t + * internal representation + */ + for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) + new_cap.cap[i] = cap_array[i]; + + /* + * Drop everything not in the new_cap (but don't add things) + */ + spin_lock(&umh_sysctl_lock); + if (write) { + if (table->data == CAP_BSET) + usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap); + if (table->data == CAP_PI) + usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap); + } + spin_unlock(&umh_sysctl_lock); + + return 0; +} + +struct ctl_table usermodehelper_table[] = { + { + .procname = "bset", + .data = CAP_BSET, + .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), + .mode = 0600, + .proc_handler = proc_cap_handler, + }, + { + .procname = "inheritable", + .data = CAP_PI, + .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), + .mode = 0600, + .proc_handler = proc_cap_handler, + }, + { } +}; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 2f735cbe05e8..c490f1e4313b 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -986,17 +986,21 @@ bool userns_may_setgroups(const struct user_namespace *ns) } /* - * Returns true if @ns is the same namespace as or a descendant of - * @target_ns. + * Returns true if @child is the same namespace or a descendant of + * @ancestor. */ +bool in_userns(const struct user_namespace *ancestor, + const struct user_namespace *child) +{ + const struct user_namespace *ns; + for (ns = child; ns->level > ancestor->level; ns = ns->parent) + ; + return (ns == ancestor); +} + bool current_in_userns(const struct user_namespace *target_ns) { - struct user_namespace *ns; - for (ns = current_user_ns(); ns; ns = ns->parent) { - if (ns == target_ns) - return true; - } - return false; + return in_userns(target_ns, current_user_ns()); } static inline struct user_namespace *to_user_ns(struct ns_common *ns) |