summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/bpf/syscall.c1
-rw-r--r--kernel/bpf/verifier.c2
-rw-r--r--kernel/cgroup/Makefile6
-rw-r--r--kernel/cgroup/cgroup-internal.h214
-rw-r--r--kernel/cgroup/cgroup-v1.c1398
-rw-r--r--kernel/cgroup/cgroup.c (renamed from kernel/cgroup.c)2082
-rw-r--r--kernel/cgroup/cpuset.c (renamed from kernel/cpuset.c)2
-rw-r--r--kernel/cgroup/freezer.c (renamed from kernel/cgroup_freezer.c)0
-rw-r--r--kernel/cgroup/namespace.c155
-rw-r--r--kernel/cgroup/pids.c (renamed from kernel/cgroup_pids.c)2
-rw-r--r--kernel/cgroup/rdma.c619
-rw-r--r--kernel/configs/android-base.config2
-rw-r--r--kernel/configs/android-recommended.config1
-rw-r--r--kernel/cpu.c4
-rw-r--r--kernel/cred.c1
-rw-r--r--kernel/debug/debug_core.c5
-rw-r--r--kernel/debug/gdbstub.c1
-rw-r--r--kernel/debug/kdb/kdb_bt.c3
-rw-r--r--kernel/debug/kdb/kdb_main.c3
-rw-r--r--kernel/delayacct.c2
-rw-r--r--kernel/events/callchain.c2
-rw-r--r--kernel/events/core.c26
-rw-r--r--kernel/events/uprobes.c30
-rw-r--r--kernel/exit.c23
-rw-r--r--kernel/fork.c87
-rw-r--r--kernel/futex.c4
-rw-r--r--kernel/hung_task.c3
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/jump_label.c153
-rw-r--r--kernel/kmod.c2
-rw-r--r--kernel/ksysfs.c2
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/latencytop.c2
-rw-r--r--kernel/locking/lockdep.c2
-rw-r--r--kernel/locking/locktorture.c2
-rw-r--r--kernel/locking/mutex.c4
-rw-r--r--kernel/locking/qspinlock_stat.h1
-rw-r--r--kernel/locking/rtmutex-debug.c1
-rw-r--r--kernel/locking/rtmutex.c4
-rw-r--r--kernel/locking/rtmutex_common.h1
-rw-r--r--kernel/locking/rwsem-spinlock.c3
-rw-r--r--kernel/locking/rwsem-xadd.c4
-rw-r--r--kernel/locking/rwsem.c1
-rw-r--r--kernel/locking/semaphore.c1
-rw-r--r--kernel/memremap.c6
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/panic.c4
-rw-r--r--kernel/pid.c1
-rw-r--r--kernel/pid_namespace.c3
-rw-r--r--kernel/power/hibernate.c94
-rw-r--r--kernel/power/process.c2
-rw-r--r--kernel/power/snapshot.c1
-rw-r--r--kernel/printk/printk.c3
-rw-r--r--kernel/profile.c2
-rw-r--r--kernel/ptrace.c3
-rw-r--r--kernel/rcu/rcuperf.c1
-rw-r--r--kernel/rcu/rcutorture.c3
-rw-r--r--kernel/rcu/srcu.c2
-rw-r--r--kernel/rcu/tiny.c14
-rw-r--r--kernel/rcu/tree.c4
-rw-r--r--kernel/rcu/tree.h1
-rw-r--r--kernel/rcu/tree_plugin.h2
-rw-r--r--kernel/rcu/update.c4
-rw-r--r--kernel/relay.c6
-rw-r--r--kernel/sched/autogroup.h1
-rw-r--r--kernel/sched/clock.c2
-rw-r--r--kernel/sched/completion.c3
-rw-r--r--kernel/sched/core.c72
-rw-r--r--kernel/sched/cpudeadline.c4
-rw-r--r--kernel/sched/cpufreq_schedutil.c1
-rw-r--r--kernel/sched/cpupri.c4
-rw-r--r--kernel/sched/cputime.c6
-rw-r--r--kernel/sched/deadline.c35
-rw-r--r--kernel/sched/debug.c3
-rw-r--r--kernel/sched/fair.c29
-rw-r--r--kernel/sched/idle.c1
-rw-r--r--kernel/sched/loadavg.c1
-rw-r--r--kernel/sched/rt.c29
-rw-r--r--kernel/sched/sched.h24
-rw-r--r--kernel/sched/stats.h111
-rw-r--r--kernel/sched/swait.c2
-rw-r--r--kernel/sched/wait.c3
-rw-r--r--kernel/seccomp.c1
-rw-r--r--kernel/signal.c24
-rw-r--r--kernel/smp.c1
-rw-r--r--kernel/smpboot.c1
-rw-r--r--kernel/sys.c29
-rw-r--r--kernel/sysctl.c1
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/hrtimer.c4
-rw-r--r--kernel/time/itimer.c2
-rw-r--r--kernel/time/posix-cpu-timers.c3
-rw-r--r--kernel/time/posix-timers.c1
-rw-r--r--kernel/time/sched_clock.c1
-rw-r--r--kernel/time/tick-sched.c6
-rw-r--r--kernel/time/timekeeping.c2
-rw-r--r--kernel/time/timer.c4
-rw-r--r--kernel/torture.c3
-rw-r--r--kernel/trace/ftrace.c373
-rw-r--r--kernel/trace/ring_buffer.c1
-rw-r--r--kernel/trace/ring_buffer_benchmark.c1
-rw-r--r--kernel/trace/trace.c17
-rw-r--r--kernel/trace/trace.h83
-rw-r--r--kernel/trace/trace_benchmark.c4
-rw-r--r--kernel/trace/trace_branch.c83
-rw-r--r--kernel/trace/trace_clock.c1
-rw-r--r--kernel/trace/trace_entries.h6
-rw-r--r--kernel/trace/trace_events_hist.c1
-rw-r--r--kernel/trace/trace_events_trigger.c1
-rw-r--r--kernel/trace/trace_hwlat.c40
-rw-r--r--kernel/trace/trace_kprobe.c2
-rw-r--r--kernel/trace/trace_output.c40
-rw-r--r--kernel/trace/trace_probe.c50
-rw-r--r--kernel/trace/trace_selftest.c1
-rw-r--r--kernel/trace/trace_stack.c1
-rw-r--r--kernel/trace/trace_uprobe.c5
-rw-r--r--kernel/tracepoint.c3
-rw-r--r--kernel/tsacct.c4
-rw-r--r--kernel/ucount.c7
-rw-r--r--kernel/uid16.c1
-rw-r--r--kernel/user.c1
-rw-r--r--kernel/user_namespace.c1
-rw-r--r--kernel/utsname.c2
-rw-r--r--kernel/utsname_sysctl.c1
-rw-r--r--kernel/watchdog.c3
-rw-r--r--kernel/watchdog_hld.c27
128 files changed, 3778 insertions, 2430 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 12c679f769c6..b302b4731d16 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -64,10 +64,7 @@ obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
-obj-$(CONFIG_CGROUPS) += cgroup.o
-obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
-obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o
-obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CGROUPS) += cgroup/
obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
obj-$(CONFIG_PID_NS) += pid_namespace.o
diff --git a/kernel/acct.c b/kernel/acct.c
index ca9cb55b5855..5b1284370367 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -56,6 +56,8 @@
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/uaccess.h>
+#include <linux/sched/cputime.h>
+
#include <asm/div64.h>
#include <linux/blkdev.h> /* sector_div */
#include <linux/pid_namespace.h>
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 461eb1e66a0f..7af0dcc5d755 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -13,6 +13,7 @@
#include <linux/bpf_trace.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
+#include <linux/sched/signal.h>
#include <linux/vmalloc.h>
#include <linux/mmzone.h>
#include <linux/anon_inodes.h>
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d2bded2b250c..3fc6e39b223e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2776,7 +2776,7 @@ static int do_check(struct bpf_verifier_env *env)
class = BPF_CLASS(insn->code);
if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
- verbose("BPF program is too large. Proccessed %d insn\n",
+ verbose("BPF program is too large. Processed %d insn\n",
insn_processed);
return -E2BIG;
}
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
new file mode 100644
index 000000000000..387348a40c64
--- /dev/null
+++ b/kernel/cgroup/Makefile
@@ -0,0 +1,6 @@
+obj-y := cgroup.o namespace.o cgroup-v1.o
+
+obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
+obj-$(CONFIG_CGROUP_PIDS) += pids.o
+obj-$(CONFIG_CGROUP_RDMA) += rdma.o
+obj-$(CONFIG_CPUSETS) += cpuset.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
new file mode 100644
index 000000000000..9203bfb05603
--- /dev/null
+++ b/kernel/cgroup/cgroup-internal.h
@@ -0,0 +1,214 @@
+#ifndef __CGROUP_INTERNAL_H
+#define __CGROUP_INTERNAL_H
+
+#include <linux/cgroup.h>
+#include <linux/kernfs.h>
+#include <linux/workqueue.h>
+#include <linux/list.h>
+
+/*
+ * A cgroup can be associated with multiple css_sets as different tasks may
+ * belong to different cgroups on different hierarchies. In the other
+ * direction, a css_set is naturally associated with multiple cgroups.
+ * This M:N relationship is represented by the following link structure
+ * which exists for each association and allows traversing the associations
+ * from both sides.
+ */
+struct cgrp_cset_link {
+ /* the cgroup and css_set this link associates */
+ struct cgroup *cgrp;
+ struct css_set *cset;
+
+ /* list of cgrp_cset_links anchored at cgrp->cset_links */
+ struct list_head cset_link;
+
+ /* list of cgrp_cset_links anchored at css_set->cgrp_links */
+ struct list_head cgrp_link;
+};
+
+/* used to track tasks and csets during migration */
+struct cgroup_taskset {
+ /* the src and dst cset list running through cset->mg_node */
+ struct list_head src_csets;
+ struct list_head dst_csets;
+
+ /* the subsys currently being processed */
+ int ssid;
+
+ /*
+ * Fields for cgroup_taskset_*() iteration.
+ *
+ * Before migration is committed, the target migration tasks are on
+ * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of
+ * the csets on ->dst_csets. ->csets point to either ->src_csets
+ * or ->dst_csets depending on whether migration is committed.
+ *
+ * ->cur_csets and ->cur_task point to the current task position
+ * during iteration.
+ */
+ struct list_head *csets;
+ struct css_set *cur_cset;
+ struct task_struct *cur_task;
+};
+
+/* migration context also tracks preloading */
+struct cgroup_mgctx {
+ /*
+ * Preloaded source and destination csets. Used to guarantee
+ * atomic success or failure on actual migration.
+ */
+ struct list_head preloaded_src_csets;
+ struct list_head preloaded_dst_csets;
+
+ /* tasks and csets to migrate */
+ struct cgroup_taskset tset;
+
+ /* subsystems affected by migration */
+ u16 ss_mask;
+};
+
+#define CGROUP_TASKSET_INIT(tset) \
+{ \
+ .src_csets = LIST_HEAD_INIT(tset.src_csets), \
+ .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \
+ .csets = &tset.src_csets, \
+}
+
+#define CGROUP_MGCTX_INIT(name) \
+{ \
+ LIST_HEAD_INIT(name.preloaded_src_csets), \
+ LIST_HEAD_INIT(name.preloaded_dst_csets), \
+ CGROUP_TASKSET_INIT(name.tset), \
+}
+
+#define DEFINE_CGROUP_MGCTX(name) \
+ struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
+
+struct cgroup_sb_opts {
+ u16 subsys_mask;
+ unsigned int flags;
+ char *release_agent;
+ bool cpuset_clone_children;
+ char *name;
+ /* User explicitly requested empty subsystem */
+ bool none;
+};
+
+extern struct mutex cgroup_mutex;
+extern spinlock_t css_set_lock;
+extern struct cgroup_subsys *cgroup_subsys[];
+extern struct list_head cgroup_roots;
+extern struct file_system_type cgroup_fs_type;
+
+/* iterate across the hierarchies */
+#define for_each_root(root) \
+ list_for_each_entry((root), &cgroup_roots, root_list)
+
+/**
+ * for_each_subsys - iterate all enabled cgroup subsystems
+ * @ss: the iteration cursor
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ */
+#define for_each_subsys(ss, ssid) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
+ (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
+
+static inline bool cgroup_is_dead(const struct cgroup *cgrp)
+{
+ return !(cgrp->self.flags & CSS_ONLINE);
+}
+
+static inline bool notify_on_release(const struct cgroup *cgrp)
+{
+ return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+}
+
+void put_css_set_locked(struct css_set *cset);
+
+static inline void put_css_set(struct css_set *cset)
+{
+ unsigned long flags;
+
+ /*
+ * Ensure that the refcount doesn't hit zero while any readers
+ * can see it. Similar to atomic_dec_and_lock(), but for an
+ * rwlock
+ */
+ if (atomic_add_unless(&cset->refcount, -1, 1))
+ return;
+
+ spin_lock_irqsave(&css_set_lock, flags);
+ put_css_set_locked(cset);
+ spin_unlock_irqrestore(&css_set_lock, flags);
+}
+
+/*
+ * refcounted get/put for css_set objects
+ */
+static inline void get_css_set(struct css_set *cset)
+{
+ atomic_inc(&cset->refcount);
+}
+
+bool cgroup_ssid_enabled(int ssid);
+bool cgroup_on_dfl(const struct cgroup *cgrp);
+
+struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root);
+struct cgroup *task_cgroup_from_root(struct task_struct *task,
+ struct cgroup_root *root);
+struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline);
+void cgroup_kn_unlock(struct kernfs_node *kn);
+int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
+ struct cgroup_namespace *ns);
+
+void cgroup_free_root(struct cgroup_root *root);
+void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts);
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
+int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
+struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
+ struct cgroup_root *root, unsigned long magic,
+ struct cgroup_namespace *ns);
+
+bool cgroup_may_migrate_to(struct cgroup *dst_cgrp);
+void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
+void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp,
+ struct cgroup_mgctx *mgctx);
+int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx);
+int cgroup_migrate(struct task_struct *leader, bool threadgroup,
+ struct cgroup_mgctx *mgctx);
+
+int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
+ bool threadgroup);
+ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off, bool threadgroup);
+ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
+ loff_t off);
+
+void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
+
+int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode);
+int cgroup_rmdir(struct kernfs_node *kn);
+int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
+ struct kernfs_root *kf_root);
+
+/*
+ * namespace.c
+ */
+extern const struct proc_ns_operations cgroupns_operations;
+
+/*
+ * cgroup-v1.c
+ */
+extern struct cftype cgroup1_base_files[];
+extern const struct file_operations proc_cgroupstats_operations;
+extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
+
+bool cgroup1_ssid_disabled(int ssid);
+void cgroup1_pidlist_destroy_all(struct cgroup *cgrp);
+void cgroup1_release_agent(struct work_struct *work);
+void cgroup1_check_for_release(struct cgroup *cgrp);
+struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
+ void *data, unsigned long magic,
+ struct cgroup_namespace *ns);
+
+#endif /* __CGROUP_INTERNAL_H */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
new file mode 100644
index 000000000000..56eba9caa632
--- /dev/null
+++ b/kernel/cgroup/cgroup-v1.c
@@ -0,0 +1,1398 @@
+#include "cgroup-internal.h"
+
+#include <linux/ctype.h>
+#include <linux/kmod.h>
+#include <linux/sort.h>
+#include <linux/delay.h>
+#include <linux/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/delayacct.h>
+#include <linux/pid_namespace.h>
+#include <linux/cgroupstats.h>
+
+#include <trace/events/cgroup.h>
+
+/*
+ * pidlists linger the following amount before being destroyed. The goal
+ * is avoiding frequent destruction in the middle of consecutive read calls
+ * Expiring in the middle is a performance problem not a correctness one.
+ * 1 sec should be enough.
+ */
+#define CGROUP_PIDLIST_DESTROY_DELAY HZ
+
+/* Controllers blocked by the commandline in v1 */
+static u16 cgroup_no_v1_mask;
+
+/*
+ * pidlist destructions need to be flushed on cgroup destruction. Use a
+ * separate workqueue as flush domain.
+ */
+static struct workqueue_struct *cgroup_pidlist_destroy_wq;
+
+/*
+ * Protects cgroup_subsys->release_agent_path. Modifying it also requires
+ * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
+ */
+static DEFINE_SPINLOCK(release_agent_path_lock);
+
+bool cgroup1_ssid_disabled(int ssid)
+{
+ return cgroup_no_v1_mask & (1 << ssid);
+}
+
+/**
+ * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
+ * @from: attach to all cgroups of a given task
+ * @tsk: the task to be attached
+ */
+int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
+{
+ struct cgroup_root *root;
+ int retval = 0;
+
+ mutex_lock(&cgroup_mutex);
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+ for_each_root(root) {
+ struct cgroup *from_cgrp;
+
+ if (root == &cgrp_dfl_root)
+ continue;
+
+ spin_lock_irq(&css_set_lock);
+ from_cgrp = task_cgroup_from_root(from, root);
+ spin_unlock_irq(&css_set_lock);
+
+ retval = cgroup_attach_task(from_cgrp, tsk, false);
+ if (retval)
+ break;
+ }
+ percpu_up_write(&cgroup_threadgroup_rwsem);
+ mutex_unlock(&cgroup_mutex);
+
+ return retval;
+}
+EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
+
+/**
+ * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
+ * @to: cgroup to which the tasks will be moved
+ * @from: cgroup in which the tasks currently reside
+ *
+ * Locking rules between cgroup_post_fork() and the migration path
+ * guarantee that, if a task is forking while being migrated, the new child
+ * is guaranteed to be either visible in the source cgroup after the
+ * parent's migration is complete or put into the target cgroup. No task
+ * can slip out of migration through forking.
+ */
+int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
+{
+ DEFINE_CGROUP_MGCTX(mgctx);
+ struct cgrp_cset_link *link;
+ struct css_task_iter it;
+ struct task_struct *task;
+ int ret;
+
+ if (cgroup_on_dfl(to))
+ return -EINVAL;
+
+ if (!cgroup_may_migrate_to(to))
+ return -EBUSY;
+
+ mutex_lock(&cgroup_mutex);
+
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+
+ /* all tasks in @from are being moved, all csets are source */
+ spin_lock_irq(&css_set_lock);
+ list_for_each_entry(link, &from->cset_links, cset_link)
+ cgroup_migrate_add_src(link->cset, to, &mgctx);
+ spin_unlock_irq(&css_set_lock);
+
+ ret = cgroup_migrate_prepare_dst(&mgctx);
+ if (ret)
+ goto out_err;
+
+ /*
+ * Migrate tasks one-by-one until @from is empty. This fails iff
+ * ->can_attach() fails.
+ */
+ do {
+ css_task_iter_start(&from->self, &it);
+ task = css_task_iter_next(&it);
+ if (task)
+ get_task_struct(task);
+ css_task_iter_end(&it);
+
+ if (task) {
+ ret = cgroup_migrate(task, false, &mgctx);
+ if (!ret)
+ trace_cgroup_transfer_tasks(to, task, false);
+ put_task_struct(task);
+ }
+ } while (task && !ret);
+out_err:
+ cgroup_migrate_finish(&mgctx);
+ percpu_up_write(&cgroup_threadgroup_rwsem);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+
+/*
+ * Stuff for reading the 'tasks'/'procs' files.
+ *
+ * Reading this file can return large amounts of data if a cgroup has
+ * *lots* of attached tasks. So it may need several calls to read(),
+ * but we cannot guarantee that the information we produce is correct
+ * unless we produce it entirely atomically.
+ *
+ */
+
+/* which pidlist file are we talking about? */
+enum cgroup_filetype {
+ CGROUP_FILE_PROCS,
+ CGROUP_FILE_TASKS,
+};
+
+/*
+ * A pidlist is a list of pids that virtually represents the contents of one
+ * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
+ * a pair (one each for procs, tasks) for each pid namespace that's relevant
+ * to the cgroup.
+ */
+struct cgroup_pidlist {
+ /*
+ * used to find which pidlist is wanted. doesn't change as long as
+ * this particular list stays in the list.
+ */
+ struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
+ /* array of xids */
+ pid_t *list;
+ /* how many elements the above list has */
+ int length;
+ /* each of these stored in a list by its cgroup */
+ struct list_head links;
+ /* pointer to the cgroup we belong to, for list removal purposes */
+ struct cgroup *owner;
+ /* for delayed destruction */
+ struct delayed_work destroy_dwork;
+};
+
+/*
+ * The following two functions "fix" the issue where there are more pids
+ * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
+ * TODO: replace with a kernel-wide solution to this problem
+ */
+#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
+static void *pidlist_allocate(int count)
+{
+ if (PIDLIST_TOO_LARGE(count))
+ return vmalloc(count * sizeof(pid_t));
+ else
+ return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
+}
+
+static void pidlist_free(void *p)
+{
+ kvfree(p);
+}
+
+/*
+ * Used to destroy all pidlists lingering waiting for destroy timer. None
+ * should be left afterwards.
+ */
+void cgroup1_pidlist_destroy_all(struct cgroup *cgrp)
+{
+ struct cgroup_pidlist *l, *tmp_l;
+
+ mutex_lock(&cgrp->pidlist_mutex);
+ list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
+ mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
+ mutex_unlock(&cgrp->pidlist_mutex);
+
+ flush_workqueue(cgroup_pidlist_destroy_wq);
+ BUG_ON(!list_empty(&cgrp->pidlists));
+}
+
+static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
+ destroy_dwork);
+ struct cgroup_pidlist *tofree = NULL;
+
+ mutex_lock(&l->owner->pidlist_mutex);
+
+ /*
+ * Destroy iff we didn't get queued again. The state won't change
+ * as destroy_dwork can only be queued while locked.
+ */
+ if (!delayed_work_pending(dwork)) {
+ list_del(&l->links);
+ pidlist_free(l->list);
+ put_pid_ns(l->key.ns);
+ tofree = l;
+ }
+
+ mutex_unlock(&l->owner->pidlist_mutex);
+ kfree(tofree);
+}
+
+/*
+ * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
+ * Returns the number of unique elements.
+ */
+static int pidlist_uniq(pid_t *list, int length)
+{
+ int src, dest = 1;
+
+ /*
+ * we presume the 0th element is unique, so i starts at 1. trivial
+ * edge cases first; no work needs to be done for either
+ */
+ if (length == 0 || length == 1)
+ return length;
+ /* src and dest walk down the list; dest counts unique elements */
+ for (src = 1; src < length; src++) {
+ /* find next unique element */
+ while (list[src] == list[src-1]) {
+ src++;
+ if (src == length)
+ goto after;
+ }
+ /* dest always points to where the next unique element goes */
+ list[dest] = list[src];
+ dest++;
+ }
+after:
+ return dest;
+}
+
+/*
+ * The two pid files - task and cgroup.procs - guaranteed that the result
+ * is sorted, which forced this whole pidlist fiasco. As pid order is
+ * different per namespace, each namespace needs differently sorted list,
+ * making it impossible to use, for example, single rbtree of member tasks
+ * sorted by task pointer. As pidlists can be fairly large, allocating one
+ * per open file is dangerous, so cgroup had to implement shared pool of
+ * pidlists keyed by cgroup and namespace.
+ */
+static int cmppid(const void *a, const void *b)
+{
+ return *(pid_t *)a - *(pid_t *)b;
+}
+
+static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
+ enum cgroup_filetype type)
+{
+ struct cgroup_pidlist *l;
+ /* don't need task_nsproxy() if we're looking at ourself */
+ struct pid_namespace *ns = task_active_pid_ns(current);
+
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
+ list_for_each_entry(l, &cgrp->pidlists, links)
+ if (l->key.type == type && l->key.ns == ns)
+ return l;
+ return NULL;
+}
+
+/*
+ * find the appropriate pidlist for our purpose (given procs vs tasks)
+ * returns with the lock on that pidlist already held, and takes care
+ * of the use count, or returns NULL with no locks held if we're out of
+ * memory.
+ */
+static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
+ enum cgroup_filetype type)
+{
+ struct cgroup_pidlist *l;
+
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
+ l = cgroup_pidlist_find(cgrp, type);
+ if (l)
+ return l;
+
+ /* entry not found; create a new one */
+ l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
+ if (!l)
+ return l;
+
+ INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
+ l->key.type = type;
+ /* don't need task_nsproxy() if we're looking at ourself */
+ l->key.ns = get_pid_ns(task_active_pid_ns(current));
+ l->owner = cgrp;
+ list_add(&l->links, &cgrp->pidlists);
+ return l;
+}
+
+/**
+ * cgroup_task_count - count the number of tasks in a cgroup.
+ * @cgrp: the cgroup in question
+ *
+ * Return the number of tasks in the cgroup. The returned number can be
+ * higher than the actual number of tasks due to css_set references from
+ * namespace roots and temporary usages.
+ */
+static int cgroup_task_count(const struct cgroup *cgrp)
+{
+ int count = 0;
+ struct cgrp_cset_link *link;
+
+ spin_lock_irq(&css_set_lock);
+ list_for_each_entry(link, &cgrp->cset_links, cset_link)
+ count += atomic_read(&link->cset->refcount);
+ spin_unlock_irq(&css_set_lock);
+ return count;
+}
+
+/*
+ * Load a cgroup's pidarray with either procs' tgids or tasks' pids
+ */
+static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
+ struct cgroup_pidlist **lp)
+{
+ pid_t *array;
+ int length;
+ int pid, n = 0; /* used for populating the array */
+ struct css_task_iter it;
+ struct task_struct *tsk;
+ struct cgroup_pidlist *l;
+
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
+ /*
+ * If cgroup gets more users after we read count, we won't have
+ * enough space - tough. This race is indistinguishable to the
+ * caller from the case that the additional cgroup users didn't
+ * show up until sometime later on.
+ */
+ length = cgroup_task_count(cgrp);
+ array = pidlist_allocate(length);
+ if (!array)
+ return -ENOMEM;
+ /* now, populate the array */
+ css_task_iter_start(&cgrp->self, &it);
+ while ((tsk = css_task_iter_next(&it))) {
+ if (unlikely(n == length))
+ break;
+ /* get tgid or pid for procs or tasks file respectively */
+ if (type == CGROUP_FILE_PROCS)
+ pid = task_tgid_vnr(tsk);
+ else
+ pid = task_pid_vnr(tsk);
+ if (pid > 0) /* make sure to only use valid results */
+ array[n++] = pid;
+ }
+ css_task_iter_end(&it);
+ length = n;
+ /* now sort & (if procs) strip out duplicates */
+ sort(array, length, sizeof(pid_t), cmppid, NULL);
+ if (type == CGROUP_FILE_PROCS)
+ length = pidlist_uniq(array, length);
+
+ l = cgroup_pidlist_find_create(cgrp, type);
+ if (!l) {
+ pidlist_free(array);
+ return -ENOMEM;
+ }
+
+ /* store array, freeing old if necessary */
+ pidlist_free(l->list);
+ l->list = array;
+ l->length = length;
+ *lp = l;
+ return 0;
+}
+
+/*
+ * seq_file methods for the tasks/procs files. The seq_file position is the
+ * next pid to display; the seq_file iterator is a pointer to the pid
+ * in the cgroup->l->list array.
+ */
+
+static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
+{
+ /*
+ * Initially we receive a position value that corresponds to
+ * one more than the last pid shown (or 0 on the first call or
+ * after a seek to the start). Use a binary-search to find the
+ * next pid to display, if any
+ */
+ struct kernfs_open_file *of = s->private;
+ struct cgroup *cgrp = seq_css(s)->cgroup;
+ struct cgroup_pidlist *l;
+ enum cgroup_filetype type = seq_cft(s)->private;
+ int index = 0, pid = *pos;
+ int *iter, ret;
+
+ mutex_lock(&cgrp->pidlist_mutex);
+
+ /*
+ * !NULL @of->priv indicates that this isn't the first start()
+ * after open. If the matching pidlist is around, we can use that.
+ * Look for it. Note that @of->priv can't be used directly. It
+ * could already have been destroyed.
+ */
+ if (of->priv)
+ of->priv = cgroup_pidlist_find(cgrp, type);
+
+ /*
+ * Either this is the first start() after open or the matching
+ * pidlist has been destroyed inbetween. Create a new one.
+ */
+ if (!of->priv) {
+ ret = pidlist_array_load(cgrp, type,
+ (struct cgroup_pidlist **)&of->priv);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+ l = of->priv;
+
+ if (pid) {
+ int end = l->length;
+
+ while (index < end) {
+ int mid = (index + end) / 2;
+ if (l->list[mid] == pid) {
+ index = mid;
+ break;
+ } else if (l->list[mid] <= pid)
+ index = mid + 1;
+ else
+ end = mid;
+ }
+ }
+ /* If we're off the end of the array, we're done */
+ if (index >= l->length)
+ return NULL;
+ /* Update the abstract position to be the actual pid that we found */
+ iter = l->list + index;
+ *pos = *iter;
+ return iter;
+}
+
+static void cgroup_pidlist_stop(struct seq_file *s, void *v)
+{
+ struct kernfs_open_file *of = s->private;
+ struct cgroup_pidlist *l = of->priv;
+
+ if (l)
+ mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
+ CGROUP_PIDLIST_DESTROY_DELAY);
+ mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
+}
+
+static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ struct kernfs_open_file *of = s->private;
+ struct cgroup_pidlist *l = of->priv;
+ pid_t *p = v;
+ pid_t *end = l->list + l->length;
+ /*
+ * Advance to the next pid in the array. If this goes off the
+ * end, we're done
+ */
+ p++;
+ if (p >= end) {
+ return NULL;
+ } else {
+ *pos = *p;
+ return p;
+ }
+}
+
+static int cgroup_pidlist_show(struct seq_file *s, void *v)
+{
+ seq_printf(s, "%d\n", *(int *)v);
+
+ return 0;
+}
+
+static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return __cgroup_procs_write(of, buf, nbytes, off, false);
+}
+
+static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+
+ BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENODEV;
+ spin_lock(&release_agent_path_lock);
+ strlcpy(cgrp->root->release_agent_path, strstrip(buf),
+ sizeof(cgrp->root->release_agent_path));
+ spin_unlock(&release_agent_path_lock);
+ cgroup_kn_unlock(of->kn);
+ return nbytes;
+}
+
+static int cgroup_release_agent_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ spin_lock(&release_agent_path_lock);
+ seq_puts(seq, cgrp->root->release_agent_path);
+ spin_unlock(&release_agent_path_lock);
+ seq_putc(seq, '\n');
+ return 0;
+}
+
+static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
+{
+ seq_puts(seq, "0\n");
+ return 0;
+}
+
+static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return notify_on_release(css->cgroup);
+}
+
+static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
+{
+ if (val)
+ set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
+ else
+ clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
+ return 0;
+}
+
+static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+}
+
+static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
+{
+ if (val)
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+ else
+ clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+ return 0;
+}
+
+/* cgroup core interface files for the legacy hierarchies */
+struct cftype cgroup1_base_files[] = {
+ {
+ .name = "cgroup.procs",
+ .seq_start = cgroup_pidlist_start,
+ .seq_next = cgroup_pidlist_next,
+ .seq_stop = cgroup_pidlist_stop,
+ .seq_show = cgroup_pidlist_show,
+ .private = CGROUP_FILE_PROCS,
+ .write = cgroup_procs_write,
+ },
+ {
+ .name = "cgroup.clone_children",
+ .read_u64 = cgroup_clone_children_read,
+ .write_u64 = cgroup_clone_children_write,
+ },
+ {
+ .name = "cgroup.sane_behavior",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_sane_behavior_show,
+ },
+ {
+ .name = "tasks",
+ .seq_start = cgroup_pidlist_start,
+ .seq_next = cgroup_pidlist_next,
+ .seq_stop = cgroup_pidlist_stop,
+ .seq_show = cgroup_pidlist_show,
+ .private = CGROUP_FILE_TASKS,
+ .write = cgroup_tasks_write,
+ },
+ {
+ .name = "notify_on_release",
+ .read_u64 = cgroup_read_notify_on_release,
+ .write_u64 = cgroup_write_notify_on_release,
+ },
+ {
+ .name = "release_agent",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_release_agent_show,
+ .write = cgroup_release_agent_write,
+ .max_write_len = PATH_MAX - 1,
+ },
+ { } /* terminate */
+};
+
+/* Display information about each subsystem and each hierarchy */
+static int proc_cgroupstats_show(struct seq_file *m, void *v)
+{
+ struct cgroup_subsys *ss;
+ int i;
+
+ seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
+ /*
+ * ideally we don't want subsystems moving around while we do this.
+ * cgroup_mutex is also necessary to guarantee an atomic snapshot of
+ * subsys/hierarchy state.
+ */
+ mutex_lock(&cgroup_mutex);
+
+ for_each_subsys(ss, i)
+ seq_printf(m, "%s\t%d\t%d\t%d\n",
+ ss->legacy_name, ss->root->hierarchy_id,
+ atomic_read(&ss->root->nr_cgrps),
+ cgroup_ssid_enabled(i));
+
+ mutex_unlock(&cgroup_mutex);
+ return 0;
+}
+
+static int cgroupstats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, proc_cgroupstats_show, NULL);
+}
+
+const struct file_operations proc_cgroupstats_operations = {
+ .open = cgroupstats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+/**
+ * cgroupstats_build - build and fill cgroupstats
+ * @stats: cgroupstats to fill information into
+ * @dentry: A dentry entry belonging to the cgroup for which stats have
+ * been requested.
+ *
+ * Build and fill cgroupstats so that taskstats can export it to user
+ * space.
+ */
+int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
+{
+ struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
+ struct cgroup *cgrp;
+ struct css_task_iter it;
+ struct task_struct *tsk;
+
+ /* it should be kernfs_node belonging to cgroupfs and is a directory */
+ if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
+ kernfs_type(kn) != KERNFS_DIR)
+ return -EINVAL;
+
+ mutex_lock(&cgroup_mutex);
+
+ /*
+ * We aren't being called from kernfs and there's no guarantee on
+ * @kn->priv's validity. For this and css_tryget_online_from_dir(),
+ * @kn->priv is RCU safe. Let's do the RCU dancing.
+ */
+ rcu_read_lock();
+ cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
+ if (!cgrp || cgroup_is_dead(cgrp)) {
+ rcu_read_unlock();
+ mutex_unlock(&cgroup_mutex);
+ return -ENOENT;
+ }
+ rcu_read_unlock();
+
+ css_task_iter_start(&cgrp->self, &it);
+ while ((tsk = css_task_iter_next(&it))) {
+ switch (tsk->state) {
+ case TASK_RUNNING:
+ stats->nr_running++;
+ break;
+ case TASK_INTERRUPTIBLE:
+ stats->nr_sleeping++;
+ break;
+ case TASK_UNINTERRUPTIBLE:
+ stats->nr_uninterruptible++;
+ break;
+ case TASK_STOPPED:
+ stats->nr_stopped++;
+ break;
+ default:
+ if (delayacct_is_task_waiting_on_io(tsk))
+ stats->nr_io_wait++;
+ break;
+ }
+ }
+ css_task_iter_end(&it);
+
+ mutex_unlock(&cgroup_mutex);
+ return 0;
+}
+
+void cgroup1_check_for_release(struct cgroup *cgrp)
+{
+ if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
+ !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
+ schedule_work(&cgrp->release_agent_work);
+}
+
+/*
+ * Notify userspace when a cgroup is released, by running the
+ * configured release agent with the name of the cgroup (path
+ * relative to the root of cgroup file system) as the argument.
+ *
+ * Most likely, this user command will try to rmdir this cgroup.
+ *
+ * This races with the possibility that some other task will be
+ * attached to this cgroup before it is removed, or that some other
+ * user task will 'mkdir' a child cgroup of this cgroup. That's ok.
+ * The presumed 'rmdir' will fail quietly if this cgroup is no longer
+ * unused, and this cgroup will be reprieved from its death sentence,
+ * to continue to serve a useful existence. Next time it's released,
+ * we will get notified again, if it still has 'notify_on_release' set.
+ *
+ * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
+ * means only wait until the task is successfully execve()'d. The
+ * separate release agent task is forked by call_usermodehelper(),
+ * then control in this thread returns here, without waiting for the
+ * release agent task. We don't bother to wait because the caller of
+ * this routine has no use for the exit status of the release agent
+ * task, so no sense holding our caller up for that.
+ */
+void cgroup1_release_agent(struct work_struct *work)
+{
+ struct cgroup *cgrp =
+ container_of(work, struct cgroup, release_agent_work);
+ char *pathbuf = NULL, *agentbuf = NULL;
+ char *argv[3], *envp[3];
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+
+ pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
+ agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
+ if (!pathbuf || !agentbuf)
+ goto out;
+
+ spin_lock_irq(&css_set_lock);
+ ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
+ spin_unlock_irq(&css_set_lock);
+ if (ret < 0 || ret >= PATH_MAX)
+ goto out;
+
+ argv[0] = agentbuf;
+ argv[1] = pathbuf;
+ argv[2] = NULL;
+
+ /* minimal command environment */
+ envp[0] = "HOME=/";
+ envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+ envp[2] = NULL;
+
+ mutex_unlock(&cgroup_mutex);
+ call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+ goto out_free;
+out:
+ mutex_unlock(&cgroup_mutex);
+out_free:
+ kfree(agentbuf);
+ kfree(pathbuf);
+}
+
+/*
+ * cgroup_rename - Only allow simple rename of directories in place.
+ */
+static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
+ const char *new_name_str)
+{
+ struct cgroup *cgrp = kn->priv;
+ int ret;
+
+ if (kernfs_type(kn) != KERNFS_DIR)
+ return -ENOTDIR;
+ if (kn->parent != new_parent)
+ return -EIO;
+
+ /*
+ * We're gonna grab cgroup_mutex which nests outside kernfs
+ * active_ref. kernfs_rename() doesn't require active_ref
+ * protection. Break them before grabbing cgroup_mutex.
+ */
+ kernfs_break_active_protection(new_parent);
+ kernfs_break_active_protection(kn);
+
+ mutex_lock(&cgroup_mutex);
+
+ ret = kernfs_rename(kn, new_parent, new_name_str);
+ if (!ret)
+ trace_cgroup_rename(cgrp);
+
+ mutex_unlock(&cgroup_mutex);
+
+ kernfs_unbreak_active_protection(kn);
+ kernfs_unbreak_active_protection(new_parent);
+ return ret;
+}
+
+static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
+{
+ struct cgroup_root *root = cgroup_root_from_kf(kf_root);
+ struct cgroup_subsys *ss;
+ int ssid;
+
+ for_each_subsys(ss, ssid)
+ if (root->subsys_mask & (1 << ssid))
+ seq_show_option(seq, ss->legacy_name, NULL);
+ if (root->flags & CGRP_ROOT_NOPREFIX)
+ seq_puts(seq, ",noprefix");
+ if (root->flags & CGRP_ROOT_XATTR)
+ seq_puts(seq, ",xattr");
+
+ spin_lock(&release_agent_path_lock);
+ if (strlen(root->release_agent_path))
+ seq_show_option(seq, "release_agent",
+ root->release_agent_path);
+ spin_unlock(&release_agent_path_lock);
+
+ if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
+ seq_puts(seq, ",clone_children");
+ if (strlen(root->name))
+ seq_show_option(seq, "name", root->name);
+ return 0;
+}
+
+static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
+{
+ char *token, *o = data;
+ bool all_ss = false, one_ss = false;
+ u16 mask = U16_MAX;
+ struct cgroup_subsys *ss;
+ int nr_opts = 0;
+ int i;
+
+#ifdef CONFIG_CPUSETS
+ mask = ~((u16)1 << cpuset_cgrp_id);
+#endif
+
+ memset(opts, 0, sizeof(*opts));
+
+ while ((token = strsep(&o, ",")) != NULL) {
+ nr_opts++;
+
+ if (!*token)
+ return -EINVAL;
+ if (!strcmp(token, "none")) {
+ /* Explicitly have no subsystems */
+ opts->none = true;
+ continue;
+ }
+ if (!strcmp(token, "all")) {
+ /* Mutually exclusive option 'all' + subsystem name */
+ if (one_ss)
+ return -EINVAL;
+ all_ss = true;
+ continue;
+ }
+ if (!strcmp(token, "noprefix")) {
+ opts->flags |= CGRP_ROOT_NOPREFIX;
+ continue;
+ }
+ if (!strcmp(token, "clone_children")) {
+ opts->cpuset_clone_children = true;
+ continue;
+ }
+ if (!strcmp(token, "xattr")) {
+ opts->flags |= CGRP_ROOT_XATTR;
+ continue;
+ }
+ if (!strncmp(token, "release_agent=", 14)) {
+ /* Specifying two release agents is forbidden */
+ if (opts->release_agent)
+ return -EINVAL;
+ opts->release_agent =
+ kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
+ if (!opts->release_agent)
+ return -ENOMEM;
+ continue;
+ }
+ if (!strncmp(token, "name=", 5)) {
+ const char *name = token + 5;
+ /* Can't specify an empty name */
+ if (!strlen(name))
+ return -EINVAL;
+ /* Must match [\w.-]+ */
+ for (i = 0; i < strlen(name); i++) {
+ char c = name[i];
+ if (isalnum(c))
+ continue;
+ if ((c == '.') || (c == '-') || (c == '_'))
+ continue;
+ return -EINVAL;
+ }
+ /* Specifying two names is forbidden */
+ if (opts->name)
+ return -EINVAL;
+ opts->name = kstrndup(name,
+ MAX_CGROUP_ROOT_NAMELEN - 1,
+ GFP_KERNEL);
+ if (!opts->name)
+ return -ENOMEM;
+
+ continue;
+ }
+
+ for_each_subsys(ss, i) {
+ if (strcmp(token, ss->legacy_name))
+ continue;
+ if (!cgroup_ssid_enabled(i))
+ continue;
+ if (cgroup1_ssid_disabled(i))
+ continue;
+
+ /* Mutually exclusive option 'all' + subsystem name */
+ if (all_ss)
+ return -EINVAL;
+ opts->subsys_mask |= (1 << i);
+ one_ss = true;
+
+ break;
+ }
+ if (i == CGROUP_SUBSYS_COUNT)
+ return -ENOENT;
+ }
+
+ /*
+ * If the 'all' option was specified select all the subsystems,
+ * otherwise if 'none', 'name=' and a subsystem name options were
+ * not specified, let's default to 'all'
+ */
+ if (all_ss || (!one_ss && !opts->none && !opts->name))
+ for_each_subsys(ss, i)
+ if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
+ opts->subsys_mask |= (1 << i);
+
+ /*
+ * We either have to specify by name or by subsystems. (So all
+ * empty hierarchies must have a name).
+ */
+ if (!opts->subsys_mask && !opts->name)
+ return -EINVAL;
+
+ /*
+ * Option noprefix was introduced just for backward compatibility
+ * with the old cpuset, so we allow noprefix only if mounting just
+ * the cpuset subsystem.
+ */
+ if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
+ return -EINVAL;
+
+ /* Can't specify "none" and some subsystems */
+ if (opts->subsys_mask && opts->none)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
+{
+ int ret = 0;
+ struct cgroup_root *root = cgroup_root_from_kf(kf_root);
+ struct cgroup_sb_opts opts;
+ u16 added_mask, removed_mask;
+
+ cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
+
+ /* See what subsystems are wanted */
+ ret = parse_cgroupfs_options(data, &opts);
+ if (ret)
+ goto out_unlock;
+
+ if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
+ pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
+ task_tgid_nr(current), current->comm);
+
+ added_mask = opts.subsys_mask & ~root->subsys_mask;
+ removed_mask = root->subsys_mask & ~opts.subsys_mask;
+
+ /* Don't allow flags or name to change at remount */
+ if ((opts.flags ^ root->flags) ||
+ (opts.name && strcmp(opts.name, root->name))) {
+ pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
+ opts.flags, opts.name ?: "", root->flags, root->name);
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ /* remounting is not allowed for populated hierarchies */
+ if (!list_empty(&root->cgrp.self.children)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ ret = rebind_subsystems(root, added_mask);
+ if (ret)
+ goto out_unlock;
+
+ WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
+
+ if (opts.release_agent) {
+ spin_lock(&release_agent_path_lock);
+ strcpy(root->release_agent_path, opts.release_agent);
+ spin_unlock(&release_agent_path_lock);
+ }
+
+ trace_cgroup_remount(root);
+
+ out_unlock:
+ kfree(opts.release_agent);
+ kfree(opts.name);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+
+struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
+ .rename = cgroup1_rename,
+ .show_options = cgroup1_show_options,
+ .remount_fs = cgroup1_remount,
+ .mkdir = cgroup_mkdir,
+ .rmdir = cgroup_rmdir,
+ .show_path = cgroup_show_path,
+};
+
+struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
+ void *data, unsigned long magic,
+ struct cgroup_namespace *ns)
+{
+ struct super_block *pinned_sb = NULL;
+ struct cgroup_sb_opts opts;
+ struct cgroup_root *root;
+ struct cgroup_subsys *ss;
+ struct dentry *dentry;
+ int i, ret;
+
+ cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
+
+ /* First find the desired set of subsystems */
+ ret = parse_cgroupfs_options(data, &opts);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Destruction of cgroup root is asynchronous, so subsystems may
+ * still be dying after the previous unmount. Let's drain the
+ * dying subsystems. We just need to ensure that the ones
+ * unmounted previously finish dying and don't care about new ones
+ * starting. Testing ref liveliness is good enough.
+ */
+ for_each_subsys(ss, i) {
+ if (!(opts.subsys_mask & (1 << i)) ||
+ ss->root == &cgrp_dfl_root)
+ continue;
+
+ if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
+ mutex_unlock(&cgroup_mutex);
+ msleep(10);
+ ret = restart_syscall();
+ goto out_free;
+ }
+ cgroup_put(&ss->root->cgrp);
+ }
+
+ for_each_root(root) {
+ bool name_match = false;
+
+ if (root == &cgrp_dfl_root)
+ continue;
+
+ /*
+ * If we asked for a name then it must match. Also, if
+ * name matches but sybsys_mask doesn't, we should fail.
+ * Remember whether name matched.
+ */
+ if (opts.name) {
+ if (strcmp(opts.name, root->name))
+ continue;
+ name_match = true;
+ }
+
+ /*
+ * If we asked for subsystems (or explicitly for no
+ * subsystems) then they must match.
+ */
+ if ((opts.subsys_mask || opts.none) &&
+ (opts.subsys_mask != root->subsys_mask)) {
+ if (!name_match)
+ continue;
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ if (root->flags ^ opts.flags)
+ pr_warn("new mount options do not match the existing superblock, will be ignored\n");
+
+ /*
+ * We want to reuse @root whose lifetime is governed by its
+ * ->cgrp. Let's check whether @root is alive and keep it
+ * that way. As cgroup_kill_sb() can happen anytime, we
+ * want to block it by pinning the sb so that @root doesn't
+ * get killed before mount is complete.
+ *
+ * With the sb pinned, tryget_live can reliably indicate
+ * whether @root can be reused. If it's being killed,
+ * drain it. We can use wait_queue for the wait but this
+ * path is super cold. Let's just sleep a bit and retry.
+ */
+ pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
+ if (IS_ERR(pinned_sb) ||
+ !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
+ mutex_unlock(&cgroup_mutex);
+ if (!IS_ERR_OR_NULL(pinned_sb))
+ deactivate_super(pinned_sb);
+ msleep(10);
+ ret = restart_syscall();
+ goto out_free;
+ }
+
+ ret = 0;
+ goto out_unlock;
+ }
+
+ /*
+ * No such thing, create a new one. name= matching without subsys
+ * specification is allowed for already existing hierarchies but we
+ * can't create new one without subsys specification.
+ */
+ if (!opts.subsys_mask && !opts.none) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ /* Hierarchies may only be created in the initial cgroup namespace. */
+ if (ns != &init_cgroup_ns) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
+ root = kzalloc(sizeof(*root), GFP_KERNEL);
+ if (!root) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ init_cgroup_root(root, &opts);
+
+ ret = cgroup_setup_root(root, opts.subsys_mask);
+ if (ret)
+ cgroup_free_root(root);
+
+out_unlock:
+ mutex_unlock(&cgroup_mutex);
+out_free:
+ kfree(opts.release_agent);
+ kfree(opts.name);
+
+ if (ret)
+ return ERR_PTR(ret);
+
+ dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
+ CGROUP_SUPER_MAGIC, ns);
+
+ /*
+ * If @pinned_sb, we're reusing an existing root and holding an
+ * extra ref on its sb. Mount is complete. Put the extra ref.
+ */
+ if (pinned_sb)
+ deactivate_super(pinned_sb);
+
+ return dentry;
+}
+
+static int __init cgroup1_wq_init(void)
+{
+ /*
+ * Used to destroy pidlists and separate to serve as flush domain.
+ * Cap @max_active to 1 too.
+ */
+ cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
+ 0, 1);
+ BUG_ON(!cgroup_pidlist_destroy_wq);
+ return 0;
+}
+core_initcall(cgroup1_wq_init);
+
+static int __init cgroup_no_v1(char *str)
+{
+ struct cgroup_subsys *ss;
+ char *token;
+ int i;
+
+ while ((token = strsep(&str, ",")) != NULL) {
+ if (!*token)
+ continue;
+
+ if (!strcmp(token, "all")) {
+ cgroup_no_v1_mask = U16_MAX;
+ break;
+ }
+
+ for_each_subsys(ss, i) {
+ if (strcmp(token, ss->name) &&
+ strcmp(token, ss->legacy_name))
+ continue;
+
+ cgroup_no_v1_mask |= 1 << i;
+ }
+ }
+ return 1;
+}
+__setup("cgroup_no_v1=", cgroup_no_v1);
+
+
+#ifdef CONFIG_CGROUP_DEBUG
+static struct cgroup_subsys_state *
+debug_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
+
+ if (!css)
+ return ERR_PTR(-ENOMEM);
+
+ return css;
+}
+
+static void debug_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css);
+}
+
+static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return cgroup_task_count(css->cgroup);
+}
+
+static u64 current_css_set_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return (u64)(unsigned long)current->cgroups;
+}
+
+static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ u64 count;
+
+ rcu_read_lock();
+ count = atomic_read(&task_css_set(current)->refcount);
+ rcu_read_unlock();
+ return count;
+}
+
+static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
+{
+ struct cgrp_cset_link *link;
+ struct css_set *cset;
+ char *name_buf;
+
+ name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+ if (!name_buf)
+ return -ENOMEM;
+
+ spin_lock_irq(&css_set_lock);
+ rcu_read_lock();
+ cset = rcu_dereference(current->cgroups);
+ list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+ struct cgroup *c = link->cgrp;
+
+ cgroup_name(c, name_buf, NAME_MAX + 1);
+ seq_printf(seq, "Root %d group %s\n",
+ c->root->hierarchy_id, name_buf);
+ }
+ rcu_read_unlock();
+ spin_unlock_irq(&css_set_lock);
+ kfree(name_buf);
+ return 0;
+}
+
+#define MAX_TASKS_SHOWN_PER_CSS 25
+static int cgroup_css_links_read(struct seq_file *seq, void *v)
+{
+ struct cgroup_subsys_state *css = seq_css(seq);
+ struct cgrp_cset_link *link;
+
+ spin_lock_irq(&css_set_lock);
+ list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
+ struct css_set *cset = link->cset;
+ struct task_struct *task;
+ int count = 0;
+
+ seq_printf(seq, "css_set %p\n", cset);
+
+ list_for_each_entry(task, &cset->tasks, cg_list) {
+ if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+ goto overflow;
+ seq_printf(seq, " task %d\n", task_pid_vnr(task));
+ }
+
+ list_for_each_entry(task, &cset->mg_tasks, cg_list) {
+ if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+ goto overflow;
+ seq_printf(seq, " task %d\n", task_pid_vnr(task));
+ }
+ continue;
+ overflow:
+ seq_puts(seq, " ...\n");
+ }
+ spin_unlock_irq(&css_set_lock);
+ return 0;
+}
+
+static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ return (!cgroup_is_populated(css->cgroup) &&
+ !css_has_online_children(&css->cgroup->self));
+}
+
+static struct cftype debug_files[] = {
+ {
+ .name = "taskcount",
+ .read_u64 = debug_taskcount_read,
+ },
+
+ {
+ .name = "current_css_set",
+ .read_u64 = current_css_set_read,
+ },
+
+ {
+ .name = "current_css_set_refcount",
+ .read_u64 = current_css_set_refcount_read,
+ },
+
+ {
+ .name = "current_css_set_cg_links",
+ .seq_show = current_css_set_cg_links_read,
+ },
+
+ {
+ .name = "cgroup_css_links",
+ .seq_show = cgroup_css_links_read,
+ },
+
+ {
+ .name = "releasable",
+ .read_u64 = releasable_read,
+ },
+
+ { } /* terminate */
+};
+
+struct cgroup_subsys debug_cgrp_subsys = {
+ .css_alloc = debug_css_alloc,
+ .css_free = debug_css_free,
+ .legacy_cftypes = debug_files,
+};
+#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup.c b/kernel/cgroup/cgroup.c
index 53bbca7c4859..0125589c7428 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -28,35 +28,27 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/cgroup.h>
+#include "cgroup-internal.h"
+
#include <linux/cred.h>
-#include <linux/ctype.h>
#include <linux/errno.h>
#include <linux/init_task.h>
#include <linux/kernel.h>
-#include <linux/list.h>
#include <linux/magic.h>
-#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
+#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/percpu-rwsem.h>
#include <linux/string.h>
-#include <linux/sort.h>
-#include <linux/kmod.h>
-#include <linux/delayacct.h>
-#include <linux/cgroupstats.h>
#include <linux/hashtable.h>
-#include <linux/pid_namespace.h>
#include <linux/idr.h>
-#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
#include <linux/kthread.h>
-#include <linux/delay.h>
#include <linux/atomic.h>
#include <linux/cpuset.h>
#include <linux/proc_ns.h>
@@ -67,14 +59,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/cgroup.h>
-/*
- * pidlists linger the following amount before being destroyed. The goal
- * is avoiding frequent destruction in the middle of consecutive read calls
- * Expiring in the middle is a performance problem not a correctness one.
- * 1 sec should be enough.
- */
-#define CGROUP_PIDLIST_DESTROY_DELAY HZ
-
#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
MAX_CFTYPE_NAME + 2)
@@ -88,14 +72,12 @@
* These locks are exported if CONFIG_PROVE_RCU so that accessors in
* cgroup.h can use them for lockdep annotations.
*/
-#ifdef CONFIG_PROVE_RCU
DEFINE_MUTEX(cgroup_mutex);
DEFINE_SPINLOCK(css_set_lock);
+
+#ifdef CONFIG_PROVE_RCU
EXPORT_SYMBOL_GPL(cgroup_mutex);
EXPORT_SYMBOL_GPL(css_set_lock);
-#else
-static DEFINE_MUTEX(cgroup_mutex);
-static DEFINE_SPINLOCK(css_set_lock);
#endif
/*
@@ -110,12 +92,6 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
*/
static DEFINE_SPINLOCK(cgroup_file_kn_lock);
-/*
- * Protects cgroup_subsys->release_agent_path. Modifying it also requires
- * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
- */
-static DEFINE_SPINLOCK(release_agent_path_lock);
-
struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
#define cgroup_assert_mutex_or_rcu_locked() \
@@ -131,15 +107,9 @@ struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
*/
static struct workqueue_struct *cgroup_destroy_wq;
-/*
- * pidlist destructions need to be flushed on cgroup destruction. Use a
- * separate workqueue as flush domain.
- */
-static struct workqueue_struct *cgroup_pidlist_destroy_wq;
-
/* generate an array of cgroup subsystem pointers */
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
-static struct cgroup_subsys *cgroup_subsys[] = {
+struct cgroup_subsys *cgroup_subsys[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS
@@ -186,18 +156,14 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
*/
static bool cgrp_dfl_visible;
-/* Controllers blocked by the commandline in v1 */
-static u16 cgroup_no_v1_mask;
-
/* some controllers are not supported in the default hierarchy */
static u16 cgrp_dfl_inhibit_ss_mask;
/* some controllers are implicitly enabled on the default hierarchy */
-static unsigned long cgrp_dfl_implicit_ss_mask;
+static u16 cgrp_dfl_implicit_ss_mask;
/* The list of hierarchy roots */
-
-static LIST_HEAD(cgroup_roots);
+LIST_HEAD(cgroup_roots);
static int cgroup_root_count;
/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
@@ -213,13 +179,13 @@ static DEFINE_IDR(cgroup_hierarchy_idr);
static u64 css_serial_nr_next = 1;
/*
- * These bitmask flags indicate whether tasks in the fork and exit paths have
- * fork/exit handlers to call. This avoids us having to do extra work in the
- * fork/exit path to check which subsystems have fork/exit callbacks.
+ * These bitmasks identify subsystems with specific features to avoid
+ * having to do iterative checks repeatedly.
*/
static u16 have_fork_callback __read_mostly;
static u16 have_exit_callback __read_mostly;
static u16 have_free_callback __read_mostly;
+static u16 have_canfork_callback __read_mostly;
/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
@@ -230,15 +196,9 @@ struct cgroup_namespace init_cgroup_ns = {
.root_cset = &init_css_set,
};
-/* Ditto for the can_fork callback. */
-static u16 have_canfork_callback __read_mostly;
-
static struct file_system_type cgroup2_fs_type;
-static struct cftype cgroup_dfl_base_files[];
-static struct cftype cgroup_legacy_base_files[];
+static struct cftype cgroup_base_files[];
-static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
-static void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
static int cgroup_apply_control(struct cgroup *cgrp);
static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
static void css_task_iter_advance(struct css_task_iter *it);
@@ -259,7 +219,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
* is fine for individual subsystems but unsuitable for cgroup core. This
* is slower static_key_enabled() based test indexed by @ssid.
*/
-static bool cgroup_ssid_enabled(int ssid)
+bool cgroup_ssid_enabled(int ssid)
{
if (CGROUP_SUBSYS_COUNT == 0)
return false;
@@ -267,11 +227,6 @@ static bool cgroup_ssid_enabled(int ssid)
return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
}
-static bool cgroup_ssid_no_v1(int ssid)
-{
- return cgroup_no_v1_mask & (1 << ssid);
-}
-
/**
* cgroup_on_dfl - test whether a cgroup is on the default hierarchy
* @cgrp: the cgroup of interest
@@ -325,7 +280,7 @@ static bool cgroup_ssid_no_v1(int ssid)
*
* - debug: disallowed on the default hierarchy.
*/
-static bool cgroup_on_dfl(const struct cgroup *cgrp)
+bool cgroup_on_dfl(const struct cgroup *cgrp)
{
return cgrp->root == &cgrp_dfl_root;
}
@@ -481,12 +436,6 @@ out_unlock:
return css;
}
-/* convenient tests for these bits */
-static inline bool cgroup_is_dead(const struct cgroup *cgrp)
-{
- return !(cgrp->self.flags & CSS_ONLINE);
-}
-
static void cgroup_get(struct cgroup *cgrp)
{
WARN_ON_ONCE(cgroup_is_dead(cgrp));
@@ -518,11 +467,6 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
}
EXPORT_SYMBOL_GPL(of_css);
-static int notify_on_release(const struct cgroup *cgrp)
-{
- return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
-}
-
/**
* for_each_css - iterate all css's of a cgroup
* @css: the iteration cursor
@@ -553,15 +497,6 @@ static int notify_on_release(const struct cgroup *cgrp)
else
/**
- * for_each_subsys - iterate all enabled cgroup subsystems
- * @ss: the iteration cursor
- * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
- */
-#define for_each_subsys(ss, ssid) \
- for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
- (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
-
-/**
* do_each_subsys_mask - filter for_each_subsys with a bitmask
* @ss: the iteration cursor
* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
@@ -585,10 +520,6 @@ static int notify_on_release(const struct cgroup *cgrp)
} \
} while (false)
-/* iterate across the hierarchies */
-#define for_each_root(root) \
- list_for_each_entry((root), &cgroup_roots, root_list)
-
/* iterate over child cgrps, lock should be held throughout iteration */
#define cgroup_for_each_live_child(child, cgrp) \
list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
@@ -615,29 +546,6 @@ static int notify_on_release(const struct cgroup *cgrp)
; \
else
-static void cgroup_release_agent(struct work_struct *work);
-static void check_for_release(struct cgroup *cgrp);
-
-/*
- * A cgroup can be associated with multiple css_sets as different tasks may
- * belong to different cgroups on different hierarchies. In the other
- * direction, a css_set is naturally associated with multiple cgroups.
- * This M:N relationship is represented by the following link structure
- * which exists for each association and allows traversing the associations
- * from both sides.
- */
-struct cgrp_cset_link {
- /* the cgroup and css_set this link associates */
- struct cgroup *cgrp;
- struct css_set *cset;
-
- /* list of cgrp_cset_links anchored at cgrp->cset_links */
- struct list_head cset_link;
-
- /* list of cgrp_cset_links anchored at css_set->cgrp_links */
- struct list_head cgrp_link;
-};
-
/*
* The default css_set - used by init and its children prior to any
* hierarchies being mounted. It contains a pointer to the root state
@@ -647,12 +555,12 @@ struct cgrp_cset_link {
*/
struct css_set init_css_set = {
.refcount = ATOMIC_INIT(1),
- .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
+ .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
+ .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
- .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
};
static int css_set_count = 1; /* 1 for init_css_set */
@@ -699,7 +607,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
if (!trigger)
break;
- check_for_release(cgrp);
+ cgroup1_check_for_release(cgrp);
cgroup_file_notify(&cgrp->events_file);
cgrp = cgroup_parent(cgrp);
@@ -808,7 +716,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
return key;
}
-static void put_css_set_locked(struct css_set *cset)
+void put_css_set_locked(struct css_set *cset)
{
struct cgrp_cset_link *link, *tmp_link;
struct cgroup_subsys *ss;
@@ -838,31 +746,6 @@ static void put_css_set_locked(struct css_set *cset)
kfree_rcu(cset, rcu_head);
}
-static void put_css_set(struct css_set *cset)
-{
- unsigned long flags;
-
- /*
- * Ensure that the refcount doesn't hit zero while any readers
- * can see it. Similar to atomic_dec_and_lock(), but for an
- * rwlock
- */
- if (atomic_add_unless(&cset->refcount, -1, 1))
- return;
-
- spin_lock_irqsave(&css_set_lock, flags);
- put_css_set_locked(cset);
- spin_unlock_irqrestore(&css_set_lock, flags);
-}
-
-/*
- * refcounted get/put for css_set objects
- */
-static inline void get_css_set(struct css_set *cset)
-{
- atomic_inc(&cset->refcount);
-}
-
/**
* compare_css_sets - helper function for find_existing_css_set().
* @cset: candidate css_set being tested
@@ -1095,13 +978,13 @@ static struct css_set *find_css_set(struct css_set *old_cset,
}
atomic_set(&cset->refcount, 1);
- INIT_LIST_HEAD(&cset->cgrp_links);
INIT_LIST_HEAD(&cset->tasks);
INIT_LIST_HEAD(&cset->mg_tasks);
- INIT_LIST_HEAD(&cset->mg_preload_node);
- INIT_LIST_HEAD(&cset->mg_node);
INIT_LIST_HEAD(&cset->task_iters);
INIT_HLIST_NODE(&cset->hlist);
+ INIT_LIST_HEAD(&cset->cgrp_links);
+ INIT_LIST_HEAD(&cset->mg_preload_node);
+ INIT_LIST_HEAD(&cset->mg_node);
/* Copy the set of subsystem state objects generated in
* find_existing_css_set() */
@@ -1138,7 +1021,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
return cset;
}
-static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
+struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
{
struct cgroup *root_cgrp = kf_root->kn->priv;
@@ -1166,7 +1049,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
}
-static void cgroup_free_root(struct cgroup_root *root)
+void cgroup_free_root(struct cgroup_root *root)
{
if (root) {
idr_destroy(&root->cgroup_idr);
@@ -1283,8 +1166,8 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
* Return the cgroup for "task" from the given hierarchy. Must be
* called with cgroup_mutex and css_set_lock held.
*/
-static struct cgroup *task_cgroup_from_root(struct task_struct *task,
- struct cgroup_root *root)
+struct cgroup *task_cgroup_from_root(struct task_struct *task,
+ struct cgroup_root *root)
{
/*
* No need to lock the task - since we hold cgroup_mutex the
@@ -1321,7 +1204,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
*/
static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
-static const struct file_operations proc_cgroupstats_operations;
static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
char *buf)
@@ -1415,7 +1297,7 @@ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
* inaccessible any time. If the caller intends to continue to access the
* cgroup, it should pin it before invoking this function.
*/
-static void cgroup_kn_unlock(struct kernfs_node *kn)
+void cgroup_kn_unlock(struct kernfs_node *kn)
{
struct cgroup *cgrp;
@@ -1447,8 +1329,7 @@ static void cgroup_kn_unlock(struct kernfs_node *kn)
* locking under kernfs active protection and allows all kernfs operations
* including self-removal.
*/
-static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn,
- bool drain_offline)
+struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
{
struct cgroup *cgrp;
@@ -1532,9 +1413,9 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
if (!css->ss) {
if (cgroup_on_dfl(cgrp))
- cfts = cgroup_dfl_base_files;
+ cfts = cgroup_base_files;
else
- cfts = cgroup_legacy_base_files;
+ cfts = cgroup1_base_files;
return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
}
@@ -1559,7 +1440,7 @@ err:
return ret;
}
-static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
+int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
{
struct cgroup *dcgrp = &dst_root->cgrp;
struct cgroup_subsys *ss;
@@ -1629,8 +1510,8 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
return 0;
}
-static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
- struct kernfs_root *kf_root)
+int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
+ struct kernfs_root *kf_root)
{
int len = 0;
char *buf = NULL;
@@ -1656,237 +1537,10 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
return len;
}
-static int cgroup_show_options(struct seq_file *seq,
- struct kernfs_root *kf_root)
-{
- struct cgroup_root *root = cgroup_root_from_kf(kf_root);
- struct cgroup_subsys *ss;
- int ssid;
-
- if (root != &cgrp_dfl_root)
- for_each_subsys(ss, ssid)
- if (root->subsys_mask & (1 << ssid))
- seq_show_option(seq, ss->legacy_name, NULL);
- if (root->flags & CGRP_ROOT_NOPREFIX)
- seq_puts(seq, ",noprefix");
- if (root->flags & CGRP_ROOT_XATTR)
- seq_puts(seq, ",xattr");
-
- spin_lock(&release_agent_path_lock);
- if (strlen(root->release_agent_path))
- seq_show_option(seq, "release_agent",
- root->release_agent_path);
- spin_unlock(&release_agent_path_lock);
-
- if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
- seq_puts(seq, ",clone_children");
- if (strlen(root->name))
- seq_show_option(seq, "name", root->name);
- return 0;
-}
-
-struct cgroup_sb_opts {
- u16 subsys_mask;
- unsigned int flags;
- char *release_agent;
- bool cpuset_clone_children;
- char *name;
- /* User explicitly requested empty subsystem */
- bool none;
-};
-
-static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
-{
- char *token, *o = data;
- bool all_ss = false, one_ss = false;
- u16 mask = U16_MAX;
- struct cgroup_subsys *ss;
- int nr_opts = 0;
- int i;
-
-#ifdef CONFIG_CPUSETS
- mask = ~((u16)1 << cpuset_cgrp_id);
-#endif
-
- memset(opts, 0, sizeof(*opts));
-
- while ((token = strsep(&o, ",")) != NULL) {
- nr_opts++;
-
- if (!*token)
- return -EINVAL;
- if (!strcmp(token, "none")) {
- /* Explicitly have no subsystems */
- opts->none = true;
- continue;
- }
- if (!strcmp(token, "all")) {
- /* Mutually exclusive option 'all' + subsystem name */
- if (one_ss)
- return -EINVAL;
- all_ss = true;
- continue;
- }
- if (!strcmp(token, "noprefix")) {
- opts->flags |= CGRP_ROOT_NOPREFIX;
- continue;
- }
- if (!strcmp(token, "clone_children")) {
- opts->cpuset_clone_children = true;
- continue;
- }
- if (!strcmp(token, "xattr")) {
- opts->flags |= CGRP_ROOT_XATTR;
- continue;
- }
- if (!strncmp(token, "release_agent=", 14)) {
- /* Specifying two release agents is forbidden */
- if (opts->release_agent)
- return -EINVAL;
- opts->release_agent =
- kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
- if (!opts->release_agent)
- return -ENOMEM;
- continue;
- }
- if (!strncmp(token, "name=", 5)) {
- const char *name = token + 5;
- /* Can't specify an empty name */
- if (!strlen(name))
- return -EINVAL;
- /* Must match [\w.-]+ */
- for (i = 0; i < strlen(name); i++) {
- char c = name[i];
- if (isalnum(c))
- continue;
- if ((c == '.') || (c == '-') || (c == '_'))
- continue;
- return -EINVAL;
- }
- /* Specifying two names is forbidden */
- if (opts->name)
- return -EINVAL;
- opts->name = kstrndup(name,
- MAX_CGROUP_ROOT_NAMELEN - 1,
- GFP_KERNEL);
- if (!opts->name)
- return -ENOMEM;
-
- continue;
- }
-
- for_each_subsys(ss, i) {
- if (strcmp(token, ss->legacy_name))
- continue;
- if (!cgroup_ssid_enabled(i))
- continue;
- if (cgroup_ssid_no_v1(i))
- continue;
-
- /* Mutually exclusive option 'all' + subsystem name */
- if (all_ss)
- return -EINVAL;
- opts->subsys_mask |= (1 << i);
- one_ss = true;
-
- break;
- }
- if (i == CGROUP_SUBSYS_COUNT)
- return -ENOENT;
- }
-
- /*
- * If the 'all' option was specified select all the subsystems,
- * otherwise if 'none', 'name=' and a subsystem name options were
- * not specified, let's default to 'all'
- */
- if (all_ss || (!one_ss && !opts->none && !opts->name))
- for_each_subsys(ss, i)
- if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i))
- opts->subsys_mask |= (1 << i);
-
- /*
- * We either have to specify by name or by subsystems. (So all
- * empty hierarchies must have a name).
- */
- if (!opts->subsys_mask && !opts->name)
- return -EINVAL;
-
- /*
- * Option noprefix was introduced just for backward compatibility
- * with the old cpuset, so we allow noprefix only if mounting just
- * the cpuset subsystem.
- */
- if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
- return -EINVAL;
-
- /* Can't specify "none" and some subsystems */
- if (opts->subsys_mask && opts->none)
- return -EINVAL;
-
- return 0;
-}
-
static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
{
- int ret = 0;
- struct cgroup_root *root = cgroup_root_from_kf(kf_root);
- struct cgroup_sb_opts opts;
- u16 added_mask, removed_mask;
-
- if (root == &cgrp_dfl_root) {
- pr_err("remount is not allowed\n");
- return -EINVAL;
- }
-
- cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
-
- /* See what subsystems are wanted */
- ret = parse_cgroupfs_options(data, &opts);
- if (ret)
- goto out_unlock;
-
- if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
- pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
- task_tgid_nr(current), current->comm);
-
- added_mask = opts.subsys_mask & ~root->subsys_mask;
- removed_mask = root->subsys_mask & ~opts.subsys_mask;
-
- /* Don't allow flags or name to change at remount */
- if ((opts.flags ^ root->flags) ||
- (opts.name && strcmp(opts.name, root->name))) {
- pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
- opts.flags, opts.name ?: "", root->flags, root->name);
- ret = -EINVAL;
- goto out_unlock;
- }
-
- /* remounting is not allowed for populated hierarchies */
- if (!list_empty(&root->cgrp.self.children)) {
- ret = -EBUSY;
- goto out_unlock;
- }
-
- ret = rebind_subsystems(root, added_mask);
- if (ret)
- goto out_unlock;
-
- WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
-
- if (opts.release_agent) {
- spin_lock(&release_agent_path_lock);
- strcpy(root->release_agent_path, opts.release_agent);
- spin_unlock(&release_agent_path_lock);
- }
-
- trace_cgroup_remount(root);
-
- out_unlock:
- kfree(opts.release_agent);
- kfree(opts.name);
- mutex_unlock(&cgroup_mutex);
- return ret;
+ pr_err("remount is not allowed\n");
+ return -EINVAL;
}
/*
@@ -1964,11 +1618,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
init_waitqueue_head(&cgrp->offline_waitq);
- INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
+ INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}
-static void init_cgroup_root(struct cgroup_root *root,
- struct cgroup_sb_opts *opts)
+void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
{
struct cgroup *cgrp = &root->cgrp;
@@ -1987,10 +1640,11 @@ static void init_cgroup_root(struct cgroup_root *root,
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
-static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
+ struct kernfs_syscall_ops *kf_sops;
struct css_set *cset;
int i, ret;
@@ -2022,7 +1676,10 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
if (ret)
goto cancel_ref;
- root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
+ kf_sops = root == &cgrp_dfl_root ?
+ &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
+
+ root->kf_root = kernfs_create_root(kf_sops,
KERNFS_ROOT_CREATE_DEACTIVATED,
root_cgrp);
if (IS_ERR(root->kf_root)) {
@@ -2080,182 +1737,18 @@ out:
return ret;
}
-static struct dentry *cgroup_mount(struct file_system_type *fs_type,
- int flags, const char *unused_dev_name,
- void *data)
+struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
+ struct cgroup_root *root, unsigned long magic,
+ struct cgroup_namespace *ns)
{
- bool is_v2 = fs_type == &cgroup2_fs_type;
- struct super_block *pinned_sb = NULL;
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
- struct cgroup_subsys *ss;
- struct cgroup_root *root;
- struct cgroup_sb_opts opts;
struct dentry *dentry;
- int ret;
- int i;
bool new_sb;
- get_cgroup_ns(ns);
-
- /* Check if the caller has permission to mount. */
- if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
- put_cgroup_ns(ns);
- return ERR_PTR(-EPERM);
- }
-
- /*
- * The first time anyone tries to mount a cgroup, enable the list
- * linking each css_set to its tasks and fix up all existing tasks.
- */
- if (!use_task_css_set_links)
- cgroup_enable_task_cg_lists();
-
- if (is_v2) {
- if (data) {
- pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
- put_cgroup_ns(ns);
- return ERR_PTR(-EINVAL);
- }
- cgrp_dfl_visible = true;
- root = &cgrp_dfl_root;
- cgroup_get(&root->cgrp);
- goto out_mount;
- }
-
- cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
-
- /* First find the desired set of subsystems */
- ret = parse_cgroupfs_options(data, &opts);
- if (ret)
- goto out_unlock;
-
- /*
- * Destruction of cgroup root is asynchronous, so subsystems may
- * still be dying after the previous unmount. Let's drain the
- * dying subsystems. We just need to ensure that the ones
- * unmounted previously finish dying and don't care about new ones
- * starting. Testing ref liveliness is good enough.
- */
- for_each_subsys(ss, i) {
- if (!(opts.subsys_mask & (1 << i)) ||
- ss->root == &cgrp_dfl_root)
- continue;
-
- if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
- mutex_unlock(&cgroup_mutex);
- msleep(10);
- ret = restart_syscall();
- goto out_free;
- }
- cgroup_put(&ss->root->cgrp);
- }
-
- for_each_root(root) {
- bool name_match = false;
-
- if (root == &cgrp_dfl_root)
- continue;
-
- /*
- * If we asked for a name then it must match. Also, if
- * name matches but sybsys_mask doesn't, we should fail.
- * Remember whether name matched.
- */
- if (opts.name) {
- if (strcmp(opts.name, root->name))
- continue;
- name_match = true;
- }
-
- /*
- * If we asked for subsystems (or explicitly for no
- * subsystems) then they must match.
- */
- if ((opts.subsys_mask || opts.none) &&
- (opts.subsys_mask != root->subsys_mask)) {
- if (!name_match)
- continue;
- ret = -EBUSY;
- goto out_unlock;
- }
-
- if (root->flags ^ opts.flags)
- pr_warn("new mount options do not match the existing superblock, will be ignored\n");
-
- /*
- * We want to reuse @root whose lifetime is governed by its
- * ->cgrp. Let's check whether @root is alive and keep it
- * that way. As cgroup_kill_sb() can happen anytime, we
- * want to block it by pinning the sb so that @root doesn't
- * get killed before mount is complete.
- *
- * With the sb pinned, tryget_live can reliably indicate
- * whether @root can be reused. If it's being killed,
- * drain it. We can use wait_queue for the wait but this
- * path is super cold. Let's just sleep a bit and retry.
- */
- pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
- if (IS_ERR(pinned_sb) ||
- !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
- mutex_unlock(&cgroup_mutex);
- if (!IS_ERR_OR_NULL(pinned_sb))
- deactivate_super(pinned_sb);
- msleep(10);
- ret = restart_syscall();
- goto out_free;
- }
-
- ret = 0;
- goto out_unlock;
- }
+ dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb);
/*
- * No such thing, create a new one. name= matching without subsys
- * specification is allowed for already existing hierarchies but we
- * can't create new one without subsys specification.
- */
- if (!opts.subsys_mask && !opts.none) {
- ret = -EINVAL;
- goto out_unlock;
- }
-
- /* Hierarchies may only be created in the initial cgroup namespace. */
- if (ns != &init_cgroup_ns) {
- ret = -EPERM;
- goto out_unlock;
- }
-
- root = kzalloc(sizeof(*root), GFP_KERNEL);
- if (!root) {
- ret = -ENOMEM;
- goto out_unlock;
- }
-
- init_cgroup_root(root, &opts);
-
- ret = cgroup_setup_root(root, opts.subsys_mask);
- if (ret)
- cgroup_free_root(root);
-
-out_unlock:
- mutex_unlock(&cgroup_mutex);
-out_free:
- kfree(opts.release_agent);
- kfree(opts.name);
-
- if (ret) {
- put_cgroup_ns(ns);
- return ERR_PTR(ret);
- }
-out_mount:
- dentry = kernfs_mount(fs_type, flags, root->kf_root,
- is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
- &new_sb);
-
- /*
- * In non-init cgroup namespace, instead of root cgroup's
- * dentry, we return the dentry corresponding to the
- * cgroupns->root_cgrp.
+ * In non-init cgroup namespace, instead of root cgroup's dentry,
+ * we return the dentry corresponding to the cgroupns->root_cgrp.
*/
if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
struct dentry *nsdentry;
@@ -2277,13 +1770,45 @@ out_mount:
if (IS_ERR(dentry) || !new_sb)
cgroup_put(&root->cgrp);
+ return dentry;
+}
+
+static struct dentry *cgroup_mount(struct file_system_type *fs_type,
+ int flags, const char *unused_dev_name,
+ void *data)
+{
+ struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+ struct dentry *dentry;
+
+ get_cgroup_ns(ns);
+
+ /* Check if the caller has permission to mount. */
+ if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
+ put_cgroup_ns(ns);
+ return ERR_PTR(-EPERM);
+ }
+
/*
- * If @pinned_sb, we're reusing an existing root and holding an
- * extra ref on its sb. Mount is complete. Put the extra ref.
+ * The first time anyone tries to mount a cgroup, enable the list
+ * linking each css_set to its tasks and fix up all existing tasks.
*/
- if (pinned_sb) {
- WARN_ON(new_sb);
- deactivate_super(pinned_sb);
+ if (!use_task_css_set_links)
+ cgroup_enable_task_cg_lists();
+
+ if (fs_type == &cgroup2_fs_type) {
+ if (data) {
+ pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
+ put_cgroup_ns(ns);
+ return ERR_PTR(-EINVAL);
+ }
+ cgrp_dfl_visible = true;
+ cgroup_get(&cgrp_dfl_root.cgrp);
+
+ dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
+ CGROUP2_SUPER_MAGIC, ns);
+ } else {
+ dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
+ CGROUP_SUPER_MAGIC, ns);
}
put_cgroup_ns(ns);
@@ -2311,7 +1836,7 @@ static void cgroup_kill_sb(struct super_block *sb)
kernfs_kill_sb(sb);
}
-static struct file_system_type cgroup_fs_type = {
+struct file_system_type cgroup_fs_type = {
.name = "cgroup",
.mount = cgroup_mount,
.kill_sb = cgroup_kill_sb,
@@ -2325,8 +1850,8 @@ static struct file_system_type cgroup2_fs_type = {
.fs_flags = FS_USERNS_MOUNT,
};
-static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
- struct cgroup_namespace *ns)
+int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
+ struct cgroup_namespace *ns)
{
struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
@@ -2389,49 +1914,18 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
}
EXPORT_SYMBOL_GPL(task_cgroup_path);
-/* used to track tasks and other necessary states during migration */
-struct cgroup_taskset {
- /* the src and dst cset list running through cset->mg_node */
- struct list_head src_csets;
- struct list_head dst_csets;
-
- /* the subsys currently being processed */
- int ssid;
-
- /*
- * Fields for cgroup_taskset_*() iteration.
- *
- * Before migration is committed, the target migration tasks are on
- * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of
- * the csets on ->dst_csets. ->csets point to either ->src_csets
- * or ->dst_csets depending on whether migration is committed.
- *
- * ->cur_csets and ->cur_task point to the current task position
- * during iteration.
- */
- struct list_head *csets;
- struct css_set *cur_cset;
- struct task_struct *cur_task;
-};
-
-#define CGROUP_TASKSET_INIT(tset) (struct cgroup_taskset){ \
- .src_csets = LIST_HEAD_INIT(tset.src_csets), \
- .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \
- .csets = &tset.src_csets, \
-}
-
/**
- * cgroup_taskset_add - try to add a migration target task to a taskset
+ * cgroup_migrate_add_task - add a migration target task to a migration context
* @task: target task
- * @tset: target taskset
+ * @mgctx: target migration context
*
- * Add @task, which is a migration target, to @tset. This function becomes
- * noop if @task doesn't need to be migrated. @task's css_set should have
- * been added as a migration source and @task->cg_list will be moved from
- * the css_set's tasks list to mg_tasks one.
+ * Add @task, which is a migration target, to @mgctx->tset. This function
+ * becomes noop if @task doesn't need to be migrated. @task's css_set
+ * should have been added as a migration source and @task->cg_list will be
+ * moved from the css_set's tasks list to mg_tasks one.
*/
-static void cgroup_taskset_add(struct task_struct *task,
- struct cgroup_taskset *tset)
+static void cgroup_migrate_add_task(struct task_struct *task,
+ struct cgroup_mgctx *mgctx)
{
struct css_set *cset;
@@ -2451,10 +1945,11 @@ static void cgroup_taskset_add(struct task_struct *task,
list_move_tail(&task->cg_list, &cset->mg_tasks);
if (list_empty(&cset->mg_node))
- list_add_tail(&cset->mg_node, &tset->src_csets);
+ list_add_tail(&cset->mg_node,
+ &mgctx->tset.src_csets);
if (list_empty(&cset->mg_dst_cset->mg_node))
- list_move_tail(&cset->mg_dst_cset->mg_node,
- &tset->dst_csets);
+ list_add_tail(&cset->mg_dst_cset->mg_node,
+ &mgctx->tset.dst_csets);
}
/**
@@ -2521,17 +2016,16 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
/**
* cgroup_taskset_migrate - migrate a taskset
- * @tset: taget taskset
- * @root: cgroup root the migration is taking place on
+ * @mgctx: migration context
*
- * Migrate tasks in @tset as setup by migration preparation functions.
+ * Migrate tasks in @mgctx as setup by migration preparation functions.
* This function fails iff one of the ->can_attach callbacks fails and
- * guarantees that either all or none of the tasks in @tset are migrated.
- * @tset is consumed regardless of success.
+ * guarantees that either all or none of the tasks in @mgctx are migrated.
+ * @mgctx is consumed regardless of success.
*/
-static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
- struct cgroup_root *root)
+static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
{
+ struct cgroup_taskset *tset = &mgctx->tset;
struct cgroup_subsys *ss;
struct task_struct *task, *tmp_task;
struct css_set *cset, *tmp_cset;
@@ -2542,7 +2036,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
return 0;
/* check that we can legitimately attach to the cgroup */
- do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
if (ss->can_attach) {
tset->ssid = ssid;
ret = ss->can_attach(tset);
@@ -2578,7 +2072,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
*/
tset->csets = &tset->dst_csets;
- do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
if (ss->attach) {
tset->ssid = ssid;
ss->attach(tset);
@@ -2589,7 +2083,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
goto out_release_tset;
out_cancel_attach:
- do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
if (ssid == failed_ssid)
break;
if (ss->cancel_attach) {
@@ -2616,7 +2110,7 @@ out_release_tset:
* zero for migration destination cgroups with tasks so that child cgroups
* don't compete against tasks.
*/
-static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
+bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
{
return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
!dst_cgrp->subtree_control;
@@ -2624,25 +2118,31 @@ static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
/**
* cgroup_migrate_finish - cleanup after attach
- * @preloaded_csets: list of preloaded css_sets
+ * @mgctx: migration context
*
* Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See
* those functions for details.
*/
-static void cgroup_migrate_finish(struct list_head *preloaded_csets)
+void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
{
+ LIST_HEAD(preloaded);
struct css_set *cset, *tmp_cset;
lockdep_assert_held(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
- list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
+
+ list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
+ list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
+
+ list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
cset->mg_src_cgrp = NULL;
cset->mg_dst_cgrp = NULL;
cset->mg_dst_cset = NULL;
list_del_init(&cset->mg_preload_node);
put_css_set_locked(cset);
}
+
spin_unlock_irq(&css_set_lock);
}
@@ -2650,10 +2150,10 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
* cgroup_migrate_add_src - add a migration source css_set
* @src_cset: the source css_set to add
* @dst_cgrp: the destination cgroup
- * @preloaded_csets: list of preloaded css_sets
+ * @mgctx: migration context
*
* Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin
- * @src_cset and add it to @preloaded_csets, which should later be cleaned
+ * @src_cset and add it to @mgctx->src_csets, which should later be cleaned
* up by cgroup_migrate_finish().
*
* This function may be called without holding cgroup_threadgroup_rwsem
@@ -2662,9 +2162,9 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
* into play and the preloaded css_sets are guaranteed to cover all
* migrations.
*/
-static void cgroup_migrate_add_src(struct css_set *src_cset,
- struct cgroup *dst_cgrp,
- struct list_head *preloaded_csets)
+void cgroup_migrate_add_src(struct css_set *src_cset,
+ struct cgroup *dst_cgrp,
+ struct cgroup_mgctx *mgctx)
{
struct cgroup *src_cgrp;
@@ -2692,33 +2192,35 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
src_cset->mg_src_cgrp = src_cgrp;
src_cset->mg_dst_cgrp = dst_cgrp;
get_css_set(src_cset);
- list_add(&src_cset->mg_preload_node, preloaded_csets);
+ list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
}
/**
* cgroup_migrate_prepare_dst - prepare destination css_sets for migration
- * @preloaded_csets: list of preloaded source css_sets
+ * @mgctx: migration context
*
* Tasks are about to be moved and all the source css_sets have been
- * preloaded to @preloaded_csets. This function looks up and pins all
- * destination css_sets, links each to its source, and append them to
- * @preloaded_csets.
+ * preloaded to @mgctx->preloaded_src_csets. This function looks up and
+ * pins all destination css_sets, links each to its source, and append them
+ * to @mgctx->preloaded_dst_csets.
*
* This function must be called after cgroup_migrate_add_src() has been
* called on each migration source css_set. After migration is performed
* using cgroup_migrate(), cgroup_migrate_finish() must be called on
- * @preloaded_csets.
+ * @mgctx.
*/
-static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
+int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
{
- LIST_HEAD(csets);
struct css_set *src_cset, *tmp_cset;
lockdep_assert_held(&cgroup_mutex);
/* look up the dst cset for each src cset and link it to src */
- list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
+ list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
+ mg_preload_node) {
struct css_set *dst_cset;
+ struct cgroup_subsys *ss;
+ int ssid;
dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
if (!dst_cset)
@@ -2743,15 +2245,19 @@ static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
src_cset->mg_dst_cset = dst_cset;
if (list_empty(&dst_cset->mg_preload_node))
- list_add(&dst_cset->mg_preload_node, &csets);
+ list_add_tail(&dst_cset->mg_preload_node,
+ &mgctx->preloaded_dst_csets);
else
put_css_set(dst_cset);
+
+ for_each_subsys(ss, ssid)
+ if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
+ mgctx->ss_mask |= 1 << ssid;
}
- list_splice_tail(&csets, preloaded_csets);
return 0;
err:
- cgroup_migrate_finish(&csets);
+ cgroup_migrate_finish(mgctx);
return -ENOMEM;
}
@@ -2759,7 +2265,7 @@ err:
* cgroup_migrate - migrate a process or task to a cgroup
* @leader: the leader of the process or the task to migrate
* @threadgroup: whether @leader points to the whole process or a single task
- * @root: cgroup root migration is taking place on
+ * @mgctx: migration context
*
* Migrate a process or task denoted by @leader. If migrating a process,
* the caller must be holding cgroup_threadgroup_rwsem. The caller is also
@@ -2773,10 +2279,9 @@ err:
* decided for all targets by invoking group_migrate_prepare_dst() before
* actually starting migrating.
*/
-static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
- struct cgroup_root *root)
+int cgroup_migrate(struct task_struct *leader, bool threadgroup,
+ struct cgroup_mgctx *mgctx)
{
- struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
struct task_struct *task;
/*
@@ -2788,14 +2293,14 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
rcu_read_lock();
task = leader;
do {
- cgroup_taskset_add(task, &tset);
+ cgroup_migrate_add_task(task, mgctx);
if (!threadgroup)
break;
} while_each_thread(leader, task);
rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
- return cgroup_taskset_migrate(&tset, root);
+ return cgroup_migrate_execute(mgctx);
}
/**
@@ -2806,10 +2311,10 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
*
* Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
*/
-static int cgroup_attach_task(struct cgroup *dst_cgrp,
- struct task_struct *leader, bool threadgroup)
+int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
+ bool threadgroup)
{
- LIST_HEAD(preloaded_csets);
+ DEFINE_CGROUP_MGCTX(mgctx);
struct task_struct *task;
int ret;
@@ -2821,8 +2326,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
rcu_read_lock();
task = leader;
do {
- cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
- &preloaded_csets);
+ cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
if (!threadgroup)
break;
} while_each_thread(leader, task);
@@ -2830,11 +2334,11 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
spin_unlock_irq(&css_set_lock);
/* prepare dst csets and commit */
- ret = cgroup_migrate_prepare_dst(&preloaded_csets);
+ ret = cgroup_migrate_prepare_dst(&mgctx);
if (!ret)
- ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root);
+ ret = cgroup_migrate(leader, threadgroup, &mgctx);
- cgroup_migrate_finish(&preloaded_csets);
+ cgroup_migrate_finish(&mgctx);
if (!ret)
trace_cgroup_attach_task(dst_cgrp, leader, threadgroup);
@@ -2846,20 +2350,9 @@ static int cgroup_procs_write_permission(struct task_struct *task,
struct cgroup *dst_cgrp,
struct kernfs_open_file *of)
{
- const struct cred *cred = current_cred();
- const struct cred *tcred = get_task_cred(task);
int ret = 0;
- /*
- * even if we're attaching all tasks in the thread group, we only
- * need to check permissions on one of them.
- */
- if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
- !uid_eq(cred->euid, tcred->uid) &&
- !uid_eq(cred->euid, tcred->suid))
- ret = -EACCES;
-
- if (!ret && cgroup_on_dfl(dst_cgrp)) {
+ if (cgroup_on_dfl(dst_cgrp)) {
struct super_block *sb = of->file->f_path.dentry->d_sb;
struct cgroup *cgrp;
struct inode *inode;
@@ -2877,9 +2370,21 @@ static int cgroup_procs_write_permission(struct task_struct *task,
ret = inode_permission(inode, MAY_WRITE);
iput(inode);
}
+ } else {
+ const struct cred *cred = current_cred();
+ const struct cred *tcred = get_task_cred(task);
+
+ /*
+ * even if we're attaching all tasks in the thread group,
+ * we only need to check permissions on one of them.
+ */
+ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+ !uid_eq(cred->euid, tcred->uid) &&
+ !uid_eq(cred->euid, tcred->suid))
+ ret = -EACCES;
+ put_cred(tcred);
}
- put_cred(tcred);
return ret;
}
@@ -2888,8 +2393,8 @@ static int cgroup_procs_write_permission(struct task_struct *task,
* function to attach either it or all tasks in its threadgroup. Will lock
* cgroup_mutex and threadgroup.
*/
-static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
- size_t nbytes, loff_t off, bool threadgroup)
+ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off, bool threadgroup)
{
struct task_struct *tsk;
struct cgroup_subsys *ss;
@@ -2950,86 +2455,12 @@ out_unlock_threadgroup:
return ret ?: nbytes;
}
-/**
- * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
- * @from: attach to all cgroups of a given task
- * @tsk: the task to be attached
- */
-int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
-{
- struct cgroup_root *root;
- int retval = 0;
-
- mutex_lock(&cgroup_mutex);
- percpu_down_write(&cgroup_threadgroup_rwsem);
- for_each_root(root) {
- struct cgroup *from_cgrp;
-
- if (root == &cgrp_dfl_root)
- continue;
-
- spin_lock_irq(&css_set_lock);
- from_cgrp = task_cgroup_from_root(from, root);
- spin_unlock_irq(&css_set_lock);
-
- retval = cgroup_attach_task(from_cgrp, tsk, false);
- if (retval)
- break;
- }
- percpu_up_write(&cgroup_threadgroup_rwsem);
- mutex_unlock(&cgroup_mutex);
-
- return retval;
-}
-EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
-
-static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- return __cgroup_procs_write(of, buf, nbytes, off, false);
-}
-
-static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
+ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
+ loff_t off)
{
return __cgroup_procs_write(of, buf, nbytes, off, true);
}
-static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- struct cgroup *cgrp;
-
- BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
-
- cgrp = cgroup_kn_lock_live(of->kn, false);
- if (!cgrp)
- return -ENODEV;
- spin_lock(&release_agent_path_lock);
- strlcpy(cgrp->root->release_agent_path, strstrip(buf),
- sizeof(cgrp->root->release_agent_path));
- spin_unlock(&release_agent_path_lock);
- cgroup_kn_unlock(of->kn);
- return nbytes;
-}
-
-static int cgroup_release_agent_show(struct seq_file *seq, void *v)
-{
- struct cgroup *cgrp = seq_css(seq)->cgroup;
-
- spin_lock(&release_agent_path_lock);
- seq_puts(seq, cgrp->root->release_agent_path);
- spin_unlock(&release_agent_path_lock);
- seq_putc(seq, '\n');
- return 0;
-}
-
-static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
-{
- seq_puts(seq, "0\n");
- return 0;
-}
-
static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
{
struct cgroup_subsys *ss;
@@ -3075,8 +2506,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
*/
static int cgroup_update_dfl_csses(struct cgroup *cgrp)
{
- LIST_HEAD(preloaded_csets);
- struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
+ DEFINE_CGROUP_MGCTX(mgctx);
struct cgroup_subsys_state *d_css;
struct cgroup *dsct;
struct css_set *src_cset;
@@ -3092,33 +2522,28 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
struct cgrp_cset_link *link;
list_for_each_entry(link, &dsct->cset_links, cset_link)
- cgroup_migrate_add_src(link->cset, dsct,
- &preloaded_csets);
+ cgroup_migrate_add_src(link->cset, dsct, &mgctx);
}
spin_unlock_irq(&css_set_lock);
/* NULL dst indicates self on default hierarchy */
- ret = cgroup_migrate_prepare_dst(&preloaded_csets);
+ ret = cgroup_migrate_prepare_dst(&mgctx);
if (ret)
goto out_finish;
spin_lock_irq(&css_set_lock);
- list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
+ list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
struct task_struct *task, *ntask;
- /* src_csets precede dst_csets, break on the first dst_cset */
- if (!src_cset->mg_src_cgrp)
- break;
-
/* all tasks in src_csets need to be migrated */
list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
- cgroup_taskset_add(task, &tset);
+ cgroup_migrate_add_task(task, &mgctx);
}
spin_unlock_irq(&css_set_lock);
- ret = cgroup_taskset_migrate(&tset, cgrp->root);
+ ret = cgroup_migrate_execute(&mgctx);
out_finish:
- cgroup_migrate_finish(&preloaded_csets);
+ cgroup_migrate_finish(&mgctx);
percpu_up_write(&cgroup_threadgroup_rwsem);
return ret;
}
@@ -3131,7 +2556,7 @@ out_finish:
* controller while the previous css is still around. This function grabs
* cgroup_mutex and drains the previous css instances of @cgrp's subtree.
*/
-static void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
+void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
__acquires(&cgroup_mutex)
{
struct cgroup *dsct;
@@ -3503,6 +2928,23 @@ static int cgroup_events_show(struct seq_file *seq, void *v)
return 0;
}
+static int cgroup_file_open(struct kernfs_open_file *of)
+{
+ struct cftype *cft = of->kn->priv;
+
+ if (cft->open)
+ return cft->open(of);
+ return 0;
+}
+
+static void cgroup_file_release(struct kernfs_open_file *of)
+{
+ struct cftype *cft = of->kn->priv;
+
+ if (cft->release)
+ cft->release(of);
+}
+
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
@@ -3553,7 +2995,8 @@ static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
{
- seq_cft(seq)->seq_stop(seq, v);
+ if (seq_cft(seq)->seq_stop)
+ seq_cft(seq)->seq_stop(seq, v);
}
static int cgroup_seqfile_show(struct seq_file *m, void *arg)
@@ -3575,12 +3018,16 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
static struct kernfs_ops cgroup_kf_single_ops = {
.atomic_write_len = PAGE_SIZE,
+ .open = cgroup_file_open,
+ .release = cgroup_file_release,
.write = cgroup_file_write,
.seq_show = cgroup_seqfile_show,
};
static struct kernfs_ops cgroup_kf_ops = {
.atomic_write_len = PAGE_SIZE,
+ .open = cgroup_file_open,
+ .release = cgroup_file_release,
.write = cgroup_file_write,
.seq_start = cgroup_seqfile_start,
.seq_next = cgroup_seqfile_next,
@@ -3588,48 +3035,6 @@ static struct kernfs_ops cgroup_kf_ops = {
.seq_show = cgroup_seqfile_show,
};
-/*
- * cgroup_rename - Only allow simple rename of directories in place.
- */
-static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
- const char *new_name_str)
-{
- struct cgroup *cgrp = kn->priv;
- int ret;
-
- if (kernfs_type(kn) != KERNFS_DIR)
- return -ENOTDIR;
- if (kn->parent != new_parent)
- return -EIO;
-
- /*
- * This isn't a proper migration and its usefulness is very
- * limited. Disallow on the default hierarchy.
- */
- if (cgroup_on_dfl(cgrp))
- return -EPERM;
-
- /*
- * We're gonna grab cgroup_mutex which nests outside kernfs
- * active_ref. kernfs_rename() doesn't require active_ref
- * protection. Break them before grabbing cgroup_mutex.
- */
- kernfs_break_active_protection(new_parent);
- kernfs_break_active_protection(kn);
-
- mutex_lock(&cgroup_mutex);
-
- ret = kernfs_rename(kn, new_parent, new_name_str);
- if (!ret)
- trace_cgroup_rename(cgrp);
-
- mutex_unlock(&cgroup_mutex);
-
- kernfs_unbreak_active_protection(kn);
- kernfs_unbreak_active_protection(new_parent);
- return ret;
-}
-
/* set uid and gid of cgroup dirs and files to that of the creator */
static int cgroup_kn_set_ugid(struct kernfs_node *kn)
{
@@ -3926,26 +3331,6 @@ void cgroup_file_notify(struct cgroup_file *cfile)
}
/**
- * cgroup_task_count - count the number of tasks in a cgroup.
- * @cgrp: the cgroup in question
- *
- * Return the number of tasks in the cgroup. The returned number can be
- * higher than the actual number of tasks due to css_set references from
- * namespace roots and temporary usages.
- */
-static int cgroup_task_count(const struct cgroup *cgrp)
-{
- int count = 0;
- struct cgrp_cset_link *link;
-
- spin_lock_irq(&css_set_lock);
- list_for_each_entry(link, &cgrp->cset_links, cset_link)
- count += atomic_read(&link->cset->refcount);
- spin_unlock_irq(&css_set_lock);
- return count;
-}
-
-/**
* css_next_child - find the next child of a given css
* @pos: the current position (%NULL to initiate traversal)
* @parent: css whose children to walk
@@ -4343,560 +3728,69 @@ void css_task_iter_end(struct css_task_iter *it)
put_task_struct(it->cur_task);
}
-/**
- * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
- * @to: cgroup to which the tasks will be moved
- * @from: cgroup in which the tasks currently reside
- *
- * Locking rules between cgroup_post_fork() and the migration path
- * guarantee that, if a task is forking while being migrated, the new child
- * is guaranteed to be either visible in the source cgroup after the
- * parent's migration is complete or put into the target cgroup. No task
- * can slip out of migration through forking.
- */
-int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
-{
- LIST_HEAD(preloaded_csets);
- struct cgrp_cset_link *link;
- struct css_task_iter it;
- struct task_struct *task;
- int ret;
-
- if (!cgroup_may_migrate_to(to))
- return -EBUSY;
-
- mutex_lock(&cgroup_mutex);
-
- percpu_down_write(&cgroup_threadgroup_rwsem);
-
- /* all tasks in @from are being moved, all csets are source */
- spin_lock_irq(&css_set_lock);
- list_for_each_entry(link, &from->cset_links, cset_link)
- cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
- spin_unlock_irq(&css_set_lock);
-
- ret = cgroup_migrate_prepare_dst(&preloaded_csets);
- if (ret)
- goto out_err;
-
- /*
- * Migrate tasks one-by-one until @from is empty. This fails iff
- * ->can_attach() fails.
- */
- do {
- css_task_iter_start(&from->self, &it);
- task = css_task_iter_next(&it);
- if (task)
- get_task_struct(task);
- css_task_iter_end(&it);
-
- if (task) {
- ret = cgroup_migrate(task, false, to->root);
- if (!ret)
- trace_cgroup_transfer_tasks(to, task, false);
- put_task_struct(task);
- }
- } while (task && !ret);
-out_err:
- cgroup_migrate_finish(&preloaded_csets);
- percpu_up_write(&cgroup_threadgroup_rwsem);
- mutex_unlock(&cgroup_mutex);
- return ret;
-}
-
-/*
- * Stuff for reading the 'tasks'/'procs' files.
- *
- * Reading this file can return large amounts of data if a cgroup has
- * *lots* of attached tasks. So it may need several calls to read(),
- * but we cannot guarantee that the information we produce is correct
- * unless we produce it entirely atomically.
- *
- */
-
-/* which pidlist file are we talking about? */
-enum cgroup_filetype {
- CGROUP_FILE_PROCS,
- CGROUP_FILE_TASKS,
-};
-
-/*
- * A pidlist is a list of pids that virtually represents the contents of one
- * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
- * a pair (one each for procs, tasks) for each pid namespace that's relevant
- * to the cgroup.
- */
-struct cgroup_pidlist {
- /*
- * used to find which pidlist is wanted. doesn't change as long as
- * this particular list stays in the list.
- */
- struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
- /* array of xids */
- pid_t *list;
- /* how many elements the above list has */
- int length;
- /* each of these stored in a list by its cgroup */
- struct list_head links;
- /* pointer to the cgroup we belong to, for list removal purposes */
- struct cgroup *owner;
- /* for delayed destruction */
- struct delayed_work destroy_dwork;
-};
-
-/*
- * The following two functions "fix" the issue where there are more pids
- * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
- * TODO: replace with a kernel-wide solution to this problem
- */
-#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
-static void *pidlist_allocate(int count)
-{
- if (PIDLIST_TOO_LARGE(count))
- return vmalloc(count * sizeof(pid_t));
- else
- return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
-}
-
-static void pidlist_free(void *p)
-{
- kvfree(p);
-}
-
-/*
- * Used to destroy all pidlists lingering waiting for destroy timer. None
- * should be left afterwards.
- */
-static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
-{
- struct cgroup_pidlist *l, *tmp_l;
-
- mutex_lock(&cgrp->pidlist_mutex);
- list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
- mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
- mutex_unlock(&cgrp->pidlist_mutex);
-
- flush_workqueue(cgroup_pidlist_destroy_wq);
- BUG_ON(!list_empty(&cgrp->pidlists));
-}
-
-static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
-{
- struct delayed_work *dwork = to_delayed_work(work);
- struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
- destroy_dwork);
- struct cgroup_pidlist *tofree = NULL;
-
- mutex_lock(&l->owner->pidlist_mutex);
-
- /*
- * Destroy iff we didn't get queued again. The state won't change
- * as destroy_dwork can only be queued while locked.
- */
- if (!delayed_work_pending(dwork)) {
- list_del(&l->links);
- pidlist_free(l->list);
- put_pid_ns(l->key.ns);
- tofree = l;
- }
-
- mutex_unlock(&l->owner->pidlist_mutex);
- kfree(tofree);
-}
-
-/*
- * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
- * Returns the number of unique elements.
- */
-static int pidlist_uniq(pid_t *list, int length)
-{
- int src, dest = 1;
-
- /*
- * we presume the 0th element is unique, so i starts at 1. trivial
- * edge cases first; no work needs to be done for either
- */
- if (length == 0 || length == 1)
- return length;
- /* src and dest walk down the list; dest counts unique elements */
- for (src = 1; src < length; src++) {
- /* find next unique element */
- while (list[src] == list[src-1]) {
- src++;
- if (src == length)
- goto after;
- }
- /* dest always points to where the next unique element goes */
- list[dest] = list[src];
- dest++;
- }
-after:
- return dest;
-}
-
-/*
- * The two pid files - task and cgroup.procs - guaranteed that the result
- * is sorted, which forced this whole pidlist fiasco. As pid order is
- * different per namespace, each namespace needs differently sorted list,
- * making it impossible to use, for example, single rbtree of member tasks
- * sorted by task pointer. As pidlists can be fairly large, allocating one
- * per open file is dangerous, so cgroup had to implement shared pool of
- * pidlists keyed by cgroup and namespace.
- *
- * All this extra complexity was caused by the original implementation
- * committing to an entirely unnecessary property. In the long term, we
- * want to do away with it. Explicitly scramble sort order if on the
- * default hierarchy so that no such expectation exists in the new
- * interface.
- *
- * Scrambling is done by swapping every two consecutive bits, which is
- * non-identity one-to-one mapping which disturbs sort order sufficiently.
- */
-static pid_t pid_fry(pid_t pid)
+static void cgroup_procs_release(struct kernfs_open_file *of)
{
- unsigned a = pid & 0x55555555;
- unsigned b = pid & 0xAAAAAAAA;
-
- return (a << 1) | (b >> 1);
-}
-
-static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
-{
- if (cgroup_on_dfl(cgrp))
- return pid_fry(pid);
- else
- return pid;
-}
-
-static int cmppid(const void *a, const void *b)
-{
- return *(pid_t *)a - *(pid_t *)b;
-}
-
-static int fried_cmppid(const void *a, const void *b)
-{
- return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
-}
-
-static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
- enum cgroup_filetype type)
-{
- struct cgroup_pidlist *l;
- /* don't need task_nsproxy() if we're looking at ourself */
- struct pid_namespace *ns = task_active_pid_ns(current);
-
- lockdep_assert_held(&cgrp->pidlist_mutex);
-
- list_for_each_entry(l, &cgrp->pidlists, links)
- if (l->key.type == type && l->key.ns == ns)
- return l;
- return NULL;
-}
-
-/*
- * find the appropriate pidlist for our purpose (given procs vs tasks)
- * returns with the lock on that pidlist already held, and takes care
- * of the use count, or returns NULL with no locks held if we're out of
- * memory.
- */
-static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
- enum cgroup_filetype type)
-{
- struct cgroup_pidlist *l;
-
- lockdep_assert_held(&cgrp->pidlist_mutex);
-
- l = cgroup_pidlist_find(cgrp, type);
- if (l)
- return l;
-
- /* entry not found; create a new one */
- l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
- if (!l)
- return l;
-
- INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
- l->key.type = type;
- /* don't need task_nsproxy() if we're looking at ourself */
- l->key.ns = get_pid_ns(task_active_pid_ns(current));
- l->owner = cgrp;
- list_add(&l->links, &cgrp->pidlists);
- return l;
-}
-
-/*
- * Load a cgroup's pidarray with either procs' tgids or tasks' pids
- */
-static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
- struct cgroup_pidlist **lp)
-{
- pid_t *array;
- int length;
- int pid, n = 0; /* used for populating the array */
- struct css_task_iter it;
- struct task_struct *tsk;
- struct cgroup_pidlist *l;
-
- lockdep_assert_held(&cgrp->pidlist_mutex);
-
- /*
- * If cgroup gets more users after we read count, we won't have
- * enough space - tough. This race is indistinguishable to the
- * caller from the case that the additional cgroup users didn't
- * show up until sometime later on.
- */
- length = cgroup_task_count(cgrp);
- array = pidlist_allocate(length);
- if (!array)
- return -ENOMEM;
- /* now, populate the array */
- css_task_iter_start(&cgrp->self, &it);
- while ((tsk = css_task_iter_next(&it))) {
- if (unlikely(n == length))
- break;
- /* get tgid or pid for procs or tasks file respectively */
- if (type == CGROUP_FILE_PROCS)
- pid = task_tgid_vnr(tsk);
- else
- pid = task_pid_vnr(tsk);
- if (pid > 0) /* make sure to only use valid results */
- array[n++] = pid;
- }
- css_task_iter_end(&it);
- length = n;
- /* now sort & (if procs) strip out duplicates */
- if (cgroup_on_dfl(cgrp))
- sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
- else
- sort(array, length, sizeof(pid_t), cmppid, NULL);
- if (type == CGROUP_FILE_PROCS)
- length = pidlist_uniq(array, length);
-
- l = cgroup_pidlist_find_create(cgrp, type);
- if (!l) {
- pidlist_free(array);
- return -ENOMEM;
+ if (of->priv) {
+ css_task_iter_end(of->priv);
+ kfree(of->priv);
}
-
- /* store array, freeing old if necessary */
- pidlist_free(l->list);
- l->list = array;
- l->length = length;
- *lp = l;
- return 0;
}
-/**
- * cgroupstats_build - build and fill cgroupstats
- * @stats: cgroupstats to fill information into
- * @dentry: A dentry entry belonging to the cgroup for which stats have
- * been requested.
- *
- * Build and fill cgroupstats so that taskstats can export it to user
- * space.
- */
-int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
+static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
{
- struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
- struct cgroup *cgrp;
- struct css_task_iter it;
- struct task_struct *tsk;
-
- /* it should be kernfs_node belonging to cgroupfs and is a directory */
- if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
- kernfs_type(kn) != KERNFS_DIR)
- return -EINVAL;
-
- mutex_lock(&cgroup_mutex);
-
- /*
- * We aren't being called from kernfs and there's no guarantee on
- * @kn->priv's validity. For this and css_tryget_online_from_dir(),
- * @kn->priv is RCU safe. Let's do the RCU dancing.
- */
- rcu_read_lock();
- cgrp = rcu_dereference(kn->priv);
- if (!cgrp || cgroup_is_dead(cgrp)) {
- rcu_read_unlock();
- mutex_unlock(&cgroup_mutex);
- return -ENOENT;
- }
- rcu_read_unlock();
+ struct kernfs_open_file *of = s->private;
+ struct css_task_iter *it = of->priv;
+ struct task_struct *task;
- css_task_iter_start(&cgrp->self, &it);
- while ((tsk = css_task_iter_next(&it))) {
- switch (tsk->state) {
- case TASK_RUNNING:
- stats->nr_running++;
- break;
- case TASK_INTERRUPTIBLE:
- stats->nr_sleeping++;
- break;
- case TASK_UNINTERRUPTIBLE:
- stats->nr_uninterruptible++;
- break;
- case TASK_STOPPED:
- stats->nr_stopped++;
- break;
- default:
- if (delayacct_is_task_waiting_on_io(tsk))
- stats->nr_io_wait++;
- break;
- }
- }
- css_task_iter_end(&it);
+ do {
+ task = css_task_iter_next(it);
+ } while (task && !thread_group_leader(task));
- mutex_unlock(&cgroup_mutex);
- return 0;
+ return task;
}
-
-/*
- * seq_file methods for the tasks/procs files. The seq_file position is the
- * next pid to display; the seq_file iterator is a pointer to the pid
- * in the cgroup->l->list array.
- */
-
-static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
+static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
{
- /*
- * Initially we receive a position value that corresponds to
- * one more than the last pid shown (or 0 on the first call or
- * after a seek to the start). Use a binary-search to find the
- * next pid to display, if any
- */
struct kernfs_open_file *of = s->private;
struct cgroup *cgrp = seq_css(s)->cgroup;
- struct cgroup_pidlist *l;
- enum cgroup_filetype type = seq_cft(s)->private;
- int index = 0, pid = *pos;
- int *iter, ret;
-
- mutex_lock(&cgrp->pidlist_mutex);
+ struct css_task_iter *it = of->priv;
/*
- * !NULL @of->priv indicates that this isn't the first start()
- * after open. If the matching pidlist is around, we can use that.
- * Look for it. Note that @of->priv can't be used directly. It
- * could already have been destroyed.
+ * When a seq_file is seeked, it's always traversed sequentially
+ * from position 0, so we can simply keep iterating on !0 *pos.
*/
- if (of->priv)
- of->priv = cgroup_pidlist_find(cgrp, type);
-
- /*
- * Either this is the first start() after open or the matching
- * pidlist has been destroyed inbetween. Create a new one.
- */
- if (!of->priv) {
- ret = pidlist_array_load(cgrp, type,
- (struct cgroup_pidlist **)&of->priv);
- if (ret)
- return ERR_PTR(ret);
- }
- l = of->priv;
-
- if (pid) {
- int end = l->length;
-
- while (index < end) {
- int mid = (index + end) / 2;
- if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
- index = mid;
- break;
- } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
- index = mid + 1;
- else
- end = mid;
- }
- }
- /* If we're off the end of the array, we're done */
- if (index >= l->length)
- return NULL;
- /* Update the abstract position to be the actual pid that we found */
- iter = l->list + index;
- *pos = cgroup_pid_fry(cgrp, *iter);
- return iter;
-}
-
-static void cgroup_pidlist_stop(struct seq_file *s, void *v)
-{
- struct kernfs_open_file *of = s->private;
- struct cgroup_pidlist *l = of->priv;
-
- if (l)
- mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
- CGROUP_PIDLIST_DESTROY_DELAY);
- mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
-}
+ if (!it) {
+ if (WARN_ON_ONCE((*pos)++))
+ return ERR_PTR(-EINVAL);
-static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
-{
- struct kernfs_open_file *of = s->private;
- struct cgroup_pidlist *l = of->priv;
- pid_t *p = v;
- pid_t *end = l->list + l->length;
- /*
- * Advance to the next pid in the array. If this goes off the
- * end, we're done
- */
- p++;
- if (p >= end) {
- return NULL;
- } else {
- *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
- return p;
+ it = kzalloc(sizeof(*it), GFP_KERNEL);
+ if (!it)
+ return ERR_PTR(-ENOMEM);
+ of->priv = it;
+ css_task_iter_start(&cgrp->self, it);
+ } else if (!(*pos)++) {
+ css_task_iter_end(it);
+ css_task_iter_start(&cgrp->self, it);
}
-}
-
-static int cgroup_pidlist_show(struct seq_file *s, void *v)
-{
- seq_printf(s, "%d\n", *(int *)v);
- return 0;
+ return cgroup_procs_next(s, NULL, NULL);
}
-static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
- struct cftype *cft)
+static int cgroup_procs_show(struct seq_file *s, void *v)
{
- return notify_on_release(css->cgroup);
-}
-
-static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
- struct cftype *cft, u64 val)
-{
- if (val)
- set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
- else
- clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
- return 0;
-}
-
-static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
-}
-
-static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
- struct cftype *cft, u64 val)
-{
- if (val)
- set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
- else
- clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+ seq_printf(s, "%d\n", task_tgid_vnr(v));
return 0;
}
/* cgroup core interface files for the default hierarchy */
-static struct cftype cgroup_dfl_base_files[] = {
+static struct cftype cgroup_base_files[] = {
{
.name = "cgroup.procs",
.file_offset = offsetof(struct cgroup, procs_file),
- .seq_start = cgroup_pidlist_start,
- .seq_next = cgroup_pidlist_next,
- .seq_stop = cgroup_pidlist_stop,
- .seq_show = cgroup_pidlist_show,
- .private = CGROUP_FILE_PROCS,
+ .release = cgroup_procs_release,
+ .seq_start = cgroup_procs_start,
+ .seq_next = cgroup_procs_next,
+ .seq_show = cgroup_procs_show,
.write = cgroup_procs_write,
},
{
@@ -4917,51 +3811,6 @@ static struct cftype cgroup_dfl_base_files[] = {
{ } /* terminate */
};
-/* cgroup core interface files for the legacy hierarchies */
-static struct cftype cgroup_legacy_base_files[] = {
- {
- .name = "cgroup.procs",
- .seq_start = cgroup_pidlist_start,
- .seq_next = cgroup_pidlist_next,
- .seq_stop = cgroup_pidlist_stop,
- .seq_show = cgroup_pidlist_show,
- .private = CGROUP_FILE_PROCS,
- .write = cgroup_procs_write,
- },
- {
- .name = "cgroup.clone_children",
- .read_u64 = cgroup_clone_children_read,
- .write_u64 = cgroup_clone_children_write,
- },
- {
- .name = "cgroup.sane_behavior",
- .flags = CFTYPE_ONLY_ON_ROOT,
- .seq_show = cgroup_sane_behavior_show,
- },
- {
- .name = "tasks",
- .seq_start = cgroup_pidlist_start,
- .seq_next = cgroup_pidlist_next,
- .seq_stop = cgroup_pidlist_stop,
- .seq_show = cgroup_pidlist_show,
- .private = CGROUP_FILE_TASKS,
- .write = cgroup_tasks_write,
- },
- {
- .name = "notify_on_release",
- .read_u64 = cgroup_read_notify_on_release,
- .write_u64 = cgroup_write_notify_on_release,
- },
- {
- .name = "release_agent",
- .flags = CFTYPE_ONLY_ON_ROOT,
- .seq_show = cgroup_release_agent_show,
- .write = cgroup_release_agent_write,
- .max_write_len = PATH_MAX - 1,
- },
- { } /* terminate */
-};
-
/*
* css destruction is four-stage process.
*
@@ -5007,7 +3856,7 @@ static void css_free_work_fn(struct work_struct *work)
} else {
/* cgroup free path */
atomic_dec(&cgrp->root->nr_cgrps);
- cgroup_pidlist_destroy_all(cgrp);
+ cgroup1_pidlist_destroy_all(cgrp);
cancel_work_sync(&cgrp->release_agent_work);
if (cgroup_parent(cgrp)) {
@@ -5302,8 +4151,7 @@ out_free_cgrp:
return ERR_PTR(ret);
}
-static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
- umode_t mode)
+int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
{
struct cgroup *parent, *cgrp;
struct kernfs_node *kn;
@@ -5507,7 +4355,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
*/
kernfs_remove(cgrp->kn);
- check_for_release(cgroup_parent(cgrp));
+ cgroup1_check_for_release(cgroup_parent(cgrp));
/* put the base reference */
percpu_ref_kill(&cgrp->self.refcnt);
@@ -5515,7 +4363,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
return 0;
};
-static int cgroup_rmdir(struct kernfs_node *kn)
+int cgroup_rmdir(struct kernfs_node *kn)
{
struct cgroup *cgrp;
int ret = 0;
@@ -5535,10 +4383,8 @@ static int cgroup_rmdir(struct kernfs_node *kn)
static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
.remount_fs = cgroup_remount,
- .show_options = cgroup_show_options,
.mkdir = cgroup_mkdir,
.rmdir = cgroup_rmdir,
- .rename = cgroup_rename,
.show_path = cgroup_show_path,
};
@@ -5646,8 +4492,8 @@ int __init cgroup_init(void)
BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
- BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
- BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
/*
* The latency of the synchronize_sched() is too high for cgroups,
@@ -5697,7 +4543,7 @@ int __init cgroup_init(void)
continue;
}
- if (cgroup_ssid_no_v1(ssid))
+ if (cgroup1_ssid_disabled(ssid))
printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
ss->name);
@@ -5744,15 +4590,6 @@ static int __init cgroup_wq_init(void)
*/
cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
BUG_ON(!cgroup_destroy_wq);
-
- /*
- * Used to destroy pidlists and separate to serve as flush domain.
- * Cap @max_active to 1 too.
- */
- cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
- 0, 1);
- BUG_ON(!cgroup_pidlist_destroy_wq);
-
return 0;
}
core_initcall(cgroup_wq_init);
@@ -5835,42 +4672,6 @@ out:
return retval;
}
-/* Display information about each subsystem and each hierarchy */
-static int proc_cgroupstats_show(struct seq_file *m, void *v)
-{
- struct cgroup_subsys *ss;
- int i;
-
- seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
- /*
- * ideally we don't want subsystems moving around while we do this.
- * cgroup_mutex is also necessary to guarantee an atomic snapshot of
- * subsys/hierarchy state.
- */
- mutex_lock(&cgroup_mutex);
-
- for_each_subsys(ss, i)
- seq_printf(m, "%s\t%d\t%d\t%d\n",
- ss->legacy_name, ss->root->hierarchy_id,
- atomic_read(&ss->root->nr_cgrps),
- cgroup_ssid_enabled(i));
-
- mutex_unlock(&cgroup_mutex);
- return 0;
-}
-
-static int cgroupstats_open(struct inode *inode, struct file *file)
-{
- return single_open(file, proc_cgroupstats_show, NULL);
-}
-
-static const struct file_operations proc_cgroupstats_operations = {
- .open = cgroupstats_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
/**
* cgroup_fork - initialize cgroup related fields during copy_process()
* @child: pointer to task_struct of forking parent process.
@@ -6050,76 +4851,6 @@ void cgroup_free(struct task_struct *task)
put_css_set(cset);
}
-static void check_for_release(struct cgroup *cgrp)
-{
- if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
- !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
- schedule_work(&cgrp->release_agent_work);
-}
-
-/*
- * Notify userspace when a cgroup is released, by running the
- * configured release agent with the name of the cgroup (path
- * relative to the root of cgroup file system) as the argument.
- *
- * Most likely, this user command will try to rmdir this cgroup.
- *
- * This races with the possibility that some other task will be
- * attached to this cgroup before it is removed, or that some other
- * user task will 'mkdir' a child cgroup of this cgroup. That's ok.
- * The presumed 'rmdir' will fail quietly if this cgroup is no longer
- * unused, and this cgroup will be reprieved from its death sentence,
- * to continue to serve a useful existence. Next time it's released,
- * we will get notified again, if it still has 'notify_on_release' set.
- *
- * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
- * means only wait until the task is successfully execve()'d. The
- * separate release agent task is forked by call_usermodehelper(),
- * then control in this thread returns here, without waiting for the
- * release agent task. We don't bother to wait because the caller of
- * this routine has no use for the exit status of the release agent
- * task, so no sense holding our caller up for that.
- */
-static void cgroup_release_agent(struct work_struct *work)
-{
- struct cgroup *cgrp =
- container_of(work, struct cgroup, release_agent_work);
- char *pathbuf = NULL, *agentbuf = NULL;
- char *argv[3], *envp[3];
- int ret;
-
- mutex_lock(&cgroup_mutex);
-
- pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
- agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
- if (!pathbuf || !agentbuf)
- goto out;
-
- spin_lock_irq(&css_set_lock);
- ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
- spin_unlock_irq(&css_set_lock);
- if (ret < 0 || ret >= PATH_MAX)
- goto out;
-
- argv[0] = agentbuf;
- argv[1] = pathbuf;
- argv[2] = NULL;
-
- /* minimal command environment */
- envp[0] = "HOME=/";
- envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
- envp[2] = NULL;
-
- mutex_unlock(&cgroup_mutex);
- call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
- goto out_free;
-out:
- mutex_unlock(&cgroup_mutex);
-out_free:
- kfree(agentbuf);
- kfree(pathbuf);
-}
-
static int __init cgroup_disable(char *str)
{
struct cgroup_subsys *ss;
@@ -6141,33 +4872,6 @@ static int __init cgroup_disable(char *str)
}
__setup("cgroup_disable=", cgroup_disable);
-static int __init cgroup_no_v1(char *str)
-{
- struct cgroup_subsys *ss;
- char *token;
- int i;
-
- while ((token = strsep(&str, ",")) != NULL) {
- if (!*token)
- continue;
-
- if (!strcmp(token, "all")) {
- cgroup_no_v1_mask = U16_MAX;
- break;
- }
-
- for_each_subsys(ss, i) {
- if (strcmp(token, ss->name) &&
- strcmp(token, ss->legacy_name))
- continue;
-
- cgroup_no_v1_mask |= 1 << i;
- }
- }
- return 1;
-}
-__setup("cgroup_no_v1=", cgroup_no_v1);
-
/**
* css_tryget_online_from_dir - get corresponding css from a cgroup dentry
* @dentry: directory dentry of interest
@@ -6197,7 +4901,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
* have been or be removed at any point. @kn->priv is RCU
* protected for this access. See css_release_work_fn() for details.
*/
- cgrp = rcu_dereference(kn->priv);
+ cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
if (cgrp)
css = cgroup_css(cgrp, ss);
@@ -6349,154 +5053,6 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
#endif /* CONFIG_SOCK_CGROUP_DATA */
-/* cgroup namespaces */
-
-static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
-{
- return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
-}
-
-static void dec_cgroup_namespaces(struct ucounts *ucounts)
-{
- dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
-}
-
-static struct cgroup_namespace *alloc_cgroup_ns(void)
-{
- struct cgroup_namespace *new_ns;
- int ret;
-
- new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
- if (!new_ns)
- return ERR_PTR(-ENOMEM);
- ret = ns_alloc_inum(&new_ns->ns);
- if (ret) {
- kfree(new_ns);
- return ERR_PTR(ret);
- }
- atomic_set(&new_ns->count, 1);
- new_ns->ns.ops = &cgroupns_operations;
- return new_ns;
-}
-
-void free_cgroup_ns(struct cgroup_namespace *ns)
-{
- put_css_set(ns->root_cset);
- dec_cgroup_namespaces(ns->ucounts);
- put_user_ns(ns->user_ns);
- ns_free_inum(&ns->ns);
- kfree(ns);
-}
-EXPORT_SYMBOL(free_cgroup_ns);
-
-struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
- struct user_namespace *user_ns,
- struct cgroup_namespace *old_ns)
-{
- struct cgroup_namespace *new_ns;
- struct ucounts *ucounts;
- struct css_set *cset;
-
- BUG_ON(!old_ns);
-
- if (!(flags & CLONE_NEWCGROUP)) {
- get_cgroup_ns(old_ns);
- return old_ns;
- }
-
- /* Allow only sysadmin to create cgroup namespace. */
- if (!ns_capable(user_ns, CAP_SYS_ADMIN))
- return ERR_PTR(-EPERM);
-
- ucounts = inc_cgroup_namespaces(user_ns);
- if (!ucounts)
- return ERR_PTR(-ENOSPC);
-
- /* It is not safe to take cgroup_mutex here */
- spin_lock_irq(&css_set_lock);
- cset = task_css_set(current);
- get_css_set(cset);
- spin_unlock_irq(&css_set_lock);
-
- new_ns = alloc_cgroup_ns();
- if (IS_ERR(new_ns)) {
- put_css_set(cset);
- dec_cgroup_namespaces(ucounts);
- return new_ns;
- }
-
- new_ns->user_ns = get_user_ns(user_ns);
- new_ns->ucounts = ucounts;
- new_ns->root_cset = cset;
-
- return new_ns;
-}
-
-static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
-{
- return container_of(ns, struct cgroup_namespace, ns);
-}
-
-static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
-{
- struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
-
- if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
- !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
- return -EPERM;
-
- /* Don't need to do anything if we are attaching to our own cgroupns. */
- if (cgroup_ns == nsproxy->cgroup_ns)
- return 0;
-
- get_cgroup_ns(cgroup_ns);
- put_cgroup_ns(nsproxy->cgroup_ns);
- nsproxy->cgroup_ns = cgroup_ns;
-
- return 0;
-}
-
-static struct ns_common *cgroupns_get(struct task_struct *task)
-{
- struct cgroup_namespace *ns = NULL;
- struct nsproxy *nsproxy;
-
- task_lock(task);
- nsproxy = task->nsproxy;
- if (nsproxy) {
- ns = nsproxy->cgroup_ns;
- get_cgroup_ns(ns);
- }
- task_unlock(task);
-
- return ns ? &ns->ns : NULL;
-}
-
-static void cgroupns_put(struct ns_common *ns)
-{
- put_cgroup_ns(to_cg_ns(ns));
-}
-
-static struct user_namespace *cgroupns_owner(struct ns_common *ns)
-{
- return to_cg_ns(ns)->user_ns;
-}
-
-const struct proc_ns_operations cgroupns_operations = {
- .name = "cgroup",
- .type = CLONE_NEWCGROUP,
- .get = cgroupns_get,
- .put = cgroupns_put,
- .install = cgroupns_install,
- .owner = cgroupns_owner,
-};
-
-static __init int cgroup_namespaces_init(void)
-{
- return 0;
-}
-subsys_initcall(cgroup_namespaces_init);
-
#ifdef CONFIG_CGROUP_BPF
int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
enum bpf_attach_type type, bool overridable)
@@ -6510,149 +5066,3 @@ int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
return ret;
}
#endif /* CONFIG_CGROUP_BPF */
-
-#ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *
-debug_css_alloc(struct cgroup_subsys_state *parent_css)
-{
- struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
-
- if (!css)
- return ERR_PTR(-ENOMEM);
-
- return css;
-}
-
-static void debug_css_free(struct cgroup_subsys_state *css)
-{
- kfree(css);
-}
-
-static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return cgroup_task_count(css->cgroup);
-}
-
-static u64 current_css_set_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return (u64)(unsigned long)current->cgroups;
-}
-
-static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- u64 count;
-
- rcu_read_lock();
- count = atomic_read(&task_css_set(current)->refcount);
- rcu_read_unlock();
- return count;
-}
-
-static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
-{
- struct cgrp_cset_link *link;
- struct css_set *cset;
- char *name_buf;
-
- name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
- if (!name_buf)
- return -ENOMEM;
-
- spin_lock_irq(&css_set_lock);
- rcu_read_lock();
- cset = rcu_dereference(current->cgroups);
- list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
- struct cgroup *c = link->cgrp;
-
- cgroup_name(c, name_buf, NAME_MAX + 1);
- seq_printf(seq, "Root %d group %s\n",
- c->root->hierarchy_id, name_buf);
- }
- rcu_read_unlock();
- spin_unlock_irq(&css_set_lock);
- kfree(name_buf);
- return 0;
-}
-
-#define MAX_TASKS_SHOWN_PER_CSS 25
-static int cgroup_css_links_read(struct seq_file *seq, void *v)
-{
- struct cgroup_subsys_state *css = seq_css(seq);
- struct cgrp_cset_link *link;
-
- spin_lock_irq(&css_set_lock);
- list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
- struct css_set *cset = link->cset;
- struct task_struct *task;
- int count = 0;
-
- seq_printf(seq, "css_set %p\n", cset);
-
- list_for_each_entry(task, &cset->tasks, cg_list) {
- if (count++ > MAX_TASKS_SHOWN_PER_CSS)
- goto overflow;
- seq_printf(seq, " task %d\n", task_pid_vnr(task));
- }
-
- list_for_each_entry(task, &cset->mg_tasks, cg_list) {
- if (count++ > MAX_TASKS_SHOWN_PER_CSS)
- goto overflow;
- seq_printf(seq, " task %d\n", task_pid_vnr(task));
- }
- continue;
- overflow:
- seq_puts(seq, " ...\n");
- }
- spin_unlock_irq(&css_set_lock);
- return 0;
-}
-
-static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
-{
- return (!cgroup_is_populated(css->cgroup) &&
- !css_has_online_children(&css->cgroup->self));
-}
-
-static struct cftype debug_files[] = {
- {
- .name = "taskcount",
- .read_u64 = debug_taskcount_read,
- },
-
- {
- .name = "current_css_set",
- .read_u64 = current_css_set_read,
- },
-
- {
- .name = "current_css_set_refcount",
- .read_u64 = current_css_set_refcount_read,
- },
-
- {
- .name = "current_css_set_cg_links",
- .seq_show = current_css_set_cg_links_read,
- },
-
- {
- .name = "cgroup_css_links",
- .seq_show = cgroup_css_links_read,
- },
-
- {
- .name = "releasable",
- .read_u64 = releasable_read,
- },
-
- { } /* terminate */
-};
-
-struct cgroup_subsys debug_cgrp_subsys = {
- .css_alloc = debug_css_alloc,
- .css_free = debug_css_free,
- .legacy_cftypes = debug_files,
-};
-#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cpuset.c b/kernel/cgroup/cpuset.c
index b3088886cd37..0f41292be0fb 100644
--- a/kernel/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -44,6 +44,8 @@
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/task.h>
#include <linux/seq_file.h>
#include <linux/security.h>
#include <linux/slab.h>
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup/freezer.c
index 1b72d56edce5..1b72d56edce5 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup/freezer.c
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
new file mode 100644
index 000000000000..96d38dab6fb2
--- /dev/null
+++ b/kernel/cgroup/namespace.c
@@ -0,0 +1,155 @@
+#include "cgroup-internal.h"
+
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+#include <linux/nsproxy.h>
+#include <linux/proc_ns.h>
+
+
+/* cgroup namespaces */
+
+static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
+{
+ return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
+}
+
+static void dec_cgroup_namespaces(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
+}
+
+static struct cgroup_namespace *alloc_cgroup_ns(void)
+{
+ struct cgroup_namespace *new_ns;
+ int ret;
+
+ new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
+ if (!new_ns)
+ return ERR_PTR(-ENOMEM);
+ ret = ns_alloc_inum(&new_ns->ns);
+ if (ret) {
+ kfree(new_ns);
+ return ERR_PTR(ret);
+ }
+ atomic_set(&new_ns->count, 1);
+ new_ns->ns.ops = &cgroupns_operations;
+ return new_ns;
+}
+
+void free_cgroup_ns(struct cgroup_namespace *ns)
+{
+ put_css_set(ns->root_cset);
+ dec_cgroup_namespaces(ns->ucounts);
+ put_user_ns(ns->user_ns);
+ ns_free_inum(&ns->ns);
+ kfree(ns);
+}
+EXPORT_SYMBOL(free_cgroup_ns);
+
+struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
+ struct user_namespace *user_ns,
+ struct cgroup_namespace *old_ns)
+{
+ struct cgroup_namespace *new_ns;
+ struct ucounts *ucounts;
+ struct css_set *cset;
+
+ BUG_ON(!old_ns);
+
+ if (!(flags & CLONE_NEWCGROUP)) {
+ get_cgroup_ns(old_ns);
+ return old_ns;
+ }
+
+ /* Allow only sysadmin to create cgroup namespace. */
+ if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ ucounts = inc_cgroup_namespaces(user_ns);
+ if (!ucounts)
+ return ERR_PTR(-ENOSPC);
+
+ /* It is not safe to take cgroup_mutex here */
+ spin_lock_irq(&css_set_lock);
+ cset = task_css_set(current);
+ get_css_set(cset);
+ spin_unlock_irq(&css_set_lock);
+
+ new_ns = alloc_cgroup_ns();
+ if (IS_ERR(new_ns)) {
+ put_css_set(cset);
+ dec_cgroup_namespaces(ucounts);
+ return new_ns;
+ }
+
+ new_ns->user_ns = get_user_ns(user_ns);
+ new_ns->ucounts = ucounts;
+ new_ns->root_cset = cset;
+
+ return new_ns;
+}
+
+static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct cgroup_namespace, ns);
+}
+
+static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+{
+ struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
+
+ if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
+ !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /* Don't need to do anything if we are attaching to our own cgroupns. */
+ if (cgroup_ns == nsproxy->cgroup_ns)
+ return 0;
+
+ get_cgroup_ns(cgroup_ns);
+ put_cgroup_ns(nsproxy->cgroup_ns);
+ nsproxy->cgroup_ns = cgroup_ns;
+
+ return 0;
+}
+
+static struct ns_common *cgroupns_get(struct task_struct *task)
+{
+ struct cgroup_namespace *ns = NULL;
+ struct nsproxy *nsproxy;
+
+ task_lock(task);
+ nsproxy = task->nsproxy;
+ if (nsproxy) {
+ ns = nsproxy->cgroup_ns;
+ get_cgroup_ns(ns);
+ }
+ task_unlock(task);
+
+ return ns ? &ns->ns : NULL;
+}
+
+static void cgroupns_put(struct ns_common *ns)
+{
+ put_cgroup_ns(to_cg_ns(ns));
+}
+
+static struct user_namespace *cgroupns_owner(struct ns_common *ns)
+{
+ return to_cg_ns(ns)->user_ns;
+}
+
+const struct proc_ns_operations cgroupns_operations = {
+ .name = "cgroup",
+ .type = CLONE_NEWCGROUP,
+ .get = cgroupns_get,
+ .put = cgroupns_put,
+ .install = cgroupns_install,
+ .owner = cgroupns_owner,
+};
+
+static __init int cgroup_namespaces_init(void)
+{
+ return 0;
+}
+subsys_initcall(cgroup_namespaces_init);
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup/pids.c
index 2bd673783f1a..e756dae49300 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup/pids.c
@@ -214,7 +214,7 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
/*
* task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
- * on threadgroup_change_begin() held by the copy_process().
+ * on cgroup_threadgroup_change_begin() held by the copy_process().
*/
static int pids_can_fork(struct task_struct *task)
{
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
new file mode 100644
index 000000000000..defad3c5e7dc
--- /dev/null
+++ b/kernel/cgroup/rdma.c
@@ -0,0 +1,619 @@
+/*
+ * RDMA resource limiting controller for cgroups.
+ *
+ * Used to allow a cgroup hierarchy to stop processes from consuming
+ * additional RDMA resources after a certain limit is reached.
+ *
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/cgroup.h>
+#include <linux/parser.h>
+#include <linux/cgroup_rdma.h>
+
+#define RDMACG_MAX_STR "max"
+
+/*
+ * Protects list of resource pools maintained on per cgroup basis
+ * and rdma device list.
+ */
+static DEFINE_MUTEX(rdmacg_mutex);
+static LIST_HEAD(rdmacg_devices);
+
+enum rdmacg_file_type {
+ RDMACG_RESOURCE_TYPE_MAX,
+ RDMACG_RESOURCE_TYPE_STAT,
+};
+
+/*
+ * resource table definition as to be seen by the user.
+ * Need to add entries to it when more resources are
+ * added/defined at IB verb/core layer.
+ */
+static char const *rdmacg_resource_names[] = {
+ [RDMACG_RESOURCE_HCA_HANDLE] = "hca_handle",
+ [RDMACG_RESOURCE_HCA_OBJECT] = "hca_object",
+};
+
+/* resource tracker for each resource of rdma cgroup */
+struct rdmacg_resource {
+ int max;
+ int usage;
+};
+
+/*
+ * resource pool object which represents per cgroup, per device
+ * resources. There are multiple instances of this object per cgroup,
+ * therefore it cannot be embedded within rdma_cgroup structure. It
+ * is maintained as list.
+ */
+struct rdmacg_resource_pool {
+ struct rdmacg_device *device;
+ struct rdmacg_resource resources[RDMACG_RESOURCE_MAX];
+
+ struct list_head cg_node;
+ struct list_head dev_node;
+
+ /* count active user tasks of this pool */
+ u64 usage_sum;
+ /* total number counts which are set to max */
+ int num_max_cnt;
+};
+
+static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
+{
+ return container_of(css, struct rdma_cgroup, css);
+}
+
+static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
+{
+ return css_rdmacg(cg->css.parent);
+}
+
+static inline struct rdma_cgroup *get_current_rdmacg(void)
+{
+ return css_rdmacg(task_get_css(current, rdma_cgrp_id));
+}
+
+static void set_resource_limit(struct rdmacg_resource_pool *rpool,
+ int index, int new_max)
+{
+ if (new_max == S32_MAX) {
+ if (rpool->resources[index].max != S32_MAX)
+ rpool->num_max_cnt++;
+ } else {
+ if (rpool->resources[index].max == S32_MAX)
+ rpool->num_max_cnt--;
+ }
+ rpool->resources[index].max = new_max;
+}
+
+static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
+{
+ int i;
+
+ for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
+ set_resource_limit(rpool, i, S32_MAX);
+}
+
+static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
+{
+ lockdep_assert_held(&rdmacg_mutex);
+
+ list_del(&rpool->cg_node);
+ list_del(&rpool->dev_node);
+ kfree(rpool);
+}
+
+static struct rdmacg_resource_pool *
+find_cg_rpool_locked(struct rdma_cgroup *cg,
+ struct rdmacg_device *device)
+
+{
+ struct rdmacg_resource_pool *pool;
+
+ lockdep_assert_held(&rdmacg_mutex);
+
+ list_for_each_entry(pool, &cg->rpools, cg_node)
+ if (pool->device == device)
+ return pool;
+
+ return NULL;
+}
+
+static struct rdmacg_resource_pool *
+get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
+{
+ struct rdmacg_resource_pool *rpool;
+
+ rpool = find_cg_rpool_locked(cg, device);
+ if (rpool)
+ return rpool;
+
+ rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
+ if (!rpool)
+ return ERR_PTR(-ENOMEM);
+
+ rpool->device = device;
+ set_all_resource_max_limit(rpool);
+
+ INIT_LIST_HEAD(&rpool->cg_node);
+ INIT_LIST_HEAD(&rpool->dev_node);
+ list_add_tail(&rpool->cg_node, &cg->rpools);
+ list_add_tail(&rpool->dev_node, &device->rpools);
+ return rpool;
+}
+
+/**
+ * uncharge_cg_locked - uncharge resource for rdma cgroup
+ * @cg: pointer to cg to uncharge and all parents in hierarchy
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to uncharge in cg (resource pool)
+ *
+ * It also frees the resource pool which was created as part of
+ * charging operation when there are no resources attached to
+ * resource pool.
+ */
+static void
+uncharge_cg_locked(struct rdma_cgroup *cg,
+ struct rdmacg_device *device,
+ enum rdmacg_resource_type index)
+{
+ struct rdmacg_resource_pool *rpool;
+
+ rpool = find_cg_rpool_locked(cg, device);
+
+ /*
+ * rpool cannot be null at this stage. Let kernel operate in case
+ * if there a bug in IB stack or rdma controller, instead of crashing
+ * the system.
+ */
+ if (unlikely(!rpool)) {
+ pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
+ return;
+ }
+
+ rpool->resources[index].usage--;
+
+ /*
+ * A negative count (or overflow) is invalid,
+ * it indicates a bug in the rdma controller.
+ */
+ WARN_ON_ONCE(rpool->resources[index].usage < 0);
+ rpool->usage_sum--;
+ if (rpool->usage_sum == 0 &&
+ rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
+ /*
+ * No user of the rpool and all entries are set to max, so
+ * safe to delete this rpool.
+ */
+ free_cg_rpool_locked(rpool);
+ }
+}
+
+/**
+ * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
+ * @device: pointer to rdmacg device
+ * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
+ * stop uncharging
+ * @index: index of the resource to uncharge in cg in given resource pool
+ */
+static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
+ struct rdmacg_device *device,
+ struct rdma_cgroup *stop_cg,
+ enum rdmacg_resource_type index)
+{
+ struct rdma_cgroup *p;
+
+ mutex_lock(&rdmacg_mutex);
+
+ for (p = cg; p != stop_cg; p = parent_rdmacg(p))
+ uncharge_cg_locked(p, device, index);
+
+ mutex_unlock(&rdmacg_mutex);
+
+ css_put(&cg->css);
+}
+
+/**
+ * rdmacg_uncharge - hierarchically uncharge rdma resource count
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to uncharge in cgroup in given resource pool
+ */
+void rdmacg_uncharge(struct rdma_cgroup *cg,
+ struct rdmacg_device *device,
+ enum rdmacg_resource_type index)
+{
+ if (index >= RDMACG_RESOURCE_MAX)
+ return;
+
+ rdmacg_uncharge_hierarchy(cg, device, NULL, index);
+}
+EXPORT_SYMBOL(rdmacg_uncharge);
+
+/**
+ * rdmacg_try_charge - hierarchically try to charge the rdma resource
+ * @rdmacg: pointer to rdma cgroup which will own this resource
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to charge in cgroup (resource pool)
+ *
+ * This function follows charging resource in hierarchical way.
+ * It will fail if the charge would cause the new value to exceed the
+ * hierarchical limit.
+ * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
+ * Returns pointer to rdmacg for this resource when charging is successful.
+ *
+ * Charger needs to account resources on two criteria.
+ * (a) per cgroup & (b) per device resource usage.
+ * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
+ * the configured limits. Per device provides granular configuration
+ * in multi device usage. It allocates resource pool in the hierarchy
+ * for each parent it come across for first resource. Later on resource
+ * pool will be available. Therefore it will be much faster thereon
+ * to charge/uncharge.
+ */
+int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
+ struct rdmacg_device *device,
+ enum rdmacg_resource_type index)
+{
+ struct rdma_cgroup *cg, *p;
+ struct rdmacg_resource_pool *rpool;
+ s64 new;
+ int ret = 0;
+
+ if (index >= RDMACG_RESOURCE_MAX)
+ return -EINVAL;
+
+ /*
+ * hold on to css, as cgroup can be removed but resource
+ * accounting happens on css.
+ */
+ cg = get_current_rdmacg();
+
+ mutex_lock(&rdmacg_mutex);
+ for (p = cg; p; p = parent_rdmacg(p)) {
+ rpool = get_cg_rpool_locked(p, device);
+ if (IS_ERR(rpool)) {
+ ret = PTR_ERR(rpool);
+ goto err;
+ } else {
+ new = rpool->resources[index].usage + 1;
+ if (new > rpool->resources[index].max) {
+ ret = -EAGAIN;
+ goto err;
+ } else {
+ rpool->resources[index].usage = new;
+ rpool->usage_sum++;
+ }
+ }
+ }
+ mutex_unlock(&rdmacg_mutex);
+
+ *rdmacg = cg;
+ return 0;
+
+err:
+ mutex_unlock(&rdmacg_mutex);
+ rdmacg_uncharge_hierarchy(cg, device, p, index);
+ return ret;
+}
+EXPORT_SYMBOL(rdmacg_try_charge);
+
+/**
+ * rdmacg_register_device - register rdmacg device to rdma controller.
+ * @device: pointer to rdmacg device whose resources need to be accounted.
+ *
+ * If IB stack wish a device to participate in rdma cgroup resource
+ * tracking, it must invoke this API to register with rdma cgroup before
+ * any user space application can start using the RDMA resources.
+ * Returns 0 on success or EINVAL when table length given is beyond
+ * supported size.
+ */
+int rdmacg_register_device(struct rdmacg_device *device)
+{
+ INIT_LIST_HEAD(&device->dev_node);
+ INIT_LIST_HEAD(&device->rpools);
+
+ mutex_lock(&rdmacg_mutex);
+ list_add_tail(&device->dev_node, &rdmacg_devices);
+ mutex_unlock(&rdmacg_mutex);
+ return 0;
+}
+EXPORT_SYMBOL(rdmacg_register_device);
+
+/**
+ * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
+ * @device: pointer to rdmacg device which was previously registered with rdma
+ * controller using rdmacg_register_device().
+ *
+ * IB stack must invoke this after all the resources of the IB device
+ * are destroyed and after ensuring that no more resources will be created
+ * when this API is invoked.
+ */
+void rdmacg_unregister_device(struct rdmacg_device *device)
+{
+ struct rdmacg_resource_pool *rpool, *tmp;
+
+ /*
+ * Synchronize with any active resource settings,
+ * usage query happening via configfs.
+ */
+ mutex_lock(&rdmacg_mutex);
+ list_del_init(&device->dev_node);
+
+ /*
+ * Now that this device is off the cgroup list, its safe to free
+ * all the rpool resources.
+ */
+ list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
+ free_cg_rpool_locked(rpool);
+
+ mutex_unlock(&rdmacg_mutex);
+}
+EXPORT_SYMBOL(rdmacg_unregister_device);
+
+static int parse_resource(char *c, int *intval)
+{
+ substring_t argstr;
+ const char **table = &rdmacg_resource_names[0];
+ char *name, *value = c;
+ size_t len;
+ int ret, i = 0;
+
+ name = strsep(&value, "=");
+ if (!name || !value)
+ return -EINVAL;
+
+ len = strlen(value);
+
+ for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
+ if (strcmp(table[i], name))
+ continue;
+
+ argstr.from = value;
+ argstr.to = value + len;
+
+ ret = match_int(&argstr, intval);
+ if (ret >= 0) {
+ if (*intval < 0)
+ break;
+ return i;
+ }
+ if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
+ *intval = S32_MAX;
+ return i;
+ }
+ break;
+ }
+ return -EINVAL;
+}
+
+static int rdmacg_parse_limits(char *options,
+ int *new_limits, unsigned long *enables)
+{
+ char *c;
+ int err = -EINVAL;
+
+ /* parse resource options */
+ while ((c = strsep(&options, " ")) != NULL) {
+ int index, intval;
+
+ index = parse_resource(c, &intval);
+ if (index < 0)
+ goto err;
+
+ new_limits[index] = intval;
+ *enables |= BIT(index);
+ }
+ return 0;
+
+err:
+ return err;
+}
+
+static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
+{
+ struct rdmacg_device *device;
+
+ lockdep_assert_held(&rdmacg_mutex);
+
+ list_for_each_entry(device, &rdmacg_devices, dev_node)
+ if (!strcmp(name, device->name))
+ return device;
+
+ return NULL;
+}
+
+static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct rdma_cgroup *cg = css_rdmacg(of_css(of));
+ const char *dev_name;
+ struct rdmacg_resource_pool *rpool;
+ struct rdmacg_device *device;
+ char *options = strstrip(buf);
+ int *new_limits;
+ unsigned long enables = 0;
+ int i = 0, ret = 0;
+
+ /* extract the device name first */
+ dev_name = strsep(&options, " ");
+ if (!dev_name) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
+ if (!new_limits) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ret = rdmacg_parse_limits(options, new_limits, &enables);
+ if (ret)
+ goto parse_err;
+
+ /* acquire lock to synchronize with hot plug devices */
+ mutex_lock(&rdmacg_mutex);
+
+ device = rdmacg_get_device_locked(dev_name);
+ if (!device) {
+ ret = -ENODEV;
+ goto dev_err;
+ }
+
+ rpool = get_cg_rpool_locked(cg, device);
+ if (IS_ERR(rpool)) {
+ ret = PTR_ERR(rpool);
+ goto dev_err;
+ }
+
+ /* now set the new limits of the rpool */
+ for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
+ set_resource_limit(rpool, i, new_limits[i]);
+
+ if (rpool->usage_sum == 0 &&
+ rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
+ /*
+ * No user of the rpool and all entries are set to max, so
+ * safe to delete this rpool.
+ */
+ free_cg_rpool_locked(rpool);
+ }
+
+dev_err:
+ mutex_unlock(&rdmacg_mutex);
+
+parse_err:
+ kfree(new_limits);
+
+err:
+ return ret ?: nbytes;
+}
+
+static void print_rpool_values(struct seq_file *sf,
+ struct rdmacg_resource_pool *rpool)
+{
+ enum rdmacg_file_type sf_type;
+ int i;
+ u32 value;
+
+ sf_type = seq_cft(sf)->private;
+
+ for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
+ seq_puts(sf, rdmacg_resource_names[i]);
+ seq_putc(sf, '=');
+ if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
+ if (rpool)
+ value = rpool->resources[i].max;
+ else
+ value = S32_MAX;
+ } else {
+ if (rpool)
+ value = rpool->resources[i].usage;
+ else
+ value = 0;
+ }
+
+ if (value == S32_MAX)
+ seq_puts(sf, RDMACG_MAX_STR);
+ else
+ seq_printf(sf, "%d", value);
+ seq_putc(sf, ' ');
+ }
+}
+
+static int rdmacg_resource_read(struct seq_file *sf, void *v)
+{
+ struct rdmacg_device *device;
+ struct rdmacg_resource_pool *rpool;
+ struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
+
+ mutex_lock(&rdmacg_mutex);
+
+ list_for_each_entry(device, &rdmacg_devices, dev_node) {
+ seq_printf(sf, "%s ", device->name);
+
+ rpool = find_cg_rpool_locked(cg, device);
+ print_rpool_values(sf, rpool);
+
+ seq_putc(sf, '\n');
+ }
+
+ mutex_unlock(&rdmacg_mutex);
+ return 0;
+}
+
+static struct cftype rdmacg_files[] = {
+ {
+ .name = "max",
+ .write = rdmacg_resource_set_max,
+ .seq_show = rdmacg_resource_read,
+ .private = RDMACG_RESOURCE_TYPE_MAX,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "current",
+ .seq_show = rdmacg_resource_read,
+ .private = RDMACG_RESOURCE_TYPE_STAT,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ { } /* terminate */
+};
+
+static struct cgroup_subsys_state *
+rdmacg_css_alloc(struct cgroup_subsys_state *parent)
+{
+ struct rdma_cgroup *cg;
+
+ cg = kzalloc(sizeof(*cg), GFP_KERNEL);
+ if (!cg)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&cg->rpools);
+ return &cg->css;
+}
+
+static void rdmacg_css_free(struct cgroup_subsys_state *css)
+{
+ struct rdma_cgroup *cg = css_rdmacg(css);
+
+ kfree(cg);
+}
+
+/**
+ * rdmacg_css_offline - cgroup css_offline callback
+ * @css: css of interest
+ *
+ * This function is called when @css is about to go away and responsible
+ * for shooting down all rdmacg associated with @css. As part of that it
+ * marks all the resource pool entries to max value, so that when resources are
+ * uncharged, associated resource pool can be freed as well.
+ */
+static void rdmacg_css_offline(struct cgroup_subsys_state *css)
+{
+ struct rdma_cgroup *cg = css_rdmacg(css);
+ struct rdmacg_resource_pool *rpool;
+
+ mutex_lock(&rdmacg_mutex);
+
+ list_for_each_entry(rpool, &cg->rpools, cg_node)
+ set_all_resource_max_limit(rpool);
+
+ mutex_unlock(&rdmacg_mutex);
+}
+
+struct cgroup_subsys rdma_cgrp_subsys = {
+ .css_alloc = rdmacg_css_alloc,
+ .css_free = rdmacg_css_free,
+ .css_offline = rdmacg_css_offline,
+ .legacy_cftypes = rdmacg_files,
+ .dfl_cftypes = rdmacg_files,
+};
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index 1a8f34f63601..26a06e09a5bd 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -21,6 +21,7 @@ CONFIG_CP15_BARRIER_EMULATION=y
CONFIG_DEFAULT_SECURITY_SELINUX=y
CONFIG_EMBEDDED=y
CONFIG_FB=y
+CONFIG_HARDENED_USERCOPY=y
CONFIG_HIGH_RES_TIMERS=y
CONFIG_INET6_AH=y
CONFIG_INET6_ESP=y
@@ -129,6 +130,7 @@ CONFIG_PPP_DEFLATE=y
CONFIG_PPP_MPPE=y
CONFIG_PREEMPT=y
CONFIG_QUOTA=y
+CONFIG_RANDOMIZE_BASE=y
CONFIG_RTC_CLASS=y
CONFIG_RT_GROUP_SCHED=y
CONFIG_SECCOMP=y
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
index 99127edc5204..28ee064b6744 100644
--- a/kernel/configs/android-recommended.config
+++ b/kernel/configs/android-recommended.config
@@ -1,4 +1,5 @@
# KEEP ALPHABETICALLY SORTED
+# CONFIG_AIO is not set
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
# CONFIG_INPUT_MOUSE is not set
# CONFIG_LEGACY_PTYS is not set
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 0a5f630f5c54..f7c063239fa5 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -7,7 +7,9 @@
#include <linux/smp.h>
#include <linux/init.h>
#include <linux/notifier.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/hotplug.h>
+#include <linux/sched/task.h>
#include <linux/unistd.h>
#include <linux/cpu.h>
#include <linux/oom.h>
diff --git a/kernel/cred.c b/kernel/cred.c
index 5f264fb5737d..2bc66075740f 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -12,6 +12,7 @@
#include <linux/cred.h>
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/sched/coredump.h>
#include <linux/key.h>
#include <linux/keyctl.h>
#include <linux/init_task.h>
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 79517e5549f1..65c0f1363788 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -49,6 +49,7 @@
#include <linux/init.h>
#include <linux/kgdb.h>
#include <linux/kdb.h>
+#include <linux/nmi.h>
#include <linux/pid.h>
#include <linux/smp.h>
#include <linux/mm.h>
@@ -232,9 +233,9 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
int i;
for (i = 0; i < VMACACHE_SIZE; i++) {
- if (!current->vmacache[i])
+ if (!current->vmacache.vmas[i])
continue;
- flush_cache_range(current->vmacache[i],
+ flush_cache_range(current->vmacache.vmas[i],
addr, addr + BREAK_INSTR_SIZE);
}
}
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 19d9a578c753..7510dc687c0d 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -29,6 +29,7 @@
*/
#include <linux/kernel.h>
+#include <linux/sched/signal.h>
#include <linux/kgdb.h>
#include <linux/kdb.h>
#include <linux/serial_core.h>
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index fe15fff5df53..6ad4a9fcbd6f 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -12,7 +12,8 @@
#include <linux/ctype.h>
#include <linux/string.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
#include <linux/kdb.h>
#include <linux/nmi.h>
#include "kdb_private.h"
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index ca183919d302..c8146d53ca67 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -18,6 +18,9 @@
#include <linux/kmsg_dump.h>
#include <linux/reboot.h>
#include <linux/sched.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/stat.h>
+#include <linux/sched/debug.h>
#include <linux/sysrq.h>
#include <linux/smp.h>
#include <linux/utsname.h>
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 660549656991..4a1c33416b6a 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -14,6 +14,8 @@
*/
#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/cputime.h>
#include <linux/slab.h>
#include <linux/taskstats.h>
#include <linux/time.h>
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index e9fdb5203de5..c04917cad1bf 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -11,6 +11,8 @@
#include <linux/perf_event.h>
#include <linux/slab.h>
+#include <linux/sched/task_stack.h>
+
#include "internal.h"
struct callchain_cpus_entries {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 77a932b54a64..6f41548f2e32 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -46,6 +46,8 @@
#include <linux/filter.h>
#include <linux/namei.h>
#include <linux/parser.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/mm.h>
#include "internal.h"
@@ -455,7 +457,7 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
- int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write)
return ret;
@@ -3522,6 +3524,8 @@ static void perf_event_enable_on_exec(int ctxn)
if (enabled) {
clone_ctx = unclone_ctx(ctx);
ctx_resched(cpuctx, ctx, event_type);
+ } else {
+ ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
}
perf_ctx_unlock(cpuctx, ctx);
@@ -4925,9 +4929,9 @@ unlock:
rcu_read_unlock();
}
-static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int perf_mmap_fault(struct vm_fault *vmf)
{
- struct perf_event *event = vma->vm_file->private_data;
+ struct perf_event *event = vmf->vma->vm_file->private_data;
struct ring_buffer *rb;
int ret = VM_FAULT_SIGBUS;
@@ -4950,7 +4954,7 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
goto unlock;
get_page(vmf->page);
- vmf->page->mapping = vma->vm_file->f_mapping;
+ vmf->page->mapping = vmf->vma->vm_file->f_mapping;
vmf->page->index = vmf->pgoff;
ret = 0;
@@ -9955,6 +9959,7 @@ SYSCALL_DEFINE5(perf_event_open,
* of swizzling perf_event::ctx.
*/
perf_remove_from_context(group_leader, 0);
+ put_ctx(gctx);
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
@@ -9993,13 +9998,6 @@ SYSCALL_DEFINE5(perf_event_open,
perf_event__state_init(group_leader);
perf_install_in_context(ctx, group_leader, group_leader->cpu);
get_ctx(ctx);
-
- /*
- * Now that all events are installed in @ctx, nothing
- * references @gctx anymore, so drop the last reference we have
- * on it.
- */
- put_ctx(gctx);
}
/*
@@ -10959,5 +10957,11 @@ struct cgroup_subsys perf_event_cgrp_subsys = {
.css_alloc = perf_cgroup_css_alloc,
.css_free = perf_cgroup_css_free,
.attach = perf_cgroup_attach,
+ /*
+ * Implicitly enable on dfl hierarchy so that perf events can
+ * always be filtered by cgroup2 path as long as perf_event
+ * controller is not mounted on a legacy hierarchy.
+ */
+ .implicit_on_dfl = true,
};
#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index d416f3baf392..0e137f98a50c 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -27,6 +27,8 @@
#include <linux/pagemap.h> /* read_mapping_page */
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
#include <linux/export.h>
#include <linux/rmap.h> /* anon_vma_prepare */
#include <linux/mmu_notifier.h> /* set_pte_at_notify */
@@ -153,14 +155,19 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
struct page *old_page, struct page *new_page)
{
struct mm_struct *mm = vma->vm_mm;
- spinlock_t *ptl;
- pte_t *ptep;
+ struct page_vma_mapped_walk pvmw = {
+ .page = old_page,
+ .vma = vma,
+ .address = addr,
+ };
int err;
/* For mmu_notifiers */
const unsigned long mmun_start = addr;
const unsigned long mmun_end = addr + PAGE_SIZE;
struct mem_cgroup *memcg;
+ VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page);
+
err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg,
false);
if (err)
@@ -171,11 +178,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
err = -EAGAIN;
- ptep = page_check_address(old_page, mm, addr, &ptl, 0);
- if (!ptep) {
+ if (!page_vma_mapped_walk(&pvmw)) {
mem_cgroup_cancel_charge(new_page, memcg, false);
goto unlock;
}
+ VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
get_page(new_page);
page_add_new_anon_rmap(new_page, vma, addr, false);
@@ -187,14 +194,15 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
inc_mm_counter(mm, MM_ANONPAGES);
}
- flush_cache_page(vma, addr, pte_pfn(*ptep));
- ptep_clear_flush_notify(vma, addr, ptep);
- set_pte_at_notify(mm, addr, ptep, mk_pte(new_page, vma->vm_page_prot));
+ flush_cache_page(vma, addr, pte_pfn(*pvmw.pte));
+ ptep_clear_flush_notify(vma, addr, pvmw.pte);
+ set_pte_at_notify(mm, addr, pvmw.pte,
+ mk_pte(new_page, vma->vm_page_prot));
page_remove_rmap(old_page, false);
if (!page_mapped(old_page))
try_to_free_swap(old_page);
- pte_unmap_unlock(ptep, ptl);
+ page_vma_mapped_walk_done(&pvmw);
if (vma->vm_flags & VM_LOCKED)
munlock_vma_page(old_page);
@@ -300,8 +308,8 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
retry:
/* Read the page with vaddr into memory */
- ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page,
- &vma, NULL);
+ ret = get_user_pages_remote(NULL, mm, vaddr, 1,
+ FOLL_FORCE | FOLL_SPLIT, &old_page, &vma, NULL);
if (ret <= 0)
return ret;
@@ -741,7 +749,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
continue;
}
- if (!atomic_inc_not_zero(&vma->vm_mm->mm_users))
+ if (!mmget_not_zero(vma->vm_mm))
continue;
info = prev;
diff --git a/kernel/exit.c b/kernel/exit.c
index 580da79e38ee..e126ebf2400c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -6,6 +6,12 @@
#include <linux/mm.h>
#include <linux/slab.h>
+#include <linux/sched/autogroup.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/stat.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
+#include <linux/sched/cputime.h>
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/capability.h>
@@ -45,6 +51,7 @@
#include <linux/task_io_accounting_ops.h>
#include <linux/tracehook.h>
#include <linux/fs_struct.h>
+#include <linux/userfaultfd_k.h>
#include <linux/init_task.h>
#include <linux/perf_event.h>
#include <trace/events/sched.h>
@@ -538,7 +545,7 @@ static void exit_mm(void)
__set_current_state(TASK_RUNNING);
down_read(&mm->mmap_sem);
}
- atomic_inc(&mm->mm_count);
+ mmgrab(mm);
BUG_ON(mm != current->active_mm);
/* more a memory barrier than a real lock */
task_lock(current);
@@ -547,6 +554,7 @@ static void exit_mm(void)
enter_lazy_tlb(mm, current);
task_unlock(current);
mm_update_next_owner(mm);
+ userfaultfd_exit(mm);
mmput(mm);
if (test_thread_flag(TIF_MEMDIE))
exit_oom_victim();
@@ -607,15 +615,18 @@ static struct task_struct *find_new_reaper(struct task_struct *father,
return thread;
if (father->signal->has_child_subreaper) {
+ unsigned int ns_level = task_pid(father)->level;
/*
* Find the first ->is_child_subreaper ancestor in our pid_ns.
- * We start from father to ensure we can not look into another
- * namespace, this is safe because all its threads are dead.
+ * We can't check reaper != child_reaper to ensure we do not
+ * cross the namespaces, the exiting parent could be injected
+ * by setns() + fork().
+ * We check pid->level, this is slightly more efficient than
+ * task_active_pid_ns(reaper) != task_active_pid_ns(father).
*/
- for (reaper = father;
- !same_thread_group(reaper, child_reaper);
+ for (reaper = father->real_parent;
+ task_pid(reaper)->level == ns_level;
reaper = reaper->real_parent) {
- /* call_usermodehelper() descendants need this check */
if (reaper == &init_task)
break;
if (!reaper->signal->is_child_subreaper)
diff --git a/kernel/fork.c b/kernel/fork.c
index ff82e24573b6..6c463c80e93d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -12,6 +12,16 @@
*/
#include <linux/slab.h>
+#include <linux/sched/autogroup.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
+#include <linux/sched/user.h>
+#include <linux/sched/numa_balancing.h>
+#include <linux/sched/stat.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
+#include <linux/sched/cputime.h>
+#include <linux/rtmutex.h>
#include <linux/init.h>
#include <linux/unistd.h>
#include <linux/module.h>
@@ -55,6 +65,7 @@
#include <linux/rmap.h>
#include <linux/ksm.h>
#include <linux/acct.h>
+#include <linux/userfaultfd_k.h>
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/freezer.h>
@@ -561,6 +572,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
struct rb_node **rb_link, *rb_parent;
int retval;
unsigned long charge;
+ LIST_HEAD(uf);
uprobe_start_dup_mmap();
if (down_write_killable(&oldmm->mmap_sem)) {
@@ -617,12 +629,13 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
if (retval)
goto fail_nomem_policy;
tmp->vm_mm = mm;
+ retval = dup_userfaultfd(tmp, &uf);
+ if (retval)
+ goto fail_nomem_anon_vma_fork;
if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
- tmp->vm_flags &=
- ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP);
+ tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
tmp->vm_next = tmp->vm_prev = NULL;
- tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
file = tmp->vm_file;
if (file) {
struct inode *inode = file_inode(file);
@@ -678,6 +691,7 @@ out:
up_write(&mm->mmap_sem);
flush_tlb_mm(oldmm);
up_write(&oldmm->mmap_sem);
+ dup_userfaultfd_complete(&uf);
fail_uprobe_end:
uprobe_end_dup_mmap();
return retval;
@@ -996,7 +1010,7 @@ struct mm_struct *get_task_mm(struct task_struct *task)
if (task->flags & PF_KTHREAD)
mm = NULL;
else
- atomic_inc(&mm->mm_users);
+ mmget(mm);
}
task_unlock(task);
return mm;
@@ -1184,7 +1198,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
vmacache_flush(tsk);
if (clone_flags & CLONE_VM) {
- atomic_inc(&oldmm->mm_users);
+ mmget(oldmm);
mm = oldmm;
goto good_mm;
}
@@ -1373,9 +1387,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->oom_score_adj = current->signal->oom_score_adj;
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
- sig->has_child_subreaper = current->signal->has_child_subreaper ||
- current->signal->is_child_subreaper;
-
mutex_init(&sig->cred_guard_mutex);
return 0;
@@ -1454,6 +1465,21 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
task->pids[type].pid = pid;
}
+static inline void rcu_copy_process(struct task_struct *p)
+{
+#ifdef CONFIG_PREEMPT_RCU
+ p->rcu_read_lock_nesting = 0;
+ p->rcu_read_unlock_special.s = 0;
+ p->rcu_blocked_node = NULL;
+ INIT_LIST_HEAD(&p->rcu_node_entry);
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
+#ifdef CONFIG_TASKS_RCU
+ p->rcu_tasks_holdout = false;
+ INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
+ p->rcu_tasks_idle_cpu = -1;
+#endif /* #ifdef CONFIG_TASKS_RCU */
+}
+
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
@@ -1745,7 +1771,7 @@ static __latent_entropy struct task_struct *copy_process(
INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
- threadgroup_change_begin(current);
+ cgroup_threadgroup_change_begin(current);
/*
* Ensure that the cgroup subsystem policies allow the new process to be
* forked. It should be noted the the new process's css_set can be changed
@@ -1810,6 +1836,13 @@ static __latent_entropy struct task_struct *copy_process(
p->signal->leader_pid = pid;
p->signal->tty = tty_kref_get(current->signal->tty);
+ /*
+ * Inherit has_child_subreaper flag under the same
+ * tasklist_lock with adding child to the process tree
+ * for propagate_has_child_subreaper optimization.
+ */
+ p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
+ p->real_parent->signal->is_child_subreaper;
list_add_tail(&p->sibling, &p->real_parent->children);
list_add_tail_rcu(&p->tasks, &init_task.tasks);
attach_pid(p, PIDTYPE_PGID);
@@ -1835,7 +1868,7 @@ static __latent_entropy struct task_struct *copy_process(
proc_fork_connector(p);
cgroup_post_fork(p);
- threadgroup_change_end(current);
+ cgroup_threadgroup_change_end(current);
perf_event_fork(p);
trace_task_newtask(p, clone_flags);
@@ -1846,7 +1879,7 @@ static __latent_entropy struct task_struct *copy_process(
bad_fork_cancel_cgroup:
cgroup_cancel_fork(p);
bad_fork_free_pid:
- threadgroup_change_end(current);
+ cgroup_threadgroup_change_end(current);
if (pid != &init_struct_pid)
free_pid(pid);
bad_fork_cleanup_thread:
@@ -2063,6 +2096,38 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
}
#endif
+void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
+{
+ struct task_struct *leader, *parent, *child;
+ int res;
+
+ read_lock(&tasklist_lock);
+ leader = top = top->group_leader;
+down:
+ for_each_thread(leader, parent) {
+ list_for_each_entry(child, &parent->children, sibling) {
+ res = visitor(child, data);
+ if (res) {
+ if (res < 0)
+ goto out;
+ leader = child;
+ goto down;
+ }
+up:
+ ;
+ }
+ }
+
+ if (leader != top) {
+ child = leader;
+ parent = child->real_parent;
+ leader = parent->group_leader;
+ goto up;
+ }
+out:
+ read_unlock(&tasklist_lock);
+}
+
#ifndef ARCH_MIN_MMSTRUCT_ALIGN
#define ARCH_MIN_MMSTRUCT_ALIGN 0
#endif
diff --git a/kernel/futex.c b/kernel/futex.c
index cdf365036141..229a744b1781 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -61,6 +61,8 @@
#include <linux/nsproxy.h>
#include <linux/ptrace.h>
#include <linux/sched/rt.h>
+#include <linux/sched/wake_q.h>
+#include <linux/sched/mm.h>
#include <linux/hugetlb.h>
#include <linux/freezer.h>
#include <linux/bootmem.h>
@@ -338,7 +340,7 @@ static inline bool should_fail_futex(bool fshared)
static inline void futex_get_mm(union futex_key *key)
{
- atomic_inc(&key->private.mm->mm_count);
+ mmgrab(key->private.mm);
/*
* Ensure futex_get_mm() implies a full barrier such that
* get_futex_key() implies a full barrier. This is relied upon
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 40c07e4fa116..f0f8e2a8496f 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -16,6 +16,9 @@
#include <linux/export.h>
#include <linux/sysctl.h>
#include <linux/utsname.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
+
#include <trace/events/sched.h>
/*
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6b669593e7eb..a4afe5cc5af1 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -17,6 +17,8 @@
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/rt.h>
+#include <linux/sched/task.h>
+#include <uapi/linux/sched/types.h>
#include <linux/task_work.h>
#include "internals.h"
@@ -353,7 +355,7 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)
return 0;
/*
- * Preserve the managed affinity setting and an userspace affinity
+ * Preserve the managed affinity setting and a userspace affinity
* setup, but make sure that one of the targets is online.
*/
if (irqd_affinity_is_managed(&desc->irq_data) ||
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index a9b8cf500591..6c9cb208ac48 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -236,12 +236,28 @@ void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry
static inline struct jump_entry *static_key_entries(struct static_key *key)
{
- return (struct jump_entry *)((unsigned long)key->entries & ~JUMP_TYPE_MASK);
+ WARN_ON_ONCE(key->type & JUMP_TYPE_LINKED);
+ return (struct jump_entry *)(key->type & ~JUMP_TYPE_MASK);
}
static inline bool static_key_type(struct static_key *key)
{
- return (unsigned long)key->entries & JUMP_TYPE_MASK;
+ return key->type & JUMP_TYPE_TRUE;
+}
+
+static inline bool static_key_linked(struct static_key *key)
+{
+ return key->type & JUMP_TYPE_LINKED;
+}
+
+static inline void static_key_clear_linked(struct static_key *key)
+{
+ key->type &= ~JUMP_TYPE_LINKED;
+}
+
+static inline void static_key_set_linked(struct static_key *key)
+{
+ key->type |= JUMP_TYPE_LINKED;
}
static inline struct static_key *jump_entry_key(struct jump_entry *entry)
@@ -254,6 +270,26 @@ static bool jump_entry_branch(struct jump_entry *entry)
return (unsigned long)entry->key & 1UL;
}
+/***
+ * A 'struct static_key' uses a union such that it either points directly
+ * to a table of 'struct jump_entry' or to a linked list of modules which in
+ * turn point to 'struct jump_entry' tables.
+ *
+ * The two lower bits of the pointer are used to keep track of which pointer
+ * type is in use and to store the initial branch direction, we use an access
+ * function which preserves these bits.
+ */
+static void static_key_set_entries(struct static_key *key,
+ struct jump_entry *entries)
+{
+ unsigned long type;
+
+ WARN_ON_ONCE((unsigned long)entries & JUMP_TYPE_MASK);
+ type = key->type & JUMP_TYPE_MASK;
+ key->entries = entries;
+ key->type |= type;
+}
+
static enum jump_label_type jump_label_type(struct jump_entry *entry)
{
struct static_key *key = jump_entry_key(entry);
@@ -313,13 +349,7 @@ void __init jump_label_init(void)
continue;
key = iterk;
- /*
- * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
- */
- *((unsigned long *)&key->entries) += (unsigned long)iter;
-#ifdef CONFIG_MODULES
- key->next = NULL;
-#endif
+ static_key_set_entries(key, iter);
}
static_key_initialized = true;
jump_label_unlock();
@@ -343,6 +373,29 @@ struct static_key_mod {
struct module *mod;
};
+static inline struct static_key_mod *static_key_mod(struct static_key *key)
+{
+ WARN_ON_ONCE(!(key->type & JUMP_TYPE_LINKED));
+ return (struct static_key_mod *)(key->type & ~JUMP_TYPE_MASK);
+}
+
+/***
+ * key->type and key->next are the same via union.
+ * This sets key->next and preserves the type bits.
+ *
+ * See additional comments above static_key_set_entries().
+ */
+static void static_key_set_mod(struct static_key *key,
+ struct static_key_mod *mod)
+{
+ unsigned long type;
+
+ WARN_ON_ONCE((unsigned long)mod & JUMP_TYPE_MASK);
+ type = key->type & JUMP_TYPE_MASK;
+ key->next = mod;
+ key->type |= type;
+}
+
static int __jump_label_mod_text_reserved(void *start, void *end)
{
struct module *mod;
@@ -365,11 +418,23 @@ static void __jump_label_mod_update(struct static_key *key)
{
struct static_key_mod *mod;
- for (mod = key->next; mod; mod = mod->next) {
- struct module *m = mod->mod;
+ for (mod = static_key_mod(key); mod; mod = mod->next) {
+ struct jump_entry *stop;
+ struct module *m;
+
+ /*
+ * NULL if the static_key is defined in a module
+ * that does not use it
+ */
+ if (!mod->entries)
+ continue;
- __jump_label_update(key, mod->entries,
- m->jump_entries + m->num_jump_entries);
+ m = mod->mod;
+ if (!m)
+ stop = __stop___jump_table;
+ else
+ stop = m->jump_entries + m->num_jump_entries;
+ __jump_label_update(key, mod->entries, stop);
}
}
@@ -404,7 +469,7 @@ static int jump_label_add_module(struct module *mod)
struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
struct jump_entry *iter;
struct static_key *key = NULL;
- struct static_key_mod *jlm;
+ struct static_key_mod *jlm, *jlm2;
/* if the module doesn't have jump label entries, just return */
if (iter_start == iter_stop)
@@ -421,20 +486,32 @@ static int jump_label_add_module(struct module *mod)
key = iterk;
if (within_module(iter->key, mod)) {
- /*
- * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
- */
- *((unsigned long *)&key->entries) += (unsigned long)iter;
- key->next = NULL;
+ static_key_set_entries(key, iter);
continue;
}
jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL);
if (!jlm)
return -ENOMEM;
+ if (!static_key_linked(key)) {
+ jlm2 = kzalloc(sizeof(struct static_key_mod),
+ GFP_KERNEL);
+ if (!jlm2) {
+ kfree(jlm);
+ return -ENOMEM;
+ }
+ preempt_disable();
+ jlm2->mod = __module_address((unsigned long)key);
+ preempt_enable();
+ jlm2->entries = static_key_entries(key);
+ jlm2->next = NULL;
+ static_key_set_mod(key, jlm2);
+ static_key_set_linked(key);
+ }
jlm->mod = mod;
jlm->entries = iter;
- jlm->next = key->next;
- key->next = jlm;
+ jlm->next = static_key_mod(key);
+ static_key_set_mod(key, jlm);
+ static_key_set_linked(key);
/* Only update if we've changed from our initial state */
if (jump_label_type(iter) != jump_label_init_type(iter))
@@ -461,16 +538,34 @@ static void jump_label_del_module(struct module *mod)
if (within_module(iter->key, mod))
continue;
+ /* No memory during module load */
+ if (WARN_ON(!static_key_linked(key)))
+ continue;
+
prev = &key->next;
- jlm = key->next;
+ jlm = static_key_mod(key);
while (jlm && jlm->mod != mod) {
prev = &jlm->next;
jlm = jlm->next;
}
- if (jlm) {
+ /* No memory during module load */
+ if (WARN_ON(!jlm))
+ continue;
+
+ if (prev == &key->next)
+ static_key_set_mod(key, jlm->next);
+ else
*prev = jlm->next;
+
+ kfree(jlm);
+
+ jlm = static_key_mod(key);
+ /* if only one etry is left, fold it back into the static_key */
+ if (jlm->next == NULL) {
+ static_key_set_entries(key, jlm->entries);
+ static_key_clear_linked(key);
kfree(jlm);
}
}
@@ -499,8 +594,10 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
case MODULE_STATE_COMING:
jump_label_lock();
ret = jump_label_add_module(mod);
- if (ret)
+ if (ret) {
+ WARN(1, "Failed to allocatote memory: jump_label may not work properly.\n");
jump_label_del_module(mod);
+ }
jump_label_unlock();
break;
case MODULE_STATE_GOING:
@@ -561,11 +658,14 @@ int jump_label_text_reserved(void *start, void *end)
static void jump_label_update(struct static_key *key)
{
struct jump_entry *stop = __stop___jump_table;
- struct jump_entry *entry = static_key_entries(key);
+ struct jump_entry *entry;
#ifdef CONFIG_MODULES
struct module *mod;
- __jump_label_mod_update(key);
+ if (static_key_linked(key)) {
+ __jump_label_mod_update(key);
+ return;
+ }
preempt_disable();
mod = __module_address((unsigned long)key);
@@ -573,6 +673,7 @@ static void jump_label_update(struct static_key *key)
stop = mod->jump_entries + mod->num_jump_entries;
preempt_enable();
#endif
+ entry = static_key_entries(key);
/* if there are no users, entry can be NULL */
if (entry)
__jump_label_update(key, entry, stop);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 0c407f905ca4..563f97e2be36 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -20,6 +20,8 @@
*/
#include <linux/module.h>
#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/binfmts.h>
#include <linux/syscalls.h>
#include <linux/unistd.h>
#include <linux/kmod.h>
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index ee1bc1bb8feb..0999679d6f26 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -195,7 +195,7 @@ static ssize_t notes_read(struct file *filp, struct kobject *kobj,
return count;
}
-static struct bin_attribute notes_attr = {
+static struct bin_attribute notes_attr __ro_after_init = {
.attr = {
.name = "notes",
.mode = S_IRUGO,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 8461a4372e8a..2f26adea0f84 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -5,7 +5,9 @@
* even if we're invoked from userspace (think modprobe, hotplug cpu,
* etc.).
*/
+#include <uapi/linux/sched/types.h>
#include <linux/sched.h>
+#include <linux/sched/task.h>
#include <linux/kthread.h>
#include <linux/completion.h>
#include <linux/err.h>
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index b5c30d9f46c5..96b4179cee6a 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -55,6 +55,8 @@
#include <linux/latencytop.h>
#include <linux/export.h>
#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/stat.h>
#include <linux/list.h>
#include <linux/stacktrace.h>
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 9812e5dd409e..12e38c213b70 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -28,6 +28,8 @@
#define DISABLE_BRANCH_PROFILING
#include <linux/mutex.h>
#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/task.h>
#include <linux/delay.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 28350dc8ecbb..f24582d4dad3 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -32,6 +32,8 @@
#include <linux/smp.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
+#include <uapi/linux/sched/types.h>
+#include <linux/rtmutex.h>
#include <linux/atomic.h>
#include <linux/moduleparam.h>
#include <linux/delay.h>
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index ad2d9e22697b..198527a62149 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -19,8 +19,10 @@
*/
#include <linux/mutex.h>
#include <linux/ww_mutex.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/sched/rt.h>
+#include <linux/sched/wake_q.h>
+#include <linux/sched/debug.h>
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index e852be4851fc..4a30ef63c607 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -63,6 +63,7 @@ enum qlock_stats {
*/
#include <linux/debugfs.h>
#include <linux/sched.h>
+#include <linux/sched/clock.h>
#include <linux/fs.h>
static const char * const qstat_names[qstat_num + 1] = {
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 62b6cee8ea7f..97ee9df32e0f 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -18,6 +18,7 @@
*/
#include <linux/sched.h>
#include <linux/sched/rt.h>
+#include <linux/sched/debug.h>
#include <linux/delay.h>
#include <linux/export.h>
#include <linux/spinlock.h>
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index d340be3a488f..6edc32ecd9c5 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -12,9 +12,11 @@
*/
#include <linux/spinlock.h>
#include <linux/export.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/sched/rt.h>
#include <linux/sched/deadline.h>
+#include <linux/sched/wake_q.h>
+#include <linux/sched/debug.h>
#include <linux/timer.h>
#include "rtmutex_common.h"
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 990134617b4c..856dfff5c33a 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -13,6 +13,7 @@
#define __KERNEL_RTMUTEX_COMMON_H
#include <linux/rtmutex.h>
+#include <linux/sched/wake_q.h>
/*
* This is the control structure for tasks blocked on a rt_mutex,
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 5eacab880f67..7bc24d477805 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -6,7 +6,8 @@
* - Derived also from comments by Linus
*/
#include <linux/rwsem.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
#include <linux/export.h>
enum rwsem_waiter_type {
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 2ad8d8dc3bb1..34e727f18e49 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -10,10 +10,12 @@
* and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
*/
#include <linux/rwsem.h>
-#include <linux/sched.h>
#include <linux/init.h>
#include <linux/export.h>
+#include <linux/sched/signal.h>
#include <linux/sched/rt.h>
+#include <linux/sched/wake_q.h>
+#include <linux/sched/debug.h>
#include <linux/osq_lock.h>
#include "rwsem.h"
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 45ba475d4be3..90a74ccd85a4 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -7,6 +7,7 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
+#include <linux/sched/debug.h>
#include <linux/export.h>
#include <linux/rwsem.h>
#include <linux/atomic.h>
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 9512e37637dc..561acdd39960 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -29,6 +29,7 @@
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/sched.h>
+#include <linux/sched/debug.h>
#include <linux/semaphore.h>
#include <linux/spinlock.h>
#include <linux/ftrace.h>
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 9ecedc28b928..06123234f118 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -246,9 +246,13 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
/* pages are dead and unused, undo the arch mapping */
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(resource_size(res), SECTION_SIZE);
+
+ lock_device_hotplug();
mem_hotplug_begin();
arch_remove_memory(align_start, align_size);
mem_hotplug_done();
+ unlock_device_hotplug();
+
untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
pgmap_radix_release(res);
dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
@@ -360,9 +364,11 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
if (error)
goto err_pfn_remap;
+ lock_device_hotplug();
mem_hotplug_begin();
error = arch_add_memory(nid, align_start, align_size, true);
mem_hotplug_done();
+ unlock_device_hotplug();
if (error)
goto err_add_memory;
diff --git a/kernel/notifier.c b/kernel/notifier.c
index fd2c9acbcc19..6196af8a8223 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -95,7 +95,7 @@ static int notifier_call_chain(struct notifier_block **nl,
if (nr_calls)
(*nr_calls)++;
- if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
+ if (ret & NOTIFY_STOP_MASK)
break;
nb = next_nb;
nr_to_call--;
diff --git a/kernel/panic.c b/kernel/panic.c
index b95959733ce0..a58932b41700 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -9,6 +9,7 @@
* to indicate a major problem.
*/
#include <linux/debug_locks.h>
+#include <linux/sched/debug.h>
#include <linux/interrupt.h>
#include <linux/kmsg_dump.h>
#include <linux/kallsyms.h>
@@ -273,7 +274,8 @@ void panic(const char *fmt, ...)
extern int stop_a_enabled;
/* Make sure the user can actually press Stop-A (L1-A) */
stop_a_enabled = 1;
- pr_emerg("Press Stop-A (L1-A) to return to the boot prom\n");
+ pr_emerg("Press Stop-A (L1-A) from sun keyboard or send break\n"
+ "twice on console to return to the boot prom\n");
}
#endif
#if defined(CONFIG_S390)
diff --git a/kernel/pid.c b/kernel/pid.c
index 0291804151b5..0143ac0ddceb 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -38,6 +38,7 @@
#include <linux/syscalls.h>
#include <linux/proc_ns.h>
#include <linux/proc_fs.h>
+#include <linux/sched/task.h>
#define pid_hashfn(nr, ns) \
hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index eef2ce968636..de461aa0bf9a 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -12,12 +12,15 @@
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/syscalls.h>
+#include <linux/cred.h>
#include <linux/err.h>
#include <linux/acct.h>
#include <linux/slab.h>
#include <linux/proc_ns.h>
#include <linux/reboot.h>
#include <linux/export.h>
+#include <linux/sched/task.h>
+#include <linux/sched/signal.h>
struct pid_cache {
int nr_ids;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 86385af1080f..a8b978c35a6a 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -10,6 +10,8 @@
* This file is released under the GPLv2.
*/
+#define pr_fmt(fmt) "PM: " fmt
+
#include <linux/export.h>
#include <linux/suspend.h>
#include <linux/syscalls.h>
@@ -21,6 +23,7 @@
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/pm.h>
+#include <linux/nmi.h>
#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/freezer.h>
@@ -104,7 +107,7 @@ EXPORT_SYMBOL(system_entering_hibernation);
#ifdef CONFIG_PM_DEBUG
static void hibernation_debug_sleep(void)
{
- printk(KERN_INFO "hibernation debug: Waiting for 5 seconds.\n");
+ pr_info("hibernation debug: Waiting for 5 seconds.\n");
mdelay(5000);
}
@@ -250,10 +253,9 @@ void swsusp_show_speed(ktime_t start, ktime_t stop,
centisecs = 1; /* avoid div-by-zero */
k = nr_pages * (PAGE_SIZE / 1024);
kps = (k * 100) / centisecs;
- printk(KERN_INFO "PM: %s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n",
- msg, k,
- centisecs / 100, centisecs % 100,
- kps / 1000, (kps % 1000) / 10);
+ pr_info("%s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n",
+ msg, k, centisecs / 100, centisecs % 100, kps / 1000,
+ (kps % 1000) / 10);
}
/**
@@ -271,8 +273,7 @@ static int create_image(int platform_mode)
error = dpm_suspend_end(PMSG_FREEZE);
if (error) {
- printk(KERN_ERR "PM: Some devices failed to power down, "
- "aborting hibernation\n");
+ pr_err("Some devices failed to power down, aborting hibernation\n");
return error;
}
@@ -288,8 +289,7 @@ static int create_image(int platform_mode)
error = syscore_suspend();
if (error) {
- printk(KERN_ERR "PM: Some system devices failed to power down, "
- "aborting hibernation\n");
+ pr_err("Some system devices failed to power down, aborting hibernation\n");
goto Enable_irqs;
}
@@ -304,8 +304,8 @@ static int create_image(int platform_mode)
restore_processor_state();
trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
if (error)
- printk(KERN_ERR "PM: Error %d creating hibernation image\n",
- error);
+ pr_err("Error %d creating hibernation image\n", error);
+
if (!in_suspend) {
events_check_enabled = false;
clear_free_pages();
@@ -432,8 +432,7 @@ static int resume_target_kernel(bool platform_mode)
error = dpm_suspend_end(PMSG_QUIESCE);
if (error) {
- printk(KERN_ERR "PM: Some devices failed to power down, "
- "aborting resume\n");
+ pr_err("Some devices failed to power down, aborting resume\n");
return error;
}
@@ -608,6 +607,22 @@ static void power_down(void)
{
#ifdef CONFIG_SUSPEND
int error;
+
+ if (hibernation_mode == HIBERNATION_SUSPEND) {
+ error = suspend_devices_and_enter(PM_SUSPEND_MEM);
+ if (error) {
+ hibernation_mode = hibernation_ops ?
+ HIBERNATION_PLATFORM :
+ HIBERNATION_SHUTDOWN;
+ } else {
+ /* Restore swap signature. */
+ error = swsusp_unmark();
+ if (error)
+ pr_err("Swap will be unusable! Try swapon -a.\n");
+
+ return;
+ }
+ }
#endif
switch (hibernation_mode) {
@@ -620,32 +635,13 @@ static void power_down(void)
if (pm_power_off)
kernel_power_off();
break;
-#ifdef CONFIG_SUSPEND
- case HIBERNATION_SUSPEND:
- error = suspend_devices_and_enter(PM_SUSPEND_MEM);
- if (error) {
- if (hibernation_ops)
- hibernation_mode = HIBERNATION_PLATFORM;
- else
- hibernation_mode = HIBERNATION_SHUTDOWN;
- power_down();
- }
- /*
- * Restore swap signature.
- */
- error = swsusp_unmark();
- if (error)
- printk(KERN_ERR "PM: Swap will be unusable! "
- "Try swapon -a.\n");
- return;
-#endif
}
kernel_halt();
/*
* Valid image is on the disk, if we continue we risk serious data
* corruption after resume.
*/
- printk(KERN_CRIT "PM: Please power down manually\n");
+ pr_crit("Power down manually\n");
while (1)
cpu_relax();
}
@@ -655,7 +651,7 @@ static int load_image_and_restore(void)
int error;
unsigned int flags;
- pr_debug("PM: Loading hibernation image.\n");
+ pr_debug("Loading hibernation image.\n");
lock_device_hotplug();
error = create_basic_memory_bitmaps();
@@ -667,7 +663,7 @@ static int load_image_and_restore(void)
if (!error)
hibernation_restore(flags & SF_PLATFORM_MODE);
- printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
+ pr_err("Failed to load hibernation image, recovering.\n");
swsusp_free();
free_basic_memory_bitmaps();
Unlock:
@@ -685,7 +681,7 @@ int hibernate(void)
bool snapshot_test = false;
if (!hibernation_available()) {
- pr_debug("PM: Hibernation not available.\n");
+ pr_debug("Hibernation not available.\n");
return -EPERM;
}
@@ -703,9 +699,9 @@ int hibernate(void)
goto Exit;
}
- printk(KERN_INFO "PM: Syncing filesystems ... ");
+ pr_info("Syncing filesystems ... \n");
sys_sync();
- printk("done.\n");
+ pr_info("done.\n");
error = freeze_processes();
if (error)
@@ -731,7 +727,7 @@ int hibernate(void)
else
flags |= SF_CRC32_MODE;
- pr_debug("PM: writing image.\n");
+ pr_debug("Writing image.\n");
error = swsusp_write(flags);
swsusp_free();
if (!error) {
@@ -743,7 +739,7 @@ int hibernate(void)
in_suspend = 0;
pm_restore_gfp_mask();
} else {
- pr_debug("PM: Image restored successfully.\n");
+ pr_debug("Image restored successfully.\n");
}
Free_bitmaps:
@@ -751,7 +747,7 @@ int hibernate(void)
Thaw:
unlock_device_hotplug();
if (snapshot_test) {
- pr_debug("PM: Checking hibernation image\n");
+ pr_debug("Checking hibernation image\n");
error = swsusp_check();
if (!error)
error = load_image_and_restore();
@@ -815,10 +811,10 @@ static int software_resume(void)
goto Unlock;
}
- pr_debug("PM: Checking hibernation image partition %s\n", resume_file);
+ pr_debug("Checking hibernation image partition %s\n", resume_file);
if (resume_delay) {
- printk(KERN_INFO "Waiting %dsec before reading resume device...\n",
+ pr_info("Waiting %dsec before reading resume device ...\n",
resume_delay);
ssleep(resume_delay);
}
@@ -857,10 +853,10 @@ static int software_resume(void)
}
Check_image:
- pr_debug("PM: Hibernation image partition %d:%d present\n",
+ pr_debug("Hibernation image partition %d:%d present\n",
MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
- pr_debug("PM: Looking for hibernation image.\n");
+ pr_debug("Looking for hibernation image.\n");
error = swsusp_check();
if (error)
goto Unlock;
@@ -879,7 +875,7 @@ static int software_resume(void)
goto Close_Finish;
}
- pr_debug("PM: Preparing processes for restore.\n");
+ pr_debug("Preparing processes for restore.\n");
error = freeze_processes();
if (error)
goto Close_Finish;
@@ -892,7 +888,7 @@ static int software_resume(void)
/* For success case, the suspend path will release the lock */
Unlock:
mutex_unlock(&pm_mutex);
- pr_debug("PM: Hibernation image not present or could not be loaded.\n");
+ pr_debug("Hibernation image not present or could not be loaded.\n");
return error;
Close_Finish:
swsusp_close(FMODE_READ);
@@ -1016,7 +1012,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
error = -EINVAL;
if (!error)
- pr_debug("PM: Hibernation mode set to '%s'\n",
+ pr_debug("Hibernation mode set to '%s'\n",
hibernation_modes[mode]);
unlock_system_sleep();
return error ? error : n;
@@ -1052,7 +1048,7 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
lock_system_sleep();
swsusp_resume_device = res;
unlock_system_sleep();
- printk(KERN_INFO "PM: Starting manual resume from disk\n");
+ pr_info("Starting manual resume from disk\n");
noresume = 0;
software_resume();
return n;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 2fba066e125f..c7209f060eeb 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -12,6 +12,8 @@
#include <linux/oom.h>
#include <linux/suspend.h>
#include <linux/module.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task.h>
#include <linux/syscalls.h>
#include <linux/freezer.h>
#include <linux/delay.h>
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 905d5bbd595f..d79a38de425a 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -22,6 +22,7 @@
#include <linux/device.h>
#include <linux/init.h>
#include <linux/bootmem.h>
+#include <linux/nmi.h>
#include <linux/syscalls.h>
#include <linux/console.h>
#include <linux/highmem.h>
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 34da86e73d00..2984fb0f0257 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -45,6 +45,9 @@
#include <linux/utsname.h>
#include <linux/ctype.h>
#include <linux/uio.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
#include <linux/uaccess.h>
#include <asm/sections.h>
diff --git a/kernel/profile.c b/kernel/profile.c
index f67ce0aa6bc4..9aa2a4445b0d 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -25,6 +25,8 @@
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
+#include <linux/sched/stat.h>
+
#include <asm/sections.h>
#include <asm/irq_regs.h>
#include <asm/ptrace.h>
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 49ba7c1ade9d..0af928712174 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -10,6 +10,9 @@
#include <linux/capability.h>
#include <linux/export.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
+#include <linux/sched/task.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/highmem.h>
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 123ccbd22449..a4a86fb47e4a 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -30,6 +30,7 @@
#include <linux/rcupdate.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
+#include <uapi/linux/sched/types.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/completion.h>
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index d81345be730e..cccc417a8135 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -32,7 +32,8 @@
#include <linux/smp.h>
#include <linux/rcupdate.h>
#include <linux/interrupt.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <uapi/linux/sched/types.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/completion.h>
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index e773129c8b08..ef3bcfb15b39 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -30,7 +30,7 @@
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
-#include <linux/rcupdate.h>
+#include <linux/rcupdate_wait.h>
#include <linux/sched.h>
#include <linux/smp.h>
#include <linux/delay.h>
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index fa6a48d3917b..6ad330dbbae2 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -25,7 +25,7 @@
#include <linux/completion.h>
#include <linux/interrupt.h>
#include <linux/notifier.h>
-#include <linux/rcupdate.h>
+#include <linux/rcupdate_wait.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/mutex.h>
@@ -47,6 +47,18 @@ static void __call_rcu(struct rcu_head *head,
#include "tiny_plugin.h"
+void rcu_barrier_bh(void)
+{
+ wait_rcu_gp(call_rcu_bh);
+}
+EXPORT_SYMBOL(rcu_barrier_bh);
+
+void rcu_barrier_sched(void)
+{
+ wait_rcu_gp(call_rcu_sched);
+}
+EXPORT_SYMBOL(rcu_barrier_sched);
+
#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
/*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d80e0d2f68c6..50fee7689e71 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -32,9 +32,10 @@
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
-#include <linux/rcupdate.h>
+#include <linux/rcupdate_wait.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
+#include <linux/sched/debug.h>
#include <linux/nmi.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
@@ -49,6 +50,7 @@
#include <linux/kernel_stat.h>
#include <linux/wait.h>
#include <linux/kthread.h>
+#include <uapi/linux/sched/types.h>
#include <linux/prefetch.h>
#include <linux/delay.h>
#include <linux/stop_machine.h>
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index b60f2b6caa14..ec62a05bfdb3 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -24,6 +24,7 @@
#include <linux/cache.h>
#include <linux/spinlock.h>
+#include <linux/rtmutex.h>
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/seqlock.h>
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index a240f3308be6..0a62a8f1caac 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -27,7 +27,9 @@
#include <linux/delay.h>
#include <linux/gfp.h>
#include <linux/oom.h>
+#include <linux/sched/debug.h>
#include <linux/smpboot.h>
+#include <uapi/linux/sched/types.h>
#include "../time/tick-internal.h"
#ifdef CONFIG_RCU_BOOST
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 9e03db9ea9c0..55c8530316c7 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -36,7 +36,8 @@
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/percpu.h>
@@ -49,6 +50,7 @@
#include <linux/moduleparam.h>
#include <linux/kthread.h>
#include <linux/tick.h>
+#include <linux/rcupdate_wait.h>
#define CREATE_TRACE_POINTS
diff --git a/kernel/relay.c b/kernel/relay.c
index 8f18d314a96a..0e413d9eec8a 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -39,10 +39,10 @@ static void relay_file_mmap_close(struct vm_area_struct *vma)
/*
* fault() vm_op implementation for relay file mapping.
*/
-static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int relay_buf_fault(struct vm_fault *vmf)
{
struct page *page;
- struct rchan_buf *buf = vma->vm_private_data;
+ struct rchan_buf *buf = vmf->vma->vm_private_data;
pgoff_t pgoff = vmf->pgoff;
if (!buf)
@@ -847,7 +847,7 @@ void relay_close(struct rchan *chan)
if (chan->last_toobig)
printk(KERN_WARNING "relay: one or more items not logged "
- "[item size (%Zd) > sub-buffer size (%Zd)]\n",
+ "[item size (%zd) > sub-buffer size (%zd)]\n",
chan->last_toobig, chan->subbuf_size);
list_del(&chan->list);
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h
index 890c95f2587a..ce40c810cd5c 100644
--- a/kernel/sched/autogroup.h
+++ b/kernel/sched/autogroup.h
@@ -2,6 +2,7 @@
#include <linux/kref.h>
#include <linux/rwsem.h>
+#include <linux/sched/autogroup.h>
struct autogroup {
/*
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index ad64efe41722..a08795e21628 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -58,6 +58,8 @@
#include <linux/percpu.h>
#include <linux/ktime.h>
#include <linux/sched.h>
+#include <linux/nmi.h>
+#include <linux/sched/clock.h>
#include <linux/static_key.h>
#include <linux/workqueue.h>
#include <linux/compiler.h>
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index f063a25d4449..53f9558fa925 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -11,7 +11,8 @@
* Waiting for completion is a typically sync point, but not an exclusion point.
*/
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
#include <linux/completion.h>
/**
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e1ae6ac15eac..956383844116 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6,10 +6,15 @@
* Copyright (C) 1991-2002 Linus Torvalds
*/
#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <uapi/linux/sched/types.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/hotplug.h>
#include <linux/cpuset.h>
#include <linux/delayacct.h>
#include <linux/init_task.h>
#include <linux/context_tracking.h>
+#include <linux/rcupdate_wait.h>
#include <linux/blkdev.h>
#include <linux/kprobes.h>
@@ -981,7 +986,7 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_
return rq;
/* Affinity changed (again). */
- if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+ if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
return rq;
rq = move_queued_task(rq, p, dest_cpu);
@@ -1090,6 +1095,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
int ret = 0;
rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
if (p->flags & PF_KTHREAD) {
/*
@@ -1258,10 +1264,10 @@ static int migrate_swap_stop(void *data)
if (task_cpu(arg->src_task) != arg->src_cpu)
goto unlock;
- if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
+ if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
goto unlock;
- if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
+ if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
goto unlock;
__migrate_swap_task(arg->src_task, arg->dst_cpu);
@@ -1302,10 +1308,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
goto out;
- if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
+ if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
goto out;
- if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
+ if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
goto out;
trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
@@ -1489,14 +1495,14 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
for_each_cpu(dest_cpu, nodemask) {
if (!cpu_active(dest_cpu))
continue;
- if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+ if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
return dest_cpu;
}
}
for (;;) {
/* Any allowed, online CPU? */
- for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
+ for_each_cpu(dest_cpu, &p->cpus_allowed) {
if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu))
continue;
if (!cpu_online(dest_cpu))
@@ -1548,10 +1554,10 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
{
lockdep_assert_held(&p->pi_lock);
- if (tsk_nr_cpus_allowed(p) > 1)
+ if (p->nr_cpus_allowed > 1)
cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
else
- cpu = cpumask_any(tsk_cpus_allowed(p));
+ cpu = cpumask_any(&p->cpus_allowed);
/*
* In order not to call set_task_cpu() on a blocking task we need
@@ -1563,7 +1569,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
* [ this allows ->select_task() to simply return task_cpu(p) and
* not worry about this generic constraint ]
*/
- if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
+ if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
!cpu_online(cpu)))
cpu = select_fallback_rq(task_cpu(p), p);
@@ -2847,7 +2853,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
if (!mm) {
next->active_mm = oldmm;
- atomic_inc(&oldmm->mm_count);
+ mmgrab(oldmm);
enter_lazy_tlb(oldmm, next);
} else
switch_mm_irqs_off(oldmm, mm, next);
@@ -3210,6 +3216,15 @@ static inline void preempt_latency_start(int val) { }
static inline void preempt_latency_stop(int val) { }
#endif
+static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+ return p->preempt_disable_ip;
+#else
+ return 0;
+#endif
+}
+
/*
* Print scheduling while atomic bug:
*/
@@ -5232,6 +5247,9 @@ void sched_show_task(struct task_struct *p)
int ppid;
unsigned long state = p->state;
+ /* Make sure the string lines up properly with the number of task states: */
+ BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1);
+
if (!try_get_task_stack(p))
return;
if (state)
@@ -5460,7 +5478,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
if (curr_cpu == target_cpu)
return 0;
- if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
+ if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
return -EINVAL;
/* TODO: This is not properly updating schedstats */
@@ -5560,7 +5578,7 @@ static void migrate_tasks(struct rq *dead_rq)
{
struct rq *rq = dead_rq;
struct task_struct *next, *stop = rq->stop;
- struct rq_flags rf, old_rf;
+ struct rq_flags rf;
int dest_cpu;
/*
@@ -5579,7 +5597,9 @@ static void migrate_tasks(struct rq *dead_rq)
* class method both need to have an up-to-date
* value of rq->clock[_task]
*/
+ rq_pin_lock(rq, &rf);
update_rq_clock(rq);
+ rq_unpin_lock(rq, &rf);
for (;;) {
/*
@@ -5592,7 +5612,7 @@ static void migrate_tasks(struct rq *dead_rq)
/*
* pick_next_task() assumes pinned rq->lock:
*/
- rq_pin_lock(rq, &rf);
+ rq_repin_lock(rq, &rf);
next = pick_next_task(rq, &fake_task, &rf);
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
@@ -5621,13 +5641,6 @@ static void migrate_tasks(struct rq *dead_rq)
continue;
}
- /*
- * __migrate_task() may return with a different
- * rq->lock held and a new cookie in 'rf', but we need
- * to preserve rf::clock_update_flags for 'dead_rq'.
- */
- old_rf = rf;
-
/* Find suitable destination for @next, with force if needed. */
dest_cpu = select_fallback_rq(dead_rq->cpu, next);
@@ -5636,7 +5649,6 @@ static void migrate_tasks(struct rq *dead_rq)
raw_spin_unlock(&rq->lock);
rq = dead_rq;
raw_spin_lock(&rq->lock);
- rf = old_rf;
}
raw_spin_unlock(&next->pi_lock);
}
@@ -6098,7 +6110,7 @@ void __init sched_init(void)
/*
* The boot idle thread does lazy MMU switching as well:
*/
- atomic_inc(&init_mm.mm_count);
+ mmgrab(&init_mm);
enter_lazy_tlb(&init_mm, current);
/*
@@ -6819,11 +6831,20 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
- sched_online_group(tg, parent);
-
return &tg->css;
}
+/* Expose task group only after completing cgroup initialization */
+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+ struct task_group *parent = css_tg(css->parent);
+
+ if (parent)
+ sched_online_group(tg, parent);
+ return 0;
+}
+
static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
@@ -7229,6 +7250,7 @@ static struct cftype cpu_files[] = {
struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
+ .css_online = cpu_cgroup_css_online,
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.fork = cpu_cgroup_fork,
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index e73119013c53..fba235c7d026 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -128,10 +128,10 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
const struct sched_dl_entity *dl_se = &p->dl;
if (later_mask &&
- cpumask_and(later_mask, cp->free_cpus, tsk_cpus_allowed(p))) {
+ cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
best_cpu = cpumask_any(later_mask);
goto out;
- } else if (cpumask_test_cpu(cpudl_maximum(cp), tsk_cpus_allowed(p)) &&
+ } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
best_cpu = cpudl_maximum(cp);
if (later_mask)
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index fd4659313640..8f8de3d4d6b7 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -13,6 +13,7 @@
#include <linux/cpufreq.h>
#include <linux/kthread.h>
+#include <uapi/linux/sched/types.h>
#include <linux/slab.h>
#include <trace/events/power.h>
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 11e9705bf937..981fcd7dc394 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -103,11 +103,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
if (skip)
continue;
- if (cpumask_any_and(tsk_cpus_allowed(p), vec->mask) >= nr_cpu_ids)
+ if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
continue;
if (lowest_mask) {
- cpumask_and(lowest_mask, tsk_cpus_allowed(p), vec->mask);
+ cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
/*
* We have to ensure that we have at least one bit
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 2ecec3a4f1ee..f3778e2b46c8 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -4,12 +4,8 @@
#include <linux/kernel_stat.h>
#include <linux/static_key.h>
#include <linux/context_tracking.h>
-#include <linux/cputime.h>
+#include <linux/sched/cputime.h>
#include "sched.h"
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#endif
-
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 27737f34757d..99b2c33a9fbc 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -134,7 +134,7 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
struct task_struct *p = dl_task_of(dl_se);
- if (tsk_nr_cpus_allowed(p) > 1)
+ if (p->nr_cpus_allowed > 1)
dl_rq->dl_nr_migratory++;
update_dl_migration(dl_rq);
@@ -144,7 +144,7 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
struct task_struct *p = dl_task_of(dl_se);
- if (tsk_nr_cpus_allowed(p) > 1)
+ if (p->nr_cpus_allowed > 1)
dl_rq->dl_nr_migratory--;
update_dl_migration(dl_rq);
@@ -252,7 +252,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
* If we cannot preempt any rq, fall back to pick any
* online cpu.
*/
- cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p));
+ cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
if (cpu >= nr_cpu_ids) {
/*
* Fail to find any suitable cpu.
@@ -958,7 +958,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
enqueue_dl_entity(&p->dl, pi_se, flags);
- if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
+ if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
}
@@ -1032,9 +1032,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
* try to make it stay here, it might be important.
*/
if (unlikely(dl_task(curr)) &&
- (tsk_nr_cpus_allowed(curr) < 2 ||
+ (curr->nr_cpus_allowed < 2 ||
!dl_entity_preempt(&p->dl, &curr->dl)) &&
- (tsk_nr_cpus_allowed(p) > 1)) {
+ (p->nr_cpus_allowed > 1)) {
int target = find_later_rq(p);
if (target != -1 &&
@@ -1055,7 +1055,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
* Current can't be migrated, useless to reschedule,
* let's hope p can move out.
*/
- if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
+ if (rq->curr->nr_cpus_allowed == 1 ||
cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
return;
@@ -1063,7 +1063,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
* p is migratable, so let's not schedule it and
* see if it is pushed or pulled somewhere else.
*/
- if (tsk_nr_cpus_allowed(p) != 1 &&
+ if (p->nr_cpus_allowed != 1 &&
cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
return;
@@ -1178,7 +1178,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
{
update_curr_dl(rq);
- if (on_dl_rq(&p->dl) && tsk_nr_cpus_allowed(p) > 1)
+ if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
}
@@ -1235,7 +1235,7 @@ static void set_curr_task_dl(struct rq *rq)
static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
- cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ cpumask_test_cpu(cpu, &p->cpus_allowed))
return 1;
return 0;
}
@@ -1279,7 +1279,7 @@ static int find_later_rq(struct task_struct *task)
if (unlikely(!later_mask))
return -1;
- if (tsk_nr_cpus_allowed(task) == 1)
+ if (task->nr_cpus_allowed == 1)
return -1;
/*
@@ -1384,8 +1384,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
/* Retry if something changed. */
if (double_lock_balance(rq, later_rq)) {
if (unlikely(task_rq(task) != rq ||
- !cpumask_test_cpu(later_rq->cpu,
- tsk_cpus_allowed(task)) ||
+ !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) ||
task_running(rq, task) ||
!dl_task(task) ||
!task_on_rq_queued(task))) {
@@ -1425,7 +1424,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
BUG_ON(rq->cpu != task_cpu(p));
BUG_ON(task_current(rq, p));
- BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
+ BUG_ON(p->nr_cpus_allowed <= 1);
BUG_ON(!task_on_rq_queued(p));
BUG_ON(!dl_task(p));
@@ -1464,7 +1463,7 @@ retry:
*/
if (dl_task(rq->curr) &&
dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
- tsk_nr_cpus_allowed(rq->curr) > 1) {
+ rq->curr->nr_cpus_allowed > 1) {
resched_curr(rq);
return 0;
}
@@ -1611,9 +1610,9 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
{
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
- tsk_nr_cpus_allowed(p) > 1 &&
+ p->nr_cpus_allowed > 1 &&
dl_task(rq->curr) &&
- (tsk_nr_cpus_allowed(rq->curr) < 2 ||
+ (rq->curr->nr_cpus_allowed < 2 ||
!dl_entity_preempt(&p->dl, &rq->curr->dl))) {
push_dl_tasks(rq);
}
@@ -1727,7 +1726,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
if (rq->curr != p) {
#ifdef CONFIG_SMP
- if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
+ if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
queue_push_tasks(rq);
#endif
if (dl_task(rq->curr))
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 109adc0e9cb9..38f019324f1a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -11,7 +11,8 @@
*/
#include <linux/proc_fs.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/task.h>
#include <linux/seq_file.h>
#include <linux/kallsyms.h>
#include <linux/utsname.h>
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 274c747a01ce..3e88b35ac157 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,7 +20,9 @@
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*/
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/topology.h>
+
#include <linux/latencytop.h>
#include <linux/cpumask.h>
#include <linux/cpuidle.h>
@@ -1551,7 +1553,7 @@ static void task_numa_compare(struct task_numa_env *env,
*/
if (cur) {
/* Skip this swap candidate if cannot move to the source cpu */
- if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
+ if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
goto unlock;
/*
@@ -1661,7 +1663,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
/* Skip this CPU if the source task cannot migrate */
- if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
+ if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
continue;
env->dst_cpu = cpu;
@@ -5458,7 +5460,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
/* Skip over this group if it has no CPUs allowed */
if (!cpumask_intersects(sched_group_cpus(group),
- tsk_cpus_allowed(p)))
+ &p->cpus_allowed))
continue;
local_group = cpumask_test_cpu(this_cpu,
@@ -5578,7 +5580,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
return cpumask_first(sched_group_cpus(group));
/* Traverse only the allowed CPUs */
- for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
+ for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
if (idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
@@ -5717,7 +5719,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
if (!test_idle_cores(target, false))
return -1;
- cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
+ cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
for_each_cpu_wrap(core, cpus, target, wrap) {
bool idle = true;
@@ -5751,7 +5753,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
return -1;
for_each_cpu(cpu, cpu_smt_mask(target)) {
- if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
continue;
if (idle_cpu(cpu))
return cpu;
@@ -5803,7 +5805,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
time = local_clock();
for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
- if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
continue;
if (idle_cpu(cpu))
break;
@@ -5958,7 +5960,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p);
want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
- && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+ && cpumask_test_cpu(cpu, &p->cpus_allowed);
}
rcu_read_lock();
@@ -6698,7 +6700,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
return 0;
- if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+ if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
int cpu;
schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
@@ -6718,7 +6720,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/* Prevent to re-select dst_cpu via env's cpus */
for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
- if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+ if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
env->flags |= LBF_DST_PINNED;
env->new_dst_cpu = cpu;
break;
@@ -7252,7 +7254,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
/*
* Group imbalance indicates (and tries to solve) the problem where balancing
- * groups is inadequate due to tsk_cpus_allowed() constraints.
+ * groups is inadequate due to ->cpus_allowed constraints.
*
* Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
* cpumask covering 1 cpu of the first group and 3 cpus of the second group.
@@ -8211,8 +8213,7 @@ more_balance:
* if the curr task on busiest cpu can't be
* moved to this_cpu
*/
- if (!cpumask_test_cpu(this_cpu,
- tsk_cpus_allowed(busiest->curr))) {
+ if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
raw_spin_unlock_irqrestore(&busiest->lock,
flags);
env.flags |= LBF_ALL_PINNED;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 6a4bae0a649d..ac6d5176463d 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -2,6 +2,7 @@
* Generic entry point for the idle threads
*/
#include <linux/sched.h>
+#include <linux/sched/idle.h>
#include <linux/cpu.h>
#include <linux/cpuidle.h>
#include <linux/cpuhotplug.h>
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index a2d6eb71f06b..7296b7308eca 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -7,6 +7,7 @@
*/
#include <linux/export.h>
+#include <linux/sched/loadavg.h>
#include "sched.h"
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e8836cfc4cdb..9f3e40226dec 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -335,7 +335,7 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rt_rq = &rq_of_rt_rq(rt_rq)->rt;
rt_rq->rt_nr_total++;
- if (tsk_nr_cpus_allowed(p) > 1)
+ if (p->nr_cpus_allowed > 1)
rt_rq->rt_nr_migratory++;
update_rt_migration(rt_rq);
@@ -352,7 +352,7 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rt_rq = &rq_of_rt_rq(rt_rq)->rt;
rt_rq->rt_nr_total--;
- if (tsk_nr_cpus_allowed(p) > 1)
+ if (p->nr_cpus_allowed > 1)
rt_rq->rt_nr_migratory--;
update_rt_migration(rt_rq);
@@ -1324,7 +1324,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
enqueue_rt_entity(rt_se, flags);
- if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
+ if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
}
@@ -1413,7 +1413,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
* will have to sort it out.
*/
if (curr && unlikely(rt_task(curr)) &&
- (tsk_nr_cpus_allowed(curr) < 2 ||
+ (curr->nr_cpus_allowed < 2 ||
curr->prio <= p->prio)) {
int target = find_lowest_rq(p);
@@ -1437,7 +1437,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
* Current can't be migrated, useless to reschedule,
* let's hope p can move out.
*/
- if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
+ if (rq->curr->nr_cpus_allowed == 1 ||
!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
return;
@@ -1445,7 +1445,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
* p is migratable, so let's not schedule it and
* see if it is pushed or pulled somewhere else.
*/
- if (tsk_nr_cpus_allowed(p) != 1
+ if (p->nr_cpus_allowed != 1
&& cpupri_find(&rq->rd->cpupri, p, NULL))
return;
@@ -1579,7 +1579,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
* The previous task needs to be made eligible for pushing
* if it is still active
*/
- if (on_rt_rq(&p->rt) && tsk_nr_cpus_allowed(p) > 1)
+ if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
}
@@ -1591,7 +1591,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
- cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ cpumask_test_cpu(cpu, &p->cpus_allowed))
return 1;
return 0;
}
@@ -1629,7 +1629,7 @@ static int find_lowest_rq(struct task_struct *task)
if (unlikely(!lowest_mask))
return -1;
- if (tsk_nr_cpus_allowed(task) == 1)
+ if (task->nr_cpus_allowed == 1)
return -1; /* No other targets possible */
if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
@@ -1726,8 +1726,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
* Also make sure that it wasn't scheduled on its rq.
*/
if (unlikely(task_rq(task) != rq ||
- !cpumask_test_cpu(lowest_rq->cpu,
- tsk_cpus_allowed(task)) ||
+ !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
task_running(rq, task) ||
!rt_task(task) ||
!task_on_rq_queued(task))) {
@@ -1762,7 +1761,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
BUG_ON(rq->cpu != task_cpu(p));
BUG_ON(task_current(rq, p));
- BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
+ BUG_ON(p->nr_cpus_allowed <= 1);
BUG_ON(!task_on_rq_queued(p));
BUG_ON(!rt_task(p));
@@ -2122,9 +2121,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
{
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
- tsk_nr_cpus_allowed(p) > 1 &&
+ p->nr_cpus_allowed > 1 &&
(dl_task(rq->curr) || rt_task(rq->curr)) &&
- (tsk_nr_cpus_allowed(rq->curr) < 2 ||
+ (rq->curr->nr_cpus_allowed < 2 ||
rq->curr->prio <= p->prio))
push_rt_tasks(rq);
}
@@ -2197,7 +2196,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
*/
if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
- if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded)
+ if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
queue_push_tasks(rq);
#endif /* CONFIG_SMP */
if (p->prio < rq->curr->prio)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 71b10a9b73cf..5cbf92214ad8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,9 +1,26 @@
#include <linux/sched.h>
+#include <linux/sched/autogroup.h>
#include <linux/sched/sysctl.h>
+#include <linux/sched/topology.h>
#include <linux/sched/rt.h>
-#include <linux/u64_stats_sync.h>
#include <linux/sched/deadline.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/wake_q.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/numa_balancing.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/cpufreq.h>
+#include <linux/sched/stat.h>
+#include <linux/sched/nohz.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/hotplug.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/init.h>
+
+#include <linux/u64_stats_sync.h>
#include <linux/kernel_stat.h>
#include <linux/binfmts.h>
#include <linux/mutex.h>
@@ -13,6 +30,10 @@
#include <linux/tick.h>
#include <linux/slab.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
+
#include "cpupri.h"
#include "cpudeadline.h"
#include "cpuacct.h"
@@ -1817,7 +1838,6 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
extern void print_dl_stats(struct seq_file *m, int cpu);
extern void
print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
-
#ifdef CONFIG_NUMA_BALANCING
extern void
show_numa_stats(struct task_struct *p, struct seq_file *m);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index bf0da0aa0a14..d5710651043b 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -164,114 +164,3 @@ sched_info_switch(struct rq *rq,
#define sched_info_arrive(rq, next) do { } while (0)
#define sched_info_switch(rq, t, next) do { } while (0)
#endif /* CONFIG_SCHED_INFO */
-
-/*
- * The following are functions that support scheduler-internal time accounting.
- * These functions are generally called at the timer tick. None of this depends
- * on CONFIG_SCHEDSTATS.
- */
-
-/**
- * get_running_cputimer - return &tsk->signal->cputimer if cputimer is running
- *
- * @tsk: Pointer to target task.
- */
-#ifdef CONFIG_POSIX_TIMERS
-static inline
-struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk)
-{
- struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-
- /* Check if cputimer isn't running. This is accessed without locking. */
- if (!READ_ONCE(cputimer->running))
- return NULL;
-
- /*
- * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime
- * in __exit_signal(), we won't account to the signal struct further
- * cputime consumed by that task, even though the task can still be
- * ticking after __exit_signal().
- *
- * In order to keep a consistent behaviour between thread group cputime
- * and thread group cputimer accounting, lets also ignore the cputime
- * elapsing after __exit_signal() in any thread group timer running.
- *
- * This makes sure that POSIX CPU clocks and timers are synchronized, so
- * that a POSIX CPU timer won't expire while the corresponding POSIX CPU
- * clock delta is behind the expiring timer value.
- */
- if (unlikely(!tsk->sighand))
- return NULL;
-
- return cputimer;
-}
-#else
-static inline
-struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk)
-{
- return NULL;
-}
-#endif
-
-/**
- * account_group_user_time - Maintain utime for a thread group.
- *
- * @tsk: Pointer to task structure.
- * @cputime: Time value by which to increment the utime field of the
- * thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the utime field there.
- */
-static inline void account_group_user_time(struct task_struct *tsk,
- u64 cputime)
-{
- struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
-
- if (!cputimer)
- return;
-
- atomic64_add(cputime, &cputimer->cputime_atomic.utime);
-}
-
-/**
- * account_group_system_time - Maintain stime for a thread group.
- *
- * @tsk: Pointer to task structure.
- * @cputime: Time value by which to increment the stime field of the
- * thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the stime field there.
- */
-static inline void account_group_system_time(struct task_struct *tsk,
- u64 cputime)
-{
- struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
-
- if (!cputimer)
- return;
-
- atomic64_add(cputime, &cputimer->cputime_atomic.stime);
-}
-
-/**
- * account_group_exec_runtime - Maintain exec runtime for a thread group.
- *
- * @tsk: Pointer to task structure.
- * @ns: Time value by which to increment the sum_exec_runtime field
- * of the thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the sum_exec_runtime field there.
- */
-static inline void account_group_exec_runtime(struct task_struct *tsk,
- unsigned long long ns)
-{
- struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
-
- if (!cputimer)
- return;
-
- atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
-}
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 82f0dff90030..3d5610dcce11 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -1,4 +1,4 @@
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/swait.h>
void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 9453efe9b25a..4d2ea6f25568 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -5,7 +5,8 @@
*/
#include <linux/init.h>
#include <linux/export.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
#include <linux/mm.h>
#include <linux/wait.h>
#include <linux/hash.h>
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index e15185c28de5..65f61077ad50 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -18,6 +18,7 @@
#include <linux/compat.h>
#include <linux/coredump.h>
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
#include <linux/seccomp.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index 13f9def8b24a..7e59ebc2c25e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -13,7 +13,12 @@
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/user.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
+#include <linux/sched/cputime.h>
#include <linux/fs.h>
#include <linux/tty.h>
#include <linux/binfmts.h>
@@ -2395,11 +2400,11 @@ void exit_signals(struct task_struct *tsk)
* @tsk is about to have PF_EXITING set - lock out users which
* expect stable threadgroup.
*/
- threadgroup_change_begin(tsk);
+ cgroup_threadgroup_change_begin(tsk);
if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
tsk->flags |= PF_EXITING;
- threadgroup_change_end(tsk);
+ cgroup_threadgroup_change_end(tsk);
return;
}
@@ -2410,7 +2415,7 @@ void exit_signals(struct task_struct *tsk)
*/
tsk->flags |= PF_EXITING;
- threadgroup_change_end(tsk);
+ cgroup_threadgroup_change_end(tsk);
if (!signal_pending(tsk))
goto out;
@@ -3239,10 +3244,17 @@ int compat_restore_altstack(const compat_stack_t __user *uss)
int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
{
+ int err;
struct task_struct *t = current;
- return __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), &uss->ss_sp) |
- __put_user(sas_ss_flags(sp), &uss->ss_flags) |
+ err = __put_user(ptr_to_compat((void __user *)t->sas_ss_sp),
+ &uss->ss_sp) |
+ __put_user(t->sas_ss_flags, &uss->ss_flags) |
__put_user(t->sas_ss_size, &uss->ss_size);
+ if (err)
+ return err;
+ if (t->sas_ss_flags & SS_AUTODISARM)
+ sas_ss_reset(t);
+ return 0;
}
#endif
diff --git a/kernel/smp.c b/kernel/smp.c
index 77fcdb9f2775..a817769b53c0 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -17,6 +17,7 @@
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/sched.h>
+#include <linux/sched/idle.h>
#include <linux/hypervisor.h>
#include "smpboot.h"
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 4a5c6e73ecd4..1d71c051a951 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -9,6 +9,7 @@
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/sched/task.h>
#include <linux/export.h>
#include <linux/percpu.h>
#include <linux/kthread.h>
diff --git a/kernel/sys.c b/kernel/sys.c
index 7d4a9a6df956..7ff6d1b10cec 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -49,6 +49,13 @@
#include <linux/binfmts.h>
#include <linux/sched.h>
+#include <linux/sched/autogroup.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/stat.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
+#include <linux/sched/task.h>
+#include <linux/sched/cputime.h>
#include <linux/rcupdate.h>
#include <linux/uidgid.h>
#include <linux/cred.h>
@@ -2063,6 +2070,24 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
}
#endif
+static int propagate_has_child_subreaper(struct task_struct *p, void *data)
+{
+ /*
+ * If task has has_child_subreaper - all its decendants
+ * already have these flag too and new decendants will
+ * inherit it on fork, skip them.
+ *
+ * If we've found child_reaper - skip descendants in
+ * it's subtree as they will never get out pidns.
+ */
+ if (p->signal->has_child_subreaper ||
+ is_child_reaper(task_pid(p)))
+ return 0;
+
+ p->signal->has_child_subreaper = 1;
+ return 1;
+}
+
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
@@ -2214,6 +2239,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
break;
case PR_SET_CHILD_SUBREAPER:
me->signal->is_child_subreaper = !!arg2;
+ if (!arg2)
+ break;
+
+ walk_process_tree(me, propagate_has_child_subreaper, NULL);
break;
case PR_GET_CHILD_SUBREAPER:
error = put_user(me->signal->is_child_subreaper,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index bb260ceb3718..acf0a5a06da7 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -63,6 +63,7 @@
#include <linux/capability.h>
#include <linux/binfmts.h>
#include <linux/sched/sysctl.h>
+#include <linux/sched/coredump.h>
#include <linux/kexec.h>
#include <linux/bpf.h>
#include <linux/mount.h>
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index e6dc9a538efa..ce3a31e8eb36 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -19,6 +19,8 @@
#include <linux/hrtimer.h>
#include <linux/timerqueue.h>
#include <linux/rtc.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
#include <linux/alarmtimer.h>
#include <linux/mutex.h>
#include <linux/platform_device.h>
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 8e11d8d9f419..ec08f527d7ee 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -43,10 +43,12 @@
#include <linux/seq_file.h>
#include <linux/err.h>
#include <linux/debugobjects.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/rt.h>
#include <linux/sched/deadline.h>
+#include <linux/sched/nohz.h>
+#include <linux/sched/debug.h>
#include <linux/timer.h>
#include <linux/freezer.h>
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index a95f13c31464..087d6a1279b8 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -10,6 +10,8 @@
#include <linux/interrupt.h>
#include <linux/syscalls.h>
#include <linux/time.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/cputime.h>
#include <linux/posix-timers.h>
#include <linux/hrtimer.h>
#include <trace/events/timer.h>
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index b4377a5e4269..4513ad16a253 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -2,7 +2,8 @@
* Implement CPU time clocks for the POSIX clock interface.
*/
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/cputime.h>
#include <linux/posix-timers.h>
#include <linux/errno.h>
#include <linux/math64.h>
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 1e6623d76750..50a6a47020de 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -35,6 +35,7 @@
#include <linux/slab.h>
#include <linux/time.h>
#include <linux/mutex.h>
+#include <linux/sched/task.h>
#include <linux/uaccess.h>
#include <linux/list.h>
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index a26036d37a38..ea6b610c4c57 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -13,6 +13,7 @@
#include <linux/kernel.h>
#include <linux/moduleparam.h>
#include <linux/sched.h>
+#include <linux/sched/clock.h>
#include <linux/syscore_ops.h>
#include <linux/hrtimer.h>
#include <linux/sched_clock.h>
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 2c115fdab397..7fe53be86077 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -17,8 +17,12 @@
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <linux/percpu.h>
+#include <linux/nmi.h>
#include <linux/profile.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/stat.h>
+#include <linux/sched/nohz.h>
#include <linux/module.h>
#include <linux/irq_work.h>
#include <linux/posix-timers.h>
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 95b258dd75db..5b63a2102c29 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -14,7 +14,9 @@
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/mm.h>
+#include <linux/nmi.h>
#include <linux/sched.h>
+#include <linux/sched/loadavg.h>
#include <linux/syscore_ops.h>
#include <linux/clocksource.h>
#include <linux/jiffies.h>
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 82a6bfa0c307..1dc0256bfb6e 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -38,8 +38,10 @@
#include <linux/tick.h>
#include <linux/kallsyms.h>
#include <linux/irq_work.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/sched/sysctl.h>
+#include <linux/sched/nohz.h>
+#include <linux/sched/debug.h>
#include <linux/slab.h>
#include <linux/compat.h>
diff --git a/kernel/torture.c b/kernel/torture.c
index 0d887eb62856..55de96529287 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -30,6 +30,7 @@
#include <linux/smp.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
+#include <linux/sched/clock.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/completion.h>
@@ -311,7 +312,7 @@ EXPORT_SYMBOL_GPL(torture_random);
/*
* Variables for shuffling. The idea is to ensure that each CPU stays
* idle for an extended period to test interactions with dyntick idle,
- * as well as interactions with any per-CPU varibles.
+ * as well as interactions with any per-CPU variables.
*/
struct shuffle_task {
struct list_head st_l;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index eb230f06ba41..0d1597c9ee30 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -15,6 +15,7 @@
#include <linux/stop_machine.h>
#include <linux/clocksource.h>
+#include <linux/sched/task.h>
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
#include <linux/suspend.h>
@@ -1110,13 +1111,6 @@ struct ftrace_func_entry {
unsigned long ip;
};
-struct ftrace_hash {
- unsigned long size_bits;
- struct hlist_head *buckets;
- unsigned long count;
- struct rcu_head rcu;
-};
-
/*
* We make these constant because no one should touch them,
* but they are used as the default "empty hash", to avoid allocating
@@ -1192,26 +1186,24 @@ struct ftrace_page {
static struct ftrace_page *ftrace_pages_start;
static struct ftrace_page *ftrace_pages;
-static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash)
+static __always_inline unsigned long
+ftrace_hash_key(struct ftrace_hash *hash, unsigned long ip)
{
- return !hash || !hash->count;
+ if (hash->size_bits > 0)
+ return hash_long(ip, hash->size_bits);
+
+ return 0;
}
-static struct ftrace_func_entry *
-ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
+/* Only use this function if ftrace_hash_empty() has already been tested */
+static __always_inline struct ftrace_func_entry *
+__ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
{
unsigned long key;
struct ftrace_func_entry *entry;
struct hlist_head *hhd;
- if (ftrace_hash_empty(hash))
- return NULL;
-
- if (hash->size_bits > 0)
- key = hash_long(ip, hash->size_bits);
- else
- key = 0;
-
+ key = ftrace_hash_key(hash, ip);
hhd = &hash->buckets[key];
hlist_for_each_entry_rcu_notrace(entry, hhd, hlist) {
@@ -1221,17 +1213,32 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
return NULL;
}
+/**
+ * ftrace_lookup_ip - Test to see if an ip exists in an ftrace_hash
+ * @hash: The hash to look at
+ * @ip: The instruction pointer to test
+ *
+ * Search a given @hash to see if a given instruction pointer (@ip)
+ * exists in it.
+ *
+ * Returns the entry that holds the @ip if found. NULL otherwise.
+ */
+struct ftrace_func_entry *
+ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
+{
+ if (ftrace_hash_empty(hash))
+ return NULL;
+
+ return __ftrace_lookup_ip(hash, ip);
+}
+
static void __add_hash_entry(struct ftrace_hash *hash,
struct ftrace_func_entry *entry)
{
struct hlist_head *hhd;
unsigned long key;
- if (hash->size_bits)
- key = hash_long(entry->ip, hash->size_bits);
- else
- key = 0;
-
+ key = ftrace_hash_key(hash, entry->ip);
hhd = &hash->buckets[key];
hlist_add_head(&entry->hlist, hhd);
hash->count++;
@@ -1383,9 +1390,8 @@ ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash);
static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
struct ftrace_hash *new_hash);
-static int
-ftrace_hash_move(struct ftrace_ops *ops, int enable,
- struct ftrace_hash **dst, struct ftrace_hash *src)
+static struct ftrace_hash *
+__ftrace_hash_move(struct ftrace_hash *src)
{
struct ftrace_func_entry *entry;
struct hlist_node *tn;
@@ -1393,21 +1399,13 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
struct ftrace_hash *new_hash;
int size = src->count;
int bits = 0;
- int ret;
int i;
- /* Reject setting notrace hash on IPMODIFY ftrace_ops */
- if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable)
- return -EINVAL;
-
/*
- * If the new source is empty, just free dst and assign it
- * the empty_hash.
+ * If the new source is empty, just return the empty_hash.
*/
- if (!src->count) {
- new_hash = EMPTY_HASH;
- goto update;
- }
+ if (!src->count)
+ return EMPTY_HASH;
/*
* Make the hash size about 1/2 the # found
@@ -1421,7 +1419,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
new_hash = alloc_ftrace_hash(bits);
if (!new_hash)
- return -ENOMEM;
+ return NULL;
size = 1 << src->size_bits;
for (i = 0; i < size; i++) {
@@ -1432,7 +1430,24 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
}
}
-update:
+ return new_hash;
+}
+
+static int
+ftrace_hash_move(struct ftrace_ops *ops, int enable,
+ struct ftrace_hash **dst, struct ftrace_hash *src)
+{
+ struct ftrace_hash *new_hash;
+ int ret;
+
+ /* Reject setting notrace hash on IPMODIFY ftrace_ops */
+ if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable)
+ return -EINVAL;
+
+ new_hash = __ftrace_hash_move(src);
+ if (!new_hash)
+ return -ENOMEM;
+
/* Make sure this can be applied if it is IPMODIFY ftrace_ops */
if (enable) {
/* IPMODIFY should be updated only when filter_hash updating */
@@ -1466,9 +1481,9 @@ static bool hash_contains_ip(unsigned long ip,
* notrace hash is considered not in the notrace hash.
*/
return (ftrace_hash_empty(hash->filter_hash) ||
- ftrace_lookup_ip(hash->filter_hash, ip)) &&
+ __ftrace_lookup_ip(hash->filter_hash, ip)) &&
(ftrace_hash_empty(hash->notrace_hash) ||
- !ftrace_lookup_ip(hash->notrace_hash, ip));
+ !__ftrace_lookup_ip(hash->notrace_hash, ip));
}
/*
@@ -2880,7 +2895,7 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
/* The function must be in the filter */
if (!ftrace_hash_empty(ops->func_hash->filter_hash) &&
- !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))
+ !__ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))
return 0;
/* If in notrace hash, we ignore it too */
@@ -4382,7 +4397,7 @@ __setup("ftrace_filter=", set_ftrace_filter);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
-static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);
+static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer);
static unsigned long save_global_trampoline;
static unsigned long save_global_flags;
@@ -4405,18 +4420,17 @@ static void __init set_ftrace_early_graph(char *buf, int enable)
{
int ret;
char *func;
- unsigned long *table = ftrace_graph_funcs;
- int *count = &ftrace_graph_count;
+ struct ftrace_hash *hash;
- if (!enable) {
- table = ftrace_graph_notrace_funcs;
- count = &ftrace_graph_notrace_count;
- }
+ if (enable)
+ hash = ftrace_graph_hash;
+ else
+ hash = ftrace_graph_notrace_hash;
while (buf) {
func = strsep(&buf, ",");
/* we allow only one expression at a time */
- ret = ftrace_set_func(table, count, FTRACE_GRAPH_MAX_FUNCS, func);
+ ret = ftrace_graph_set_hash(hash, func);
if (ret)
printk(KERN_DEBUG "ftrace: function %s not "
"traceable\n", func);
@@ -4540,26 +4554,55 @@ static const struct file_operations ftrace_notrace_fops = {
static DEFINE_MUTEX(graph_lock);
-int ftrace_graph_count;
-int ftrace_graph_notrace_count;
-unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
-unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
+struct ftrace_hash *ftrace_graph_hash = EMPTY_HASH;
+struct ftrace_hash *ftrace_graph_notrace_hash = EMPTY_HASH;
+
+enum graph_filter_type {
+ GRAPH_FILTER_NOTRACE = 0,
+ GRAPH_FILTER_FUNCTION,
+};
+
+#define FTRACE_GRAPH_EMPTY ((void *)1)
struct ftrace_graph_data {
- unsigned long *table;
- size_t size;
- int *count;
- const struct seq_operations *seq_ops;
+ struct ftrace_hash *hash;
+ struct ftrace_func_entry *entry;
+ int idx; /* for hash table iteration */
+ enum graph_filter_type type;
+ struct ftrace_hash *new_hash;
+ const struct seq_operations *seq_ops;
+ struct trace_parser parser;
};
static void *
__g_next(struct seq_file *m, loff_t *pos)
{
struct ftrace_graph_data *fgd = m->private;
+ struct ftrace_func_entry *entry = fgd->entry;
+ struct hlist_head *head;
+ int i, idx = fgd->idx;
- if (*pos >= *fgd->count)
+ if (*pos >= fgd->hash->count)
return NULL;
- return &fgd->table[*pos];
+
+ if (entry) {
+ hlist_for_each_entry_continue(entry, hlist) {
+ fgd->entry = entry;
+ return entry;
+ }
+
+ idx++;
+ }
+
+ for (i = idx; i < 1 << fgd->hash->size_bits; i++) {
+ head = &fgd->hash->buckets[i];
+ hlist_for_each_entry(entry, head, hlist) {
+ fgd->entry = entry;
+ fgd->idx = i;
+ return entry;
+ }
+ }
+ return NULL;
}
static void *
@@ -4575,10 +4618,19 @@ static void *g_start(struct seq_file *m, loff_t *pos)
mutex_lock(&graph_lock);
+ if (fgd->type == GRAPH_FILTER_FUNCTION)
+ fgd->hash = rcu_dereference_protected(ftrace_graph_hash,
+ lockdep_is_held(&graph_lock));
+ else
+ fgd->hash = rcu_dereference_protected(ftrace_graph_notrace_hash,
+ lockdep_is_held(&graph_lock));
+
/* Nothing, tell g_show to print all functions are enabled */
- if (!*fgd->count && !*pos)
- return (void *)1;
+ if (ftrace_hash_empty(fgd->hash) && !*pos)
+ return FTRACE_GRAPH_EMPTY;
+ fgd->idx = 0;
+ fgd->entry = NULL;
return __g_next(m, pos);
}
@@ -4589,22 +4641,22 @@ static void g_stop(struct seq_file *m, void *p)
static int g_show(struct seq_file *m, void *v)
{
- unsigned long *ptr = v;
+ struct ftrace_func_entry *entry = v;
- if (!ptr)
+ if (!entry)
return 0;
- if (ptr == (unsigned long *)1) {
+ if (entry == FTRACE_GRAPH_EMPTY) {
struct ftrace_graph_data *fgd = m->private;
- if (fgd->table == ftrace_graph_funcs)
+ if (fgd->type == GRAPH_FILTER_FUNCTION)
seq_puts(m, "#### all functions enabled ####\n");
else
seq_puts(m, "#### no functions disabled ####\n");
return 0;
}
- seq_printf(m, "%ps\n", (void *)*ptr);
+ seq_printf(m, "%ps\n", (void *)entry->ip);
return 0;
}
@@ -4621,24 +4673,51 @@ __ftrace_graph_open(struct inode *inode, struct file *file,
struct ftrace_graph_data *fgd)
{
int ret = 0;
+ struct ftrace_hash *new_hash = NULL;
- mutex_lock(&graph_lock);
- if ((file->f_mode & FMODE_WRITE) &&
- (file->f_flags & O_TRUNC)) {
- *fgd->count = 0;
- memset(fgd->table, 0, fgd->size * sizeof(*fgd->table));
+ if (file->f_mode & FMODE_WRITE) {
+ const int size_bits = FTRACE_HASH_DEFAULT_BITS;
+
+ if (trace_parser_get_init(&fgd->parser, FTRACE_BUFF_MAX))
+ return -ENOMEM;
+
+ if (file->f_flags & O_TRUNC)
+ new_hash = alloc_ftrace_hash(size_bits);
+ else
+ new_hash = alloc_and_copy_ftrace_hash(size_bits,
+ fgd->hash);
+ if (!new_hash) {
+ ret = -ENOMEM;
+ goto out;
+ }
}
- mutex_unlock(&graph_lock);
if (file->f_mode & FMODE_READ) {
- ret = seq_open(file, fgd->seq_ops);
+ ret = seq_open(file, &ftrace_graph_seq_ops);
if (!ret) {
struct seq_file *m = file->private_data;
m->private = fgd;
+ } else {
+ /* Failed */
+ free_ftrace_hash(new_hash);
+ new_hash = NULL;
}
} else
file->private_data = fgd;
+out:
+ if (ret < 0 && file->f_mode & FMODE_WRITE)
+ trace_parser_put(&fgd->parser);
+
+ fgd->new_hash = new_hash;
+
+ /*
+ * All uses of fgd->hash must be taken with the graph_lock
+ * held. The graph_lock is going to be released, so force
+ * fgd->hash to be reinitialized when it is taken again.
+ */
+ fgd->hash = NULL;
+
return ret;
}
@@ -4646,6 +4725,7 @@ static int
ftrace_graph_open(struct inode *inode, struct file *file)
{
struct ftrace_graph_data *fgd;
+ int ret;
if (unlikely(ftrace_disabled))
return -ENODEV;
@@ -4654,18 +4734,26 @@ ftrace_graph_open(struct inode *inode, struct file *file)
if (fgd == NULL)
return -ENOMEM;
- fgd->table = ftrace_graph_funcs;
- fgd->size = FTRACE_GRAPH_MAX_FUNCS;
- fgd->count = &ftrace_graph_count;
+ mutex_lock(&graph_lock);
+
+ fgd->hash = rcu_dereference_protected(ftrace_graph_hash,
+ lockdep_is_held(&graph_lock));
+ fgd->type = GRAPH_FILTER_FUNCTION;
fgd->seq_ops = &ftrace_graph_seq_ops;
- return __ftrace_graph_open(inode, file, fgd);
+ ret = __ftrace_graph_open(inode, file, fgd);
+ if (ret < 0)
+ kfree(fgd);
+
+ mutex_unlock(&graph_lock);
+ return ret;
}
static int
ftrace_graph_notrace_open(struct inode *inode, struct file *file)
{
struct ftrace_graph_data *fgd;
+ int ret;
if (unlikely(ftrace_disabled))
return -ENODEV;
@@ -4674,45 +4762,97 @@ ftrace_graph_notrace_open(struct inode *inode, struct file *file)
if (fgd == NULL)
return -ENOMEM;
- fgd->table = ftrace_graph_notrace_funcs;
- fgd->size = FTRACE_GRAPH_MAX_FUNCS;
- fgd->count = &ftrace_graph_notrace_count;
+ mutex_lock(&graph_lock);
+
+ fgd->hash = rcu_dereference_protected(ftrace_graph_notrace_hash,
+ lockdep_is_held(&graph_lock));
+ fgd->type = GRAPH_FILTER_NOTRACE;
fgd->seq_ops = &ftrace_graph_seq_ops;
- return __ftrace_graph_open(inode, file, fgd);
+ ret = __ftrace_graph_open(inode, file, fgd);
+ if (ret < 0)
+ kfree(fgd);
+
+ mutex_unlock(&graph_lock);
+ return ret;
}
static int
ftrace_graph_release(struct inode *inode, struct file *file)
{
+ struct ftrace_graph_data *fgd;
+ struct ftrace_hash *old_hash, *new_hash;
+ struct trace_parser *parser;
+ int ret = 0;
+
if (file->f_mode & FMODE_READ) {
struct seq_file *m = file->private_data;
- kfree(m->private);
+ fgd = m->private;
seq_release(inode, file);
} else {
- kfree(file->private_data);
+ fgd = file->private_data;
}
- return 0;
+
+ if (file->f_mode & FMODE_WRITE) {
+
+ parser = &fgd->parser;
+
+ if (trace_parser_loaded((parser))) {
+ parser->buffer[parser->idx] = 0;
+ ret = ftrace_graph_set_hash(fgd->new_hash,
+ parser->buffer);
+ }
+
+ trace_parser_put(parser);
+
+ new_hash = __ftrace_hash_move(fgd->new_hash);
+ if (!new_hash) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ mutex_lock(&graph_lock);
+
+ if (fgd->type == GRAPH_FILTER_FUNCTION) {
+ old_hash = rcu_dereference_protected(ftrace_graph_hash,
+ lockdep_is_held(&graph_lock));
+ rcu_assign_pointer(ftrace_graph_hash, new_hash);
+ } else {
+ old_hash = rcu_dereference_protected(ftrace_graph_notrace_hash,
+ lockdep_is_held(&graph_lock));
+ rcu_assign_pointer(ftrace_graph_notrace_hash, new_hash);
+ }
+
+ mutex_unlock(&graph_lock);
+
+ /* Wait till all users are no longer using the old hash */
+ synchronize_sched();
+
+ free_ftrace_hash(old_hash);
+ }
+
+ out:
+ kfree(fgd->new_hash);
+ kfree(fgd);
+
+ return ret;
}
static int
-ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)
+ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer)
{
struct ftrace_glob func_g;
struct dyn_ftrace *rec;
struct ftrace_page *pg;
+ struct ftrace_func_entry *entry;
int fail = 1;
int not;
- bool exists;
- int i;
/* decode regex */
func_g.type = filter_parse_regex(buffer, strlen(buffer),
&func_g.search, &not);
- if (!not && *idx >= size)
- return -EBUSY;
func_g.len = strlen(func_g.search);
@@ -4729,26 +4869,18 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)
continue;
if (ftrace_match_record(rec, &func_g, NULL, 0)) {
- /* if it is in the array */
- exists = false;
- for (i = 0; i < *idx; i++) {
- if (array[i] == rec->ip) {
- exists = true;
- break;
- }
- }
+ entry = ftrace_lookup_ip(hash, rec->ip);
if (!not) {
fail = 0;
- if (!exists) {
- array[(*idx)++] = rec->ip;
- if (*idx >= size)
- goto out;
- }
+
+ if (entry)
+ continue;
+ if (add_hash_entry(hash, rec->ip) < 0)
+ goto out;
} else {
- if (exists) {
- array[i] = array[--(*idx)];
- array[*idx] = 0;
+ if (entry) {
+ free_hash_entry(hash, entry);
fail = 0;
}
}
@@ -4767,35 +4899,34 @@ static ssize_t
ftrace_graph_write(struct file *file, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- struct trace_parser parser;
ssize_t read, ret = 0;
struct ftrace_graph_data *fgd = file->private_data;
+ struct trace_parser *parser;
if (!cnt)
return 0;
- if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX))
- return -ENOMEM;
-
- read = trace_get_user(&parser, ubuf, cnt, ppos);
+ /* Read mode uses seq functions */
+ if (file->f_mode & FMODE_READ) {
+ struct seq_file *m = file->private_data;
+ fgd = m->private;
+ }
- if (read >= 0 && trace_parser_loaded((&parser))) {
- parser.buffer[parser.idx] = 0;
+ parser = &fgd->parser;
- mutex_lock(&graph_lock);
+ read = trace_get_user(parser, ubuf, cnt, ppos);
- /* we allow only one expression at a time */
- ret = ftrace_set_func(fgd->table, fgd->count, fgd->size,
- parser.buffer);
+ if (read >= 0 && trace_parser_loaded(parser) &&
+ !trace_parser_cont(parser)) {
- mutex_unlock(&graph_lock);
+ ret = ftrace_graph_set_hash(fgd->new_hash,
+ parser->buffer);
+ trace_parser_clear(parser);
}
if (!ret)
ret = read;
- trace_parser_put(&parser);
-
return ret;
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a85739efcc30..96fc3c043ad6 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -6,6 +6,7 @@
#include <linux/trace_events.h>
#include <linux/ring_buffer.h>
#include <linux/trace_clock.h>
+#include <linux/sched/clock.h>
#include <linux/trace_seq.h>
#include <linux/spinlock.h>
#include <linux/irq_work.h>
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 6df9a83e20d7..c190a4d5013c 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -6,6 +6,7 @@
#include <linux/ring_buffer.h>
#include <linux/completion.h>
#include <linux/kthread.h>
+#include <uapi/linux/sched/types.h>
#include <linux/module.h>
#include <linux/ktime.h>
#include <asm/local.h>
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d7449783987a..707445ceb7ef 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -260,16 +260,8 @@ unsigned long long ns2usecs(u64 nsec)
TRACE_ITER_EVENT_FORK
/*
- * The global_trace is the descriptor that holds the tracing
- * buffers for the live tracing. For each CPU, it contains
- * a link list of pages that will store trace entries. The
- * page descriptor of the pages in the memory is used to hold
- * the link list by linking the lru item in the page descriptor
- * to each of the pages in the buffer per CPU.
- *
- * For each active CPU there is a data field that holds the
- * pages for the buffer for that CPU. Each CPU has the same number
- * of pages allocated for its buffer.
+ * The global_trace is the descriptor that holds the top-level tracing
+ * buffers for the live tracing.
*/
static struct trace_array global_trace = {
.trace_flags = TRACE_DEFAULT_FLAGS,
@@ -1193,6 +1185,7 @@ int trace_parser_get_init(struct trace_parser *parser, int size)
void trace_parser_put(struct trace_parser *parser)
{
kfree(parser->buffer);
+ parser->buffer = NULL;
}
/*
@@ -7503,7 +7496,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
ftrace_init_tracefs(tr, d_tracer);
}
-static struct vfsmount *trace_automount(void *ingore)
+static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
{
struct vfsmount *mnt;
struct file_system_type *type;
@@ -7516,7 +7509,7 @@ static struct vfsmount *trace_automount(void *ingore)
type = get_fs_type("tracefs");
if (!type)
return NULL;
- mnt = vfs_kern_mount(type, 0, "tracefs", NULL);
+ mnt = vfs_submount(mntpt, type, "tracefs", NULL);
put_filesystem(type);
if (IS_ERR(mnt))
return NULL;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1ea51ab53edf..ae1cce91fead 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -753,6 +753,21 @@ enum print_line_t print_trace_line(struct trace_iterator *iter);
extern char trace_find_mark(unsigned long long duration);
+struct ftrace_hash {
+ unsigned long size_bits;
+ struct hlist_head *buckets;
+ unsigned long count;
+ struct rcu_head rcu;
+};
+
+struct ftrace_func_entry *
+ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip);
+
+static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash)
+{
+ return !hash || !hash->count;
+}
+
/* Standard output formatting function used for function return traces */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -787,53 +802,50 @@ extern void __trace_graph_return(struct trace_array *tr,
struct ftrace_graph_ret *trace,
unsigned long flags, int pc);
-
#ifdef CONFIG_DYNAMIC_FTRACE
-/* TODO: make this variable */
-#define FTRACE_GRAPH_MAX_FUNCS 32
-extern int ftrace_graph_count;
-extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
-extern int ftrace_graph_notrace_count;
-extern unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS];
+extern struct ftrace_hash *ftrace_graph_hash;
+extern struct ftrace_hash *ftrace_graph_notrace_hash;
static inline int ftrace_graph_addr(unsigned long addr)
{
- int i;
-
- if (!ftrace_graph_count)
- return 1;
-
- for (i = 0; i < ftrace_graph_count; i++) {
- if (addr == ftrace_graph_funcs[i]) {
- /*
- * If no irqs are to be traced, but a set_graph_function
- * is set, and called by an interrupt handler, we still
- * want to trace it.
- */
- if (in_irq())
- trace_recursion_set(TRACE_IRQ_BIT);
- else
- trace_recursion_clear(TRACE_IRQ_BIT);
- return 1;
- }
+ int ret = 0;
+
+ preempt_disable_notrace();
+
+ if (ftrace_hash_empty(ftrace_graph_hash)) {
+ ret = 1;
+ goto out;
}
- return 0;
+ if (ftrace_lookup_ip(ftrace_graph_hash, addr)) {
+ /*
+ * If no irqs are to be traced, but a set_graph_function
+ * is set, and called by an interrupt handler, we still
+ * want to trace it.
+ */
+ if (in_irq())
+ trace_recursion_set(TRACE_IRQ_BIT);
+ else
+ trace_recursion_clear(TRACE_IRQ_BIT);
+ ret = 1;
+ }
+
+out:
+ preempt_enable_notrace();
+ return ret;
}
static inline int ftrace_graph_notrace_addr(unsigned long addr)
{
- int i;
+ int ret = 0;
- if (!ftrace_graph_notrace_count)
- return 0;
+ preempt_disable_notrace();
- for (i = 0; i < ftrace_graph_notrace_count; i++) {
- if (addr == ftrace_graph_notrace_funcs[i])
- return 1;
- }
+ if (ftrace_lookup_ip(ftrace_graph_notrace_hash, addr))
+ ret = 1;
- return 0;
+ preempt_enable_notrace();
+ return ret;
}
#else
static inline int ftrace_graph_addr(unsigned long addr)
@@ -1300,7 +1312,8 @@ static inline bool is_string_field(struct ftrace_event_field *field)
{
return field->filter_type == FILTER_DYN_STRING ||
field->filter_type == FILTER_STATIC_STRING ||
- field->filter_type == FILTER_PTR_STRING;
+ field->filter_type == FILTER_PTR_STRING ||
+ field->filter_type == FILTER_COMM;
}
static inline bool is_function_field(struct ftrace_event_field *field)
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index e3b488825ae3..e49fbe901cfc 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -175,9 +175,9 @@ int trace_benchmark_reg(void)
bm_event_thread = kthread_run(benchmark_event_kthread,
NULL, "event_benchmark");
- if (!bm_event_thread) {
+ if (IS_ERR(bm_event_thread)) {
pr_warning("trace benchmark failed to create kernel thread\n");
- return -ENOMEM;
+ return PTR_ERR(bm_event_thread);
}
return 0;
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 75489de546b6..4d8fdf3184dc 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -27,7 +27,7 @@ static DEFINE_MUTEX(branch_tracing_mutex);
static struct trace_array *branch_tracer;
static void
-probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
{
struct trace_event_call *call = &event_branch;
struct trace_array *tr = branch_tracer;
@@ -68,16 +68,17 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
entry = ring_buffer_event_data(event);
/* Strip off the path, only save the file */
- p = f->file + strlen(f->file);
- while (p >= f->file && *p != '/')
+ p = f->data.file + strlen(f->data.file);
+ while (p >= f->data.file && *p != '/')
p--;
p++;
- strncpy(entry->func, f->func, TRACE_FUNC_SIZE);
+ strncpy(entry->func, f->data.func, TRACE_FUNC_SIZE);
strncpy(entry->file, p, TRACE_FILE_SIZE);
entry->func[TRACE_FUNC_SIZE] = 0;
entry->file[TRACE_FILE_SIZE] = 0;
- entry->line = f->line;
+ entry->constant = f->constant;
+ entry->line = f->data.line;
entry->correct = val == expect;
if (!call_filter_check_discard(call, entry, buffer, event))
@@ -89,7 +90,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
}
static inline
-void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
{
if (!branch_tracing_enabled)
return;
@@ -195,13 +196,19 @@ core_initcall(init_branch_tracer);
#else
static inline
-void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
{
}
#endif /* CONFIG_BRANCH_TRACER */
-void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
+void ftrace_likely_update(struct ftrace_likely_data *f, int val,
+ int expect, int is_constant)
{
+ /* A constant is always correct */
+ if (is_constant) {
+ f->constant++;
+ val = expect;
+ }
/*
* I would love to have a trace point here instead, but the
* trace point code is so inundated with unlikely and likely
@@ -212,9 +219,9 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
/* FIXME: Make this atomic! */
if (val == expect)
- f->correct++;
+ f->data.correct++;
else
- f->incorrect++;
+ f->data.incorrect++;
}
EXPORT_SYMBOL(ftrace_likely_update);
@@ -245,29 +252,60 @@ static inline long get_incorrect_percent(struct ftrace_branch_data *p)
return percent;
}
-static int branch_stat_show(struct seq_file *m, void *v)
+static const char *branch_stat_process_file(struct ftrace_branch_data *p)
{
- struct ftrace_branch_data *p = v;
const char *f;
- long percent;
/* Only print the file, not the path */
f = p->file + strlen(p->file);
while (f >= p->file && *f != '/')
f--;
- f++;
+ return ++f;
+}
+
+static void branch_stat_show(struct seq_file *m,
+ struct ftrace_branch_data *p, const char *f)
+{
+ long percent;
/*
* The miss is overlayed on correct, and hit on incorrect.
*/
percent = get_incorrect_percent(p);
- seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect);
if (percent < 0)
seq_puts(m, " X ");
else
seq_printf(m, "%3ld ", percent);
+
seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
+}
+
+static int branch_stat_show_normal(struct seq_file *m,
+ struct ftrace_branch_data *p, const char *f)
+{
+ seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect);
+ branch_stat_show(m, p, f);
+ return 0;
+}
+
+static int annotate_branch_stat_show(struct seq_file *m, void *v)
+{
+ struct ftrace_likely_data *p = v;
+ const char *f;
+ int l;
+
+ f = branch_stat_process_file(&p->data);
+
+ if (!p->constant)
+ return branch_stat_show_normal(m, &p->data, f);
+
+ l = snprintf(NULL, 0, "/%lu", p->constant);
+ l = l > 8 ? 0 : 8 - l;
+
+ seq_printf(m, "%8lu/%lu %*lu ",
+ p->data.correct, p->constant, l, p->data.incorrect);
+ branch_stat_show(m, &p->data, f);
return 0;
}
@@ -279,7 +317,7 @@ static void *annotated_branch_stat_start(struct tracer_stat *trace)
static void *
annotated_branch_stat_next(void *v, int idx)
{
- struct ftrace_branch_data *p = v;
+ struct ftrace_likely_data *p = v;
++p;
@@ -328,7 +366,7 @@ static struct tracer_stat annotated_branch_stats = {
.stat_next = annotated_branch_stat_next,
.stat_cmp = annotated_branch_stat_cmp,
.stat_headers = annotated_branch_stat_headers,
- .stat_show = branch_stat_show
+ .stat_show = annotate_branch_stat_show
};
__init static int init_annotated_branch_stats(void)
@@ -379,12 +417,21 @@ all_branch_stat_next(void *v, int idx)
return p;
}
+static int all_branch_stat_show(struct seq_file *m, void *v)
+{
+ struct ftrace_branch_data *p = v;
+ const char *f;
+
+ f = branch_stat_process_file(p);
+ return branch_stat_show_normal(m, p, f);
+}
+
static struct tracer_stat all_branch_stats = {
.name = "branch_all",
.stat_start = all_branch_stat_start,
.stat_next = all_branch_stat_next,
.stat_headers = all_branch_stat_headers,
- .stat_show = branch_stat_show
+ .stat_show = all_branch_stat_show
};
__init static int all_annotated_branch_stats(void)
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 0f06532a755b..5fdc779f411d 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -18,6 +18,7 @@
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/sched.h>
+#include <linux/sched/clock.h>
#include <linux/ktime.h>
#include <linux/trace_clock.h>
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index eb7396b7e7c3..c203ac4df791 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -328,11 +328,13 @@ FTRACE_ENTRY(branch, trace_branch,
__array( char, func, TRACE_FUNC_SIZE+1 )
__array( char, file, TRACE_FILE_SIZE+1 )
__field( char, correct )
+ __field( char, constant )
),
- F_printk("%u:%s:%s (%u)",
+ F_printk("%u:%s:%s (%u)%s",
__entry->line,
- __entry->func, __entry->file, __entry->correct),
+ __entry->func, __entry->file, __entry->correct,
+ __entry->constant ? " CONSTANT" : ""),
FILTER_OTHER
);
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index f3a960ed75a1..1c21d0e2a145 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -19,6 +19,7 @@
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/stacktrace.h>
+#include <linux/rculist.h>
#include "tracing_map.h"
#include "trace.h"
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 6721a1e89f39..f2ac9d44f6c4 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -22,6 +22,7 @@
#include <linux/ctype.h>
#include <linux/mutex.h>
#include <linux/slab.h>
+#include <linux/rculist.h>
#include "trace.h"
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index af344a1bf0d0..21ea6ae77d93 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -44,6 +44,7 @@
#include <linux/uaccess.h>
#include <linux/cpumask.h>
#include <linux/delay.h>
+#include <linux/sched/clock.h>
#include "trace.h"
static struct trace_array *hwlat_trace;
@@ -266,24 +267,13 @@ out:
static struct cpumask save_cpumask;
static bool disable_migrate;
-static void move_to_next_cpu(bool initmask)
+static void move_to_next_cpu(void)
{
- static struct cpumask *current_mask;
+ struct cpumask *current_mask = &save_cpumask;
int next_cpu;
if (disable_migrate)
return;
-
- /* Just pick the first CPU on first iteration */
- if (initmask) {
- current_mask = &save_cpumask;
- get_online_cpus();
- cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
- put_online_cpus();
- next_cpu = cpumask_first(current_mask);
- goto set_affinity;
- }
-
/*
* If for some reason the user modifies the CPU affinity
* of this thread, than stop migrating for the duration
@@ -300,7 +290,6 @@ static void move_to_next_cpu(bool initmask)
if (next_cpu >= nr_cpu_ids)
next_cpu = cpumask_first(current_mask);
- set_affinity:
if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */
goto disable;
@@ -322,20 +311,15 @@ static void move_to_next_cpu(bool initmask)
* need to ensure nothing else might be running (and thus preempting).
* Obviously this should never be used in production environments.
*
- * Currently this runs on which ever CPU it was scheduled on, but most
- * real-world hardware latency situations occur across several CPUs,
- * but we might later generalize this if we find there are any actualy
- * systems with alternate SMI delivery or other hardware latencies.
+ * Executes one loop interaction on each CPU in tracing_cpumask sysfs file.
*/
static int kthread_fn(void *data)
{
u64 interval;
- bool initmask = true;
while (!kthread_should_stop()) {
- move_to_next_cpu(initmask);
- initmask = false;
+ move_to_next_cpu();
local_irq_disable();
get_sample();
@@ -366,13 +350,27 @@ static int kthread_fn(void *data)
*/
static int start_kthread(struct trace_array *tr)
{
+ struct cpumask *current_mask = &save_cpumask;
struct task_struct *kthread;
+ int next_cpu;
+
+ /* Just pick the first CPU on first iteration */
+ current_mask = &save_cpumask;
+ get_online_cpus();
+ cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
+ put_online_cpus();
+ next_cpu = cpumask_first(current_mask);
kthread = kthread_create(kthread_fn, NULL, "hwlatd");
if (IS_ERR(kthread)) {
pr_err(BANNER "could not start sampling thread\n");
return -ENOMEM;
}
+
+ cpumask_clear(current_mask);
+ cpumask_set_cpu(next_cpu, current_mask);
+ sched_setaffinity(kthread->pid, current_mask);
+
hwlat_kthread = kthread;
wake_up_process(kthread);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 7ad9e53ad174..5f688cc724f0 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -16,9 +16,11 @@
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#define pr_fmt(fmt) "trace_kprobe: " fmt
#include <linux/module.h>
#include <linux/uaccess.h>
+#include <linux/rculist.h>
#include "trace_probe.h"
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index aea6a1218c7d..02a4aeb22c47 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -8,6 +8,8 @@
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/ftrace.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/mm.h>
#include "trace_output.h"
@@ -124,6 +126,44 @@ EXPORT_SYMBOL(trace_print_symbols_seq);
#if BITS_PER_LONG == 32
const char *
+trace_print_flags_seq_u64(struct trace_seq *p, const char *delim,
+ unsigned long long flags,
+ const struct trace_print_flags_u64 *flag_array)
+{
+ unsigned long long mask;
+ const char *str;
+ const char *ret = trace_seq_buffer_ptr(p);
+ int i, first = 1;
+
+ for (i = 0; flag_array[i].name && flags; i++) {
+
+ mask = flag_array[i].mask;
+ if ((flags & mask) != mask)
+ continue;
+
+ str = flag_array[i].name;
+ flags &= ~mask;
+ if (!first && delim)
+ trace_seq_puts(p, delim);
+ else
+ first = 0;
+ trace_seq_puts(p, str);
+ }
+
+ /* check for left over flags */
+ if (flags) {
+ if (!first && delim)
+ trace_seq_puts(p, delim);
+ trace_seq_printf(p, "0x%llx", flags);
+ }
+
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+EXPORT_SYMBOL(trace_print_flags_seq_u64);
+
+const char *
trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
const struct trace_print_flags_u64 *symbol_array)
{
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 8c0553d9afd3..52478f033f88 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -21,6 +21,7 @@
* Copyright (C) IBM Corporation, 2010-2011
* Author: Srikar Dronamraju
*/
+#define pr_fmt(fmt) "trace_probe: " fmt
#include "trace_probe.h"
@@ -647,7 +648,7 @@ ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos,
int (*createfn)(int, char **))
{
- char *kbuf, *tmp;
+ char *kbuf, *buf, *tmp;
int ret = 0;
size_t done = 0;
size_t size;
@@ -667,27 +668,38 @@ ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
goto out;
}
kbuf[size] = '\0';
- tmp = strchr(kbuf, '\n');
+ buf = kbuf;
+ do {
+ tmp = strchr(buf, '\n');
+ if (tmp) {
+ *tmp = '\0';
+ size = tmp - buf + 1;
+ } else {
+ size = strlen(buf);
+ if (done + size < count) {
+ if (buf != kbuf)
+ break;
+ /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
+ pr_warn("Line length is too long: Should be less than %d\n",
+ WRITE_BUFSIZE - 2);
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+ done += size;
- if (tmp) {
- *tmp = '\0';
- size = tmp - kbuf + 1;
- } else if (done + size < count) {
- pr_warn("Line length is too long: Should be less than %d\n",
- WRITE_BUFSIZE);
- ret = -EINVAL;
- goto out;
- }
- done += size;
- /* Remove comments */
- tmp = strchr(kbuf, '#');
+ /* Remove comments */
+ tmp = strchr(buf, '#');
- if (tmp)
- *tmp = '\0';
+ if (tmp)
+ *tmp = '\0';
- ret = traceprobe_command(kbuf, createfn);
- if (ret)
- goto out;
+ ret = traceprobe_command(buf, createfn);
+ if (ret)
+ goto out;
+ buf += size;
+
+ } while (done < count);
}
ret = done;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index b0f86ea77881..cb917cebae29 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1,5 +1,6 @@
/* Include in trace.c */
+#include <uapi/linux/sched/types.h>
#include <linux/stringify.h>
#include <linux/kthread.h>
#include <linux/delay.h>
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 2a1abbaca10e..1d68b5b7ad41 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -2,6 +2,7 @@
* Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
*
*/
+#include <linux/sched/task_stack.h>
#include <linux/stacktrace.h>
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 0913693caf6e..a7581fec9681 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -17,12 +17,14 @@
* Copyright (C) IBM Corporation, 2010-2012
* Author: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
*/
+#define pr_fmt(fmt) "trace_kprobe: " fmt
#include <linux/module.h>
#include <linux/uaccess.h>
#include <linux/uprobes.h>
#include <linux/namei.h>
#include <linux/string.h>
+#include <linux/rculist.h>
#include "trace_probe.h"
@@ -431,7 +433,8 @@ static int create_trace_uprobe(int argc, char **argv)
pr_info("Probe point is not specified.\n");
return -EINVAL;
}
- arg = strchr(argv[1], ':');
+ /* Find the last occurrence, in case the path contains ':' too. */
+ arg = strrchr(argv[1], ':');
if (!arg) {
ret = -EINVAL;
goto fail_address_parse;
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 1f9a31f934a4..685c50ae6300 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -24,7 +24,8 @@
#include <linux/tracepoint.h>
#include <linux/err.h>
#include <linux/slab.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
#include <linux/static_key.h>
extern struct tracepoint * const __start___tracepoints_ptrs[];
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 5c21f0535056..370724b45391 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -17,7 +17,9 @@
*/
#include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/cputime.h>
#include <linux/tsacct_kern.h>
#include <linux/acct.h>
#include <linux/jiffies.h>
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 95c6336fc2b3..62630a40ab3a 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -8,6 +8,7 @@
#include <linux/stat.h>
#include <linux/sysctl.h>
#include <linux/slab.h>
+#include <linux/cred.h>
#include <linux/hash.h>
#include <linux/user_namespace.h>
@@ -57,7 +58,7 @@ static struct ctl_table_root set_root = {
static int zero = 0;
static int int_max = INT_MAX;
-#define UCOUNT_ENTRY(name) \
+#define UCOUNT_ENTRY(name) \
{ \
.procname = name, \
.maxlen = sizeof(int), \
@@ -74,6 +75,10 @@ static struct ctl_table user_table[] = {
UCOUNT_ENTRY("max_net_namespaces"),
UCOUNT_ENTRY("max_mnt_namespaces"),
UCOUNT_ENTRY("max_cgroup_namespaces"),
+#ifdef CONFIG_INOTIFY_USER
+ UCOUNT_ENTRY("max_inotify_instances"),
+ UCOUNT_ENTRY("max_inotify_watches"),
+#endif
{ }
};
#endif /* CONFIG_SYSCTL */
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 71645ae9303a..5c2dc5b2bf4f 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -12,6 +12,7 @@
#include <linux/init.h>
#include <linux/highuid.h>
#include <linux/security.h>
+#include <linux/cred.h>
#include <linux/syscalls.h>
#include <linux/uaccess.h>
diff --git a/kernel/user.c b/kernel/user.c
index b069ccbfb0b0..00281add65b2 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -13,6 +13,7 @@
#include <linux/slab.h>
#include <linux/bitops.h>
#include <linux/key.h>
+#include <linux/sched/user.h>
#include <linux/interrupt.h>
#include <linux/export.h>
#include <linux/user_namespace.h>
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 86b7854fec8e..2f735cbe05e8 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -8,6 +8,7 @@
#include <linux/export.h>
#include <linux/nsproxy.h>
#include <linux/slab.h>
+#include <linux/sched/signal.h>
#include <linux/user_namespace.h>
#include <linux/proc_ns.h>
#include <linux/highuid.h>
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 6976cd47dcf6..913fe4336d2b 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,8 +14,10 @@
#include <linux/utsname.h>
#include <linux/err.h>
#include <linux/slab.h>
+#include <linux/cred.h>
#include <linux/user_namespace.h>
#include <linux/proc_ns.h>
+#include <linux/sched/task.h>
static struct ucounts *inc_uts_namespaces(struct user_namespace *ns)
{
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index c8eac43267e9..233cd8fc6910 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -14,6 +14,7 @@
#include <linux/utsname.h>
#include <linux/sysctl.h>
#include <linux/wait.h>
+#include <linux/rwsem.h>
#ifdef CONFIG_PROC_SYSCTL
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 63177be0159e..03e0b69bb5bf 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -19,8 +19,11 @@
#include <linux/sysctl.h>
#include <linux/smpboot.h>
#include <linux/sched/rt.h>
+#include <uapi/linux/sched/types.h>
#include <linux/tick.h>
#include <linux/workqueue.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/debug.h>
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 12b8dd640786..54a427d1f344 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -13,6 +13,8 @@
#include <linux/nmi.h>
#include <linux/module.h>
+#include <linux/sched/debug.h>
+
#include <asm/irq_regs.h>
#include <linux/perf_event.h>
@@ -137,12 +139,14 @@ static void watchdog_overflow_callback(struct perf_event *event,
* Reduce the watchdog noise by only printing messages
* that are different from what cpu0 displayed.
*/
-static unsigned long cpu0_err;
+static unsigned long firstcpu_err;
+static atomic_t watchdog_cpus;
int watchdog_nmi_enable(unsigned int cpu)
{
struct perf_event_attr *wd_attr;
struct perf_event *event = per_cpu(watchdog_ev, cpu);
+ int firstcpu = 0;
/* nothing to do if the hard lockup detector is disabled */
if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
@@ -156,19 +160,22 @@ int watchdog_nmi_enable(unsigned int cpu)
if (event != NULL)
goto out_enable;
+ if (atomic_inc_return(&watchdog_cpus) == 1)
+ firstcpu = 1;
+
wd_attr = &wd_hw_attr;
wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
/* Try to register using hardware perf events */
event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
- /* save cpu0 error for future comparision */
- if (cpu == 0 && IS_ERR(event))
- cpu0_err = PTR_ERR(event);
+ /* save the first cpu's error for future comparision */
+ if (firstcpu && IS_ERR(event))
+ firstcpu_err = PTR_ERR(event);
if (!IS_ERR(event)) {
- /* only print for cpu0 or different than cpu0 */
- if (cpu == 0 || cpu0_err)
+ /* only print for the first cpu initialized */
+ if (firstcpu || firstcpu_err)
pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
goto out_save;
}
@@ -186,7 +193,7 @@ int watchdog_nmi_enable(unsigned int cpu)
smp_mb__after_atomic();
/* skip displaying the same error again */
- if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
+ if (!firstcpu && (PTR_ERR(event) == firstcpu_err))
return PTR_ERR(event);
/* vary the KERN level based on the returned errno */
@@ -222,9 +229,9 @@ void watchdog_nmi_disable(unsigned int cpu)
/* should be in cleanup, but blocks oprofile */
perf_event_release_kernel(event);
- }
- if (cpu == 0) {
+
/* watchdog_nmi_enable() expects this to be zero initially. */
- cpu0_err = 0;
+ if (atomic_dec_and_test(&watchdog_cpus))
+ firstcpu_err = 0;
}
}
OpenPOWER on IntegriCloud