From 7c397d01e43493dd087f9cd926cd1fcf508a8019 Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Wed, 14 Dec 2016 15:59:46 -0500
Subject: audit: Make AUDIT_KERNEL event conform to the specification

The AUDIT_KERNEL event is not following name=value format. This causes
some information to get lost. The event has been reformatted to follow
the convention. Additionally the audit_enabled value was added for
troubleshooting purposes. The following is an example of the new event:

  type=KERNEL audit(1480621249.833:1): state=initialized
              audit_enabled=0 res=1

Signed-off-by: Steve Grubb <sgrubb@redhat.com>
[PM: commit tweaks to make checkpatch.pl happy]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 41017685f9f2..57acf2541fdd 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1344,7 +1344,9 @@ static int __init audit_init(void)
 		panic("audit: failed to start the kauditd thread (%d)\n", err);
 	}
 
-	audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
+	audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL,
+		"state=initialized audit_enabled=%u res=1",
+		 audit_enabled);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 89670affa2a62c4868a2dd8a4195a1a2ec58cb27 Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Wed, 14 Dec 2016 16:00:13 -0500
Subject: audit: Make AUDIT_ANOM_ABEND event normalized

The audit event specification asks for certain fields to exist in
all events. Running 'ausearch -m anom_abend -sv yes' returns no
events. This patch adds the result field so that the
AUDIT_ANOM_ABEND event conforms to the rules.

Signed-off-by: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/auditsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index f78cb1b3fa74..bb5f504592c6 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2411,7 +2411,7 @@ void audit_core_dumps(long signr)
 	if (unlikely(!ab))
 		return;
 	audit_log_task(ab);
-	audit_log_format(ab, " sig=%ld", signr);
+	audit_log_format(ab, " sig=%ld res=1", signr);
 	audit_log_end(ab);
 }
 
-- 
cgit v1.2.3


From e90cbebc3fa5caea4c8bfeb0d0157a0cee53efc7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 27 Dec 2016 14:49:03 -0500
Subject: cgroup add cftype->open/release() callbacks

Pipe the newly added kernfs->open/release() callbacks through cftype.
While at it, as cleanup operations now can be performed from
->release() instead of ->seq_stop(), make the latter optional.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Acked-by: Zefan Li <lizefan@huawei.com>
---
 include/linux/cgroup-defs.h |  3 +++
 kernel/cgroup.c             | 24 +++++++++++++++++++++++-
 2 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 861b4677fc5b..8a916dc96e84 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -388,6 +388,9 @@ struct cftype {
 	struct list_head node;		/* anchored at ss->cfts */
 	struct kernfs_ops *kf_ops;
 
+	int (*open)(struct kernfs_open_file *of);
+	void (*release)(struct kernfs_open_file *of);
+
 	/*
 	 * read_u64() is a shortcut for the common case of returning a
 	 * single integer. Use it in place of read()
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2ee9ec3051b2..87167e40c4fa 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3503,6 +3503,23 @@ static int cgroup_events_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
+static int cgroup_file_open(struct kernfs_open_file *of)
+{
+	struct cftype *cft = of->kn->priv;
+
+	if (cft->open)
+		return cft->open(of);
+	return 0;
+}
+
+static void cgroup_file_release(struct kernfs_open_file *of)
+{
+	struct cftype *cft = of->kn->priv;
+
+	if (cft->release)
+		cft->release(of);
+}
+
 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
 				 size_t nbytes, loff_t off)
 {
@@ -3553,7 +3570,8 @@ static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
 
 static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
 {
-	seq_cft(seq)->seq_stop(seq, v);
+	if (seq_cft(seq)->seq_stop)
+		seq_cft(seq)->seq_stop(seq, v);
 }
 
 static int cgroup_seqfile_show(struct seq_file *m, void *arg)
@@ -3575,12 +3593,16 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
 
 static struct kernfs_ops cgroup_kf_single_ops = {
 	.atomic_write_len	= PAGE_SIZE,
+	.open			= cgroup_file_open,
+	.release		= cgroup_file_release,
 	.write			= cgroup_file_write,
 	.seq_show		= cgroup_seqfile_show,
 };
 
 static struct kernfs_ops cgroup_kf_ops = {
 	.atomic_write_len	= PAGE_SIZE,
+	.open			= cgroup_file_open,
+	.release		= cgroup_file_release,
 	.write			= cgroup_file_write,
 	.seq_start		= cgroup_seqfile_start,
 	.seq_next		= cgroup_seqfile_next,
-- 
cgit v1.2.3


From b4b90a8e86f2539a028d68077b45e8511dd2adb0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 27 Dec 2016 14:49:04 -0500
Subject: cgroup: reimplement reading "cgroup.procs" on cgroup v2

On v1, "tasks" and "cgroup.procs" are expected to be sorted which
makes the implementation expensive and unnecessarily complicated
involving result cache management.

v2 doesn't have the sorting requirement, so it can just iterate and
print processes one by one.  seq_files are either read sequentially or
reset to position zero, so the implementation doesn't even need to
worry about seeking.

This keeps the css_task_iter across multiple read(2) calls and
migrations of new processes always append won't miss processes which
are newly migrated in before each read(2).

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Acked-by: Zefan Li <lizefan@huawei.com>
---
 kernel/cgroup.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 58 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 87167e40c4fa..fd684bfd154d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4426,6 +4426,60 @@ out_err:
 	return ret;
 }
 
+static void cgroup_procs_release(struct kernfs_open_file *of)
+{
+	if (of->priv) {
+		css_task_iter_end(of->priv);
+		kfree(of->priv);
+	}
+}
+
+static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct kernfs_open_file *of = s->private;
+	struct css_task_iter *it = of->priv;
+	struct task_struct *task;
+
+	do {
+		task = css_task_iter_next(it);
+	} while (task && !thread_group_leader(task));
+
+	return task;
+}
+
+static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
+{
+	struct kernfs_open_file *of = s->private;
+	struct cgroup *cgrp = seq_css(s)->cgroup;
+	struct css_task_iter *it = of->priv;
+
+	/*
+	 * When a seq_file is seeked, it's always traversed sequentially
+	 * from position 0, so we can simply keep iterating on !0 *pos.
+	 */
+	if (!it) {
+		if (WARN_ON_ONCE((*pos)++))
+			return ERR_PTR(-EINVAL);
+
+		it = kzalloc(sizeof(*it), GFP_KERNEL);
+		if (!it)
+			return ERR_PTR(-ENOMEM);
+		of->priv = it;
+		css_task_iter_start(&cgrp->self, it);
+	} else if (!(*pos)++) {
+		css_task_iter_end(it);
+		css_task_iter_start(&cgrp->self, it);
+	}
+
+	return cgroup_procs_next(s, NULL, NULL);
+}
+
+static int cgroup_procs_show(struct seq_file *s, void *v)
+{
+	seq_printf(s, "%d\n", task_tgid_vnr(v));
+	return 0;
+}
+
 /*
  * Stuff for reading the 'tasks'/'procs' files.
  *
@@ -4914,11 +4968,10 @@ static struct cftype cgroup_dfl_base_files[] = {
 	{
 		.name = "cgroup.procs",
 		.file_offset = offsetof(struct cgroup, procs_file),
-		.seq_start = cgroup_pidlist_start,
-		.seq_next = cgroup_pidlist_next,
-		.seq_stop = cgroup_pidlist_stop,
-		.seq_show = cgroup_pidlist_show,
-		.private = CGROUP_FILE_PROCS,
+		.release = cgroup_procs_release,
+		.seq_start = cgroup_procs_start,
+		.seq_next = cgroup_procs_next,
+		.seq_show = cgroup_procs_show,
 		.write = cgroup_procs_write,
 	},
 	{
-- 
cgit v1.2.3


From 2fae98634334e256b67241970526ddfadc77db2b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 27 Dec 2016 14:49:05 -0500
Subject: cgroup: remove cgroup_pid_fry() and friends

cgroup_pid_fry() was added to mangle cgroup.procs pid listing order on
v2 to make it clear that the output is not sorted.  Now that v2 now
uses a separate "cgroup.procs" read method, this is no longer used.
Remove it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Acked-by: Zefan Li <lizefan@huawei.com>
---
 kernel/cgroup.c | 43 +++++--------------------------------------
 1 file changed, 5 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fd684bfd154d..b9e2d85d59b1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4618,42 +4618,12 @@ after:
  * sorted by task pointer.  As pidlists can be fairly large, allocating one
  * per open file is dangerous, so cgroup had to implement shared pool of
  * pidlists keyed by cgroup and namespace.
- *
- * All this extra complexity was caused by the original implementation
- * committing to an entirely unnecessary property.  In the long term, we
- * want to do away with it.  Explicitly scramble sort order if on the
- * default hierarchy so that no such expectation exists in the new
- * interface.
- *
- * Scrambling is done by swapping every two consecutive bits, which is
- * non-identity one-to-one mapping which disturbs sort order sufficiently.
  */
-static pid_t pid_fry(pid_t pid)
-{
-	unsigned a = pid & 0x55555555;
-	unsigned b = pid & 0xAAAAAAAA;
-
-	return (a << 1) | (b >> 1);
-}
-
-static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
-{
-	if (cgroup_on_dfl(cgrp))
-		return pid_fry(pid);
-	else
-		return pid;
-}
-
 static int cmppid(const void *a, const void *b)
 {
 	return *(pid_t *)a - *(pid_t *)b;
 }
 
-static int fried_cmppid(const void *a, const void *b)
-{
-	return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
-}
-
 static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
 						  enum cgroup_filetype type)
 {
@@ -4741,10 +4711,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
 	css_task_iter_end(&it);
 	length = n;
 	/* now sort & (if procs) strip out duplicates */
-	if (cgroup_on_dfl(cgrp))
-		sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
-	else
-		sort(array, length, sizeof(pid_t), cmppid, NULL);
+	sort(array, length, sizeof(pid_t), cmppid, NULL);
 	if (type == CGROUP_FILE_PROCS)
 		length = pidlist_uniq(array, length);
 
@@ -4876,10 +4843,10 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 
 		while (index < end) {
 			int mid = (index + end) / 2;
-			if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
+			if (l->list[mid] == pid) {
 				index = mid;
 				break;
-			} else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
+			} else if (l->list[mid] <= pid)
 				index = mid + 1;
 			else
 				end = mid;
@@ -4890,7 +4857,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 		return NULL;
 	/* Update the abstract position to be the actual pid that we found */
 	iter = l->list + index;
-	*pos = cgroup_pid_fry(cgrp, *iter);
+	*pos = *iter;
 	return iter;
 }
 
@@ -4919,7 +4886,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
 	if (p >= end) {
 		return NULL;
 	} else {
-		*pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
+		*pos = *p;
 		return p;
 	}
 }
-- 
cgit v1.2.3


From 5f617ebbdf10abd49312a89e3b894b927c7367f5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 27 Dec 2016 14:49:05 -0500
Subject: cgroup: reorder css_set fields

Reorder css_set fields so that they're roughly in the order of how hot
they are.  The rough order is

1. the actual csses
2. reference counter and the default cgroup pointer.
3. task lists and iterations
4. fields used during merge including css_set lookup
5. the rest

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Acked-by: Zefan Li <lizefan@huawei.com>
---
 include/linux/cgroup-defs.h | 54 ++++++++++++++++++++++-----------------------
 kernel/cgroup.c             | 13 ++++++-----
 2 files changed, 35 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 8a916dc96e84..3c02404cfce9 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -148,14 +148,18 @@ struct cgroup_subsys_state {
  * set for a task.
  */
 struct css_set {
-	/* Reference count */
-	atomic_t refcount;
-
 	/*
-	 * List running through all cgroup groups in the same hash
-	 * slot. Protected by css_set_lock
+	 * Set of subsystem states, one for each subsystem. This array is
+	 * immutable after creation apart from the init_css_set during
+	 * subsystem registration (at boot time).
 	 */
-	struct hlist_node hlist;
+	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
+
+	/* reference count */
+	atomic_t refcount;
+
+	/* the default cgroup associated with this css_set */
+	struct cgroup *dfl_cgrp;
 
 	/*
 	 * Lists running through all tasks using this cgroup group.
@@ -167,21 +171,29 @@ struct css_set {
 	struct list_head tasks;
 	struct list_head mg_tasks;
 
+	/* all css_task_iters currently walking this cset */
+	struct list_head task_iters;
+
 	/*
-	 * List of cgrp_cset_links pointing at cgroups referenced from this
-	 * css_set.  Protected by css_set_lock.
+	 * On the default hierarhcy, ->subsys[ssid] may point to a css
+	 * attached to an ancestor instead of the cgroup this css_set is
+	 * associated with.  The following node is anchored at
+	 * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
+	 * iterate through all css's attached to a given cgroup.
 	 */
-	struct list_head cgrp_links;
+	struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
 
-	/* the default cgroup associated with this css_set */
-	struct cgroup *dfl_cgrp;
+	/*
+	 * List running through all cgroup groups in the same hash
+	 * slot. Protected by css_set_lock
+	 */
+	struct hlist_node hlist;
 
 	/*
-	 * Set of subsystem states, one for each subsystem. This array is
-	 * immutable after creation apart from the init_css_set during
-	 * subsystem registration (at boot time).
+	 * List of cgrp_cset_links pointing at cgroups referenced from this
+	 * css_set.  Protected by css_set_lock.
 	 */
-	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
+	struct list_head cgrp_links;
 
 	/*
 	 * List of csets participating in the on-going migration either as
@@ -201,18 +213,6 @@ struct css_set {
 	struct cgroup *mg_dst_cgrp;
 	struct css_set *mg_dst_cset;
 
-	/*
-	 * On the default hierarhcy, ->subsys[ssid] may point to a css
-	 * attached to an ancestor instead of the cgroup this css_set is
-	 * associated with.  The following node is anchored at
-	 * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
-	 * iterate through all css's attached to a given cgroup.
-	 */
-	struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
-
-	/* all css_task_iters currently walking this cset */
-	struct list_head task_iters;
-
 	/* dead and being drained, ignore for migration */
 	bool dead;
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b9e2d85d59b1..1a815f275849 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -647,12 +647,12 @@ struct cgrp_cset_link {
  */
 struct css_set init_css_set = {
 	.refcount		= ATOMIC_INIT(1),
-	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
 	.tasks			= LIST_HEAD_INIT(init_css_set.tasks),
 	.mg_tasks		= LIST_HEAD_INIT(init_css_set.mg_tasks),
+	.task_iters		= LIST_HEAD_INIT(init_css_set.task_iters),
+	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
 	.mg_preload_node	= LIST_HEAD_INIT(init_css_set.mg_preload_node),
 	.mg_node		= LIST_HEAD_INIT(init_css_set.mg_node),
-	.task_iters		= LIST_HEAD_INIT(init_css_set.task_iters),
 };
 
 static int css_set_count	= 1;	/* 1 for init_css_set */
@@ -1095,13 +1095,13 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	}
 
 	atomic_set(&cset->refcount, 1);
-	INIT_LIST_HEAD(&cset->cgrp_links);
 	INIT_LIST_HEAD(&cset->tasks);
 	INIT_LIST_HEAD(&cset->mg_tasks);
-	INIT_LIST_HEAD(&cset->mg_preload_node);
-	INIT_LIST_HEAD(&cset->mg_node);
 	INIT_LIST_HEAD(&cset->task_iters);
 	INIT_HLIST_NODE(&cset->hlist);
+	INIT_LIST_HEAD(&cset->cgrp_links);
+	INIT_LIST_HEAD(&cset->mg_preload_node);
+	INIT_LIST_HEAD(&cset->mg_node);
 
 	/* Copy the set of subsystem state objects generated in
 	 * find_existing_css_set() */
@@ -4384,6 +4384,9 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 	struct task_struct *task;
 	int ret;
 
+	if (cgroup_on_dfl(to))
+		return -EINVAL;
+
 	if (!cgroup_may_migrate_to(to))
 		return -EBUSY;
 
-- 
cgit v1.2.3


From 201af4c0fab02876ef0311e7f7b4083aa138930c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 27 Dec 2016 14:49:05 -0500
Subject: cgroup: move cgroup files under kernel/cgroup/

They're growing to be too many and planned to get split further.  Move
them under their own directory.

 kernel/cgroup.c		-> kernel/cgroup/cgroup.c
 kernel/cgroup_freezer.c	-> kernel/cgroup/freezer.c
 kernel/cgroup_pids.c		-> kernel/cgroup/pids.c
 kernel/cpuset.c		-> kernel/cgroup/cpuset.c

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Acked-by: Zefan Li <lizefan@huawei.com>
---
 kernel/Makefile         |    5 +-
 kernel/cgroup.c         | 6705 -----------------------------------------------
 kernel/cgroup/Makefile  |    5 +
 kernel/cgroup/cgroup.c  | 6705 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/cgroup/cpuset.c  | 2752 +++++++++++++++++++
 kernel/cgroup/freezer.c |  481 ++++
 kernel/cgroup/pids.c    |  348 +++
 kernel/cgroup_freezer.c |  481 ----
 kernel/cgroup_pids.c    |  348 ---
 kernel/cpuset.c         | 2752 -------------------
 10 files changed, 10292 insertions(+), 10290 deletions(-)
 delete mode 100644 kernel/cgroup.c
 create mode 100644 kernel/cgroup/Makefile
 create mode 100644 kernel/cgroup/cgroup.c
 create mode 100644 kernel/cgroup/cpuset.c
 create mode 100644 kernel/cgroup/freezer.c
 create mode 100644 kernel/cgroup/pids.c
 delete mode 100644 kernel/cgroup_freezer.c
 delete mode 100644 kernel/cgroup_pids.c
 delete mode 100644 kernel/cpuset.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 12c679f769c6..b302b4731d16 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -64,10 +64,7 @@ obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
 obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
 obj-$(CONFIG_COMPAT) += compat.o
-obj-$(CONFIG_CGROUPS) += cgroup.o
-obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
-obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o
-obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CGROUPS) += cgroup/
 obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_USER_NS) += user_namespace.o
 obj-$(CONFIG_PID_NS) += pid_namespace.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
deleted file mode 100644
index 1a815f275849..000000000000
--- a/kernel/cgroup.c
+++ /dev/null
@@ -1,6705 +0,0 @@
-/*
- *  Generic process-grouping system.
- *
- *  Based originally on the cpuset system, extracted by Paul Menage
- *  Copyright (C) 2006 Google, Inc
- *
- *  Notifications support
- *  Copyright (C) 2009 Nokia Corporation
- *  Author: Kirill A. Shutemov
- *
- *  Copyright notices from the original cpuset code:
- *  --------------------------------------------------
- *  Copyright (C) 2003 BULL SA.
- *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
- *
- *  Portions derived from Patrick Mochel's sysfs code.
- *  sysfs is Copyright (c) 2001-3 Patrick Mochel
- *
- *  2003-10-10 Written by Simon Derr.
- *  2003-10-22 Updates by Stephen Hemminger.
- *  2004 May-July Rework by Paul Jackson.
- *  ---------------------------------------------------
- *
- *  This file is subject to the terms and conditions of the GNU General Public
- *  License.  See the file COPYING in the main directory of the Linux
- *  distribution for more details.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/cgroup.h>
-#include <linux/cred.h>
-#include <linux/ctype.h>
-#include <linux/errno.h>
-#include <linux/init_task.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/magic.h>
-#include <linux/mm.h>
-#include <linux/mutex.h>
-#include <linux/mount.h>
-#include <linux/pagemap.h>
-#include <linux/proc_fs.h>
-#include <linux/rcupdate.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/percpu-rwsem.h>
-#include <linux/string.h>
-#include <linux/sort.h>
-#include <linux/kmod.h>
-#include <linux/delayacct.h>
-#include <linux/cgroupstats.h>
-#include <linux/hashtable.h>
-#include <linux/pid_namespace.h>
-#include <linux/idr.h>
-#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
-#include <linux/kthread.h>
-#include <linux/delay.h>
-#include <linux/atomic.h>
-#include <linux/cpuset.h>
-#include <linux/proc_ns.h>
-#include <linux/nsproxy.h>
-#include <linux/file.h>
-#include <net/sock.h>
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/cgroup.h>
-
-/*
- * pidlists linger the following amount before being destroyed.  The goal
- * is avoiding frequent destruction in the middle of consecutive read calls
- * Expiring in the middle is a performance problem not a correctness one.
- * 1 sec should be enough.
- */
-#define CGROUP_PIDLIST_DESTROY_DELAY	HZ
-
-#define CGROUP_FILE_NAME_MAX		(MAX_CGROUP_TYPE_NAMELEN +	\
-					 MAX_CFTYPE_NAME + 2)
-
-/*
- * cgroup_mutex is the master lock.  Any modification to cgroup or its
- * hierarchy must be performed while holding it.
- *
- * css_set_lock protects task->cgroups pointer, the list of css_set
- * objects, and the chain of tasks off each css_set.
- *
- * These locks are exported if CONFIG_PROVE_RCU so that accessors in
- * cgroup.h can use them for lockdep annotations.
- */
-#ifdef CONFIG_PROVE_RCU
-DEFINE_MUTEX(cgroup_mutex);
-DEFINE_SPINLOCK(css_set_lock);
-EXPORT_SYMBOL_GPL(cgroup_mutex);
-EXPORT_SYMBOL_GPL(css_set_lock);
-#else
-static DEFINE_MUTEX(cgroup_mutex);
-static DEFINE_SPINLOCK(css_set_lock);
-#endif
-
-/*
- * Protects cgroup_idr and css_idr so that IDs can be released without
- * grabbing cgroup_mutex.
- */
-static DEFINE_SPINLOCK(cgroup_idr_lock);
-
-/*
- * Protects cgroup_file->kn for !self csses.  It synchronizes notifications
- * against file removal/re-creation across css hiding.
- */
-static DEFINE_SPINLOCK(cgroup_file_kn_lock);
-
-/*
- * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
- * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
- */
-static DEFINE_SPINLOCK(release_agent_path_lock);
-
-struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
-
-#define cgroup_assert_mutex_or_rcu_locked()				\
-	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&			\
-			   !lockdep_is_held(&cgroup_mutex),		\
-			   "cgroup_mutex or RCU read lock required");
-
-/*
- * cgroup destruction makes heavy use of work items and there can be a lot
- * of concurrent destructions.  Use a separate workqueue so that cgroup
- * destruction work items don't end up filling up max_active of system_wq
- * which may lead to deadlock.
- */
-static struct workqueue_struct *cgroup_destroy_wq;
-
-/*
- * pidlist destructions need to be flushed on cgroup destruction.  Use a
- * separate workqueue as flush domain.
- */
-static struct workqueue_struct *cgroup_pidlist_destroy_wq;
-
-/* generate an array of cgroup subsystem pointers */
-#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
-static struct cgroup_subsys *cgroup_subsys[] = {
-#include <linux/cgroup_subsys.h>
-};
-#undef SUBSYS
-
-/* array of cgroup subsystem names */
-#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
-static const char *cgroup_subsys_name[] = {
-#include <linux/cgroup_subsys.h>
-};
-#undef SUBSYS
-
-/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
-#define SUBSYS(_x)								\
-	DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key);			\
-	DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key);			\
-	EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key);			\
-	EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
-#include <linux/cgroup_subsys.h>
-#undef SUBSYS
-
-#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
-static struct static_key_true *cgroup_subsys_enabled_key[] = {
-#include <linux/cgroup_subsys.h>
-};
-#undef SUBSYS
-
-#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
-static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
-#include <linux/cgroup_subsys.h>
-};
-#undef SUBSYS
-
-/*
- * The default hierarchy, reserved for the subsystems that are otherwise
- * unattached - it never has more than a single cgroup, and all tasks are
- * part of that cgroup.
- */
-struct cgroup_root cgrp_dfl_root;
-EXPORT_SYMBOL_GPL(cgrp_dfl_root);
-
-/*
- * The default hierarchy always exists but is hidden until mounted for the
- * first time.  This is for backward compatibility.
- */
-static bool cgrp_dfl_visible;
-
-/* Controllers blocked by the commandline in v1 */
-static u16 cgroup_no_v1_mask;
-
-/* some controllers are not supported in the default hierarchy */
-static u16 cgrp_dfl_inhibit_ss_mask;
-
-/* some controllers are implicitly enabled on the default hierarchy */
-static unsigned long cgrp_dfl_implicit_ss_mask;
-
-/* The list of hierarchy roots */
-
-static LIST_HEAD(cgroup_roots);
-static int cgroup_root_count;
-
-/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
-static DEFINE_IDR(cgroup_hierarchy_idr);
-
-/*
- * Assign a monotonically increasing serial number to csses.  It guarantees
- * cgroups with bigger numbers are newer than those with smaller numbers.
- * Also, as csses are always appended to the parent's ->children list, it
- * guarantees that sibling csses are always sorted in the ascending serial
- * number order on the list.  Protected by cgroup_mutex.
- */
-static u64 css_serial_nr_next = 1;
-
-/*
- * These bitmask flags indicate whether tasks in the fork and exit paths have
- * fork/exit handlers to call. This avoids us having to do extra work in the
- * fork/exit path to check which subsystems have fork/exit callbacks.
- */
-static u16 have_fork_callback __read_mostly;
-static u16 have_exit_callback __read_mostly;
-static u16 have_free_callback __read_mostly;
-
-/* cgroup namespace for init task */
-struct cgroup_namespace init_cgroup_ns = {
-	.count		= { .counter = 2, },
-	.user_ns	= &init_user_ns,
-	.ns.ops		= &cgroupns_operations,
-	.ns.inum	= PROC_CGROUP_INIT_INO,
-	.root_cset	= &init_css_set,
-};
-
-/* Ditto for the can_fork callback. */
-static u16 have_canfork_callback __read_mostly;
-
-static struct file_system_type cgroup2_fs_type;
-static struct cftype cgroup_dfl_base_files[];
-static struct cftype cgroup_legacy_base_files[];
-
-static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
-static void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
-static int cgroup_apply_control(struct cgroup *cgrp);
-static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
-static void css_task_iter_advance(struct css_task_iter *it);
-static int cgroup_destroy_locked(struct cgroup *cgrp);
-static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
-					      struct cgroup_subsys *ss);
-static void css_release(struct percpu_ref *ref);
-static void kill_css(struct cgroup_subsys_state *css);
-static int cgroup_addrm_files(struct cgroup_subsys_state *css,
-			      struct cgroup *cgrp, struct cftype cfts[],
-			      bool is_add);
-
-/**
- * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
- * @ssid: subsys ID of interest
- *
- * cgroup_subsys_enabled() can only be used with literal subsys names which
- * is fine for individual subsystems but unsuitable for cgroup core.  This
- * is slower static_key_enabled() based test indexed by @ssid.
- */
-static bool cgroup_ssid_enabled(int ssid)
-{
-	if (CGROUP_SUBSYS_COUNT == 0)
-		return false;
-
-	return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
-}
-
-static bool cgroup_ssid_no_v1(int ssid)
-{
-	return cgroup_no_v1_mask & (1 << ssid);
-}
-
-/**
- * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
- * @cgrp: the cgroup of interest
- *
- * The default hierarchy is the v2 interface of cgroup and this function
- * can be used to test whether a cgroup is on the default hierarchy for
- * cases where a subsystem should behave differnetly depending on the
- * interface version.
- *
- * The set of behaviors which change on the default hierarchy are still
- * being determined and the mount option is prefixed with __DEVEL__.
- *
- * List of changed behaviors:
- *
- * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
- *   and "name" are disallowed.
- *
- * - When mounting an existing superblock, mount options should match.
- *
- * - Remount is disallowed.
- *
- * - rename(2) is disallowed.
- *
- * - "tasks" is removed.  Everything should be at process granularity.  Use
- *   "cgroup.procs" instead.
- *
- * - "cgroup.procs" is not sorted.  pids will be unique unless they got
- *   recycled inbetween reads.
- *
- * - "release_agent" and "notify_on_release" are removed.  Replacement
- *   notification mechanism will be implemented.
- *
- * - "cgroup.clone_children" is removed.
- *
- * - "cgroup.subtree_populated" is available.  Its value is 0 if the cgroup
- *   and its descendants contain no task; otherwise, 1.  The file also
- *   generates kernfs notification which can be monitored through poll and
- *   [di]notify when the value of the file changes.
- *
- * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
- *   take masks of ancestors with non-empty cpus/mems, instead of being
- *   moved to an ancestor.
- *
- * - cpuset: a task can be moved into an empty cpuset, and again it takes
- *   masks of ancestors.
- *
- * - memcg: use_hierarchy is on by default and the cgroup file for the flag
- *   is not created.
- *
- * - blkcg: blk-throttle becomes properly hierarchical.
- *
- * - debug: disallowed on the default hierarchy.
- */
-static bool cgroup_on_dfl(const struct cgroup *cgrp)
-{
-	return cgrp->root == &cgrp_dfl_root;
-}
-
-/* IDR wrappers which synchronize using cgroup_idr_lock */
-static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
-			    gfp_t gfp_mask)
-{
-	int ret;
-
-	idr_preload(gfp_mask);
-	spin_lock_bh(&cgroup_idr_lock);
-	ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
-	spin_unlock_bh(&cgroup_idr_lock);
-	idr_preload_end();
-	return ret;
-}
-
-static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
-{
-	void *ret;
-
-	spin_lock_bh(&cgroup_idr_lock);
-	ret = idr_replace(idr, ptr, id);
-	spin_unlock_bh(&cgroup_idr_lock);
-	return ret;
-}
-
-static void cgroup_idr_remove(struct idr *idr, int id)
-{
-	spin_lock_bh(&cgroup_idr_lock);
-	idr_remove(idr, id);
-	spin_unlock_bh(&cgroup_idr_lock);
-}
-
-static struct cgroup *cgroup_parent(struct cgroup *cgrp)
-{
-	struct cgroup_subsys_state *parent_css = cgrp->self.parent;
-
-	if (parent_css)
-		return container_of(parent_css, struct cgroup, self);
-	return NULL;
-}
-
-/* subsystems visibly enabled on a cgroup */
-static u16 cgroup_control(struct cgroup *cgrp)
-{
-	struct cgroup *parent = cgroup_parent(cgrp);
-	u16 root_ss_mask = cgrp->root->subsys_mask;
-
-	if (parent)
-		return parent->subtree_control;
-
-	if (cgroup_on_dfl(cgrp))
-		root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
-				  cgrp_dfl_implicit_ss_mask);
-	return root_ss_mask;
-}
-
-/* subsystems enabled on a cgroup */
-static u16 cgroup_ss_mask(struct cgroup *cgrp)
-{
-	struct cgroup *parent = cgroup_parent(cgrp);
-
-	if (parent)
-		return parent->subtree_ss_mask;
-
-	return cgrp->root->subsys_mask;
-}
-
-/**
- * cgroup_css - obtain a cgroup's css for the specified subsystem
- * @cgrp: the cgroup of interest
- * @ss: the subsystem of interest (%NULL returns @cgrp->self)
- *
- * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
- * function must be called either under cgroup_mutex or rcu_read_lock() and
- * the caller is responsible for pinning the returned css if it wants to
- * keep accessing it outside the said locks.  This function may return
- * %NULL if @cgrp doesn't have @subsys_id enabled.
- */
-static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
-					      struct cgroup_subsys *ss)
-{
-	if (ss)
-		return rcu_dereference_check(cgrp->subsys[ss->id],
-					lockdep_is_held(&cgroup_mutex));
-	else
-		return &cgrp->self;
-}
-
-/**
- * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
- * @cgrp: the cgroup of interest
- * @ss: the subsystem of interest (%NULL returns @cgrp->self)
- *
- * Similar to cgroup_css() but returns the effective css, which is defined
- * as the matching css of the nearest ancestor including self which has @ss
- * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
- * function is guaranteed to return non-NULL css.
- */
-static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
-						struct cgroup_subsys *ss)
-{
-	lockdep_assert_held(&cgroup_mutex);
-
-	if (!ss)
-		return &cgrp->self;
-
-	/*
-	 * This function is used while updating css associations and thus
-	 * can't test the csses directly.  Test ss_mask.
-	 */
-	while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
-		cgrp = cgroup_parent(cgrp);
-		if (!cgrp)
-			return NULL;
-	}
-
-	return cgroup_css(cgrp, ss);
-}
-
-/**
- * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
- * @cgrp: the cgroup of interest
- * @ss: the subsystem of interest
- *
- * Find and get the effective css of @cgrp for @ss.  The effective css is
- * defined as the matching css of the nearest ancestor including self which
- * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
- * the root css is returned, so this function always returns a valid css.
- * The returned css must be put using css_put().
- */
-struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
-					     struct cgroup_subsys *ss)
-{
-	struct cgroup_subsys_state *css;
-
-	rcu_read_lock();
-
-	do {
-		css = cgroup_css(cgrp, ss);
-
-		if (css && css_tryget_online(css))
-			goto out_unlock;
-		cgrp = cgroup_parent(cgrp);
-	} while (cgrp);
-
-	css = init_css_set.subsys[ss->id];
-	css_get(css);
-out_unlock:
-	rcu_read_unlock();
-	return css;
-}
-
-/* convenient tests for these bits */
-static inline bool cgroup_is_dead(const struct cgroup *cgrp)
-{
-	return !(cgrp->self.flags & CSS_ONLINE);
-}
-
-static void cgroup_get(struct cgroup *cgrp)
-{
-	WARN_ON_ONCE(cgroup_is_dead(cgrp));
-	css_get(&cgrp->self);
-}
-
-static bool cgroup_tryget(struct cgroup *cgrp)
-{
-	return css_tryget(&cgrp->self);
-}
-
-struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
-{
-	struct cgroup *cgrp = of->kn->parent->priv;
-	struct cftype *cft = of_cft(of);
-
-	/*
-	 * This is open and unprotected implementation of cgroup_css().
-	 * seq_css() is only called from a kernfs file operation which has
-	 * an active reference on the file.  Because all the subsystem
-	 * files are drained before a css is disassociated with a cgroup,
-	 * the matching css from the cgroup's subsys table is guaranteed to
-	 * be and stay valid until the enclosing operation is complete.
-	 */
-	if (cft->ss)
-		return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
-	else
-		return &cgrp->self;
-}
-EXPORT_SYMBOL_GPL(of_css);
-
-static int notify_on_release(const struct cgroup *cgrp)
-{
-	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
-}
-
-/**
- * for_each_css - iterate all css's of a cgroup
- * @css: the iteration cursor
- * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
- * @cgrp: the target cgroup to iterate css's of
- *
- * Should be called under cgroup_[tree_]mutex.
- */
-#define for_each_css(css, ssid, cgrp)					\
-	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\
-		if (!((css) = rcu_dereference_check(			\
-				(cgrp)->subsys[(ssid)],			\
-				lockdep_is_held(&cgroup_mutex)))) { }	\
-		else
-
-/**
- * for_each_e_css - iterate all effective css's of a cgroup
- * @css: the iteration cursor
- * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
- * @cgrp: the target cgroup to iterate css's of
- *
- * Should be called under cgroup_[tree_]mutex.
- */
-#define for_each_e_css(css, ssid, cgrp)					\
-	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\
-		if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
-			;						\
-		else
-
-/**
- * for_each_subsys - iterate all enabled cgroup subsystems
- * @ss: the iteration cursor
- * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
- */
-#define for_each_subsys(ss, ssid)					\
-	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
-	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
-
-/**
- * do_each_subsys_mask - filter for_each_subsys with a bitmask
- * @ss: the iteration cursor
- * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
- * @ss_mask: the bitmask
- *
- * The block will only run for cases where the ssid-th bit (1 << ssid) of
- * @ss_mask is set.
- */
-#define do_each_subsys_mask(ss, ssid, ss_mask) do {			\
-	unsigned long __ss_mask = (ss_mask);				\
-	if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */	\
-		(ssid) = 0;						\
-		break;							\
-	}								\
-	for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) {	\
-		(ss) = cgroup_subsys[ssid];				\
-		{
-
-#define while_each_subsys_mask()					\
-		}							\
-	}								\
-} while (false)
-
-/* iterate across the hierarchies */
-#define for_each_root(root)						\
-	list_for_each_entry((root), &cgroup_roots, root_list)
-
-/* iterate over child cgrps, lock should be held throughout iteration */
-#define cgroup_for_each_live_child(child, cgrp)				\
-	list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
-		if (({ lockdep_assert_held(&cgroup_mutex);		\
-		       cgroup_is_dead(child); }))			\
-			;						\
-		else
-
-/* walk live descendants in preorder */
-#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)		\
-	css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL))	\
-		if (({ lockdep_assert_held(&cgroup_mutex);		\
-		       (dsct) = (d_css)->cgroup;			\
-		       cgroup_is_dead(dsct); }))			\
-			;						\
-		else
-
-/* walk live descendants in postorder */
-#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp)		\
-	css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL))	\
-		if (({ lockdep_assert_held(&cgroup_mutex);		\
-		       (dsct) = (d_css)->cgroup;			\
-		       cgroup_is_dead(dsct); }))			\
-			;						\
-		else
-
-static void cgroup_release_agent(struct work_struct *work);
-static void check_for_release(struct cgroup *cgrp);
-
-/*
- * A cgroup can be associated with multiple css_sets as different tasks may
- * belong to different cgroups on different hierarchies.  In the other
- * direction, a css_set is naturally associated with multiple cgroups.
- * This M:N relationship is represented by the following link structure
- * which exists for each association and allows traversing the associations
- * from both sides.
- */
-struct cgrp_cset_link {
-	/* the cgroup and css_set this link associates */
-	struct cgroup		*cgrp;
-	struct css_set		*cset;
-
-	/* list of cgrp_cset_links anchored at cgrp->cset_links */
-	struct list_head	cset_link;
-
-	/* list of cgrp_cset_links anchored at css_set->cgrp_links */
-	struct list_head	cgrp_link;
-};
-
-/*
- * The default css_set - used by init and its children prior to any
- * hierarchies being mounted. It contains a pointer to the root state
- * for each subsystem. Also used to anchor the list of css_sets. Not
- * reference-counted, to improve performance when child cgroups
- * haven't been created.
- */
-struct css_set init_css_set = {
-	.refcount		= ATOMIC_INIT(1),
-	.tasks			= LIST_HEAD_INIT(init_css_set.tasks),
-	.mg_tasks		= LIST_HEAD_INIT(init_css_set.mg_tasks),
-	.task_iters		= LIST_HEAD_INIT(init_css_set.task_iters),
-	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
-	.mg_preload_node	= LIST_HEAD_INIT(init_css_set.mg_preload_node),
-	.mg_node		= LIST_HEAD_INIT(init_css_set.mg_node),
-};
-
-static int css_set_count	= 1;	/* 1 for init_css_set */
-
-/**
- * css_set_populated - does a css_set contain any tasks?
- * @cset: target css_set
- */
-static bool css_set_populated(struct css_set *cset)
-{
-	lockdep_assert_held(&css_set_lock);
-
-	return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
-}
-
-/**
- * cgroup_update_populated - updated populated count of a cgroup
- * @cgrp: the target cgroup
- * @populated: inc or dec populated count
- *
- * One of the css_sets associated with @cgrp is either getting its first
- * task or losing the last.  Update @cgrp->populated_cnt accordingly.  The
- * count is propagated towards root so that a given cgroup's populated_cnt
- * is zero iff the cgroup and all its descendants don't contain any tasks.
- *
- * @cgrp's interface file "cgroup.populated" is zero if
- * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt
- * changes from or to zero, userland is notified that the content of the
- * interface file has changed.  This can be used to detect when @cgrp and
- * its descendants become populated or empty.
- */
-static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
-{
-	lockdep_assert_held(&css_set_lock);
-
-	do {
-		bool trigger;
-
-		if (populated)
-			trigger = !cgrp->populated_cnt++;
-		else
-			trigger = !--cgrp->populated_cnt;
-
-		if (!trigger)
-			break;
-
-		check_for_release(cgrp);
-		cgroup_file_notify(&cgrp->events_file);
-
-		cgrp = cgroup_parent(cgrp);
-	} while (cgrp);
-}
-
-/**
- * css_set_update_populated - update populated state of a css_set
- * @cset: target css_set
- * @populated: whether @cset is populated or depopulated
- *
- * @cset is either getting the first task or losing the last.  Update the
- * ->populated_cnt of all associated cgroups accordingly.
- */
-static void css_set_update_populated(struct css_set *cset, bool populated)
-{
-	struct cgrp_cset_link *link;
-
-	lockdep_assert_held(&css_set_lock);
-
-	list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
-		cgroup_update_populated(link->cgrp, populated);
-}
-
-/**
- * css_set_move_task - move a task from one css_set to another
- * @task: task being moved
- * @from_cset: css_set @task currently belongs to (may be NULL)
- * @to_cset: new css_set @task is being moved to (may be NULL)
- * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
- *
- * Move @task from @from_cset to @to_cset.  If @task didn't belong to any
- * css_set, @from_cset can be NULL.  If @task is being disassociated
- * instead of moved, @to_cset can be NULL.
- *
- * This function automatically handles populated_cnt updates and
- * css_task_iter adjustments but the caller is responsible for managing
- * @from_cset and @to_cset's reference counts.
- */
-static void css_set_move_task(struct task_struct *task,
-			      struct css_set *from_cset, struct css_set *to_cset,
-			      bool use_mg_tasks)
-{
-	lockdep_assert_held(&css_set_lock);
-
-	if (to_cset && !css_set_populated(to_cset))
-		css_set_update_populated(to_cset, true);
-
-	if (from_cset) {
-		struct css_task_iter *it, *pos;
-
-		WARN_ON_ONCE(list_empty(&task->cg_list));
-
-		/*
-		 * @task is leaving, advance task iterators which are
-		 * pointing to it so that they can resume at the next
-		 * position.  Advancing an iterator might remove it from
-		 * the list, use safe walk.  See css_task_iter_advance*()
-		 * for details.
-		 */
-		list_for_each_entry_safe(it, pos, &from_cset->task_iters,
-					 iters_node)
-			if (it->task_pos == &task->cg_list)
-				css_task_iter_advance(it);
-
-		list_del_init(&task->cg_list);
-		if (!css_set_populated(from_cset))
-			css_set_update_populated(from_cset, false);
-	} else {
-		WARN_ON_ONCE(!list_empty(&task->cg_list));
-	}
-
-	if (to_cset) {
-		/*
-		 * We are synchronized through cgroup_threadgroup_rwsem
-		 * against PF_EXITING setting such that we can't race
-		 * against cgroup_exit() changing the css_set to
-		 * init_css_set and dropping the old one.
-		 */
-		WARN_ON_ONCE(task->flags & PF_EXITING);
-
-		rcu_assign_pointer(task->cgroups, to_cset);
-		list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
-							     &to_cset->tasks);
-	}
-}
-
-/*
- * hash table for cgroup groups. This improves the performance to find
- * an existing css_set. This hash doesn't (currently) take into
- * account cgroups in empty hierarchies.
- */
-#define CSS_SET_HASH_BITS	7
-static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
-
-static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
-{
-	unsigned long key = 0UL;
-	struct cgroup_subsys *ss;
-	int i;
-
-	for_each_subsys(ss, i)
-		key += (unsigned long)css[i];
-	key = (key >> 16) ^ key;
-
-	return key;
-}
-
-static void put_css_set_locked(struct css_set *cset)
-{
-	struct cgrp_cset_link *link, *tmp_link;
-	struct cgroup_subsys *ss;
-	int ssid;
-
-	lockdep_assert_held(&css_set_lock);
-
-	if (!atomic_dec_and_test(&cset->refcount))
-		return;
-
-	/* This css_set is dead. unlink it and release cgroup and css refs */
-	for_each_subsys(ss, ssid) {
-		list_del(&cset->e_cset_node[ssid]);
-		css_put(cset->subsys[ssid]);
-	}
-	hash_del(&cset->hlist);
-	css_set_count--;
-
-	list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
-		list_del(&link->cset_link);
-		list_del(&link->cgrp_link);
-		if (cgroup_parent(link->cgrp))
-			cgroup_put(link->cgrp);
-		kfree(link);
-	}
-
-	kfree_rcu(cset, rcu_head);
-}
-
-static void put_css_set(struct css_set *cset)
-{
-	unsigned long flags;
-
-	/*
-	 * Ensure that the refcount doesn't hit zero while any readers
-	 * can see it. Similar to atomic_dec_and_lock(), but for an
-	 * rwlock
-	 */
-	if (atomic_add_unless(&cset->refcount, -1, 1))
-		return;
-
-	spin_lock_irqsave(&css_set_lock, flags);
-	put_css_set_locked(cset);
-	spin_unlock_irqrestore(&css_set_lock, flags);
-}
-
-/*
- * refcounted get/put for css_set objects
- */
-static inline void get_css_set(struct css_set *cset)
-{
-	atomic_inc(&cset->refcount);
-}
-
-/**
- * compare_css_sets - helper function for find_existing_css_set().
- * @cset: candidate css_set being tested
- * @old_cset: existing css_set for a task
- * @new_cgrp: cgroup that's being entered by the task
- * @template: desired set of css pointers in css_set (pre-calculated)
- *
- * Returns true if "cset" matches "old_cset" except for the hierarchy
- * which "new_cgrp" belongs to, for which it should match "new_cgrp".
- */
-static bool compare_css_sets(struct css_set *cset,
-			     struct css_set *old_cset,
-			     struct cgroup *new_cgrp,
-			     struct cgroup_subsys_state *template[])
-{
-	struct list_head *l1, *l2;
-
-	/*
-	 * On the default hierarchy, there can be csets which are
-	 * associated with the same set of cgroups but different csses.
-	 * Let's first ensure that csses match.
-	 */
-	if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
-		return false;
-
-	/*
-	 * Compare cgroup pointers in order to distinguish between
-	 * different cgroups in hierarchies.  As different cgroups may
-	 * share the same effective css, this comparison is always
-	 * necessary.
-	 */
-	l1 = &cset->cgrp_links;
-	l2 = &old_cset->cgrp_links;
-	while (1) {
-		struct cgrp_cset_link *link1, *link2;
-		struct cgroup *cgrp1, *cgrp2;
-
-		l1 = l1->next;
-		l2 = l2->next;
-		/* See if we reached the end - both lists are equal length. */
-		if (l1 == &cset->cgrp_links) {
-			BUG_ON(l2 != &old_cset->cgrp_links);
-			break;
-		} else {
-			BUG_ON(l2 == &old_cset->cgrp_links);
-		}
-		/* Locate the cgroups associated with these links. */
-		link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
-		link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
-		cgrp1 = link1->cgrp;
-		cgrp2 = link2->cgrp;
-		/* Hierarchies should be linked in the same order. */
-		BUG_ON(cgrp1->root != cgrp2->root);
-
-		/*
-		 * If this hierarchy is the hierarchy of the cgroup
-		 * that's changing, then we need to check that this
-		 * css_set points to the new cgroup; if it's any other
-		 * hierarchy, then this css_set should point to the
-		 * same cgroup as the old css_set.
-		 */
-		if (cgrp1->root == new_cgrp->root) {
-			if (cgrp1 != new_cgrp)
-				return false;
-		} else {
-			if (cgrp1 != cgrp2)
-				return false;
-		}
-	}
-	return true;
-}
-
-/**
- * find_existing_css_set - init css array and find the matching css_set
- * @old_cset: the css_set that we're using before the cgroup transition
- * @cgrp: the cgroup that we're moving into
- * @template: out param for the new set of csses, should be clear on entry
- */
-static struct css_set *find_existing_css_set(struct css_set *old_cset,
-					struct cgroup *cgrp,
-					struct cgroup_subsys_state *template[])
-{
-	struct cgroup_root *root = cgrp->root;
-	struct cgroup_subsys *ss;
-	struct css_set *cset;
-	unsigned long key;
-	int i;
-
-	/*
-	 * Build the set of subsystem state objects that we want to see in the
-	 * new css_set. while subsystems can change globally, the entries here
-	 * won't change, so no need for locking.
-	 */
-	for_each_subsys(ss, i) {
-		if (root->subsys_mask & (1UL << i)) {
-			/*
-			 * @ss is in this hierarchy, so we want the
-			 * effective css from @cgrp.
-			 */
-			template[i] = cgroup_e_css(cgrp, ss);
-		} else {
-			/*
-			 * @ss is not in this hierarchy, so we don't want
-			 * to change the css.
-			 */
-			template[i] = old_cset->subsys[i];
-		}
-	}
-
-	key = css_set_hash(template);
-	hash_for_each_possible(css_set_table, cset, hlist, key) {
-		if (!compare_css_sets(cset, old_cset, cgrp, template))
-			continue;
-
-		/* This css_set matches what we need */
-		return cset;
-	}
-
-	/* No existing cgroup group matched */
-	return NULL;
-}
-
-static void free_cgrp_cset_links(struct list_head *links_to_free)
-{
-	struct cgrp_cset_link *link, *tmp_link;
-
-	list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
-		list_del(&link->cset_link);
-		kfree(link);
-	}
-}
-
-/**
- * allocate_cgrp_cset_links - allocate cgrp_cset_links
- * @count: the number of links to allocate
- * @tmp_links: list_head the allocated links are put on
- *
- * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
- * through ->cset_link.  Returns 0 on success or -errno.
- */
-static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
-{
-	struct cgrp_cset_link *link;
-	int i;
-
-	INIT_LIST_HEAD(tmp_links);
-
-	for (i = 0; i < count; i++) {
-		link = kzalloc(sizeof(*link), GFP_KERNEL);
-		if (!link) {
-			free_cgrp_cset_links(tmp_links);
-			return -ENOMEM;
-		}
-		list_add(&link->cset_link, tmp_links);
-	}
-	return 0;
-}
-
-/**
- * link_css_set - a helper function to link a css_set to a cgroup
- * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
- * @cset: the css_set to be linked
- * @cgrp: the destination cgroup
- */
-static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
-			 struct cgroup *cgrp)
-{
-	struct cgrp_cset_link *link;
-
-	BUG_ON(list_empty(tmp_links));
-
-	if (cgroup_on_dfl(cgrp))
-		cset->dfl_cgrp = cgrp;
-
-	link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
-	link->cset = cset;
-	link->cgrp = cgrp;
-
-	/*
-	 * Always add links to the tail of the lists so that the lists are
-	 * in choronological order.
-	 */
-	list_move_tail(&link->cset_link, &cgrp->cset_links);
-	list_add_tail(&link->cgrp_link, &cset->cgrp_links);
-
-	if (cgroup_parent(cgrp))
-		cgroup_get(cgrp);
-}
-
-/**
- * find_css_set - return a new css_set with one cgroup updated
- * @old_cset: the baseline css_set
- * @cgrp: the cgroup to be updated
- *
- * Return a new css_set that's equivalent to @old_cset, but with @cgrp
- * substituted into the appropriate hierarchy.
- */
-static struct css_set *find_css_set(struct css_set *old_cset,
-				    struct cgroup *cgrp)
-{
-	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
-	struct css_set *cset;
-	struct list_head tmp_links;
-	struct cgrp_cset_link *link;
-	struct cgroup_subsys *ss;
-	unsigned long key;
-	int ssid;
-
-	lockdep_assert_held(&cgroup_mutex);
-
-	/* First see if we already have a cgroup group that matches
-	 * the desired set */
-	spin_lock_irq(&css_set_lock);
-	cset = find_existing_css_set(old_cset, cgrp, template);
-	if (cset)
-		get_css_set(cset);
-	spin_unlock_irq(&css_set_lock);
-
-	if (cset)
-		return cset;
-
-	cset = kzalloc(sizeof(*cset), GFP_KERNEL);
-	if (!cset)
-		return NULL;
-
-	/* Allocate all the cgrp_cset_link objects that we'll need */
-	if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
-		kfree(cset);
-		return NULL;
-	}
-
-	atomic_set(&cset->refcount, 1);
-	INIT_LIST_HEAD(&cset->tasks);
-	INIT_LIST_HEAD(&cset->mg_tasks);
-	INIT_LIST_HEAD(&cset->task_iters);
-	INIT_HLIST_NODE(&cset->hlist);
-	INIT_LIST_HEAD(&cset->cgrp_links);
-	INIT_LIST_HEAD(&cset->mg_preload_node);
-	INIT_LIST_HEAD(&cset->mg_node);
-
-	/* Copy the set of subsystem state objects generated in
-	 * find_existing_css_set() */
-	memcpy(cset->subsys, template, sizeof(cset->subsys));
-
-	spin_lock_irq(&css_set_lock);
-	/* Add reference counts and links from the new css_set. */
-	list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
-		struct cgroup *c = link->cgrp;
-
-		if (c->root == cgrp->root)
-			c = cgrp;
-		link_css_set(&tmp_links, cset, c);
-	}
-
-	BUG_ON(!list_empty(&tmp_links));
-
-	css_set_count++;
-
-	/* Add @cset to the hash table */
-	key = css_set_hash(cset->subsys);
-	hash_add(css_set_table, &cset->hlist, key);
-
-	for_each_subsys(ss, ssid) {
-		struct cgroup_subsys_state *css = cset->subsys[ssid];
-
-		list_add_tail(&cset->e_cset_node[ssid],
-			      &css->cgroup->e_csets[ssid]);
-		css_get(css);
-	}
-
-	spin_unlock_irq(&css_set_lock);
-
-	return cset;
-}
-
-static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
-{
-	struct cgroup *root_cgrp = kf_root->kn->priv;
-
-	return root_cgrp->root;
-}
-
-static int cgroup_init_root_id(struct cgroup_root *root)
-{
-	int id;
-
-	lockdep_assert_held(&cgroup_mutex);
-
-	id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
-	if (id < 0)
-		return id;
-
-	root->hierarchy_id = id;
-	return 0;
-}
-
-static void cgroup_exit_root_id(struct cgroup_root *root)
-{
-	lockdep_assert_held(&cgroup_mutex);
-
-	idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
-}
-
-static void cgroup_free_root(struct cgroup_root *root)
-{
-	if (root) {
-		idr_destroy(&root->cgroup_idr);
-		kfree(root);
-	}
-}
-
-static void cgroup_destroy_root(struct cgroup_root *root)
-{
-	struct cgroup *cgrp = &root->cgrp;
-	struct cgrp_cset_link *link, *tmp_link;
-
-	trace_cgroup_destroy_root(root);
-
-	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
-
-	BUG_ON(atomic_read(&root->nr_cgrps));
-	BUG_ON(!list_empty(&cgrp->self.children));
-
-	/* Rebind all subsystems back to the default hierarchy */
-	WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
-
-	/*
-	 * Release all the links from cset_links to this hierarchy's
-	 * root cgroup
-	 */
-	spin_lock_irq(&css_set_lock);
-
-	list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
-		list_del(&link->cset_link);
-		list_del(&link->cgrp_link);
-		kfree(link);
-	}
-
-	spin_unlock_irq(&css_set_lock);
-
-	if (!list_empty(&root->root_list)) {
-		list_del(&root->root_list);
-		cgroup_root_count--;
-	}
-
-	cgroup_exit_root_id(root);
-
-	mutex_unlock(&cgroup_mutex);
-
-	kernfs_destroy_root(root->kf_root);
-	cgroup_free_root(root);
-}
-
-/*
- * look up cgroup associated with current task's cgroup namespace on the
- * specified hierarchy
- */
-static struct cgroup *
-current_cgns_cgroup_from_root(struct cgroup_root *root)
-{
-	struct cgroup *res = NULL;
-	struct css_set *cset;
-
-	lockdep_assert_held(&css_set_lock);
-
-	rcu_read_lock();
-
-	cset = current->nsproxy->cgroup_ns->root_cset;
-	if (cset == &init_css_set) {
-		res = &root->cgrp;
-	} else {
-		struct cgrp_cset_link *link;
-
-		list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
-			struct cgroup *c = link->cgrp;
-
-			if (c->root == root) {
-				res = c;
-				break;
-			}
-		}
-	}
-	rcu_read_unlock();
-
-	BUG_ON(!res);
-	return res;
-}
-
-/* look up cgroup associated with given css_set on the specified hierarchy */
-static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
-					    struct cgroup_root *root)
-{
-	struct cgroup *res = NULL;
-
-	lockdep_assert_held(&cgroup_mutex);
-	lockdep_assert_held(&css_set_lock);
-
-	if (cset == &init_css_set) {
-		res = &root->cgrp;
-	} else {
-		struct cgrp_cset_link *link;
-
-		list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
-			struct cgroup *c = link->cgrp;
-
-			if (c->root == root) {
-				res = c;
-				break;
-			}
-		}
-	}
-
-	BUG_ON(!res);
-	return res;
-}
-
-/*
- * Return the cgroup for "task" from the given hierarchy. Must be
- * called with cgroup_mutex and css_set_lock held.
- */
-static struct cgroup *task_cgroup_from_root(struct task_struct *task,
-					    struct cgroup_root *root)
-{
-	/*
-	 * No need to lock the task - since we hold cgroup_mutex the
-	 * task can't change groups, so the only thing that can happen
-	 * is that it exits and its css is set back to init_css_set.
-	 */
-	return cset_cgroup_from_root(task_css_set(task), root);
-}
-
-/*
- * A task must hold cgroup_mutex to modify cgroups.
- *
- * Any task can increment and decrement the count field without lock.
- * So in general, code holding cgroup_mutex can't rely on the count
- * field not changing.  However, if the count goes to zero, then only
- * cgroup_attach_task() can increment it again.  Because a count of zero
- * means that no tasks are currently attached, therefore there is no
- * way a task attached to that cgroup can fork (the other way to
- * increment the count).  So code holding cgroup_mutex can safely
- * assume that if the count is zero, it will stay zero. Similarly, if
- * a task holds cgroup_mutex on a cgroup with zero count, it
- * knows that the cgroup won't be removed, as cgroup_rmdir()
- * needs that mutex.
- *
- * A cgroup can only be deleted if both its 'count' of using tasks
- * is zero, and its list of 'children' cgroups is empty.  Since all
- * tasks in the system use _some_ cgroup, and since there is always at
- * least one task in the system (init, pid == 1), therefore, root cgroup
- * always has either children cgroups and/or using tasks.  So we don't
- * need a special hack to ensure that root cgroup cannot be deleted.
- *
- * P.S.  One more locking exception.  RCU is used to guard the
- * update of a tasks cgroup pointer by cgroup_attach_task()
- */
-
-static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
-static const struct file_operations proc_cgroupstats_operations;
-
-static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
-			      char *buf)
-{
-	struct cgroup_subsys *ss = cft->ss;
-
-	if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
-	    !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
-		snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
-			 cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
-			 cft->name);
-	else
-		strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
-	return buf;
-}
-
-/**
- * cgroup_file_mode - deduce file mode of a control file
- * @cft: the control file in question
- *
- * S_IRUGO for read, S_IWUSR for write.
- */
-static umode_t cgroup_file_mode(const struct cftype *cft)
-{
-	umode_t mode = 0;
-
-	if (cft->read_u64 || cft->read_s64 || cft->seq_show)
-		mode |= S_IRUGO;
-
-	if (cft->write_u64 || cft->write_s64 || cft->write) {
-		if (cft->flags & CFTYPE_WORLD_WRITABLE)
-			mode |= S_IWUGO;
-		else
-			mode |= S_IWUSR;
-	}
-
-	return mode;
-}
-
-/**
- * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask
- * @subtree_control: the new subtree_control mask to consider
- * @this_ss_mask: available subsystems
- *
- * On the default hierarchy, a subsystem may request other subsystems to be
- * enabled together through its ->depends_on mask.  In such cases, more
- * subsystems than specified in "cgroup.subtree_control" may be enabled.
- *
- * This function calculates which subsystems need to be enabled if
- * @subtree_control is to be applied while restricted to @this_ss_mask.
- */
-static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
-{
-	u16 cur_ss_mask = subtree_control;
-	struct cgroup_subsys *ss;
-	int ssid;
-
-	lockdep_assert_held(&cgroup_mutex);
-
-	cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
-
-	while (true) {
-		u16 new_ss_mask = cur_ss_mask;
-
-		do_each_subsys_mask(ss, ssid, cur_ss_mask) {
-			new_ss_mask |= ss->depends_on;
-		} while_each_subsys_mask();
-
-		/*
-		 * Mask out subsystems which aren't available.  This can
-		 * happen only if some depended-upon subsystems were bound
-		 * to non-default hierarchies.
-		 */
-		new_ss_mask &= this_ss_mask;
-
-		if (new_ss_mask == cur_ss_mask)
-			break;
-		cur_ss_mask = new_ss_mask;
-	}
-
-	return cur_ss_mask;
-}
-
-/**
- * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
- * @kn: the kernfs_node being serviced
- *
- * This helper undoes cgroup_kn_lock_live() and should be invoked before
- * the method finishes if locking succeeded.  Note that once this function
- * returns the cgroup returned by cgroup_kn_lock_live() may become
- * inaccessible any time.  If the caller intends to continue to access the
- * cgroup, it should pin it before invoking this function.
- */
-static void cgroup_kn_unlock(struct kernfs_node *kn)
-{
-	struct cgroup *cgrp;
-
-	if (kernfs_type(kn) == KERNFS_DIR)
-		cgrp = kn->priv;
-	else
-		cgrp = kn->parent->priv;
-
-	mutex_unlock(&cgroup_mutex);
-
-	kernfs_unbreak_active_protection(kn);
-	cgroup_put(cgrp);
-}
-
-/**
- * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
- * @kn: the kernfs_node being serviced
- * @drain_offline: perform offline draining on the cgroup
- *
- * This helper is to be used by a cgroup kernfs method currently servicing
- * @kn.  It breaks the active protection, performs cgroup locking and
- * verifies that the associated cgroup is alive.  Returns the cgroup if
- * alive; otherwise, %NULL.  A successful return should be undone by a
- * matching cgroup_kn_unlock() invocation.  If @drain_offline is %true, the
- * cgroup is drained of offlining csses before return.
- *
- * Any cgroup kernfs method implementation which requires locking the
- * associated cgroup should use this helper.  It avoids nesting cgroup
- * locking under kernfs active protection and allows all kernfs operations
- * including self-removal.
- */
-static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn,
-					  bool drain_offline)
-{
-	struct cgroup *cgrp;
-
-	if (kernfs_type(kn) == KERNFS_DIR)
-		cgrp = kn->priv;
-	else
-		cgrp = kn->parent->priv;
-
-	/*
-	 * We're gonna grab cgroup_mutex which nests outside kernfs
-	 * active_ref.  cgroup liveliness check alone provides enough
-	 * protection against removal.  Ensure @cgrp stays accessible and
-	 * break the active_ref protection.
-	 */
-	if (!cgroup_tryget(cgrp))
-		return NULL;
-	kernfs_break_active_protection(kn);
-
-	if (drain_offline)
-		cgroup_lock_and_drain_offline(cgrp);
-	else
-		mutex_lock(&cgroup_mutex);
-
-	if (!cgroup_is_dead(cgrp))
-		return cgrp;
-
-	cgroup_kn_unlock(kn);
-	return NULL;
-}
-
-static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
-{
-	char name[CGROUP_FILE_NAME_MAX];
-
-	lockdep_assert_held(&cgroup_mutex);
-
-	if (cft->file_offset) {
-		struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
-		struct cgroup_file *cfile = (void *)css + cft->file_offset;
-
-		spin_lock_irq(&cgroup_file_kn_lock);
-		cfile->kn = NULL;
-		spin_unlock_irq(&cgroup_file_kn_lock);
-	}
-
-	kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
-}
-
-/**
- * css_clear_dir - remove subsys files in a cgroup directory
- * @css: taget css
- */
-static void css_clear_dir(struct cgroup_subsys_state *css)
-{
-	struct cgroup *cgrp = css->cgroup;
-	struct cftype *cfts;
-
-	if (!(css->flags & CSS_VISIBLE))
-		return;
-
-	css->flags &= ~CSS_VISIBLE;
-
-	list_for_each_entry(cfts, &css->ss->cfts, node)
-		cgroup_addrm_files(css, cgrp, cfts, false);
-}
-
-/**
- * css_populate_dir - create subsys files in a cgroup directory
- * @css: target css
- *
- * On failure, no file is added.
- */
-static int css_populate_dir(struct cgroup_subsys_state *css)
-{
-	struct cgroup *cgrp = css->cgroup;
-	struct cftype *cfts, *failed_cfts;
-	int ret;
-
-	if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
-		return 0;
-
-	if (!css->ss) {
-		if (cgroup_on_dfl(cgrp))
-			cfts = cgroup_dfl_base_files;
-		else
-			cfts = cgroup_legacy_base_files;
-
-		return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
-	}
-
-	list_for_each_entry(cfts, &css->ss->cfts, node) {
-		ret = cgroup_addrm_files(css, cgrp, cfts, true);
-		if (ret < 0) {
-			failed_cfts = cfts;
-			goto err;
-		}
-	}
-
-	css->flags |= CSS_VISIBLE;
-
-	return 0;
-err:
-	list_for_each_entry(cfts, &css->ss->cfts, node) {
-		if (cfts == failed_cfts)
-			break;
-		cgroup_addrm_files(css, cgrp, cfts, false);
-	}
-	return ret;
-}
-
-static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
-{
-	struct cgroup *dcgrp = &dst_root->cgrp;
-	struct cgroup_subsys *ss;
-	int ssid, i, ret;
-
-	lockdep_assert_held(&cgroup_mutex);
-
-	do_each_subsys_mask(ss, ssid, ss_mask) {
-		/*
-		 * If @ss has non-root csses attached to it, can't move.
-		 * If @ss is an implicit controller, it is exempt from this
-		 * rule and can be stolen.
-		 */
-		if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
-		    !ss->implicit_on_dfl)
-			return -EBUSY;
-
-		/* can't move between two non-dummy roots either */
-		if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
-			return -EBUSY;
-	} while_each_subsys_mask();
-
-	do_each_subsys_mask(ss, ssid, ss_mask) {
-		struct cgroup_root *src_root = ss->root;
-		struct cgroup *scgrp = &src_root->cgrp;
-		struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
-		struct css_set *cset;
-
-		WARN_ON(!css || cgroup_css(dcgrp, ss));
-
-		/* disable from the source */
-		src_root->subsys_mask &= ~(1 << ssid);
-		WARN_ON(cgroup_apply_control(scgrp));
-		cgroup_finalize_control(scgrp, 0);
-
-		/* rebind */
-		RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
-		rcu_assign_pointer(dcgrp->subsys[ssid], css);
-		ss->root = dst_root;
-		css->cgroup = dcgrp;
-
-		spin_lock_irq(&css_set_lock);
-		hash_for_each(css_set_table, i, cset, hlist)
-			list_move_tail(&cset->e_cset_node[ss->id],
-				       &dcgrp->e_csets[ss->id]);
-		spin_unlock_irq(&css_set_lock);
-
-		/* default hierarchy doesn't enable controllers by default */
-		dst_root->subsys_mask |= 1 << ssid;
-		if (dst_root == &cgrp_dfl_root) {
-			static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
-		} else {
-			dcgrp->subtree_control |= 1 << ssid;
-			static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
-		}
-
-		ret = cgroup_apply_control(dcgrp);
-		if (ret)
-			pr_warn("partial failure to rebind %s controller (err=%d)\n",
-				ss->name, ret);
-
-		if (ss->bind)
-			ss->bind(css);
-	} while_each_subsys_mask();
-
-	kernfs_activate(dcgrp->kn);
-	return 0;
-}
-
-static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
-			    struct kernfs_root *kf_root)
-{
-	int len = 0;
-	char *buf = NULL;
-	struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
-	struct cgroup *ns_cgroup;
-
-	buf = kmalloc(PATH_MAX, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	spin_lock_irq(&css_set_lock);
-	ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
-	len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
-	spin_unlock_irq(&css_set_lock);
-
-	if (len >= PATH_MAX)
-		len = -ERANGE;
-	else if (len > 0) {
-		seq_escape(sf, buf, " \t\n\\");
-		len = 0;
-	}
-	kfree(buf);
-	return len;
-}
-
-static int cgroup_show_options(struct seq_file *seq,
-			       struct kernfs_root *kf_root)
-{
-	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
-	struct cgroup_subsys *ss;
-	int ssid;
-
-	if (root != &cgrp_dfl_root)
-		for_each_subsys(ss, ssid)
-			if (root->subsys_mask & (1 << ssid))
-				seq_show_option(seq, ss->legacy_name, NULL);
-	if (root->flags & CGRP_ROOT_NOPREFIX)
-		seq_puts(seq, ",noprefix");
-	if (root->flags & CGRP_ROOT_XATTR)
-		seq_puts(seq, ",xattr");
-
-	spin_lock(&release_agent_path_lock);
-	if (strlen(root->release_agent_path))
-		seq_show_option(seq, "release_agent",
-				root->release_agent_path);
-	spin_unlock(&release_agent_path_lock);
-
-	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
-		seq_puts(seq, ",clone_children");
-	if (strlen(root->name))
-		seq_show_option(seq, "name", root->name);
-	return 0;
-}
-
-struct cgroup_sb_opts {
-	u16 subsys_mask;
-	unsigned int flags;
-	char *release_agent;
-	bool cpuset_clone_children;
-	char *name;
-	/* User explicitly requested empty subsystem */
-	bool none;
-};
-
-static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
-{
-	char *token, *o = data;
-	bool all_ss = false, one_ss = false;
-	u16 mask = U16_MAX;
-	struct cgroup_subsys *ss;
-	int nr_opts = 0;
-	int i;
-
-#ifdef CONFIG_CPUSETS
-	mask = ~((u16)1 << cpuset_cgrp_id);
-#endif
-
-	memset(opts, 0, sizeof(*opts));
-
-	while ((token = strsep(&o, ",")) != NULL) {
-		nr_opts++;
-
-		if (!*token)
-			return -EINVAL;
-		if (!strcmp(token, "none")) {
-			/* Explicitly have no subsystems */
-			opts->none = true;
-			continue;
-		}
-		if (!strcmp(token, "all")) {
-			/* Mutually exclusive option 'all' + subsystem name */
-			if (one_ss)
-				return -EINVAL;
-			all_ss = true;
-			continue;
-		}
-		if (!strcmp(token, "noprefix")) {
-			opts->flags |= CGRP_ROOT_NOPREFIX;
-			continue;
-		}
-		if (!strcmp(token, "clone_children")) {
-			opts->cpuset_clone_children = true;
-			continue;
-		}
-		if (!strcmp(token, "xattr")) {
-			opts->flags |= CGRP_ROOT_XATTR;
-			continue;
-		}
-		if (!strncmp(token, "release_agent=", 14)) {
-			/* Specifying two release agents is forbidden */
-			if (opts->release_agent)
-				return -EINVAL;
-			opts->release_agent =
-				kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
-			if (!opts->release_agent)
-				return -ENOMEM;
-			continue;
-		}
-		if (!strncmp(token, "name=", 5)) {
-			const char *name = token + 5;
-			/* Can't specify an empty name */
-			if (!strlen(name))
-				return -EINVAL;
-			/* Must match [\w.-]+ */
-			for (i = 0; i < strlen(name); i++) {
-				char c = name[i];
-				if (isalnum(c))
-					continue;
-				if ((c == '.') || (c == '-') || (c == '_'))
-					continue;
-				return -EINVAL;
-			}
-			/* Specifying two names is forbidden */
-			if (opts->name)
-				return -EINVAL;
-			opts->name = kstrndup(name,
-					      MAX_CGROUP_ROOT_NAMELEN - 1,
-					      GFP_KERNEL);
-			if (!opts->name)
-				return -ENOMEM;
-
-			continue;
-		}
-
-		for_each_subsys(ss, i) {
-			if (strcmp(token, ss->legacy_name))
-				continue;
-			if (!cgroup_ssid_enabled(i))
-				continue;
-			if (cgroup_ssid_no_v1(i))
-				continue;
-
-			/* Mutually exclusive option 'all' + subsystem name */
-			if (all_ss)
-				return -EINVAL;
-			opts->subsys_mask |= (1 << i);
-			one_ss = true;
-
-			break;
-		}
-		if (i == CGROUP_SUBSYS_COUNT)
-			return -ENOENT;
-	}
-
-	/*
-	 * If the 'all' option was specified select all the subsystems,
-	 * otherwise if 'none', 'name=' and a subsystem name options were
-	 * not specified, let's default to 'all'
-	 */
-	if (all_ss || (!one_ss && !opts->none && !opts->name))
-		for_each_subsys(ss, i)
-			if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i))
-				opts->subsys_mask |= (1 << i);
-
-	/*
-	 * We either have to specify by name or by subsystems. (So all
-	 * empty hierarchies must have a name).
-	 */
-	if (!opts->subsys_mask && !opts->name)
-		return -EINVAL;
-
-	/*
-	 * Option noprefix was introduced just for backward compatibility
-	 * with the old cpuset, so we allow noprefix only if mounting just
-	 * the cpuset subsystem.
-	 */
-	if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
-		return -EINVAL;
-
-	/* Can't specify "none" and some subsystems */
-	if (opts->subsys_mask && opts->none)
-		return -EINVAL;
-
-	return 0;
-}
-
-static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
-{
-	int ret = 0;
-	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
-	struct cgroup_sb_opts opts;
-	u16 added_mask, removed_mask;
-
-	if (root == &cgrp_dfl_root) {
-		pr_err("remount is not allowed\n");
-		return -EINVAL;
-	}
-
-	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
-
-	/* See what subsystems are wanted */
-	ret = parse_cgroupfs_options(data, &opts);
-	if (ret)
-		goto out_unlock;
-
-	if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
-		pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
-			task_tgid_nr(current), current->comm);
-
-	added_mask = opts.subsys_mask & ~root->subsys_mask;
-	removed_mask = root->subsys_mask & ~opts.subsys_mask;
-
-	/* Don't allow flags or name to change at remount */
-	if ((opts.flags ^ root->flags) ||
-	    (opts.name && strcmp(opts.name, root->name))) {
-		pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
-		       opts.flags, opts.name ?: "", root->flags, root->name);
-		ret = -EINVAL;
-		goto out_unlock;
-	}
-
-	/* remounting is not allowed for populated hierarchies */
-	if (!list_empty(&root->cgrp.self.children)) {
-		ret = -EBUSY;
-		goto out_unlock;
-	}
-
-	ret = rebind_subsystems(root, added_mask);
-	if (ret)
-		goto out_unlock;
-
-	WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
-
-	if (opts.release_agent) {
-		spin_lock(&release_agent_path_lock);
-		strcpy(root->release_agent_path, opts.release_agent);
-		spin_unlock(&release_agent_path_lock);
-	}
-
-	trace_cgroup_remount(root);
-
- out_unlock:
-	kfree(opts.release_agent);
-	kfree(opts.name);
-	mutex_unlock(&cgroup_mutex);
-	return ret;
-}
-
-/*
- * To reduce the fork() overhead for systems that are not actually using
- * their cgroups capability, we don't maintain the lists running through
- * each css_set to its tasks until we see the list actually used - in other
- * words after the first mount.
- */
-static bool use_task_css_set_links __read_mostly;
-
-static void cgroup_enable_task_cg_lists(void)
-{
-	struct task_struct *p, *g;
-
-	spin_lock_irq(&css_set_lock);
-
-	if (use_task_css_set_links)
-		goto out_unlock;
-
-	use_task_css_set_links = true;
-
-	/*
-	 * We need tasklist_lock because RCU is not safe against
-	 * while_each_thread(). Besides, a forking task that has passed
-	 * cgroup_post_fork() without seeing use_task_css_set_links = 1
-	 * is not guaranteed to have its child immediately visible in the
-	 * tasklist if we walk through it with RCU.
-	 */
-	read_lock(&tasklist_lock);
-	do_each_thread(g, p) {
-		WARN_ON_ONCE(!list_empty(&p->cg_list) ||
-			     task_css_set(p) != &init_css_set);
-
-		/*
-		 * We should check if the process is exiting, otherwise
-		 * it will race with cgroup_exit() in that the list
-		 * entry won't be deleted though the process has exited.
-		 * Do it while holding siglock so that we don't end up
-		 * racing against cgroup_exit().
-		 *
-		 * Interrupts were already disabled while acquiring
-		 * the css_set_lock, so we do not need to disable it
-		 * again when acquiring the sighand->siglock here.
-		 */
-		spin_lock(&p->sighand->siglock);
-		if (!(p->flags & PF_EXITING)) {
-			struct css_set *cset = task_css_set(p);
-
-			if (!css_set_populated(cset))
-				css_set_update_populated(cset, true);
-			list_add_tail(&p->cg_list, &cset->tasks);
-			get_css_set(cset);
-		}
-		spin_unlock(&p->sighand->siglock);
-	} while_each_thread(g, p);
-	read_unlock(&tasklist_lock);
-out_unlock:
-	spin_unlock_irq(&css_set_lock);
-}
-
-static void init_cgroup_housekeeping(struct cgroup *cgrp)
-{
-	struct cgroup_subsys *ss;
-	int ssid;
-
-	INIT_LIST_HEAD(&cgrp->self.sibling);
-	INIT_LIST_HEAD(&cgrp->self.children);
-	INIT_LIST_HEAD(&cgrp->cset_links);
-	INIT_LIST_HEAD(&cgrp->pidlists);
-	mutex_init(&cgrp->pidlist_mutex);
-	cgrp->self.cgroup = cgrp;
-	cgrp->self.flags |= CSS_ONLINE;
-
-	for_each_subsys(ss, ssid)
-		INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
-
-	init_waitqueue_head(&cgrp->offline_waitq);
-	INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
-}
-
-static void init_cgroup_root(struct cgroup_root *root,
-			     struct cgroup_sb_opts *opts)
-{
-	struct cgroup *cgrp = &root->cgrp;
-
-	INIT_LIST_HEAD(&root->root_list);
-	atomic_set(&root->nr_cgrps, 1);
-	cgrp->root = root;
-	init_cgroup_housekeeping(cgrp);
-	idr_init(&root->cgroup_idr);
-
-	root->flags = opts->flags;
-	if (opts->release_agent)
-		strcpy(root->release_agent_path, opts->release_agent);
-	if (opts->name)
-		strcpy(root->name, opts->name);
-	if (opts->cpuset_clone_children)
-		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
-}
-
-static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
-{
-	LIST_HEAD(tmp_links);
-	struct cgroup *root_cgrp = &root->cgrp;
-	struct css_set *cset;
-	int i, ret;
-
-	lockdep_assert_held(&cgroup_mutex);
-
-	ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
-	if (ret < 0)
-		goto out;
-	root_cgrp->id = ret;
-	root_cgrp->ancestor_ids[0] = ret;
-
-	ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
-			      GFP_KERNEL);
-	if (ret)
-		goto out;
-
-	/*
-	 * We're accessing css_set_count without locking css_set_lock here,
-	 * but that's OK - it can only be increased by someone holding
-	 * cgroup_lock, and that's us.  Later rebinding may disable
-	 * controllers on the default hierarchy and thus create new csets,
-	 * which can't be more than the existing ones.  Allocate 2x.
-	 */
-	ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
-	if (ret)
-		goto cancel_ref;
-
-	ret = cgroup_init_root_id(root);
-	if (ret)
-		goto cancel_ref;
-
-	root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
-					   KERNFS_ROOT_CREATE_DEACTIVATED,
-					   root_cgrp);
-	if (IS_ERR(root->kf_root)) {
-		ret = PTR_ERR(root->kf_root);
-		goto exit_root_id;
-	}
-	root_cgrp->kn = root->kf_root->kn;
-
-	ret = css_populate_dir(&root_cgrp->self);
-	if (ret)
-		goto destroy_root;
-
-	ret = rebind_subsystems(root, ss_mask);
-	if (ret)
-		goto destroy_root;
-
-	trace_cgroup_setup_root(root);
-
-	/*
-	 * There must be no failure case after here, since rebinding takes
-	 * care of subsystems' refcounts, which are explicitly dropped in
-	 * the failure exit path.
-	 */
-	list_add(&root->root_list, &cgroup_roots);
-	cgroup_root_count++;
-
-	/*
-	 * Link the root cgroup in this hierarchy into all the css_set
-	 * objects.
-	 */
-	spin_lock_irq(&css_set_lock);
-	hash_for_each(css_set_table, i, cset, hlist) {
-		link_css_set(&tmp_links, cset, root_cgrp);
-		if (css_set_populated(cset))
-			cgroup_update_populated(root_cgrp, true);
-	}
-	spin_unlock_irq(&css_set_lock);
-
-	BUG_ON(!list_empty(&root_cgrp->self.children));
-	BUG_ON(atomic_read(&root->nr_cgrps) != 1);
-
-	kernfs_activate(root_cgrp->kn);
-	ret = 0;
-	goto out;
-
-destroy_root:
-	kernfs_destroy_root(root->kf_root);
-	root->kf_root = NULL;
-exit_root_id:
-	cgroup_exit_root_id(root);
-cancel_ref:
-	percpu_ref_exit(&root_cgrp->self.refcnt);
-out:
-	free_cgrp_cset_links(&tmp_links);
-	return ret;
-}
-
-static struct dentry *cgroup_mount(struct file_system_type *fs_type,
-			 int flags, const char *unused_dev_name,
-			 void *data)
-{
-	bool is_v2 = fs_type == &cgroup2_fs_type;
-	struct super_block *pinned_sb = NULL;
-	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
-	struct cgroup_subsys *ss;
-	struct cgroup_root *root;
-	struct cgroup_sb_opts opts;
-	struct dentry *dentry;
-	int ret;
-	int i;
-	bool new_sb;
-
-	get_cgroup_ns(ns);
-
-	/* Check if the caller has permission to mount. */
-	if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
-		put_cgroup_ns(ns);
-		return ERR_PTR(-EPERM);
-	}
-
-	/*
-	 * The first time anyone tries to mount a cgroup, enable the list
-	 * linking each css_set to its tasks and fix up all existing tasks.
-	 */
-	if (!use_task_css_set_links)
-		cgroup_enable_task_cg_lists();
-
-	if (is_v2) {
-		if (data) {
-			pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
-			put_cgroup_ns(ns);
-			return ERR_PTR(-EINVAL);
-		}
-		cgrp_dfl_visible = true;
-		root = &cgrp_dfl_root;
-		cgroup_get(&root->cgrp);
-		goto out_mount;
-	}
-
-	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
-
-	/* First find the desired set of subsystems */
-	ret = parse_cgroupfs_options(data, &opts);
-	if (ret)
-		goto out_unlock;
-
-	/*
-	 * Destruction of cgroup root is asynchronous, so subsystems may
-	 * still be dying after the previous unmount.  Let's drain the
-	 * dying subsystems.  We just need to ensure that the ones
-	 * unmounted previously finish dying and don't care about new ones
-	 * starting.  Testing ref liveliness is good enough.
-	 */
-	for_each_subsys(ss, i) {
-		if (!(opts.subsys_mask & (1 << i)) ||
-		    ss->root == &cgrp_dfl_root)
-			continue;
-
-		if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
-			mutex_unlock(&cgroup_mutex);
-			msleep(10);
-			ret = restart_syscall();
-			goto out_free;
-		}
-		cgroup_put(&ss->root->cgrp);
-	}
-
-	for_each_root(root) {
-		bool name_match = false;
-
-		if (root == &cgrp_dfl_root)
-			continue;
-
-		/*
-		 * If we asked for a name then it must match.  Also, if
-		 * name matches but sybsys_mask doesn't, we should fail.
-		 * Remember whether name matched.
-		 */
-		if (opts.name) {
-			if (strcmp(opts.name, root->name))
-				continue;
-			name_match = true;
-		}
-
-		/*
-		 * If we asked for subsystems (or explicitly for no
-		 * subsystems) then they must match.
-		 */
-		if ((opts.subsys_mask || opts.none) &&
-		    (opts.subsys_mask != root->subsys_mask)) {
-			if (!name_match)
-				continue;
-			ret = -EBUSY;
-			goto out_unlock;
-		}
-
-		if (root->flags ^ opts.flags)
-			pr_warn("new mount options do not match the existing superblock, will be ignored\n");
-
-		/*
-		 * We want to reuse @root whose lifetime is governed by its
-		 * ->cgrp.  Let's check whether @root is alive and keep it
-		 * that way.  As cgroup_kill_sb() can happen anytime, we
-		 * want to block it by pinning the sb so that @root doesn't
-		 * get killed before mount is complete.
-		 *
-		 * With the sb pinned, tryget_live can reliably indicate
-		 * whether @root can be reused.  If it's being killed,
-		 * drain it.  We can use wait_queue for the wait but this
-		 * path is super cold.  Let's just sleep a bit and retry.
-		 */
-		pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
-		if (IS_ERR(pinned_sb) ||
-		    !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
-			mutex_unlock(&cgroup_mutex);
-			if (!IS_ERR_OR_NULL(pinned_sb))
-				deactivate_super(pinned_sb);
-			msleep(10);
-			ret = restart_syscall();
-			goto out_free;
-		}
-
-		ret = 0;
-		goto out_unlock;
-	}
-
-	/*
-	 * No such thing, create a new one.  name= matching without subsys
-	 * specification is allowed for already existing hierarchies but we
-	 * can't create new one without subsys specification.
-	 */
-	if (!opts.subsys_mask && !opts.none) {
-		ret = -EINVAL;
-		goto out_unlock;
-	}
-
-	/* Hierarchies may only be created in the initial cgroup namespace. */
-	if (ns != &init_cgroup_ns) {
-		ret = -EPERM;
-		goto out_unlock;
-	}
-
-	root = kzalloc(sizeof(*root), GFP_KERNEL);
-	if (!root) {
-		ret = -ENOMEM;
-		goto out_unlock;
-	}
-
-	init_cgroup_root(root, &opts);
-
-	ret = cgroup_setup_root(root, opts.subsys_mask);
-	if (ret)
-		cgroup_free_root(root);
-
-out_unlock:
-	mutex_unlock(&cgroup_mutex);
-out_free:
-	kfree(opts.release_agent);
-	kfree(opts.name);
-
-	if (ret) {
-		put_cgroup_ns(ns);
-		return ERR_PTR(ret);
-	}
-out_mount:
-	dentry = kernfs_mount(fs_type, flags, root->kf_root,
-			      is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
-			      &new_sb);
-
-	/*
-	 * In non-init cgroup namespace, instead of root cgroup's
-	 * dentry, we return the dentry corresponding to the
-	 * cgroupns->root_cgrp.
-	 */
-	if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
-		struct dentry *nsdentry;
-		struct cgroup *cgrp;
-
-		mutex_lock(&cgroup_mutex);
-		spin_lock_irq(&css_set_lock);
-
-		cgrp = cset_cgroup_from_root(ns->root_cset, root);
-
-		spin_unlock_irq(&css_set_lock);
-		mutex_unlock(&cgroup_mutex);
-
-		nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
-		dput(dentry);
-		dentry = nsdentry;
-	}
-
-	if (IS_ERR(dentry) || !new_sb)
-		cgroup_put(&root->cgrp);
-
-	/*
-	 * If @pinned_sb, we're reusing an existing root and holding an
-	 * extra ref on its sb.  Mount is complete.  Put the extra ref.
-	 */
-	if (pinned_sb) {
-		WARN_ON(new_sb);
-		deactivate_super(pinned_sb);
-	}
-
-	put_cgroup_ns(ns);
-	return dentry;
-}
-
-static void cgroup_kill_sb(struct super_block *sb)
-{
-	struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
-	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
-
-	/*
-	 * If @root doesn't have any mounts or children, start killing it.
-	 * This prevents new mounts by disabling percpu_ref_tryget_live().
-	 * cgroup_mount() may wait for @root's release.
-	 *
-	 * And don't kill the default root.
-	 */
-	if (!list_empty(&root->cgrp.self.children) ||
-	    root == &cgrp_dfl_root)
-		cgroup_put(&root->cgrp);
-	else
-		percpu_ref_kill(&root->cgrp.self.refcnt);
-
-	kernfs_kill_sb(sb);
-}
-
-static struct file_system_type cgroup_fs_type = {
-	.name = "cgroup",
-	.mount = cgroup_mount,
-	.kill_sb = cgroup_kill_sb,
-	.fs_flags = FS_USERNS_MOUNT,
-};
-
-static struct file_system_type cgroup2_fs_type = {
-	.name = "cgroup2",
-	.mount = cgroup_mount,
-	.kill_sb = cgroup_kill_sb,
-	.fs_flags = FS_USERNS_MOUNT,
-};
-
-static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
-				 struct cgroup_namespace *ns)
-{
-	struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
-
-	return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
-}
-
-int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
-		   struct cgroup_namespace *ns)
-{
-	int ret;
-
-	mutex_lock(&cgroup_mutex);
-	spin_lock_irq(&css_set_lock);
-
-	ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
-
-	spin_unlock_irq(&css_set_lock);
-	mutex_unlock(&cgroup_mutex);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(cgroup_path_ns);
-
-/**
- * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
- * @task: target task
- * @buf: the buffer to write the path into
- * @buflen: the length of the buffer
- *
- * Determine @task's cgroup on the first (the one with the lowest non-zero
- * hierarchy_id) cgroup hierarchy and copy its path into @buf.  This
- * function grabs cgroup_mutex and shouldn't be used inside locks used by
- * cgroup controller callbacks.
- *
- * Return value is the same as kernfs_path().
- */
-int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
-{
-	struct cgroup_root *root;
-	struct cgroup *cgrp;
-	int hierarchy_id = 1;
-	int ret;
-
-	mutex_lock(&cgroup_mutex);
-	spin_lock_irq(&css_set_lock);
-
-	root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
-
-	if (root) {
-		cgrp = task_cgroup_from_root(task, root);
-		ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
-	} else {
-		/* if no hierarchy exists, everyone is in "/" */
-		ret = strlcpy(buf, "/", buflen);
-	}
-
-	spin_unlock_irq(&css_set_lock);
-	mutex_unlock(&cgroup_mutex);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(task_cgroup_path);
-
-/* used to track tasks and other necessary states during migration */
-struct cgroup_taskset {
-	/* the src and dst cset list running through cset->mg_node */
-	struct list_head	src_csets;
-	struct list_head	dst_csets;
-
-	/* the subsys currently being processed */
-	int			ssid;
-
-	/*
-	 * Fields for cgroup_taskset_*() iteration.
-	 *
-	 * Before migration is committed, the target migration tasks are on
-	 * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
-	 * the csets on ->dst_csets.  ->csets point to either ->src_csets
-	 * or ->dst_csets depending on whether migration is committed.
-	 *
-	 * ->cur_csets and ->cur_task point to the current task position
-	 * during iteration.
-	 */
-	struct list_head	*csets;
-	struct css_set		*cur_cset;
-	struct task_struct	*cur_task;
-};
-
-#define CGROUP_TASKSET_INIT(tset)	(struct cgroup_taskset){	\
-	.src_csets		= LIST_HEAD_INIT(tset.src_csets),	\
-	.dst_csets		= LIST_HEAD_INIT(tset.dst_csets),	\
-	.csets			= &tset.src_csets,			\
-}
-
-/**
- * cgroup_taskset_add - try to add a migration target task to a taskset
- * @task: target task
- * @tset: target taskset
- *
- * Add @task, which is a migration target, to @tset.  This function becomes
- * noop if @task doesn't need to be migrated.  @task's css_set should have
- * been added as a migration source and @task->cg_list will be moved from
- * the css_set's tasks list to mg_tasks one.
- */
-static void cgroup_taskset_add(struct task_struct *task,
-			       struct cgroup_taskset *tset)
-{
-	struct css_set *cset;
-
-	lockdep_assert_held(&css_set_lock);
-
-	/* @task either already exited or can't exit until the end */
-	if (task->flags & PF_EXITING)
-		return;
-
-	/* leave @task alone if post_fork() hasn't linked it yet */
-	if (list_empty(&task->cg_list))
-		return;
-
-	cset = task_css_set(task);
-	if (!cset->mg_src_cgrp)
-		return;
-
-	list_move_tail(&task->cg_list, &cset->mg_tasks);
-	if (list_empty(&cset->mg_node))
-		list_add_tail(&cset->mg_node, &tset->src_csets);
-	if (list_empty(&cset->mg_dst_cset->mg_node))
-		list_move_tail(&cset->mg_dst_cset->mg_node,
-			       &tset->dst_csets);
-}
-
-/**
- * cgroup_taskset_first - reset taskset and return the first task
- * @tset: taskset of interest
- * @dst_cssp: output variable for the destination css
- *
- * @tset iteration is initialized and the first task is returned.
- */
-struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
-					 struct cgroup_subsys_state **dst_cssp)
-{
-	tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
-	tset->cur_task = NULL;
-
-	return cgroup_taskset_next(tset, dst_cssp);
-}
-
-/**
- * cgroup_taskset_next - iterate to the next task in taskset
- * @tset: taskset of interest
- * @dst_cssp: output variable for the destination css
- *
- * Return the next task in @tset.  Iteration must have been initialized
- * with cgroup_taskset_first().
- */
-struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
-					struct cgroup_subsys_state **dst_cssp)
-{
-	struct css_set *cset = tset->cur_cset;
-	struct task_struct *task = tset->cur_task;
-
-	while (&cset->mg_node != tset->csets) {
-		if (!task)
-			task = list_first_entry(&cset->mg_tasks,
-						struct task_struct, cg_list);
-		else
-			task = list_next_entry(task, cg_list);
-
-		if (&task->cg_list != &cset->mg_tasks) {
-			tset->cur_cset = cset;
-			tset->cur_task = task;
-
-			/*
-			 * This function may be called both before and
-			 * after cgroup_taskset_migrate().  The two cases
-			 * can be distinguished by looking at whether @cset
-			 * has its ->mg_dst_cset set.
-			 */
-			if (cset->mg_dst_cset)
-				*dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
-			else
-				*dst_cssp = cset->subsys[tset->ssid];
-
-			return task;
-		}
-
-		cset = list_next_entry(cset, mg_node);
-		task = NULL;
-	}
-
-	return NULL;
-}
-
-/**
- * cgroup_taskset_migrate - migrate a taskset
- * @tset: taget taskset
- * @root: cgroup root the migration is taking place on
- *
- * Migrate tasks in @tset as setup by migration preparation functions.
- * This function fails iff one of the ->can_attach callbacks fails and
- * guarantees that either all or none of the tasks in @tset are migrated.
- * @tset is consumed regardless of success.
- */
-static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
-				  struct cgroup_root *root)
-{
-	struct cgroup_subsys *ss;
-	struct task_struct *task, *tmp_task;
-	struct css_set *cset, *tmp_cset;
-	int ssid, failed_ssid, ret;
-
-	/* methods shouldn't be called if no task is actually migrating */
-	if (list_empty(&tset->src_csets))
-		return 0;
-
-	/* check that we can legitimately attach to the cgroup */
-	do_each_subsys_mask(ss, ssid, root->subsys_mask) {
-		if (ss->can_attach) {
-			tset->ssid = ssid;
-			ret = ss->can_attach(tset);
-			if (ret) {
-				failed_ssid = ssid;
-				goto out_cancel_attach;
-			}
-		}
-	} while_each_subsys_mask();
-
-	/*
-	 * Now that we're guaranteed success, proceed to move all tasks to
-	 * the new cgroup.  There are no failure cases after here, so this
-	 * is the commit point.
-	 */
-	spin_lock_irq(&css_set_lock);
-	list_for_each_entry(cset, &tset->src_csets, mg_node) {
-		list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
-			struct css_set *from_cset = task_css_set(task);
-			struct css_set *to_cset = cset->mg_dst_cset;
-
-			get_css_set(to_cset);
-			css_set_move_task(task, from_cset, to_cset, true);
-			put_css_set_locked(from_cset);
-		}
-	}
-	spin_unlock_irq(&css_set_lock);
-
-	/*
-	 * Migration is committed, all target tasks are now on dst_csets.
-	 * Nothing is sensitive to fork() after this point.  Notify
-	 * controllers that migration is complete.
-	 */
-	tset->csets = &tset->dst_csets;
-
-	do_each_subsys_mask(ss, ssid, root->subsys_mask) {
-		if (ss->attach) {
-			tset->ssid = ssid;
-			ss->attach(tset);
-		}
-	} while_each_subsys_mask();
-
-	ret = 0;
-	goto out_release_tset;
-
-out_cancel_attach:
-	do_each_subsys_mask(ss, ssid, root->subsys_mask) {
-		if (ssid == failed_ssid)
-			break;
-		if (ss->cancel_attach) {
-			tset->ssid = ssid;
-			ss->cancel_attach(tset);
-		}
-	} while_each_subsys_mask();
-out_release_tset:
-	spin_lock_irq(&css_set_lock);
-	list_splice_init(&tset->dst_csets, &tset->src_csets);
-	list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
-		list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
-		list_del_init(&cset->mg_node);
-	}
-	spin_unlock_irq(&css_set_lock);
-	return ret;
-}
-
-/**
- * cgroup_may_migrate_to - verify whether a cgroup can be migration destination
- * @dst_cgrp: destination cgroup to test
- *
- * On the default hierarchy, except for the root, subtree_control must be
- * zero for migration destination cgroups with tasks so that child cgroups
- * don't compete against tasks.
- */
-static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
-{
-	return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
-		!dst_cgrp->subtree_control;
-}
-
-/**
- * cgroup_migrate_finish - cleanup after attach
- * @preloaded_csets: list of preloaded css_sets
- *
- * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
- * those functions for details.
- */
-static void cgroup_migrate_finish(struct list_head *preloaded_csets)
-{
-	struct css_set *cset, *tmp_cset;
-
-	lockdep_assert_held(&cgroup_mutex);
-
-	spin_lock_irq(&css_set_lock);
-	list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
-		cset->mg_src_cgrp = NULL;
-		cset->mg_dst_cgrp = NULL;
-		cset->mg_dst_cset = NULL;
-		list_del_init(&cset->mg_preload_node);
-		put_css_set_locked(cset);
-	}
-	spin_unlock_irq(&css_set_lock);
-}
-
-/**
- * cgroup_migrate_add_src - add a migration source css_set
- * @src_cset: the source css_set to add
- * @dst_cgrp: the destination cgroup
- * @preloaded_csets: list of preloaded css_sets
- *
- * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
- * @src_cset and add it to @preloaded_csets, which should later be cleaned
- * up by cgroup_migrate_finish().
- *
- * This function may be called without holding cgroup_threadgroup_rwsem
- * even if the target is a process.  Threads may be created and destroyed
- * but as long as cgroup_mutex is not dropped, no new css_set can be put
- * into play and the preloaded css_sets are guaranteed to cover all
- * migrations.
- */
-static void cgroup_migrate_add_src(struct css_set *src_cset,
-				   struct cgroup *dst_cgrp,
-				   struct list_head *preloaded_csets)
-{
-	struct cgroup *src_cgrp;
-
-	lockdep_assert_held(&cgroup_mutex);
-	lockdep_assert_held(&css_set_lock);
-
-	/*
-	 * If ->dead, @src_set is associated with one or more dead cgroups
-	 * and doesn't contain any migratable tasks.  Ignore it early so
-	 * that the rest of migration path doesn't get confused by it.
-	 */
-	if (src_cset->dead)
-		return;
-
-	src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
-
-	if (!list_empty(&src_cset->mg_preload_node))
-		return;
-
-	WARN_ON(src_cset->mg_src_cgrp);
-	WARN_ON(src_cset->mg_dst_cgrp);
-	WARN_ON(!list_empty(&src_cset->mg_tasks));
-	WARN_ON(!list_empty(&src_cset->mg_node));
-
-	src_cset->mg_src_cgrp = src_cgrp;
-	src_cset->mg_dst_cgrp = dst_cgrp;
-	get_css_set(src_cset);
-	list_add(&src_cset->mg_preload_node, preloaded_csets);
-}
-
-/**
- * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
- * @preloaded_csets: list of preloaded source css_sets
- *
- * Tasks are about to be moved and all the source css_sets have been
- * preloaded to @preloaded_csets.  This function looks up and pins all
- * destination css_sets, links each to its source, and append them to
- * @preloaded_csets.
- *
- * This function must be called after cgroup_migrate_add_src() has been
- * called on each migration source css_set.  After migration is performed
- * using cgroup_migrate(), cgroup_migrate_finish() must be called on
- * @preloaded_csets.
- */
-static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
-{
-	LIST_HEAD(csets);
-	struct css_set *src_cset, *tmp_cset;
-
-	lockdep_assert_held(&cgroup_mutex);
-
-	/* look up the dst cset for each src cset and link it to src */
-	list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
-		struct css_set *dst_cset;
-
-		dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
-		if (!dst_cset)
-			goto err;
-
-		WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
-
-		/*
-		 * If src cset equals dst, it's noop.  Drop the src.
-		 * cgroup_migrate() will skip the cset too.  Note that we
-		 * can't handle src == dst as some nodes are used by both.
-		 */
-		if (src_cset == dst_cset) {
-			src_cset->mg_src_cgrp = NULL;
-			src_cset->mg_dst_cgrp = NULL;
-			list_del_init(&src_cset->mg_preload_node);
-			put_css_set(src_cset);
-			put_css_set(dst_cset);
-			continue;
-		}
-
-		src_cset->mg_dst_cset = dst_cset;
-
-		if (list_empty(&dst_cset->mg_preload_node))
-			list_add(&dst_cset->mg_preload_node, &csets);
-		else
-			put_css_set(dst_cset);
-	}
-
-	list_splice_tail(&csets, preloaded_csets);
-	return 0;
-err:
-	cgroup_migrate_finish(&csets);
-	return -ENOMEM;
-}
-
-/**
- * cgroup_migrate - migrate a process or task to a cgroup
- * @leader: the leader of the process or the task to migrate
- * @threadgroup: whether @leader points to the whole process or a single task
- * @root: cgroup root migration is taking place on
- *
- * Migrate a process or task denoted by @leader.  If migrating a process,
- * the caller must be holding cgroup_threadgroup_rwsem.  The caller is also
- * responsible for invoking cgroup_migrate_add_src() and
- * cgroup_migrate_prepare_dst() on the targets before invoking this
- * function and following up with cgroup_migrate_finish().
- *
- * As long as a controller's ->can_attach() doesn't fail, this function is
- * guaranteed to succeed.  This means that, excluding ->can_attach()
- * failure, when migrating multiple targets, the success or failure can be
- * decided for all targets by invoking group_migrate_prepare_dst() before
- * actually starting migrating.
- */
-static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
-			  struct cgroup_root *root)
-{
-	struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
-	struct task_struct *task;
-
-	/*
-	 * Prevent freeing of tasks while we take a snapshot. Tasks that are
-	 * already PF_EXITING could be freed from underneath us unless we
-	 * take an rcu_read_lock.
-	 */
-	spin_lock_irq(&css_set_lock);
-	rcu_read_lock();
-	task = leader;
-	do {
-		cgroup_taskset_add(task, &tset);
-		if (!threadgroup)
-			break;
-	} while_each_thread(leader, task);
-	rcu_read_unlock();
-	spin_unlock_irq(&css_set_lock);
-
-	return cgroup_taskset_migrate(&tset, root);
-}
-
-/**
- * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
- * @dst_cgrp: the cgroup to attach to
- * @leader: the task or the leader of the threadgroup to be attached
- * @threadgroup: attach the whole threadgroup?
- *
- * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
- */
-static int cgroup_attach_task(struct cgroup *dst_cgrp,
-			      struct task_struct *leader, bool threadgroup)
-{
-	LIST_HEAD(preloaded_csets);
-	struct task_struct *task;
-	int ret;
-
-	if (!cgroup_may_migrate_to(dst_cgrp))
-		return -EBUSY;
-
-	/* look up all src csets */
-	spin_lock_irq(&css_set_lock);
-	rcu_read_lock();
-	task = leader;
-	do {
-		cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
-				       &preloaded_csets);
-		if (!threadgroup)
-			break;
-	} while_each_thread(leader, task);
-	rcu_read_unlock();
-	spin_unlock_irq(&css_set_lock);
-
-	/* prepare dst csets and commit */
-	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
-	if (!ret)
-		ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root);
-
-	cgroup_migrate_finish(&preloaded_csets);
-
-	if (!ret)
-		trace_cgroup_attach_task(dst_cgrp, leader, threadgroup);
-
-	return ret;
-}
-
-static int cgroup_procs_write_permission(struct task_struct *task,
-					 struct cgroup *dst_cgrp,
-					 struct kernfs_open_file *of)
-{
-	const struct cred *cred = current_cred();
-	const struct cred *tcred = get_task_cred(task);
-	int ret = 0;
-
-	/*
-	 * even if we're attaching all tasks in the thread group, we only
-	 * need to check permissions on one of them.
-	 */
-	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
-	    !uid_eq(cred->euid, tcred->uid) &&
-	    !uid_eq(cred->euid, tcred->suid))
-		ret = -EACCES;
-
-	if (!ret && cgroup_on_dfl(dst_cgrp)) {
-		struct super_block *sb = of->file->f_path.dentry->d_sb;
-		struct cgroup *cgrp;
-		struct inode *inode;
-
-		spin_lock_irq(&css_set_lock);
-		cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
-		spin_unlock_irq(&css_set_lock);
-
-		while (!cgroup_is_descendant(dst_cgrp, cgrp))
-			cgrp = cgroup_parent(cgrp);
-
-		ret = -ENOMEM;
-		inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
-		if (inode) {
-			ret = inode_permission(inode, MAY_WRITE);
-			iput(inode);
-		}
-	}
-
-	put_cred(tcred);
-	return ret;
-}
-
-/*
- * Find the task_struct of the task to attach by vpid and pass it along to the
- * function to attach either it or all tasks in its threadgroup. Will lock
- * cgroup_mutex and threadgroup.
- */
-static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
-				    size_t nbytes, loff_t off, bool threadgroup)
-{
-	struct task_struct *tsk;
-	struct cgroup_subsys *ss;
-	struct cgroup *cgrp;
-	pid_t pid;
-	int ssid, ret;
-
-	if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
-		return -EINVAL;
-
-	cgrp = cgroup_kn_lock_live(of->kn, false);
-	if (!cgrp)
-		return -ENODEV;
-
-	percpu_down_write(&cgroup_threadgroup_rwsem);
-	rcu_read_lock();
-	if (pid) {
-		tsk = find_task_by_vpid(pid);
-		if (!tsk) {
-			ret = -ESRCH;
-			goto out_unlock_rcu;
-		}
-	} else {
-		tsk = current;
-	}
-
-	if (threadgroup)
-		tsk = tsk->group_leader;
-
-	/*
-	 * Workqueue threads may acquire PF_NO_SETAFFINITY and become
-	 * trapped in a cpuset, or RT worker may be born in a cgroup
-	 * with no rt_runtime allocated.  Just say no.
-	 */
-	if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
-		ret = -EINVAL;
-		goto out_unlock_rcu;
-	}
-
-	get_task_struct(tsk);
-	rcu_read_unlock();
-
-	ret = cgroup_procs_write_permission(tsk, cgrp, of);
-	if (!ret)
-		ret = cgroup_attach_task(cgrp, tsk, threadgroup);
-
-	put_task_struct(tsk);
-	goto out_unlock_threadgroup;
-
-out_unlock_rcu:
-	rcu_read_unlock();
-out_unlock_threadgroup:
-	percpu_up_write(&cgroup_threadgroup_rwsem);
-	for_each_subsys(ss, ssid)
-		if (ss->post_attach)
-			ss->post_attach();
-	cgroup_kn_unlock(of->kn);
-	return ret ?: nbytes;
-}
-
-/**
- * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
- * @from: attach to all cgroups of a given task
- * @tsk: the task to be attached
- */
-int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
-{
-	struct cgroup_root *root;
-	int retval = 0;
-
-	mutex_lock(&cgroup_mutex);
-	percpu_down_write(&cgroup_threadgroup_rwsem);
-	for_each_root(root) {
-		struct cgroup *from_cgrp;
-
-		if (root == &cgrp_dfl_root)
-			continue;
-
-		spin_lock_irq(&css_set_lock);
-		from_cgrp = task_cgroup_from_root(from, root);
-		spin_unlock_irq(&css_set_lock);
-
-		retval = cgroup_attach_task(from_cgrp, tsk, false);
-		if (retval)
-			break;
-	}
-	percpu_up_write(&cgroup_threadgroup_rwsem);
-	mutex_unlock(&cgroup_mutex);
-
-	return retval;
-}
-EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
-
-static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
-				  char *buf, size_t nbytes, loff_t off)
-{
-	return __cgroup_procs_write(of, buf, nbytes, off, false);
-}
-
-static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
-				  char *buf, size_t nbytes, loff_t off)
-{
-	return __cgroup_procs_write(of, buf, nbytes, off, true);
-}
-
-static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
-					  char *buf, size_t nbytes, loff_t off)
-{
-	struct cgroup *cgrp;
-
-	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
-
-	cgrp = cgroup_kn_lock_live(of->kn, false);
-	if (!cgrp)
-		return -ENODEV;
-	spin_lock(&release_agent_path_lock);
-	strlcpy(cgrp->root->release_agent_path, strstrip(buf),
-		sizeof(cgrp->root->release_agent_path));
-	spin_unlock(&release_agent_path_lock);
-	cgroup_kn_unlock(of->kn);
-	return nbytes;
-}
-
-static int cgroup_release_agent_show(struct seq_file *seq, void *v)
-{
-	struct cgroup *cgrp = seq_css(seq)->cgroup;
-
-	spin_lock(&release_agent_path_lock);
-	seq_puts(seq, cgrp->root->release_agent_path);
-	spin_unlock(&release_agent_path_lock);
-	seq_putc(seq, '\n');
-	return 0;
-}
-
-static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
-{
-	seq_puts(seq, "0\n");
-	return 0;
-}
-
-static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
-{
-	struct cgroup_subsys *ss;
-	bool printed = false;
-	int ssid;
-
-	do_each_subsys_mask(ss, ssid, ss_mask) {
-		if (printed)
-			seq_putc(seq, ' ');
-		seq_printf(seq, "%s", ss->name);
-		printed = true;
-	} while_each_subsys_mask();
-	if (printed)
-		seq_putc(seq, '\n');
-}
-
-/* show controllers which are enabled from the parent */
-static int cgroup_controllers_show(struct seq_file *seq, void *v)
-{
-	struct cgroup *cgrp = seq_css(seq)->cgroup;
-
-	cgroup_print_ss_mask(seq, cgroup_control(cgrp));
-	return 0;
-}
-
-/* show controllers which are enabled for a given cgroup's children */
-static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
-{
-	struct cgroup *cgrp = seq_css(seq)->cgroup;
-
-	cgroup_print_ss_mask(seq, cgrp->subtree_control);
-	return 0;
-}
-
-/**
- * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
- * @cgrp: root of the subtree to update csses for
- *
- * @cgrp's control masks have changed and its subtree's css associations
- * need to be updated accordingly.  This function looks up all css_sets
- * which are attached to the subtree, creates the matching updated css_sets
- * and migrates the tasks to the new ones.
- */
-static int cgroup_update_dfl_csses(struct cgroup *cgrp)
-{
-	LIST_HEAD(preloaded_csets);
-	struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
-	struct cgroup_subsys_state *d_css;
-	struct cgroup *dsct;
-	struct css_set *src_cset;
-	int ret;
-
-	lockdep_assert_held(&cgroup_mutex);
-
-	percpu_down_write(&cgroup_threadgroup_rwsem);
-
-	/* look up all csses currently attached to @cgrp's subtree */
-	spin_lock_irq(&css_set_lock);
-	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
-		struct cgrp_cset_link *link;
-
-		list_for_each_entry(link, &dsct->cset_links, cset_link)
-			cgroup_migrate_add_src(link->cset, dsct,
-					       &preloaded_csets);
-	}
-	spin_unlock_irq(&css_set_lock);
-
-	/* NULL dst indicates self on default hierarchy */
-	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
-	if (ret)
-		goto out_finish;
-
-	spin_lock_irq(&css_set_lock);
-	list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
-		struct task_struct *task, *ntask;
-
-		/* src_csets precede dst_csets, break on the first dst_cset */
-		if (!src_cset->mg_src_cgrp)
-			break;
-
-		/* all tasks in src_csets need to be migrated */
-		list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
-			cgroup_taskset_add(task, &tset);
-	}
-	spin_unlock_irq(&css_set_lock);
-
-	ret = cgroup_taskset_migrate(&tset, cgrp->root);
-out_finish:
-	cgroup_migrate_finish(&preloaded_csets);
-	percpu_up_write(&cgroup_threadgroup_rwsem);
-	return ret;
-}
-
-/**
- * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses
- * @cgrp: root of the target subtree
- *
- * Because css offlining is asynchronous, userland may try to re-enable a
- * controller while the previous css is still around.  This function grabs
- * cgroup_mutex and drains the previous css instances of @cgrp's subtree.
- */
-static void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
-	__acquires(&cgroup_mutex)
-{
-	struct cgroup *dsct;
-	struct cgroup_subsys_state *d_css;
-	struct cgroup_subsys *ss;
-	int ssid;
-
-restart:
-	mutex_lock(&cgroup_mutex);
-
-	cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
-		for_each_subsys(ss, ssid) {
-			struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
-			DEFINE_WAIT(wait);
-
-			if (!css || !percpu_ref_is_dying(&css->refcnt))
-				continue;
-
-			cgroup_get(dsct);
-			prepare_to_wait(&dsct->offline_waitq, &wait,
-					TASK_UNINTERRUPTIBLE);
-
-			mutex_unlock(&cgroup_mutex);
-			schedule();
-			finish_wait(&dsct->offline_waitq, &wait);
-
-			cgroup_put(dsct);
-			goto restart;
-		}
-	}
-}
-
-/**
- * cgroup_save_control - save control masks of a subtree
- * @cgrp: root of the target subtree
- *
- * Save ->subtree_control and ->subtree_ss_mask to the respective old_
- * prefixed fields for @cgrp's subtree including @cgrp itself.
- */
-static void cgroup_save_control(struct cgroup *cgrp)
-{
-	struct cgroup *dsct;
-	struct cgroup_subsys_state *d_css;
-
-	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
-		dsct->old_subtree_control = dsct->subtree_control;
-		dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
-	}
-}
-
-/**
- * cgroup_propagate_control - refresh control masks of a subtree
- * @cgrp: root of the target subtree
- *
- * For @cgrp and its subtree, ensure ->subtree_ss_mask matches
- * ->subtree_control and propagate controller availability through the
- * subtree so that descendants don't have unavailable controllers enabled.
- */
-static void cgroup_propagate_control(struct cgroup *cgrp)
-{
-	struct cgroup *dsct;
-	struct cgroup_subsys_state *d_css;
-
-	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
-		dsct->subtree_control &= cgroup_control(dsct);
-		dsct->subtree_ss_mask =
-			cgroup_calc_subtree_ss_mask(dsct->subtree_control,
-						    cgroup_ss_mask(dsct));
-	}
-}
-
-/**
- * cgroup_restore_control - restore control masks of a subtree
- * @cgrp: root of the target subtree
- *
- * Restore ->subtree_control and ->subtree_ss_mask from the respective old_
- * prefixed fields for @cgrp's subtree including @cgrp itself.
- */
-static void cgroup_restore_control(struct cgroup *cgrp)
-{
-	struct cgroup *dsct;
-	struct cgroup_subsys_state *d_css;
-
-	cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
-		dsct->subtree_control = dsct->old_subtree_control;
-		dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
-	}
-}
-
-static bool css_visible(struct cgroup_subsys_state *css)
-{
-	struct cgroup_subsys *ss = css->ss;
-	struct cgroup *cgrp = css->cgroup;
-
-	if (cgroup_control(cgrp) & (1 << ss->id))
-		return true;
-	if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
-		return false;
-	return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
-}
-
-/**
- * cgroup_apply_control_enable - enable or show csses according to control
- * @cgrp: root of the target subtree
- *
- * Walk @cgrp's subtree and create new csses or make the existing ones
- * visible.  A css is created invisible if it's being implicitly enabled
- * through dependency.  An invisible css is made visible when the userland
- * explicitly enables it.
- *
- * Returns 0 on success, -errno on failure.  On failure, csses which have
- * been processed already aren't cleaned up.  The caller is responsible for
- * cleaning up with cgroup_apply_control_disble().
- */
-static int cgroup_apply_control_enable(struct cgroup *cgrp)
-{
-	struct cgroup *dsct;
-	struct cgroup_subsys_state *d_css;
-	struct cgroup_subsys *ss;
-	int ssid, ret;
-
-	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
-		for_each_subsys(ss, ssid) {
-			struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
-
-			WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
-
-			if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
-				continue;
-
-			if (!css) {
-				css = css_create(dsct, ss);
-				if (IS_ERR(css))
-					return PTR_ERR(css);
-			}
-
-			if (css_visible(css)) {
-				ret = css_populate_dir(css);
-				if (ret)
-					return ret;
-			}
-		}
-	}
-
-	return 0;
-}
-
-/**
- * cgroup_apply_control_disable - kill or hide csses according to control
- * @cgrp: root of the target subtree
- *
- * Walk @cgrp's subtree and kill and hide csses so that they match
- * cgroup_ss_mask() and cgroup_visible_mask().
- *
- * A css is hidden when the userland requests it to be disabled while other
- * subsystems are still depending on it.  The css must not actively control
- * resources and be in the vanilla state if it's made visible again later.
- * Controllers which may be depended upon should provide ->css_reset() for
- * this purpose.
- */
-static void cgroup_apply_control_disable(struct cgroup *cgrp)
-{
-	struct cgroup *dsct;
-	struct cgroup_subsys_state *d_css;
-	struct cgroup_subsys *ss;
-	int ssid;
-
-	cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
-		for_each_subsys(ss, ssid) {
-			struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
-
-			WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
-
-			if (!css)
-				continue;
-
-			if (css->parent &&
-			    !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
-				kill_css(css);
-			} else if (!css_visible(css)) {
-				css_clear_dir(css);
-				if (ss->css_reset)
-					ss->css_reset(css);
-			}
-		}
-	}
-}
-
-/**
- * cgroup_apply_control - apply control mask updates to the subtree
- * @cgrp: root of the target subtree
- *
- * subsystems can be enabled and disabled in a subtree using the following
- * steps.
- *
- * 1. Call cgroup_save_control() to stash the current state.
- * 2. Update ->subtree_control masks in the subtree as desired.
- * 3. Call cgroup_apply_control() to apply the changes.
- * 4. Optionally perform other related operations.
- * 5. Call cgroup_finalize_control() to finish up.
- *
- * This function implements step 3 and propagates the mask changes
- * throughout @cgrp's subtree, updates csses accordingly and perform
- * process migrations.
- */
-static int cgroup_apply_control(struct cgroup *cgrp)
-{
-	int ret;
-
-	cgroup_propagate_control(cgrp);
-
-	ret = cgroup_apply_control_enable(cgrp);
-	if (ret)
-		return ret;
-
-	/*
-	 * At this point, cgroup_e_css() results reflect the new csses
-	 * making the following cgroup_update_dfl_csses() properly update
-	 * css associations of all tasks in the subtree.
-	 */
-	ret = cgroup_update_dfl_csses(cgrp);
-	if (ret)
-		return ret;
-
-	return 0;
-}
-
-/**
- * cgroup_finalize_control - finalize control mask update
- * @cgrp: root of the target subtree
- * @ret: the result of the update
- *
- * Finalize control mask update.  See cgroup_apply_control() for more info.
- */
-static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
-{
-	if (ret) {
-		cgroup_restore_control(cgrp);
-		cgroup_propagate_control(cgrp);
-	}
-
-	cgroup_apply_control_disable(cgrp);
-}
-
-/* change the enabled child controllers for a cgroup in the default hierarchy */
-static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
-					    char *buf, size_t nbytes,
-					    loff_t off)
-{
-	u16 enable = 0, disable = 0;
-	struct cgroup *cgrp, *child;
-	struct cgroup_subsys *ss;
-	char *tok;
-	int ssid, ret;
-
-	/*
-	 * Parse input - space separated list of subsystem names prefixed
-	 * with either + or -.
-	 */
-	buf = strstrip(buf);
-	while ((tok = strsep(&buf, " "))) {
-		if (tok[0] == '\0')
-			continue;
-		do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
-			if (!cgroup_ssid_enabled(ssid) ||
-			    strcmp(tok + 1, ss->name))
-				continue;
-
-			if (*tok == '+') {
-				enable |= 1 << ssid;
-				disable &= ~(1 << ssid);
-			} else if (*tok == '-') {
-				disable |= 1 << ssid;
-				enable &= ~(1 << ssid);
-			} else {
-				return -EINVAL;
-			}
-			break;
-		} while_each_subsys_mask();
-		if (ssid == CGROUP_SUBSYS_COUNT)
-			return -EINVAL;
-	}
-
-	cgrp = cgroup_kn_lock_live(of->kn, true);
-	if (!cgrp)
-		return -ENODEV;
-
-	for_each_subsys(ss, ssid) {
-		if (enable & (1 << ssid)) {
-			if (cgrp->subtree_control & (1 << ssid)) {
-				enable &= ~(1 << ssid);
-				continue;
-			}
-
-			if (!(cgroup_control(cgrp) & (1 << ssid))) {
-				ret = -ENOENT;
-				goto out_unlock;
-			}
-		} else if (disable & (1 << ssid)) {
-			if (!(cgrp->subtree_control & (1 << ssid))) {
-				disable &= ~(1 << ssid);
-				continue;
-			}
-
-			/* a child has it enabled? */
-			cgroup_for_each_live_child(child, cgrp) {
-				if (child->subtree_control & (1 << ssid)) {
-					ret = -EBUSY;
-					goto out_unlock;
-				}
-			}
-		}
-	}
-
-	if (!enable && !disable) {
-		ret = 0;
-		goto out_unlock;
-	}
-
-	/*
-	 * Except for the root, subtree_control must be zero for a cgroup
-	 * with tasks so that child cgroups don't compete against tasks.
-	 */
-	if (enable && cgroup_parent(cgrp)) {
-		struct cgrp_cset_link *link;
-
-		/*
-		 * Because namespaces pin csets too, @cgrp->cset_links
-		 * might not be empty even when @cgrp is empty.  Walk and
-		 * verify each cset.
-		 */
-		spin_lock_irq(&css_set_lock);
-
-		ret = 0;
-		list_for_each_entry(link, &cgrp->cset_links, cset_link) {
-			if (css_set_populated(link->cset)) {
-				ret = -EBUSY;
-				break;
-			}
-		}
-
-		spin_unlock_irq(&css_set_lock);
-
-		if (ret)
-			goto out_unlock;
-	}
-
-	/* save and update control masks and prepare csses */
-	cgroup_save_control(cgrp);
-
-	cgrp->subtree_control |= enable;
-	cgrp->subtree_control &= ~disable;
-
-	ret = cgroup_apply_control(cgrp);
-
-	cgroup_finalize_control(cgrp, ret);
-
-	kernfs_activate(cgrp->kn);
-	ret = 0;
-out_unlock:
-	cgroup_kn_unlock(of->kn);
-	return ret ?: nbytes;
-}
-
-static int cgroup_events_show(struct seq_file *seq, void *v)
-{
-	seq_printf(seq, "populated %d\n",
-		   cgroup_is_populated(seq_css(seq)->cgroup));
-	return 0;
-}
-
-static int cgroup_file_open(struct kernfs_open_file *of)
-{
-	struct cftype *cft = of->kn->priv;
-
-	if (cft->open)
-		return cft->open(of);
-	return 0;
-}
-
-static void cgroup_file_release(struct kernfs_open_file *of)
-{
-	struct cftype *cft = of->kn->priv;
-
-	if (cft->release)
-		cft->release(of);
-}
-
-static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
-				 size_t nbytes, loff_t off)
-{
-	struct cgroup *cgrp = of->kn->parent->priv;
-	struct cftype *cft = of->kn->priv;
-	struct cgroup_subsys_state *css;
-	int ret;
-
-	if (cft->write)
-		return cft->write(of, buf, nbytes, off);
-
-	/*
-	 * kernfs guarantees that a file isn't deleted with operations in
-	 * flight, which means that the matching css is and stays alive and
-	 * doesn't need to be pinned.  The RCU locking is not necessary
-	 * either.  It's just for the convenience of using cgroup_css().
-	 */
-	rcu_read_lock();
-	css = cgroup_css(cgrp, cft->ss);
-	rcu_read_unlock();
-
-	if (cft->write_u64) {
-		unsigned long long v;
-		ret = kstrtoull(buf, 0, &v);
-		if (!ret)
-			ret = cft->write_u64(css, cft, v);
-	} else if (cft->write_s64) {
-		long long v;
-		ret = kstrtoll(buf, 0, &v);
-		if (!ret)
-			ret = cft->write_s64(css, cft, v);
-	} else {
-		ret = -EINVAL;
-	}
-
-	return ret ?: nbytes;
-}
-
-static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
-{
-	return seq_cft(seq)->seq_start(seq, ppos);
-}
-
-static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
-{
-	return seq_cft(seq)->seq_next(seq, v, ppos);
-}
-
-static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
-{
-	if (seq_cft(seq)->seq_stop)
-		seq_cft(seq)->seq_stop(seq, v);
-}
-
-static int cgroup_seqfile_show(struct seq_file *m, void *arg)
-{
-	struct cftype *cft = seq_cft(m);
-	struct cgroup_subsys_state *css = seq_css(m);
-
-	if (cft->seq_show)
-		return cft->seq_show(m, arg);
-
-	if (cft->read_u64)
-		seq_printf(m, "%llu\n", cft->read_u64(css, cft));
-	else if (cft->read_s64)
-		seq_printf(m, "%lld\n", cft->read_s64(css, cft));
-	else
-		return -EINVAL;
-	return 0;
-}
-
-static struct kernfs_ops cgroup_kf_single_ops = {
-	.atomic_write_len	= PAGE_SIZE,
-	.open			= cgroup_file_open,
-	.release		= cgroup_file_release,
-	.write			= cgroup_file_write,
-	.seq_show		= cgroup_seqfile_show,
-};
-
-static struct kernfs_ops cgroup_kf_ops = {
-	.atomic_write_len	= PAGE_SIZE,
-	.open			= cgroup_file_open,
-	.release		= cgroup_file_release,
-	.write			= cgroup_file_write,
-	.seq_start		= cgroup_seqfile_start,
-	.seq_next		= cgroup_seqfile_next,
-	.seq_stop		= cgroup_seqfile_stop,
-	.seq_show		= cgroup_seqfile_show,
-};
-
-/*
- * cgroup_rename - Only allow simple rename of directories in place.
- */
-static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
-			 const char *new_name_str)
-{
-	struct cgroup *cgrp = kn->priv;
-	int ret;
-
-	if (kernfs_type(kn) != KERNFS_DIR)
-		return -ENOTDIR;
-	if (kn->parent != new_parent)
-		return -EIO;
-
-	/*
-	 * This isn't a proper migration and its usefulness is very
-	 * limited.  Disallow on the default hierarchy.
-	 */
-	if (cgroup_on_dfl(cgrp))
-		return -EPERM;
-
-	/*
-	 * We're gonna grab cgroup_mutex which nests outside kernfs
-	 * active_ref.  kernfs_rename() doesn't require active_ref
-	 * protection.  Break them before grabbing cgroup_mutex.
-	 */
-	kernfs_break_active_protection(new_parent);
-	kernfs_break_active_protection(kn);
-
-	mutex_lock(&cgroup_mutex);
-
-	ret = kernfs_rename(kn, new_parent, new_name_str);
-	if (!ret)
-		trace_cgroup_rename(cgrp);
-
-	mutex_unlock(&cgroup_mutex);
-
-	kernfs_unbreak_active_protection(kn);
-	kernfs_unbreak_active_protection(new_parent);
-	return ret;
-}
-
-/* set uid and gid of cgroup dirs and files to that of the creator */
-static int cgroup_kn_set_ugid(struct kernfs_node *kn)
-{
-	struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
-			       .ia_uid = current_fsuid(),
-			       .ia_gid = current_fsgid(), };
-
-	if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
-	    gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
-		return 0;
-
-	return kernfs_setattr(kn, &iattr);
-}
-
-static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
-			   struct cftype *cft)
-{
-	char name[CGROUP_FILE_NAME_MAX];
-	struct kernfs_node *kn;
-	struct lock_class_key *key = NULL;
-	int ret;
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	key = &cft->lockdep_key;
-#endif
-	kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
-				  cgroup_file_mode(cft), 0, cft->kf_ops, cft,
-				  NULL, key);
-	if (IS_ERR(kn))
-		return PTR_ERR(kn);
-
-	ret = cgroup_kn_set_ugid(kn);
-	if (ret) {
-		kernfs_remove(kn);
-		return ret;
-	}
-
-	if (cft->file_offset) {
-		struct cgroup_file *cfile = (void *)css + cft->file_offset;
-
-		spin_lock_irq(&cgroup_file_kn_lock);
-		cfile->kn = kn;
-		spin_unlock_irq(&cgroup_file_kn_lock);
-	}
-
-	return 0;
-}
-
-/**
- * cgroup_addrm_files - add or remove files to a cgroup directory
- * @css: the target css
- * @cgrp: the target cgroup (usually css->cgroup)
- * @cfts: array of cftypes to be added
- * @is_add: whether to add or remove
- *
- * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
- * For removals, this function never fails.
- */
-static int cgroup_addrm_files(struct cgroup_subsys_state *css,
-			      struct cgroup *cgrp, struct cftype cfts[],
-			      bool is_add)
-{
-	struct cftype *cft, *cft_end = NULL;
-	int ret = 0;
-
-	lockdep_assert_held(&cgroup_mutex);
-
-restart:
-	for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
-		/* does cft->flags tell us to skip this file on @cgrp? */
-		if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
-			continue;
-		if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
-			continue;
-		if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
-			continue;
-		if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
-			continue;
-
-		if (is_add) {
-			ret = cgroup_add_file(css, cgrp, cft);
-			if (ret) {
-				pr_warn("%s: failed to add %s, err=%d\n",
-					__func__, cft->name, ret);
-				cft_end = cft;
-				is_add = false;
-				goto restart;
-			}
-		} else {
-			cgroup_rm_file(cgrp, cft);
-		}
-	}
-	return ret;
-}
-
-static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
-{
-	LIST_HEAD(pending);
-	struct cgroup_subsys *ss = cfts[0].ss;
-	struct cgroup *root = &ss->root->cgrp;
-	struct cgroup_subsys_state *css;
-	int ret = 0;
-
-	lockdep_assert_held(&cgroup_mutex);
-
-	/* add/rm files for all cgroups created before */
-	css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
-		struct cgroup *cgrp = css->cgroup;
-
-		if (!(css->flags & CSS_VISIBLE))
-			continue;
-
-		ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
-		if (ret)
-			break;
-	}
-
-	if (is_add && !ret)
-		kernfs_activate(root->kn);
-	return ret;
-}
-
-static void cgroup_exit_cftypes(struct cftype *cfts)
-{
-	struct cftype *cft;
-
-	for (cft = cfts; cft->name[0] != '\0'; cft++) {
-		/* free copy for custom atomic_write_len, see init_cftypes() */
-		if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
-			kfree(cft->kf_ops);
-		cft->kf_ops = NULL;
-		cft->ss = NULL;
-
-		/* revert flags set by cgroup core while adding @cfts */
-		cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
-	}
-}
-
-static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
-{
-	struct cftype *cft;
-
-	for (cft = cfts; cft->name[0] != '\0'; cft++) {
-		struct kernfs_ops *kf_ops;
-
-		WARN_ON(cft->ss || cft->kf_ops);
-
-		if (cft->seq_start)
-			kf_ops = &cgroup_kf_ops;
-		else
-			kf_ops = &cgroup_kf_single_ops;
-
-		/*
-		 * Ugh... if @cft wants a custom max_write_len, we need to
-		 * make a copy of kf_ops to set its atomic_write_len.
-		 */
-		if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
-			kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
-			if (!kf_ops) {
-				cgroup_exit_cftypes(cfts);
-				return -ENOMEM;
-			}
-			kf_ops->atomic_write_len = cft->max_write_len;
-		}
-
-		cft->kf_ops = kf_ops;
-		cft->ss = ss;
-	}
-
-	return 0;
-}
-
-static int cgroup_rm_cftypes_locked(struct cftype *cfts)
-{
-	lockdep_assert_held(&cgroup_mutex);
-
-	if (!cfts || !cfts[0].ss)
-		return -ENOENT;
-
-	list_del(&cfts->node);
-	cgroup_apply_cftypes(cfts, false);
-	cgroup_exit_cftypes(cfts);
-	return 0;
-}
-
-/**
- * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
- * @cfts: zero-length name terminated array of cftypes
- *
- * Unregister @cfts.  Files described by @cfts are removed from all
- * existing cgroups and all future cgroups won't have them either.  This
- * function can be called anytime whether @cfts' subsys is attached or not.
- *
- * Returns 0 on successful unregistration, -ENOENT if @cfts is not
- * registered.
- */
-int cgroup_rm_cftypes(struct cftype *cfts)
-{
-	int ret;
-
-	mutex_lock(&cgroup_mutex);
-	ret = cgroup_rm_cftypes_locked(cfts);
-	mutex_unlock(&cgroup_mutex);
-	return ret;
-}
-
-/**
- * cgroup_add_cftypes - add an array of cftypes to a subsystem
- * @ss: target cgroup subsystem
- * @cfts: zero-length name terminated array of cftypes
- *
- * Register @cfts to @ss.  Files described by @cfts are created for all
- * existing cgroups to which @ss is attached and all future cgroups will
- * have them too.  This function can be called anytime whether @ss is
- * attached or not.
- *
- * Returns 0 on successful registration, -errno on failure.  Note that this
- * function currently returns 0 as long as @cfts registration is successful
- * even if some file creation attempts on existing cgroups fail.
- */
-static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
-{
-	int ret;
-
-	if (!cgroup_ssid_enabled(ss->id))
-		return 0;
-
-	if (!cfts || cfts[0].name[0] == '\0')
-		return 0;
-
-	ret = cgroup_init_cftypes(ss, cfts);
-	if (ret)
-		return ret;
-
-	mutex_lock(&cgroup_mutex);
-
-	list_add_tail(&cfts->node, &ss->cfts);
-	ret = cgroup_apply_cftypes(cfts, true);
-	if (ret)
-		cgroup_rm_cftypes_locked(cfts);
-
-	mutex_unlock(&cgroup_mutex);
-	return ret;
-}
-
-/**
- * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
- * @ss: target cgroup subsystem
- * @cfts: zero-length name terminated array of cftypes
- *
- * Similar to cgroup_add_cftypes() but the added files are only used for
- * the default hierarchy.
- */
-int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
-{
-	struct cftype *cft;
-
-	for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
-		cft->flags |= __CFTYPE_ONLY_ON_DFL;
-	return cgroup_add_cftypes(ss, cfts);
-}
-
-/**
- * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
- * @ss: target cgroup subsystem
- * @cfts: zero-length name terminated array of cftypes
- *
- * Similar to cgroup_add_cftypes() but the added files are only used for
- * the legacy hierarchies.
- */
-int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
-{
-	struct cftype *cft;
-
-	for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
-		cft->flags |= __CFTYPE_NOT_ON_DFL;
-	return cgroup_add_cftypes(ss, cfts);
-}
-
-/**
- * cgroup_file_notify - generate a file modified event for a cgroup_file
- * @cfile: target cgroup_file
- *
- * @cfile must have been obtained by setting cftype->file_offset.
- */
-void cgroup_file_notify(struct cgroup_file *cfile)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&cgroup_file_kn_lock, flags);
-	if (cfile->kn)
-		kernfs_notify(cfile->kn);
-	spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
-}
-
-/**
- * cgroup_task_count - count the number of tasks in a cgroup.
- * @cgrp: the cgroup in question
- *
- * Return the number of tasks in the cgroup.  The returned number can be
- * higher than the actual number of tasks due to css_set references from
- * namespace roots and temporary usages.
- */
-static int cgroup_task_count(const struct cgroup *cgrp)
-{
-	int count = 0;
-	struct cgrp_cset_link *link;
-
-	spin_lock_irq(&css_set_lock);
-	list_for_each_entry(link, &cgrp->cset_links, cset_link)
-		count += atomic_read(&link->cset->refcount);
-	spin_unlock_irq(&css_set_lock);
-	return count;
-}
-
-/**
- * css_next_child - find the next child of a given css
- * @pos: the current position (%NULL to initiate traversal)
- * @parent: css whose children to walk
- *
- * This function returns the next child of @parent and should be called
- * under either cgroup_mutex or RCU read lock.  The only requirement is
- * that @parent and @pos are accessible.  The next sibling is guaranteed to
- * be returned regardless of their states.
- *
- * If a subsystem synchronizes ->css_online() and the start of iteration, a
- * css which finished ->css_online() is guaranteed to be visible in the
- * future iterations and will stay visible until the last reference is put.
- * A css which hasn't finished ->css_online() or already finished
- * ->css_offline() may show up during traversal.  It's each subsystem's
- * responsibility to synchronize against on/offlining.
- */
-struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
-					   struct cgroup_subsys_state *parent)
-{
-	struct cgroup_subsys_state *next;
-
-	cgroup_assert_mutex_or_rcu_locked();
-
-	/*
-	 * @pos could already have been unlinked from the sibling list.
-	 * Once a cgroup is removed, its ->sibling.next is no longer
-	 * updated when its next sibling changes.  CSS_RELEASED is set when
-	 * @pos is taken off list, at which time its next pointer is valid,
-	 * and, as releases are serialized, the one pointed to by the next
-	 * pointer is guaranteed to not have started release yet.  This
-	 * implies that if we observe !CSS_RELEASED on @pos in this RCU
-	 * critical section, the one pointed to by its next pointer is
-	 * guaranteed to not have finished its RCU grace period even if we
-	 * have dropped rcu_read_lock() inbetween iterations.
-	 *
-	 * If @pos has CSS_RELEASED set, its next pointer can't be
-	 * dereferenced; however, as each css is given a monotonically
-	 * increasing unique serial number and always appended to the
-	 * sibling list, the next one can be found by walking the parent's
-	 * children until the first css with higher serial number than
-	 * @pos's.  While this path can be slower, it happens iff iteration
-	 * races against release and the race window is very small.
-	 */
-	if (!pos) {
-		next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
-	} else if (likely(!(pos->flags & CSS_RELEASED))) {
-		next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
-	} else {
-		list_for_each_entry_rcu(next, &parent->children, sibling)
-			if (next->serial_nr > pos->serial_nr)
-				break;
-	}
-
-	/*
-	 * @next, if not pointing to the head, can be dereferenced and is
-	 * the next sibling.
-	 */
-	if (&next->sibling != &parent->children)
-		return next;
-	return NULL;
-}
-
-/**
- * css_next_descendant_pre - find the next descendant for pre-order walk
- * @pos: the current position (%NULL to initiate traversal)
- * @root: css whose descendants to walk
- *
- * To be used by css_for_each_descendant_pre().  Find the next descendant
- * to visit for pre-order traversal of @root's descendants.  @root is
- * included in the iteration and the first node to be visited.
- *
- * While this function requires cgroup_mutex or RCU read locking, it
- * doesn't require the whole traversal to be contained in a single critical
- * section.  This function will return the correct next descendant as long
- * as both @pos and @root are accessible and @pos is a descendant of @root.
- *
- * If a subsystem synchronizes ->css_online() and the start of iteration, a
- * css which finished ->css_online() is guaranteed to be visible in the
- * future iterations and will stay visible until the last reference is put.
- * A css which hasn't finished ->css_online() or already finished
- * ->css_offline() may show up during traversal.  It's each subsystem's
- * responsibility to synchronize against on/offlining.
- */
-struct cgroup_subsys_state *
-css_next_descendant_pre(struct cgroup_subsys_state *pos,
-			struct cgroup_subsys_state *root)
-{
-	struct cgroup_subsys_state *next;
-
-	cgroup_assert_mutex_or_rcu_locked();
-
-	/* if first iteration, visit @root */
-	if (!pos)
-		return root;
-
-	/* visit the first child if exists */
-	next = css_next_child(NULL, pos);
-	if (next)
-		return next;
-
-	/* no child, visit my or the closest ancestor's next sibling */
-	while (pos != root) {
-		next = css_next_child(pos, pos->parent);
-		if (next)
-			return next;
-		pos = pos->parent;
-	}
-
-	return NULL;
-}
-
-/**
- * css_rightmost_descendant - return the rightmost descendant of a css
- * @pos: css of interest
- *
- * Return the rightmost descendant of @pos.  If there's no descendant, @pos
- * is returned.  This can be used during pre-order traversal to skip
- * subtree of @pos.
- *
- * While this function requires cgroup_mutex or RCU read locking, it
- * doesn't require the whole traversal to be contained in a single critical
- * section.  This function will return the correct rightmost descendant as
- * long as @pos is accessible.
- */
-struct cgroup_subsys_state *
-css_rightmost_descendant(struct cgroup_subsys_state *pos)
-{
-	struct cgroup_subsys_state *last, *tmp;
-
-	cgroup_assert_mutex_or_rcu_locked();
-
-	do {
-		last = pos;
-		/* ->prev isn't RCU safe, walk ->next till the end */
-		pos = NULL;
-		css_for_each_child(tmp, last)
-			pos = tmp;
-	} while (pos);
-
-	return last;
-}
-
-static struct cgroup_subsys_state *
-css_leftmost_descendant(struct cgroup_subsys_state *pos)
-{
-	struct cgroup_subsys_state *last;
-
-	do {
-		last = pos;
-		pos = css_next_child(NULL, pos);
-	} while (pos);
-
-	return last;
-}
-
-/**
- * css_next_descendant_post - find the next descendant for post-order walk
- * @pos: the current position (%NULL to initiate traversal)
- * @root: css whose descendants to walk
- *
- * To be used by css_for_each_descendant_post().  Find the next descendant
- * to visit for post-order traversal of @root's descendants.  @root is
- * included in the iteration and the last node to be visited.
- *
- * While this function requires cgroup_mutex or RCU read locking, it
- * doesn't require the whole traversal to be contained in a single critical
- * section.  This function will return the correct next descendant as long
- * as both @pos and @cgroup are accessible and @pos is a descendant of
- * @cgroup.
- *
- * If a subsystem synchronizes ->css_online() and the start of iteration, a
- * css which finished ->css_online() is guaranteed to be visible in the
- * future iterations and will stay visible until the last reference is put.
- * A css which hasn't finished ->css_online() or already finished
- * ->css_offline() may show up during traversal.  It's each subsystem's
- * responsibility to synchronize against on/offlining.
- */
-struct cgroup_subsys_state *
-css_next_descendant_post(struct cgroup_subsys_state *pos,
-			 struct cgroup_subsys_state *root)
-{
-	struct cgroup_subsys_state *next;
-
-	cgroup_assert_mutex_or_rcu_locked();
-
-	/* if first iteration, visit leftmost descendant which may be @root */
-	if (!pos)
-		return css_leftmost_descendant(root);
-
-	/* if we visited @root, we're done */
-	if (pos == root)
-		return NULL;
-
-	/* if there's an unvisited sibling, visit its leftmost descendant */
-	next = css_next_child(pos, pos->parent);
-	if (next)
-		return css_leftmost_descendant(next);
-
-	/* no sibling left, visit parent */
-	return pos->parent;
-}
-
-/**
- * css_has_online_children - does a css have online children
- * @css: the target css
- *
- * Returns %true if @css has any online children; otherwise, %false.  This
- * function can be called from any context but the caller is responsible
- * for synchronizing against on/offlining as necessary.
- */
-bool css_has_online_children(struct cgroup_subsys_state *css)
-{
-	struct cgroup_subsys_state *child;
-	bool ret = false;
-
-	rcu_read_lock();
-	css_for_each_child(child, css) {
-		if (child->flags & CSS_ONLINE) {
-			ret = true;
-			break;
-		}
-	}
-	rcu_read_unlock();
-	return ret;
-}
-
-/**
- * css_task_iter_advance_css_set - advance a task itererator to the next css_set
- * @it: the iterator to advance
- *
- * Advance @it to the next css_set to walk.
- */
-static void css_task_iter_advance_css_set(struct css_task_iter *it)
-{
-	struct list_head *l = it->cset_pos;
-	struct cgrp_cset_link *link;
-	struct css_set *cset;
-
-	lockdep_assert_held(&css_set_lock);
-
-	/* Advance to the next non-empty css_set */
-	do {
-		l = l->next;
-		if (l == it->cset_head) {
-			it->cset_pos = NULL;
-			it->task_pos = NULL;
-			return;
-		}
-
-		if (it->ss) {
-			cset = container_of(l, struct css_set,
-					    e_cset_node[it->ss->id]);
-		} else {
-			link = list_entry(l, struct cgrp_cset_link, cset_link);
-			cset = link->cset;
-		}
-	} while (!css_set_populated(cset));
-
-	it->cset_pos = l;
-
-	if (!list_empty(&cset->tasks))
-		it->task_pos = cset->tasks.next;
-	else
-		it->task_pos = cset->mg_tasks.next;
-
-	it->tasks_head = &cset->tasks;
-	it->mg_tasks_head = &cset->mg_tasks;
-
-	/*
-	 * We don't keep css_sets locked across iteration steps and thus
-	 * need to take steps to ensure that iteration can be resumed after
-	 * the lock is re-acquired.  Iteration is performed at two levels -
-	 * css_sets and tasks in them.
-	 *
-	 * Once created, a css_set never leaves its cgroup lists, so a
-	 * pinned css_set is guaranteed to stay put and we can resume
-	 * iteration afterwards.
-	 *
-	 * Tasks may leave @cset across iteration steps.  This is resolved
-	 * by registering each iterator with the css_set currently being
-	 * walked and making css_set_move_task() advance iterators whose
-	 * next task is leaving.
-	 */
-	if (it->cur_cset) {
-		list_del(&it->iters_node);
-		put_css_set_locked(it->cur_cset);
-	}
-	get_css_set(cset);
-	it->cur_cset = cset;
-	list_add(&it->iters_node, &cset->task_iters);
-}
-
-static void css_task_iter_advance(struct css_task_iter *it)
-{
-	struct list_head *l = it->task_pos;
-
-	lockdep_assert_held(&css_set_lock);
-	WARN_ON_ONCE(!l);
-
-	/*
-	 * Advance iterator to find next entry.  cset->tasks is consumed
-	 * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
-	 * next cset.
-	 */
-	l = l->next;
-
-	if (l == it->tasks_head)
-		l = it->mg_tasks_head->next;
-
-	if (l == it->mg_tasks_head)
-		css_task_iter_advance_css_set(it);
-	else
-		it->task_pos = l;
-}
-
-/**
- * css_task_iter_start - initiate task iteration
- * @css: the css to walk tasks of
- * @it: the task iterator to use
- *
- * Initiate iteration through the tasks of @css.  The caller can call
- * css_task_iter_next() to walk through the tasks until the function
- * returns NULL.  On completion of iteration, css_task_iter_end() must be
- * called.
- */
-void css_task_iter_start(struct cgroup_subsys_state *css,
-			 struct css_task_iter *it)
-{
-	/* no one should try to iterate before mounting cgroups */
-	WARN_ON_ONCE(!use_task_css_set_links);
-
-	memset(it, 0, sizeof(*it));
-
-	spin_lock_irq(&css_set_lock);
-
-	it->ss = css->ss;
-
-	if (it->ss)
-		it->cset_pos = &css->cgroup->e_csets[css->ss->id];
-	else
-		it->cset_pos = &css->cgroup->cset_links;
-
-	it->cset_head = it->cset_pos;
-
-	css_task_iter_advance_css_set(it);
-
-	spin_unlock_irq(&css_set_lock);
-}
-
-/**
- * css_task_iter_next - return the next task for the iterator
- * @it: the task iterator being iterated
- *
- * The "next" function for task iteration.  @it should have been
- * initialized via css_task_iter_start().  Returns NULL when the iteration
- * reaches the end.
- */
-struct task_struct *css_task_iter_next(struct css_task_iter *it)
-{
-	if (it->cur_task) {
-		put_task_struct(it->cur_task);
-		it->cur_task = NULL;
-	}
-
-	spin_lock_irq(&css_set_lock);
-
-	if (it->task_pos) {
-		it->cur_task = list_entry(it->task_pos, struct task_struct,
-					  cg_list);
-		get_task_struct(it->cur_task);
-		css_task_iter_advance(it);
-	}
-
-	spin_unlock_irq(&css_set_lock);
-
-	return it->cur_task;
-}
-
-/**
- * css_task_iter_end - finish task iteration
- * @it: the task iterator to finish
- *
- * Finish task iteration started by css_task_iter_start().
- */
-void css_task_iter_end(struct css_task_iter *it)
-{
-	if (it->cur_cset) {
-		spin_lock_irq(&css_set_lock);
-		list_del(&it->iters_node);
-		put_css_set_locked(it->cur_cset);
-		spin_unlock_irq(&css_set_lock);
-	}
-
-	if (it->cur_task)
-		put_task_struct(it->cur_task);
-}
-
-/**
- * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
- * @to: cgroup to which the tasks will be moved
- * @from: cgroup in which the tasks currently reside
- *
- * Locking rules between cgroup_post_fork() and the migration path
- * guarantee that, if a task is forking while being migrated, the new child
- * is guaranteed to be either visible in the source cgroup after the
- * parent's migration is complete or put into the target cgroup.  No task
- * can slip out of migration through forking.
- */
-int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
-{
-	LIST_HEAD(preloaded_csets);
-	struct cgrp_cset_link *link;
-	struct css_task_iter it;
-	struct task_struct *task;
-	int ret;
-
-	if (cgroup_on_dfl(to))
-		return -EINVAL;
-
-	if (!cgroup_may_migrate_to(to))
-		return -EBUSY;
-
-	mutex_lock(&cgroup_mutex);
-
-	percpu_down_write(&cgroup_threadgroup_rwsem);
-
-	/* all tasks in @from are being moved, all csets are source */
-	spin_lock_irq(&css_set_lock);
-	list_for_each_entry(link, &from->cset_links, cset_link)
-		cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
-	spin_unlock_irq(&css_set_lock);
-
-	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
-	if (ret)
-		goto out_err;
-
-	/*
-	 * Migrate tasks one-by-one until @from is empty.  This fails iff
-	 * ->can_attach() fails.
-	 */
-	do {
-		css_task_iter_start(&from->self, &it);
-		task = css_task_iter_next(&it);
-		if (task)
-			get_task_struct(task);
-		css_task_iter_end(&it);
-
-		if (task) {
-			ret = cgroup_migrate(task, false, to->root);
-			if (!ret)
-				trace_cgroup_transfer_tasks(to, task, false);
-			put_task_struct(task);
-		}
-	} while (task && !ret);
-out_err:
-	cgroup_migrate_finish(&preloaded_csets);
-	percpu_up_write(&cgroup_threadgroup_rwsem);
-	mutex_unlock(&cgroup_mutex);
-	return ret;
-}
-
-static void cgroup_procs_release(struct kernfs_open_file *of)
-{
-	if (of->priv) {
-		css_task_iter_end(of->priv);
-		kfree(of->priv);
-	}
-}
-
-static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
-{
-	struct kernfs_open_file *of = s->private;
-	struct css_task_iter *it = of->priv;
-	struct task_struct *task;
-
-	do {
-		task = css_task_iter_next(it);
-	} while (task && !thread_group_leader(task));
-
-	return task;
-}
-
-static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
-{
-	struct kernfs_open_file *of = s->private;
-	struct cgroup *cgrp = seq_css(s)->cgroup;
-	struct css_task_iter *it = of->priv;
-
-	/*
-	 * When a seq_file is seeked, it's always traversed sequentially
-	 * from position 0, so we can simply keep iterating on !0 *pos.
-	 */
-	if (!it) {
-		if (WARN_ON_ONCE((*pos)++))
-			return ERR_PTR(-EINVAL);
-
-		it = kzalloc(sizeof(*it), GFP_KERNEL);
-		if (!it)
-			return ERR_PTR(-ENOMEM);
-		of->priv = it;
-		css_task_iter_start(&cgrp->self, it);
-	} else if (!(*pos)++) {
-		css_task_iter_end(it);
-		css_task_iter_start(&cgrp->self, it);
-	}
-
-	return cgroup_procs_next(s, NULL, NULL);
-}
-
-static int cgroup_procs_show(struct seq_file *s, void *v)
-{
-	seq_printf(s, "%d\n", task_tgid_vnr(v));
-	return 0;
-}
-
-/*
- * Stuff for reading the 'tasks'/'procs' files.
- *
- * Reading this file can return large amounts of data if a cgroup has
- * *lots* of attached tasks. So it may need several calls to read(),
- * but we cannot guarantee that the information we produce is correct
- * unless we produce it entirely atomically.
- *
- */
-
-/* which pidlist file are we talking about? */
-enum cgroup_filetype {
-	CGROUP_FILE_PROCS,
-	CGROUP_FILE_TASKS,
-};
-
-/*
- * A pidlist is a list of pids that virtually represents the contents of one
- * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
- * a pair (one each for procs, tasks) for each pid namespace that's relevant
- * to the cgroup.
- */
-struct cgroup_pidlist {
-	/*
-	 * used to find which pidlist is wanted. doesn't change as long as
-	 * this particular list stays in the list.
-	*/
-	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
-	/* array of xids */
-	pid_t *list;
-	/* how many elements the above list has */
-	int length;
-	/* each of these stored in a list by its cgroup */
-	struct list_head links;
-	/* pointer to the cgroup we belong to, for list removal purposes */
-	struct cgroup *owner;
-	/* for delayed destruction */
-	struct delayed_work destroy_dwork;
-};
-
-/*
- * The following two functions "fix" the issue where there are more pids
- * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
- * TODO: replace with a kernel-wide solution to this problem
- */
-#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
-static void *pidlist_allocate(int count)
-{
-	if (PIDLIST_TOO_LARGE(count))
-		return vmalloc(count * sizeof(pid_t));
-	else
-		return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
-}
-
-static void pidlist_free(void *p)
-{
-	kvfree(p);
-}
-
-/*
- * Used to destroy all pidlists lingering waiting for destroy timer.  None
- * should be left afterwards.
- */
-static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
-{
-	struct cgroup_pidlist *l, *tmp_l;
-
-	mutex_lock(&cgrp->pidlist_mutex);
-	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
-		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
-	mutex_unlock(&cgrp->pidlist_mutex);
-
-	flush_workqueue(cgroup_pidlist_destroy_wq);
-	BUG_ON(!list_empty(&cgrp->pidlists));
-}
-
-static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
-{
-	struct delayed_work *dwork = to_delayed_work(work);
-	struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
-						destroy_dwork);
-	struct cgroup_pidlist *tofree = NULL;
-
-	mutex_lock(&l->owner->pidlist_mutex);
-
-	/*
-	 * Destroy iff we didn't get queued again.  The state won't change
-	 * as destroy_dwork can only be queued while locked.
-	 */
-	if (!delayed_work_pending(dwork)) {
-		list_del(&l->links);
-		pidlist_free(l->list);
-		put_pid_ns(l->key.ns);
-		tofree = l;
-	}
-
-	mutex_unlock(&l->owner->pidlist_mutex);
-	kfree(tofree);
-}
-
-/*
- * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
- * Returns the number of unique elements.
- */
-static int pidlist_uniq(pid_t *list, int length)
-{
-	int src, dest = 1;
-
-	/*
-	 * we presume the 0th element is unique, so i starts at 1. trivial
-	 * edge cases first; no work needs to be done for either
-	 */
-	if (length == 0 || length == 1)
-		return length;
-	/* src and dest walk down the list; dest counts unique elements */
-	for (src = 1; src < length; src++) {
-		/* find next unique element */
-		while (list[src] == list[src-1]) {
-			src++;
-			if (src == length)
-				goto after;
-		}
-		/* dest always points to where the next unique element goes */
-		list[dest] = list[src];
-		dest++;
-	}
-after:
-	return dest;
-}
-
-/*
- * The two pid files - task and cgroup.procs - guaranteed that the result
- * is sorted, which forced this whole pidlist fiasco.  As pid order is
- * different per namespace, each namespace needs differently sorted list,
- * making it impossible to use, for example, single rbtree of member tasks
- * sorted by task pointer.  As pidlists can be fairly large, allocating one
- * per open file is dangerous, so cgroup had to implement shared pool of
- * pidlists keyed by cgroup and namespace.
- */
-static int cmppid(const void *a, const void *b)
-{
-	return *(pid_t *)a - *(pid_t *)b;
-}
-
-static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
-						  enum cgroup_filetype type)
-{
-	struct cgroup_pidlist *l;
-	/* don't need task_nsproxy() if we're looking at ourself */
-	struct pid_namespace *ns = task_active_pid_ns(current);
-
-	lockdep_assert_held(&cgrp->pidlist_mutex);
-
-	list_for_each_entry(l, &cgrp->pidlists, links)
-		if (l->key.type == type && l->key.ns == ns)
-			return l;
-	return NULL;
-}
-
-/*
- * find the appropriate pidlist for our purpose (given procs vs tasks)
- * returns with the lock on that pidlist already held, and takes care
- * of the use count, or returns NULL with no locks held if we're out of
- * memory.
- */
-static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
-						enum cgroup_filetype type)
-{
-	struct cgroup_pidlist *l;
-
-	lockdep_assert_held(&cgrp->pidlist_mutex);
-
-	l = cgroup_pidlist_find(cgrp, type);
-	if (l)
-		return l;
-
-	/* entry not found; create a new one */
-	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
-	if (!l)
-		return l;
-
-	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
-	l->key.type = type;
-	/* don't need task_nsproxy() if we're looking at ourself */
-	l->key.ns = get_pid_ns(task_active_pid_ns(current));
-	l->owner = cgrp;
-	list_add(&l->links, &cgrp->pidlists);
-	return l;
-}
-
-/*
- * Load a cgroup's pidarray with either procs' tgids or tasks' pids
- */
-static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
-			      struct cgroup_pidlist **lp)
-{
-	pid_t *array;
-	int length;
-	int pid, n = 0; /* used for populating the array */
-	struct css_task_iter it;
-	struct task_struct *tsk;
-	struct cgroup_pidlist *l;
-
-	lockdep_assert_held(&cgrp->pidlist_mutex);
-
-	/*
-	 * If cgroup gets more users after we read count, we won't have
-	 * enough space - tough.  This race is indistinguishable to the
-	 * caller from the case that the additional cgroup users didn't
-	 * show up until sometime later on.
-	 */
-	length = cgroup_task_count(cgrp);
-	array = pidlist_allocate(length);
-	if (!array)
-		return -ENOMEM;
-	/* now, populate the array */
-	css_task_iter_start(&cgrp->self, &it);
-	while ((tsk = css_task_iter_next(&it))) {
-		if (unlikely(n == length))
-			break;
-		/* get tgid or pid for procs or tasks file respectively */
-		if (type == CGROUP_FILE_PROCS)
-			pid = task_tgid_vnr(tsk);
-		else
-			pid = task_pid_vnr(tsk);
-		if (pid > 0) /* make sure to only use valid results */
-			array[n++] = pid;
-	}
-	css_task_iter_end(&it);
-	length = n;
-	/* now sort & (if procs) strip out duplicates */
-	sort(array, length, sizeof(pid_t), cmppid, NULL);
-	if (type == CGROUP_FILE_PROCS)
-		length = pidlist_uniq(array, length);
-
-	l = cgroup_pidlist_find_create(cgrp, type);
-	if (!l) {
-		pidlist_free(array);
-		return -ENOMEM;
-	}
-
-	/* store array, freeing old if necessary */
-	pidlist_free(l->list);
-	l->list = array;
-	l->length = length;
-	*lp = l;
-	return 0;
-}
-
-/**
- * cgroupstats_build - build and fill cgroupstats
- * @stats: cgroupstats to fill information into
- * @dentry: A dentry entry belonging to the cgroup for which stats have
- * been requested.
- *
- * Build and fill cgroupstats so that taskstats can export it to user
- * space.
- */
-int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
-{
-	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
-	struct cgroup *cgrp;
-	struct css_task_iter it;
-	struct task_struct *tsk;
-
-	/* it should be kernfs_node belonging to cgroupfs and is a directory */
-	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
-	    kernfs_type(kn) != KERNFS_DIR)
-		return -EINVAL;
-
-	mutex_lock(&cgroup_mutex);
-
-	/*
-	 * We aren't being called from kernfs and there's no guarantee on
-	 * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
-	 * @kn->priv is RCU safe.  Let's do the RCU dancing.
-	 */
-	rcu_read_lock();
-	cgrp = rcu_dereference(kn->priv);
-	if (!cgrp || cgroup_is_dead(cgrp)) {
-		rcu_read_unlock();
-		mutex_unlock(&cgroup_mutex);
-		return -ENOENT;
-	}
-	rcu_read_unlock();
-
-	css_task_iter_start(&cgrp->self, &it);
-	while ((tsk = css_task_iter_next(&it))) {
-		switch (tsk->state) {
-		case TASK_RUNNING:
-			stats->nr_running++;
-			break;
-		case TASK_INTERRUPTIBLE:
-			stats->nr_sleeping++;
-			break;
-		case TASK_UNINTERRUPTIBLE:
-			stats->nr_uninterruptible++;
-			break;
-		case TASK_STOPPED:
-			stats->nr_stopped++;
-			break;
-		default:
-			if (delayacct_is_task_waiting_on_io(tsk))
-				stats->nr_io_wait++;
-			break;
-		}
-	}
-	css_task_iter_end(&it);
-
-	mutex_unlock(&cgroup_mutex);
-	return 0;
-}
-
-
-/*
- * seq_file methods for the tasks/procs files. The seq_file position is the
- * next pid to display; the seq_file iterator is a pointer to the pid
- * in the cgroup->l->list array.
- */
-
-static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
-{
-	/*
-	 * Initially we receive a position value that corresponds to
-	 * one more than the last pid shown (or 0 on the first call or
-	 * after a seek to the start). Use a binary-search to find the
-	 * next pid to display, if any
-	 */
-	struct kernfs_open_file *of = s->private;
-	struct cgroup *cgrp = seq_css(s)->cgroup;
-	struct cgroup_pidlist *l;
-	enum cgroup_filetype type = seq_cft(s)->private;
-	int index = 0, pid = *pos;
-	int *iter, ret;
-
-	mutex_lock(&cgrp->pidlist_mutex);
-
-	/*
-	 * !NULL @of->priv indicates that this isn't the first start()
-	 * after open.  If the matching pidlist is around, we can use that.
-	 * Look for it.  Note that @of->priv can't be used directly.  It
-	 * could already have been destroyed.
-	 */
-	if (of->priv)
-		of->priv = cgroup_pidlist_find(cgrp, type);
-
-	/*
-	 * Either this is the first start() after open or the matching
-	 * pidlist has been destroyed inbetween.  Create a new one.
-	 */
-	if (!of->priv) {
-		ret = pidlist_array_load(cgrp, type,
-					 (struct cgroup_pidlist **)&of->priv);
-		if (ret)
-			return ERR_PTR(ret);
-	}
-	l = of->priv;
-
-	if (pid) {
-		int end = l->length;
-
-		while (index < end) {
-			int mid = (index + end) / 2;
-			if (l->list[mid] == pid) {
-				index = mid;
-				break;
-			} else if (l->list[mid] <= pid)
-				index = mid + 1;
-			else
-				end = mid;
-		}
-	}
-	/* If we're off the end of the array, we're done */
-	if (index >= l->length)
-		return NULL;
-	/* Update the abstract position to be the actual pid that we found */
-	iter = l->list + index;
-	*pos = *iter;
-	return iter;
-}
-
-static void cgroup_pidlist_stop(struct seq_file *s, void *v)
-{
-	struct kernfs_open_file *of = s->private;
-	struct cgroup_pidlist *l = of->priv;
-
-	if (l)
-		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
-				 CGROUP_PIDLIST_DESTROY_DELAY);
-	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
-}
-
-static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
-{
-	struct kernfs_open_file *of = s->private;
-	struct cgroup_pidlist *l = of->priv;
-	pid_t *p = v;
-	pid_t *end = l->list + l->length;
-	/*
-	 * Advance to the next pid in the array. If this goes off the
-	 * end, we're done
-	 */
-	p++;
-	if (p >= end) {
-		return NULL;
-	} else {
-		*pos = *p;
-		return p;
-	}
-}
-
-static int cgroup_pidlist_show(struct seq_file *s, void *v)
-{
-	seq_printf(s, "%d\n", *(int *)v);
-
-	return 0;
-}
-
-static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
-					 struct cftype *cft)
-{
-	return notify_on_release(css->cgroup);
-}
-
-static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
-					  struct cftype *cft, u64 val)
-{
-	if (val)
-		set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
-	else
-		clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
-	return 0;
-}
-
-static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
-				      struct cftype *cft)
-{
-	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
-}
-
-static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
-				       struct cftype *cft, u64 val)
-{
-	if (val)
-		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
-	else
-		clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
-	return 0;
-}
-
-/* cgroup core interface files for the default hierarchy */
-static struct cftype cgroup_dfl_base_files[] = {
-	{
-		.name = "cgroup.procs",
-		.file_offset = offsetof(struct cgroup, procs_file),
-		.release = cgroup_procs_release,
-		.seq_start = cgroup_procs_start,
-		.seq_next = cgroup_procs_next,
-		.seq_show = cgroup_procs_show,
-		.write = cgroup_procs_write,
-	},
-	{
-		.name = "cgroup.controllers",
-		.seq_show = cgroup_controllers_show,
-	},
-	{
-		.name = "cgroup.subtree_control",
-		.seq_show = cgroup_subtree_control_show,
-		.write = cgroup_subtree_control_write,
-	},
-	{
-		.name = "cgroup.events",
-		.flags = CFTYPE_NOT_ON_ROOT,
-		.file_offset = offsetof(struct cgroup, events_file),
-		.seq_show = cgroup_events_show,
-	},
-	{ }	/* terminate */
-};
-
-/* cgroup core interface files for the legacy hierarchies */
-static struct cftype cgroup_legacy_base_files[] = {
-	{
-		.name = "cgroup.procs",
-		.seq_start = cgroup_pidlist_start,
-		.seq_next = cgroup_pidlist_next,
-		.seq_stop = cgroup_pidlist_stop,
-		.seq_show = cgroup_pidlist_show,
-		.private = CGROUP_FILE_PROCS,
-		.write = cgroup_procs_write,
-	},
-	{
-		.name = "cgroup.clone_children",
-		.read_u64 = cgroup_clone_children_read,
-		.write_u64 = cgroup_clone_children_write,
-	},
-	{
-		.name = "cgroup.sane_behavior",
-		.flags = CFTYPE_ONLY_ON_ROOT,
-		.seq_show = cgroup_sane_behavior_show,
-	},
-	{
-		.name = "tasks",
-		.seq_start = cgroup_pidlist_start,
-		.seq_next = cgroup_pidlist_next,
-		.seq_stop = cgroup_pidlist_stop,
-		.seq_show = cgroup_pidlist_show,
-		.private = CGROUP_FILE_TASKS,
-		.write = cgroup_tasks_write,
-	},
-	{
-		.name = "notify_on_release",
-		.read_u64 = cgroup_read_notify_on_release,
-		.write_u64 = cgroup_write_notify_on_release,
-	},
-	{
-		.name = "release_agent",
-		.flags = CFTYPE_ONLY_ON_ROOT,
-		.seq_show = cgroup_release_agent_show,
-		.write = cgroup_release_agent_write,
-		.max_write_len = PATH_MAX - 1,
-	},
-	{ }	/* terminate */
-};
-
-/*
- * css destruction is four-stage process.
- *
- * 1. Destruction starts.  Killing of the percpu_ref is initiated.
- *    Implemented in kill_css().
- *
- * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
- *    and thus css_tryget_online() is guaranteed to fail, the css can be
- *    offlined by invoking offline_css().  After offlining, the base ref is
- *    put.  Implemented in css_killed_work_fn().
- *
- * 3. When the percpu_ref reaches zero, the only possible remaining
- *    accessors are inside RCU read sections.  css_release() schedules the
- *    RCU callback.
- *
- * 4. After the grace period, the css can be freed.  Implemented in
- *    css_free_work_fn().
- *
- * It is actually hairier because both step 2 and 4 require process context
- * and thus involve punting to css->destroy_work adding two additional
- * steps to the already complex sequence.
- */
-static void css_free_work_fn(struct work_struct *work)
-{
-	struct cgroup_subsys_state *css =
-		container_of(work, struct cgroup_subsys_state, destroy_work);
-	struct cgroup_subsys *ss = css->ss;
-	struct cgroup *cgrp = css->cgroup;
-
-	percpu_ref_exit(&css->refcnt);
-
-	if (ss) {
-		/* css free path */
-		struct cgroup_subsys_state *parent = css->parent;
-		int id = css->id;
-
-		ss->css_free(css);
-		cgroup_idr_remove(&ss->css_idr, id);
-		cgroup_put(cgrp);
-
-		if (parent)
-			css_put(parent);
-	} else {
-		/* cgroup free path */
-		atomic_dec(&cgrp->root->nr_cgrps);
-		cgroup_pidlist_destroy_all(cgrp);
-		cancel_work_sync(&cgrp->release_agent_work);
-
-		if (cgroup_parent(cgrp)) {
-			/*
-			 * We get a ref to the parent, and put the ref when
-			 * this cgroup is being freed, so it's guaranteed
-			 * that the parent won't be destroyed before its
-			 * children.
-			 */
-			cgroup_put(cgroup_parent(cgrp));
-			kernfs_put(cgrp->kn);
-			kfree(cgrp);
-		} else {
-			/*
-			 * This is root cgroup's refcnt reaching zero,
-			 * which indicates that the root should be
-			 * released.
-			 */
-			cgroup_destroy_root(cgrp->root);
-		}
-	}
-}
-
-static void css_free_rcu_fn(struct rcu_head *rcu_head)
-{
-	struct cgroup_subsys_state *css =
-		container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
-
-	INIT_WORK(&css->destroy_work, css_free_work_fn);
-	queue_work(cgroup_destroy_wq, &css->destroy_work);
-}
-
-static void css_release_work_fn(struct work_struct *work)
-{
-	struct cgroup_subsys_state *css =
-		container_of(work, struct cgroup_subsys_state, destroy_work);
-	struct cgroup_subsys *ss = css->ss;
-	struct cgroup *cgrp = css->cgroup;
-
-	mutex_lock(&cgroup_mutex);
-
-	css->flags |= CSS_RELEASED;
-	list_del_rcu(&css->sibling);
-
-	if (ss) {
-		/* css release path */
-		cgroup_idr_replace(&ss->css_idr, NULL, css->id);
-		if (ss->css_released)
-			ss->css_released(css);
-	} else {
-		/* cgroup release path */
-		trace_cgroup_release(cgrp);
-
-		cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
-		cgrp->id = -1;
-
-		/*
-		 * There are two control paths which try to determine
-		 * cgroup from dentry without going through kernfs -
-		 * cgroupstats_build() and css_tryget_online_from_dir().
-		 * Those are supported by RCU protecting clearing of
-		 * cgrp->kn->priv backpointer.
-		 */
-		if (cgrp->kn)
-			RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
-					 NULL);
-
-		cgroup_bpf_put(cgrp);
-	}
-
-	mutex_unlock(&cgroup_mutex);
-
-	call_rcu(&css->rcu_head, css_free_rcu_fn);
-}
-
-static void css_release(struct percpu_ref *ref)
-{
-	struct cgroup_subsys_state *css =
-		container_of(ref, struct cgroup_subsys_state, refcnt);
-
-	INIT_WORK(&css->destroy_work, css_release_work_fn);
-	queue_work(cgroup_destroy_wq, &css->destroy_work);
-}
-
-static void init_and_link_css(struct cgroup_subsys_state *css,
-			      struct cgroup_subsys *ss, struct cgroup *cgrp)
-{
-	lockdep_assert_held(&cgroup_mutex);
-
-	cgroup_get(cgrp);
-
-	memset(css, 0, sizeof(*css));
-	css->cgroup = cgrp;
-	css->ss = ss;
-	css->id = -1;
-	INIT_LIST_HEAD(&css->sibling);
-	INIT_LIST_HEAD(&css->children);
-	css->serial_nr = css_serial_nr_next++;
-	atomic_set(&css->online_cnt, 0);
-
-	if (cgroup_parent(cgrp)) {
-		css->parent = cgroup_css(cgroup_parent(cgrp), ss);
-		css_get(css->parent);
-	}
-
-	BUG_ON(cgroup_css(cgrp, ss));
-}
-
-/* invoke ->css_online() on a new CSS and mark it online if successful */
-static int online_css(struct cgroup_subsys_state *css)
-{
-	struct cgroup_subsys *ss = css->ss;
-	int ret = 0;
-
-	lockdep_assert_held(&cgroup_mutex);
-
-	if (ss->css_online)
-		ret = ss->css_online(css);
-	if (!ret) {
-		css->flags |= CSS_ONLINE;
-		rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
-
-		atomic_inc(&css->online_cnt);
-		if (css->parent)
-			atomic_inc(&css->parent->online_cnt);
-	}
-	return ret;
-}
-
-/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
-static void offline_css(struct cgroup_subsys_state *css)
-{
-	struct cgroup_subsys *ss = css->ss;
-
-	lockdep_assert_held(&cgroup_mutex);
-
-	if (!(css->flags & CSS_ONLINE))
-		return;
-
-	if (ss->css_reset)
-		ss->css_reset(css);
-
-	if (ss->css_offline)
-		ss->css_offline(css);
-
-	css->flags &= ~CSS_ONLINE;
-	RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
-
-	wake_up_all(&css->cgroup->offline_waitq);
-}
-
-/**
- * css_create - create a cgroup_subsys_state
- * @cgrp: the cgroup new css will be associated with
- * @ss: the subsys of new css
- *
- * Create a new css associated with @cgrp - @ss pair.  On success, the new
- * css is online and installed in @cgrp.  This function doesn't create the
- * interface files.  Returns 0 on success, -errno on failure.
- */
-static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
-					      struct cgroup_subsys *ss)
-{
-	struct cgroup *parent = cgroup_parent(cgrp);
-	struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
-	struct cgroup_subsys_state *css;
-	int err;
-
-	lockdep_assert_held(&cgroup_mutex);
-
-	css = ss->css_alloc(parent_css);
-	if (!css)
-		css = ERR_PTR(-ENOMEM);
-	if (IS_ERR(css))
-		return css;
-
-	init_and_link_css(css, ss, cgrp);
-
-	err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
-	if (err)
-		goto err_free_css;
-
-	err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
-	if (err < 0)
-		goto err_free_css;
-	css->id = err;
-
-	/* @css is ready to be brought online now, make it visible */
-	list_add_tail_rcu(&css->sibling, &parent_css->children);
-	cgroup_idr_replace(&ss->css_idr, css, css->id);
-
-	err = online_css(css);
-	if (err)
-		goto err_list_del;
-
-	if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
-	    cgroup_parent(parent)) {
-		pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
-			current->comm, current->pid, ss->name);
-		if (!strcmp(ss->name, "memory"))
-			pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
-		ss->warned_broken_hierarchy = true;
-	}
-
-	return css;
-
-err_list_del:
-	list_del_rcu(&css->sibling);
-err_free_css:
-	call_rcu(&css->rcu_head, css_free_rcu_fn);
-	return ERR_PTR(err);
-}
-
-static struct cgroup *cgroup_create(struct cgroup *parent)
-{
-	struct cgroup_root *root = parent->root;
-	struct cgroup *cgrp, *tcgrp;
-	int level = parent->level + 1;
-	int ret;
-
-	/* allocate the cgroup and its ID, 0 is reserved for the root */
-	cgrp = kzalloc(sizeof(*cgrp) +
-		       sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL);
-	if (!cgrp)
-		return ERR_PTR(-ENOMEM);
-
-	ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
-	if (ret)
-		goto out_free_cgrp;
-
-	/*
-	 * Temporarily set the pointer to NULL, so idr_find() won't return
-	 * a half-baked cgroup.
-	 */
-	cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
-	if (cgrp->id < 0) {
-		ret = -ENOMEM;
-		goto out_cancel_ref;
-	}
-
-	init_cgroup_housekeeping(cgrp);
-
-	cgrp->self.parent = &parent->self;
-	cgrp->root = root;
-	cgrp->level = level;
-
-	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
-		cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
-
-	if (notify_on_release(parent))
-		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
-
-	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
-		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
-
-	cgrp->self.serial_nr = css_serial_nr_next++;
-
-	/* allocation complete, commit to creation */
-	list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
-	atomic_inc(&root->nr_cgrps);
-	cgroup_get(parent);
-
-	/*
-	 * @cgrp is now fully operational.  If something fails after this
-	 * point, it'll be released via the normal destruction path.
-	 */
-	cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
-
-	/*
-	 * On the default hierarchy, a child doesn't automatically inherit
-	 * subtree_control from the parent.  Each is configured manually.
-	 */
-	if (!cgroup_on_dfl(cgrp))
-		cgrp->subtree_control = cgroup_control(cgrp);
-
-	if (parent)
-		cgroup_bpf_inherit(cgrp, parent);
-
-	cgroup_propagate_control(cgrp);
-
-	/* @cgrp doesn't have dir yet so the following will only create csses */
-	ret = cgroup_apply_control_enable(cgrp);
-	if (ret)
-		goto out_destroy;
-
-	return cgrp;
-
-out_cancel_ref:
-	percpu_ref_exit(&cgrp->self.refcnt);
-out_free_cgrp:
-	kfree(cgrp);
-	return ERR_PTR(ret);
-out_destroy:
-	cgroup_destroy_locked(cgrp);
-	return ERR_PTR(ret);
-}
-
-static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
-			umode_t mode)
-{
-	struct cgroup *parent, *cgrp;
-	struct kernfs_node *kn;
-	int ret;
-
-	/* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
-	if (strchr(name, '\n'))
-		return -EINVAL;
-
-	parent = cgroup_kn_lock_live(parent_kn, false);
-	if (!parent)
-		return -ENODEV;
-
-	cgrp = cgroup_create(parent);
-	if (IS_ERR(cgrp)) {
-		ret = PTR_ERR(cgrp);
-		goto out_unlock;
-	}
-
-	/* create the directory */
-	kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
-	if (IS_ERR(kn)) {
-		ret = PTR_ERR(kn);
-		goto out_destroy;
-	}
-	cgrp->kn = kn;
-
-	/*
-	 * This extra ref will be put in cgroup_free_fn() and guarantees
-	 * that @cgrp->kn is always accessible.
-	 */
-	kernfs_get(kn);
-
-	ret = cgroup_kn_set_ugid(kn);
-	if (ret)
-		goto out_destroy;
-
-	ret = css_populate_dir(&cgrp->self);
-	if (ret)
-		goto out_destroy;
-
-	ret = cgroup_apply_control_enable(cgrp);
-	if (ret)
-		goto out_destroy;
-
-	trace_cgroup_mkdir(cgrp);
-
-	/* let's create and online css's */
-	kernfs_activate(kn);
-
-	ret = 0;
-	goto out_unlock;
-
-out_destroy:
-	cgroup_destroy_locked(cgrp);
-out_unlock:
-	cgroup_kn_unlock(parent_kn);
-	return ret;
-}
-
-/*
- * This is called when the refcnt of a css is confirmed to be killed.
- * css_tryget_online() is now guaranteed to fail.  Tell the subsystem to
- * initate destruction and put the css ref from kill_css().
- */
-static void css_killed_work_fn(struct work_struct *work)
-{
-	struct cgroup_subsys_state *css =
-		container_of(work, struct cgroup_subsys_state, destroy_work);
-
-	mutex_lock(&cgroup_mutex);
-
-	do {
-		offline_css(css);
-		css_put(css);
-		/* @css can't go away while we're holding cgroup_mutex */
-		css = css->parent;
-	} while (css && atomic_dec_and_test(&css->online_cnt));
-
-	mutex_unlock(&cgroup_mutex);
-}
-
-/* css kill confirmation processing requires process context, bounce */
-static void css_killed_ref_fn(struct percpu_ref *ref)
-{
-	struct cgroup_subsys_state *css =
-		container_of(ref, struct cgroup_subsys_state, refcnt);
-
-	if (atomic_dec_and_test(&css->online_cnt)) {
-		INIT_WORK(&css->destroy_work, css_killed_work_fn);
-		queue_work(cgroup_destroy_wq, &css->destroy_work);
-	}
-}
-
-/**
- * kill_css - destroy a css
- * @css: css to destroy
- *
- * This function initiates destruction of @css by removing cgroup interface
- * files and putting its base reference.  ->css_offline() will be invoked
- * asynchronously once css_tryget_online() is guaranteed to fail and when
- * the reference count reaches zero, @css will be released.
- */
-static void kill_css(struct cgroup_subsys_state *css)
-{
-	lockdep_assert_held(&cgroup_mutex);
-
-	/*
-	 * This must happen before css is disassociated with its cgroup.
-	 * See seq_css() for details.
-	 */
-	css_clear_dir(css);
-
-	/*
-	 * Killing would put the base ref, but we need to keep it alive
-	 * until after ->css_offline().
-	 */
-	css_get(css);
-
-	/*
-	 * cgroup core guarantees that, by the time ->css_offline() is
-	 * invoked, no new css reference will be given out via
-	 * css_tryget_online().  We can't simply call percpu_ref_kill() and
-	 * proceed to offlining css's because percpu_ref_kill() doesn't
-	 * guarantee that the ref is seen as killed on all CPUs on return.
-	 *
-	 * Use percpu_ref_kill_and_confirm() to get notifications as each
-	 * css is confirmed to be seen as killed on all CPUs.
-	 */
-	percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
-}
-
-/**
- * cgroup_destroy_locked - the first stage of cgroup destruction
- * @cgrp: cgroup to be destroyed
- *
- * css's make use of percpu refcnts whose killing latency shouldn't be
- * exposed to userland and are RCU protected.  Also, cgroup core needs to
- * guarantee that css_tryget_online() won't succeed by the time
- * ->css_offline() is invoked.  To satisfy all the requirements,
- * destruction is implemented in the following two steps.
- *
- * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
- *     userland visible parts and start killing the percpu refcnts of
- *     css's.  Set up so that the next stage will be kicked off once all
- *     the percpu refcnts are confirmed to be killed.
- *
- * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
- *     rest of destruction.  Once all cgroup references are gone, the
- *     cgroup is RCU-freed.
- *
- * This function implements s1.  After this step, @cgrp is gone as far as
- * the userland is concerned and a new cgroup with the same name may be
- * created.  As cgroup doesn't care about the names internally, this
- * doesn't cause any problem.
- */
-static int cgroup_destroy_locked(struct cgroup *cgrp)
-	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
-{
-	struct cgroup_subsys_state *css;
-	struct cgrp_cset_link *link;
-	int ssid;
-
-	lockdep_assert_held(&cgroup_mutex);
-
-	/*
-	 * Only migration can raise populated from zero and we're already
-	 * holding cgroup_mutex.
-	 */
-	if (cgroup_is_populated(cgrp))
-		return -EBUSY;
-
-	/*
-	 * Make sure there's no live children.  We can't test emptiness of
-	 * ->self.children as dead children linger on it while being
-	 * drained; otherwise, "rmdir parent/child parent" may fail.
-	 */
-	if (css_has_online_children(&cgrp->self))
-		return -EBUSY;
-
-	/*
-	 * Mark @cgrp and the associated csets dead.  The former prevents
-	 * further task migration and child creation by disabling
-	 * cgroup_lock_live_group().  The latter makes the csets ignored by
-	 * the migration path.
-	 */
-	cgrp->self.flags &= ~CSS_ONLINE;
-
-	spin_lock_irq(&css_set_lock);
-	list_for_each_entry(link, &cgrp->cset_links, cset_link)
-		link->cset->dead = true;
-	spin_unlock_irq(&css_set_lock);
-
-	/* initiate massacre of all css's */
-	for_each_css(css, ssid, cgrp)
-		kill_css(css);
-
-	/*
-	 * Remove @cgrp directory along with the base files.  @cgrp has an
-	 * extra ref on its kn.
-	 */
-	kernfs_remove(cgrp->kn);
-
-	check_for_release(cgroup_parent(cgrp));
-
-	/* put the base reference */
-	percpu_ref_kill(&cgrp->self.refcnt);
-
-	return 0;
-};
-
-static int cgroup_rmdir(struct kernfs_node *kn)
-{
-	struct cgroup *cgrp;
-	int ret = 0;
-
-	cgrp = cgroup_kn_lock_live(kn, false);
-	if (!cgrp)
-		return 0;
-
-	ret = cgroup_destroy_locked(cgrp);
-
-	if (!ret)
-		trace_cgroup_rmdir(cgrp);
-
-	cgroup_kn_unlock(kn);
-	return ret;
-}
-
-static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
-	.remount_fs		= cgroup_remount,
-	.show_options		= cgroup_show_options,
-	.mkdir			= cgroup_mkdir,
-	.rmdir			= cgroup_rmdir,
-	.rename			= cgroup_rename,
-	.show_path		= cgroup_show_path,
-};
-
-static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
-{
-	struct cgroup_subsys_state *css;
-
-	pr_debug("Initializing cgroup subsys %s\n", ss->name);
-
-	mutex_lock(&cgroup_mutex);
-
-	idr_init(&ss->css_idr);
-	INIT_LIST_HEAD(&ss->cfts);
-
-	/* Create the root cgroup state for this subsystem */
-	ss->root = &cgrp_dfl_root;
-	css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
-	/* We don't handle early failures gracefully */
-	BUG_ON(IS_ERR(css));
-	init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
-
-	/*
-	 * Root csses are never destroyed and we can't initialize
-	 * percpu_ref during early init.  Disable refcnting.
-	 */
-	css->flags |= CSS_NO_REF;
-
-	if (early) {
-		/* allocation can't be done safely during early init */
-		css->id = 1;
-	} else {
-		css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
-		BUG_ON(css->id < 0);
-	}
-
-	/* Update the init_css_set to contain a subsys
-	 * pointer to this state - since the subsystem is
-	 * newly registered, all tasks and hence the
-	 * init_css_set is in the subsystem's root cgroup. */
-	init_css_set.subsys[ss->id] = css;
-
-	have_fork_callback |= (bool)ss->fork << ss->id;
-	have_exit_callback |= (bool)ss->exit << ss->id;
-	have_free_callback |= (bool)ss->free << ss->id;
-	have_canfork_callback |= (bool)ss->can_fork << ss->id;
-
-	/* At system boot, before all subsystems have been
-	 * registered, no tasks have been forked, so we don't
-	 * need to invoke fork callbacks here. */
-	BUG_ON(!list_empty(&init_task.tasks));
-
-	BUG_ON(online_css(css));
-
-	mutex_unlock(&cgroup_mutex);
-}
-
-/**
- * cgroup_init_early - cgroup initialization at system boot
- *
- * Initialize cgroups at system boot, and initialize any
- * subsystems that request early init.
- */
-int __init cgroup_init_early(void)
-{
-	static struct cgroup_sb_opts __initdata opts;
-	struct cgroup_subsys *ss;
-	int i;
-
-	init_cgroup_root(&cgrp_dfl_root, &opts);
-	cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
-
-	RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
-
-	for_each_subsys(ss, i) {
-		WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
-		     "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
-		     i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
-		     ss->id, ss->name);
-		WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
-		     "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
-
-		ss->id = i;
-		ss->name = cgroup_subsys_name[i];
-		if (!ss->legacy_name)
-			ss->legacy_name = cgroup_subsys_name[i];
-
-		if (ss->early_init)
-			cgroup_init_subsys(ss, true);
-	}
-	return 0;
-}
-
-static u16 cgroup_disable_mask __initdata;
-
-/**
- * cgroup_init - cgroup initialization
- *
- * Register cgroup filesystem and /proc file, and initialize
- * any subsystems that didn't request early init.
- */
-int __init cgroup_init(void)
-{
-	struct cgroup_subsys *ss;
-	int ssid;
-
-	BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
-	BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
-	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
-	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
-
-	/*
-	 * The latency of the synchronize_sched() is too high for cgroups,
-	 * avoid it at the cost of forcing all readers into the slow path.
-	 */
-	rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
-
-	get_user_ns(init_cgroup_ns.user_ns);
-
-	mutex_lock(&cgroup_mutex);
-
-	/*
-	 * Add init_css_set to the hash table so that dfl_root can link to
-	 * it during init.
-	 */
-	hash_add(css_set_table, &init_css_set.hlist,
-		 css_set_hash(init_css_set.subsys));
-
-	BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
-
-	mutex_unlock(&cgroup_mutex);
-
-	for_each_subsys(ss, ssid) {
-		if (ss->early_init) {
-			struct cgroup_subsys_state *css =
-				init_css_set.subsys[ss->id];
-
-			css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
-						   GFP_KERNEL);
-			BUG_ON(css->id < 0);
-		} else {
-			cgroup_init_subsys(ss, false);
-		}
-
-		list_add_tail(&init_css_set.e_cset_node[ssid],
-			      &cgrp_dfl_root.cgrp.e_csets[ssid]);
-
-		/*
-		 * Setting dfl_root subsys_mask needs to consider the
-		 * disabled flag and cftype registration needs kmalloc,
-		 * both of which aren't available during early_init.
-		 */
-		if (cgroup_disable_mask & (1 << ssid)) {
-			static_branch_disable(cgroup_subsys_enabled_key[ssid]);
-			printk(KERN_INFO "Disabling %s control group subsystem\n",
-			       ss->name);
-			continue;
-		}
-
-		if (cgroup_ssid_no_v1(ssid))
-			printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
-			       ss->name);
-
-		cgrp_dfl_root.subsys_mask |= 1 << ss->id;
-
-		if (ss->implicit_on_dfl)
-			cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
-		else if (!ss->dfl_cftypes)
-			cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
-
-		if (ss->dfl_cftypes == ss->legacy_cftypes) {
-			WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
-		} else {
-			WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
-			WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
-		}
-
-		if (ss->bind)
-			ss->bind(init_css_set.subsys[ssid]);
-	}
-
-	/* init_css_set.subsys[] has been updated, re-hash */
-	hash_del(&init_css_set.hlist);
-	hash_add(css_set_table, &init_css_set.hlist,
-		 css_set_hash(init_css_set.subsys));
-
-	WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
-	WARN_ON(register_filesystem(&cgroup_fs_type));
-	WARN_ON(register_filesystem(&cgroup2_fs_type));
-	WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
-
-	return 0;
-}
-
-static int __init cgroup_wq_init(void)
-{
-	/*
-	 * There isn't much point in executing destruction path in
-	 * parallel.  Good chunk is serialized with cgroup_mutex anyway.
-	 * Use 1 for @max_active.
-	 *
-	 * We would prefer to do this in cgroup_init() above, but that
-	 * is called before init_workqueues(): so leave this until after.
-	 */
-	cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
-	BUG_ON(!cgroup_destroy_wq);
-
-	/*
-	 * Used to destroy pidlists and separate to serve as flush domain.
-	 * Cap @max_active to 1 too.
-	 */
-	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
-						    0, 1);
-	BUG_ON(!cgroup_pidlist_destroy_wq);
-
-	return 0;
-}
-core_initcall(cgroup_wq_init);
-
-/*
- * proc_cgroup_show()
- *  - Print task's cgroup paths into seq_file, one line for each hierarchy
- *  - Used for /proc/<pid>/cgroup.
- */
-int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
-		     struct pid *pid, struct task_struct *tsk)
-{
-	char *buf;
-	int retval;
-	struct cgroup_root *root;
-
-	retval = -ENOMEM;
-	buf = kmalloc(PATH_MAX, GFP_KERNEL);
-	if (!buf)
-		goto out;
-
-	mutex_lock(&cgroup_mutex);
-	spin_lock_irq(&css_set_lock);
-
-	for_each_root(root) {
-		struct cgroup_subsys *ss;
-		struct cgroup *cgrp;
-		int ssid, count = 0;
-
-		if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
-			continue;
-
-		seq_printf(m, "%d:", root->hierarchy_id);
-		if (root != &cgrp_dfl_root)
-			for_each_subsys(ss, ssid)
-				if (root->subsys_mask & (1 << ssid))
-					seq_printf(m, "%s%s", count++ ? "," : "",
-						   ss->legacy_name);
-		if (strlen(root->name))
-			seq_printf(m, "%sname=%s", count ? "," : "",
-				   root->name);
-		seq_putc(m, ':');
-
-		cgrp = task_cgroup_from_root(tsk, root);
-
-		/*
-		 * On traditional hierarchies, all zombie tasks show up as
-		 * belonging to the root cgroup.  On the default hierarchy,
-		 * while a zombie doesn't show up in "cgroup.procs" and
-		 * thus can't be migrated, its /proc/PID/cgroup keeps
-		 * reporting the cgroup it belonged to before exiting.  If
-		 * the cgroup is removed before the zombie is reaped,
-		 * " (deleted)" is appended to the cgroup path.
-		 */
-		if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
-			retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
-						current->nsproxy->cgroup_ns);
-			if (retval >= PATH_MAX)
-				retval = -ENAMETOOLONG;
-			if (retval < 0)
-				goto out_unlock;
-
-			seq_puts(m, buf);
-		} else {
-			seq_puts(m, "/");
-		}
-
-		if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
-			seq_puts(m, " (deleted)\n");
-		else
-			seq_putc(m, '\n');
-	}
-
-	retval = 0;
-out_unlock:
-	spin_unlock_irq(&css_set_lock);
-	mutex_unlock(&cgroup_mutex);
-	kfree(buf);
-out:
-	return retval;
-}
-
-/* Display information about each subsystem and each hierarchy */
-static int proc_cgroupstats_show(struct seq_file *m, void *v)
-{
-	struct cgroup_subsys *ss;
-	int i;
-
-	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
-	/*
-	 * ideally we don't want subsystems moving around while we do this.
-	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
-	 * subsys/hierarchy state.
-	 */
-	mutex_lock(&cgroup_mutex);
-
-	for_each_subsys(ss, i)
-		seq_printf(m, "%s\t%d\t%d\t%d\n",
-			   ss->legacy_name, ss->root->hierarchy_id,
-			   atomic_read(&ss->root->nr_cgrps),
-			   cgroup_ssid_enabled(i));
-
-	mutex_unlock(&cgroup_mutex);
-	return 0;
-}
-
-static int cgroupstats_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_cgroupstats_show, NULL);
-}
-
-static const struct file_operations proc_cgroupstats_operations = {
-	.open = cgroupstats_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-/**
- * cgroup_fork - initialize cgroup related fields during copy_process()
- * @child: pointer to task_struct of forking parent process.
- *
- * A task is associated with the init_css_set until cgroup_post_fork()
- * attaches it to the parent's css_set.  Empty cg_list indicates that
- * @child isn't holding reference to its css_set.
- */
-void cgroup_fork(struct task_struct *child)
-{
-	RCU_INIT_POINTER(child->cgroups, &init_css_set);
-	INIT_LIST_HEAD(&child->cg_list);
-}
-
-/**
- * cgroup_can_fork - called on a new task before the process is exposed
- * @child: the task in question.
- *
- * This calls the subsystem can_fork() callbacks. If the can_fork() callback
- * returns an error, the fork aborts with that error code. This allows for
- * a cgroup subsystem to conditionally allow or deny new forks.
- */
-int cgroup_can_fork(struct task_struct *child)
-{
-	struct cgroup_subsys *ss;
-	int i, j, ret;
-
-	do_each_subsys_mask(ss, i, have_canfork_callback) {
-		ret = ss->can_fork(child);
-		if (ret)
-			goto out_revert;
-	} while_each_subsys_mask();
-
-	return 0;
-
-out_revert:
-	for_each_subsys(ss, j) {
-		if (j >= i)
-			break;
-		if (ss->cancel_fork)
-			ss->cancel_fork(child);
-	}
-
-	return ret;
-}
-
-/**
- * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
- * @child: the task in question
- *
- * This calls the cancel_fork() callbacks if a fork failed *after*
- * cgroup_can_fork() succeded.
- */
-void cgroup_cancel_fork(struct task_struct *child)
-{
-	struct cgroup_subsys *ss;
-	int i;
-
-	for_each_subsys(ss, i)
-		if (ss->cancel_fork)
-			ss->cancel_fork(child);
-}
-
-/**
- * cgroup_post_fork - called on a new task after adding it to the task list
- * @child: the task in question
- *
- * Adds the task to the list running through its css_set if necessary and
- * call the subsystem fork() callbacks.  Has to be after the task is
- * visible on the task list in case we race with the first call to
- * cgroup_task_iter_start() - to guarantee that the new task ends up on its
- * list.
- */
-void cgroup_post_fork(struct task_struct *child)
-{
-	struct cgroup_subsys *ss;
-	int i;
-
-	/*
-	 * This may race against cgroup_enable_task_cg_lists().  As that
-	 * function sets use_task_css_set_links before grabbing
-	 * tasklist_lock and we just went through tasklist_lock to add
-	 * @child, it's guaranteed that either we see the set
-	 * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
-	 * @child during its iteration.
-	 *
-	 * If we won the race, @child is associated with %current's
-	 * css_set.  Grabbing css_set_lock guarantees both that the
-	 * association is stable, and, on completion of the parent's
-	 * migration, @child is visible in the source of migration or
-	 * already in the destination cgroup.  This guarantee is necessary
-	 * when implementing operations which need to migrate all tasks of
-	 * a cgroup to another.
-	 *
-	 * Note that if we lose to cgroup_enable_task_cg_lists(), @child
-	 * will remain in init_css_set.  This is safe because all tasks are
-	 * in the init_css_set before cg_links is enabled and there's no
-	 * operation which transfers all tasks out of init_css_set.
-	 */
-	if (use_task_css_set_links) {
-		struct css_set *cset;
-
-		spin_lock_irq(&css_set_lock);
-		cset = task_css_set(current);
-		if (list_empty(&child->cg_list)) {
-			get_css_set(cset);
-			css_set_move_task(child, NULL, cset, false);
-		}
-		spin_unlock_irq(&css_set_lock);
-	}
-
-	/*
-	 * Call ss->fork().  This must happen after @child is linked on
-	 * css_set; otherwise, @child might change state between ->fork()
-	 * and addition to css_set.
-	 */
-	do_each_subsys_mask(ss, i, have_fork_callback) {
-		ss->fork(child);
-	} while_each_subsys_mask();
-}
-
-/**
- * cgroup_exit - detach cgroup from exiting task
- * @tsk: pointer to task_struct of exiting process
- *
- * Description: Detach cgroup from @tsk and release it.
- *
- * Note that cgroups marked notify_on_release force every task in
- * them to take the global cgroup_mutex mutex when exiting.
- * This could impact scaling on very large systems.  Be reluctant to
- * use notify_on_release cgroups where very high task exit scaling
- * is required on large systems.
- *
- * We set the exiting tasks cgroup to the root cgroup (top_cgroup).  We
- * call cgroup_exit() while the task is still competent to handle
- * notify_on_release(), then leave the task attached to the root cgroup in
- * each hierarchy for the remainder of its exit.  No need to bother with
- * init_css_set refcnting.  init_css_set never goes away and we can't race
- * with migration path - PF_EXITING is visible to migration path.
- */
-void cgroup_exit(struct task_struct *tsk)
-{
-	struct cgroup_subsys *ss;
-	struct css_set *cset;
-	int i;
-
-	/*
-	 * Unlink from @tsk from its css_set.  As migration path can't race
-	 * with us, we can check css_set and cg_list without synchronization.
-	 */
-	cset = task_css_set(tsk);
-
-	if (!list_empty(&tsk->cg_list)) {
-		spin_lock_irq(&css_set_lock);
-		css_set_move_task(tsk, cset, NULL, false);
-		spin_unlock_irq(&css_set_lock);
-	} else {
-		get_css_set(cset);
-	}
-
-	/* see cgroup_post_fork() for details */
-	do_each_subsys_mask(ss, i, have_exit_callback) {
-		ss->exit(tsk);
-	} while_each_subsys_mask();
-}
-
-void cgroup_free(struct task_struct *task)
-{
-	struct css_set *cset = task_css_set(task);
-	struct cgroup_subsys *ss;
-	int ssid;
-
-	do_each_subsys_mask(ss, ssid, have_free_callback) {
-		ss->free(task);
-	} while_each_subsys_mask();
-
-	put_css_set(cset);
-}
-
-static void check_for_release(struct cgroup *cgrp)
-{
-	if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
-	    !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
-		schedule_work(&cgrp->release_agent_work);
-}
-
-/*
- * Notify userspace when a cgroup is released, by running the
- * configured release agent with the name of the cgroup (path
- * relative to the root of cgroup file system) as the argument.
- *
- * Most likely, this user command will try to rmdir this cgroup.
- *
- * This races with the possibility that some other task will be
- * attached to this cgroup before it is removed, or that some other
- * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
- * The presumed 'rmdir' will fail quietly if this cgroup is no longer
- * unused, and this cgroup will be reprieved from its death sentence,
- * to continue to serve a useful existence.  Next time it's released,
- * we will get notified again, if it still has 'notify_on_release' set.
- *
- * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
- * means only wait until the task is successfully execve()'d.  The
- * separate release agent task is forked by call_usermodehelper(),
- * then control in this thread returns here, without waiting for the
- * release agent task.  We don't bother to wait because the caller of
- * this routine has no use for the exit status of the release agent
- * task, so no sense holding our caller up for that.
- */
-static void cgroup_release_agent(struct work_struct *work)
-{
-	struct cgroup *cgrp =
-		container_of(work, struct cgroup, release_agent_work);
-	char *pathbuf = NULL, *agentbuf = NULL;
-	char *argv[3], *envp[3];
-	int ret;
-
-	mutex_lock(&cgroup_mutex);
-
-	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
-	agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
-	if (!pathbuf || !agentbuf)
-		goto out;
-
-	spin_lock_irq(&css_set_lock);
-	ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
-	spin_unlock_irq(&css_set_lock);
-	if (ret < 0 || ret >= PATH_MAX)
-		goto out;
-
-	argv[0] = agentbuf;
-	argv[1] = pathbuf;
-	argv[2] = NULL;
-
-	/* minimal command environment */
-	envp[0] = "HOME=/";
-	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
-	envp[2] = NULL;
-
-	mutex_unlock(&cgroup_mutex);
-	call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
-	goto out_free;
-out:
-	mutex_unlock(&cgroup_mutex);
-out_free:
-	kfree(agentbuf);
-	kfree(pathbuf);
-}
-
-static int __init cgroup_disable(char *str)
-{
-	struct cgroup_subsys *ss;
-	char *token;
-	int i;
-
-	while ((token = strsep(&str, ",")) != NULL) {
-		if (!*token)
-			continue;
-
-		for_each_subsys(ss, i) {
-			if (strcmp(token, ss->name) &&
-			    strcmp(token, ss->legacy_name))
-				continue;
-			cgroup_disable_mask |= 1 << i;
-		}
-	}
-	return 1;
-}
-__setup("cgroup_disable=", cgroup_disable);
-
-static int __init cgroup_no_v1(char *str)
-{
-	struct cgroup_subsys *ss;
-	char *token;
-	int i;
-
-	while ((token = strsep(&str, ",")) != NULL) {
-		if (!*token)
-			continue;
-
-		if (!strcmp(token, "all")) {
-			cgroup_no_v1_mask = U16_MAX;
-			break;
-		}
-
-		for_each_subsys(ss, i) {
-			if (strcmp(token, ss->name) &&
-			    strcmp(token, ss->legacy_name))
-				continue;
-
-			cgroup_no_v1_mask |= 1 << i;
-		}
-	}
-	return 1;
-}
-__setup("cgroup_no_v1=", cgroup_no_v1);
-
-/**
- * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
- * @dentry: directory dentry of interest
- * @ss: subsystem of interest
- *
- * If @dentry is a directory for a cgroup which has @ss enabled on it, try
- * to get the corresponding css and return it.  If such css doesn't exist
- * or can't be pinned, an ERR_PTR value is returned.
- */
-struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
-						       struct cgroup_subsys *ss)
-{
-	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
-	struct file_system_type *s_type = dentry->d_sb->s_type;
-	struct cgroup_subsys_state *css = NULL;
-	struct cgroup *cgrp;
-
-	/* is @dentry a cgroup dir? */
-	if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
-	    !kn || kernfs_type(kn) != KERNFS_DIR)
-		return ERR_PTR(-EBADF);
-
-	rcu_read_lock();
-
-	/*
-	 * This path doesn't originate from kernfs and @kn could already
-	 * have been or be removed at any point.  @kn->priv is RCU
-	 * protected for this access.  See css_release_work_fn() for details.
-	 */
-	cgrp = rcu_dereference(kn->priv);
-	if (cgrp)
-		css = cgroup_css(cgrp, ss);
-
-	if (!css || !css_tryget_online(css))
-		css = ERR_PTR(-ENOENT);
-
-	rcu_read_unlock();
-	return css;
-}
-
-/**
- * css_from_id - lookup css by id
- * @id: the cgroup id
- * @ss: cgroup subsys to be looked into
- *
- * Returns the css if there's valid one with @id, otherwise returns NULL.
- * Should be called under rcu_read_lock().
- */
-struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
-{
-	WARN_ON_ONCE(!rcu_read_lock_held());
-	return idr_find(&ss->css_idr, id);
-}
-
-/**
- * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path
- * @path: path on the default hierarchy
- *
- * Find the cgroup at @path on the default hierarchy, increment its
- * reference count and return it.  Returns pointer to the found cgroup on
- * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR)
- * if @path points to a non-directory.
- */
-struct cgroup *cgroup_get_from_path(const char *path)
-{
-	struct kernfs_node *kn;
-	struct cgroup *cgrp;
-
-	mutex_lock(&cgroup_mutex);
-
-	kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
-	if (kn) {
-		if (kernfs_type(kn) == KERNFS_DIR) {
-			cgrp = kn->priv;
-			cgroup_get(cgrp);
-		} else {
-			cgrp = ERR_PTR(-ENOTDIR);
-		}
-		kernfs_put(kn);
-	} else {
-		cgrp = ERR_PTR(-ENOENT);
-	}
-
-	mutex_unlock(&cgroup_mutex);
-	return cgrp;
-}
-EXPORT_SYMBOL_GPL(cgroup_get_from_path);
-
-/**
- * cgroup_get_from_fd - get a cgroup pointer from a fd
- * @fd: fd obtained by open(cgroup2_dir)
- *
- * Find the cgroup from a fd which should be obtained
- * by opening a cgroup directory.  Returns a pointer to the
- * cgroup on success. ERR_PTR is returned if the cgroup
- * cannot be found.
- */
-struct cgroup *cgroup_get_from_fd(int fd)
-{
-	struct cgroup_subsys_state *css;
-	struct cgroup *cgrp;
-	struct file *f;
-
-	f = fget_raw(fd);
-	if (!f)
-		return ERR_PTR(-EBADF);
-
-	css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
-	fput(f);
-	if (IS_ERR(css))
-		return ERR_CAST(css);
-
-	cgrp = css->cgroup;
-	if (!cgroup_on_dfl(cgrp)) {
-		cgroup_put(cgrp);
-		return ERR_PTR(-EBADF);
-	}
-
-	return cgrp;
-}
-EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
-
-/*
- * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
- * definition in cgroup-defs.h.
- */
-#ifdef CONFIG_SOCK_CGROUP_DATA
-
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
-
-DEFINE_SPINLOCK(cgroup_sk_update_lock);
-static bool cgroup_sk_alloc_disabled __read_mostly;
-
-void cgroup_sk_alloc_disable(void)
-{
-	if (cgroup_sk_alloc_disabled)
-		return;
-	pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
-	cgroup_sk_alloc_disabled = true;
-}
-
-#else
-
-#define cgroup_sk_alloc_disabled	false
-
-#endif
-
-void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
-{
-	if (cgroup_sk_alloc_disabled)
-		return;
-
-	/* Socket clone path */
-	if (skcd->val) {
-		cgroup_get(sock_cgroup_ptr(skcd));
-		return;
-	}
-
-	rcu_read_lock();
-
-	while (true) {
-		struct css_set *cset;
-
-		cset = task_css_set(current);
-		if (likely(cgroup_tryget(cset->dfl_cgrp))) {
-			skcd->val = (unsigned long)cset->dfl_cgrp;
-			break;
-		}
-		cpu_relax();
-	}
-
-	rcu_read_unlock();
-}
-
-void cgroup_sk_free(struct sock_cgroup_data *skcd)
-{
-	cgroup_put(sock_cgroup_ptr(skcd));
-}
-
-#endif	/* CONFIG_SOCK_CGROUP_DATA */
-
-/* cgroup namespaces */
-
-static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
-{
-	return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
-}
-
-static void dec_cgroup_namespaces(struct ucounts *ucounts)
-{
-	dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
-}
-
-static struct cgroup_namespace *alloc_cgroup_ns(void)
-{
-	struct cgroup_namespace *new_ns;
-	int ret;
-
-	new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
-	if (!new_ns)
-		return ERR_PTR(-ENOMEM);
-	ret = ns_alloc_inum(&new_ns->ns);
-	if (ret) {
-		kfree(new_ns);
-		return ERR_PTR(ret);
-	}
-	atomic_set(&new_ns->count, 1);
-	new_ns->ns.ops = &cgroupns_operations;
-	return new_ns;
-}
-
-void free_cgroup_ns(struct cgroup_namespace *ns)
-{
-	put_css_set(ns->root_cset);
-	dec_cgroup_namespaces(ns->ucounts);
-	put_user_ns(ns->user_ns);
-	ns_free_inum(&ns->ns);
-	kfree(ns);
-}
-EXPORT_SYMBOL(free_cgroup_ns);
-
-struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
-					struct user_namespace *user_ns,
-					struct cgroup_namespace *old_ns)
-{
-	struct cgroup_namespace *new_ns;
-	struct ucounts *ucounts;
-	struct css_set *cset;
-
-	BUG_ON(!old_ns);
-
-	if (!(flags & CLONE_NEWCGROUP)) {
-		get_cgroup_ns(old_ns);
-		return old_ns;
-	}
-
-	/* Allow only sysadmin to create cgroup namespace. */
-	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
-		return ERR_PTR(-EPERM);
-
-	ucounts = inc_cgroup_namespaces(user_ns);
-	if (!ucounts)
-		return ERR_PTR(-ENOSPC);
-
-	/* It is not safe to take cgroup_mutex here */
-	spin_lock_irq(&css_set_lock);
-	cset = task_css_set(current);
-	get_css_set(cset);
-	spin_unlock_irq(&css_set_lock);
-
-	new_ns = alloc_cgroup_ns();
-	if (IS_ERR(new_ns)) {
-		put_css_set(cset);
-		dec_cgroup_namespaces(ucounts);
-		return new_ns;
-	}
-
-	new_ns->user_ns = get_user_ns(user_ns);
-	new_ns->ucounts = ucounts;
-	new_ns->root_cset = cset;
-
-	return new_ns;
-}
-
-static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
-{
-	return container_of(ns, struct cgroup_namespace, ns);
-}
-
-static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
-{
-	struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
-
-	if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
-	    !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
-		return -EPERM;
-
-	/* Don't need to do anything if we are attaching to our own cgroupns. */
-	if (cgroup_ns == nsproxy->cgroup_ns)
-		return 0;
-
-	get_cgroup_ns(cgroup_ns);
-	put_cgroup_ns(nsproxy->cgroup_ns);
-	nsproxy->cgroup_ns = cgroup_ns;
-
-	return 0;
-}
-
-static struct ns_common *cgroupns_get(struct task_struct *task)
-{
-	struct cgroup_namespace *ns = NULL;
-	struct nsproxy *nsproxy;
-
-	task_lock(task);
-	nsproxy = task->nsproxy;
-	if (nsproxy) {
-		ns = nsproxy->cgroup_ns;
-		get_cgroup_ns(ns);
-	}
-	task_unlock(task);
-
-	return ns ? &ns->ns : NULL;
-}
-
-static void cgroupns_put(struct ns_common *ns)
-{
-	put_cgroup_ns(to_cg_ns(ns));
-}
-
-static struct user_namespace *cgroupns_owner(struct ns_common *ns)
-{
-	return to_cg_ns(ns)->user_ns;
-}
-
-const struct proc_ns_operations cgroupns_operations = {
-	.name		= "cgroup",
-	.type		= CLONE_NEWCGROUP,
-	.get		= cgroupns_get,
-	.put		= cgroupns_put,
-	.install	= cgroupns_install,
-	.owner		= cgroupns_owner,
-};
-
-static __init int cgroup_namespaces_init(void)
-{
-	return 0;
-}
-subsys_initcall(cgroup_namespaces_init);
-
-#ifdef CONFIG_CGROUP_BPF
-void cgroup_bpf_update(struct cgroup *cgrp,
-		       struct bpf_prog *prog,
-		       enum bpf_attach_type type)
-{
-	struct cgroup *parent = cgroup_parent(cgrp);
-
-	mutex_lock(&cgroup_mutex);
-	__cgroup_bpf_update(cgrp, parent, prog, type);
-	mutex_unlock(&cgroup_mutex);
-}
-#endif /* CONFIG_CGROUP_BPF */
-
-#ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *
-debug_css_alloc(struct cgroup_subsys_state *parent_css)
-{
-	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
-
-	if (!css)
-		return ERR_PTR(-ENOMEM);
-
-	return css;
-}
-
-static void debug_css_free(struct cgroup_subsys_state *css)
-{
-	kfree(css);
-}
-
-static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
-				struct cftype *cft)
-{
-	return cgroup_task_count(css->cgroup);
-}
-
-static u64 current_css_set_read(struct cgroup_subsys_state *css,
-				struct cftype *cft)
-{
-	return (u64)(unsigned long)current->cgroups;
-}
-
-static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
-					 struct cftype *cft)
-{
-	u64 count;
-
-	rcu_read_lock();
-	count = atomic_read(&task_css_set(current)->refcount);
-	rcu_read_unlock();
-	return count;
-}
-
-static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
-{
-	struct cgrp_cset_link *link;
-	struct css_set *cset;
-	char *name_buf;
-
-	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
-	if (!name_buf)
-		return -ENOMEM;
-
-	spin_lock_irq(&css_set_lock);
-	rcu_read_lock();
-	cset = rcu_dereference(current->cgroups);
-	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
-		struct cgroup *c = link->cgrp;
-
-		cgroup_name(c, name_buf, NAME_MAX + 1);
-		seq_printf(seq, "Root %d group %s\n",
-			   c->root->hierarchy_id, name_buf);
-	}
-	rcu_read_unlock();
-	spin_unlock_irq(&css_set_lock);
-	kfree(name_buf);
-	return 0;
-}
-
-#define MAX_TASKS_SHOWN_PER_CSS 25
-static int cgroup_css_links_read(struct seq_file *seq, void *v)
-{
-	struct cgroup_subsys_state *css = seq_css(seq);
-	struct cgrp_cset_link *link;
-
-	spin_lock_irq(&css_set_lock);
-	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
-		struct css_set *cset = link->cset;
-		struct task_struct *task;
-		int count = 0;
-
-		seq_printf(seq, "css_set %p\n", cset);
-
-		list_for_each_entry(task, &cset->tasks, cg_list) {
-			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
-				goto overflow;
-			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
-		}
-
-		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
-			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
-				goto overflow;
-			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
-		}
-		continue;
-	overflow:
-		seq_puts(seq, "  ...\n");
-	}
-	spin_unlock_irq(&css_set_lock);
-	return 0;
-}
-
-static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
-{
-	return (!cgroup_is_populated(css->cgroup) &&
-		!css_has_online_children(&css->cgroup->self));
-}
-
-static struct cftype debug_files[] =  {
-	{
-		.name = "taskcount",
-		.read_u64 = debug_taskcount_read,
-	},
-
-	{
-		.name = "current_css_set",
-		.read_u64 = current_css_set_read,
-	},
-
-	{
-		.name = "current_css_set_refcount",
-		.read_u64 = current_css_set_refcount_read,
-	},
-
-	{
-		.name = "current_css_set_cg_links",
-		.seq_show = current_css_set_cg_links_read,
-	},
-
-	{
-		.name = "cgroup_css_links",
-		.seq_show = cgroup_css_links_read,
-	},
-
-	{
-		.name = "releasable",
-		.read_u64 = releasable_read,
-	},
-
-	{ }	/* terminate */
-};
-
-struct cgroup_subsys debug_cgrp_subsys = {
-	.css_alloc = debug_css_alloc,
-	.css_free = debug_css_free,
-	.legacy_cftypes = debug_files,
-};
-#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
new file mode 100644
index 000000000000..4d561a50a5ac
--- /dev/null
+++ b/kernel/cgroup/Makefile
@@ -0,0 +1,5 @@
+obj-y := cgroup.o
+
+obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
+obj-$(CONFIG_CGROUP_PIDS) += pids.o
+obj-$(CONFIG_CPUSETS) += cpuset.o
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
new file mode 100644
index 000000000000..1a815f275849
--- /dev/null
+++ b/kernel/cgroup/cgroup.c
@@ -0,0 +1,6705 @@
+/*
+ *  Generic process-grouping system.
+ *
+ *  Based originally on the cpuset system, extracted by Paul Menage
+ *  Copyright (C) 2006 Google, Inc
+ *
+ *  Notifications support
+ *  Copyright (C) 2009 Nokia Corporation
+ *  Author: Kirill A. Shutemov
+ *
+ *  Copyright notices from the original cpuset code:
+ *  --------------------------------------------------
+ *  Copyright (C) 2003 BULL SA.
+ *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
+ *
+ *  Portions derived from Patrick Mochel's sysfs code.
+ *  sysfs is Copyright (c) 2001-3 Patrick Mochel
+ *
+ *  2003-10-10 Written by Simon Derr.
+ *  2003-10-22 Updates by Stephen Hemminger.
+ *  2004 May-July Rework by Paul Jackson.
+ *  ---------------------------------------------------
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cgroup.h>
+#include <linux/cred.h>
+#include <linux/ctype.h>
+#include <linux/errno.h>
+#include <linux/init_task.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/magic.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/proc_fs.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/string.h>
+#include <linux/sort.h>
+#include <linux/kmod.h>
+#include <linux/delayacct.h>
+#include <linux/cgroupstats.h>
+#include <linux/hashtable.h>
+#include <linux/pid_namespace.h>
+#include <linux/idr.h>
+#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/atomic.h>
+#include <linux/cpuset.h>
+#include <linux/proc_ns.h>
+#include <linux/nsproxy.h>
+#include <linux/file.h>
+#include <net/sock.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/cgroup.h>
+
+/*
+ * pidlists linger the following amount before being destroyed.  The goal
+ * is avoiding frequent destruction in the middle of consecutive read calls
+ * Expiring in the middle is a performance problem not a correctness one.
+ * 1 sec should be enough.
+ */
+#define CGROUP_PIDLIST_DESTROY_DELAY	HZ
+
+#define CGROUP_FILE_NAME_MAX		(MAX_CGROUP_TYPE_NAMELEN +	\
+					 MAX_CFTYPE_NAME + 2)
+
+/*
+ * cgroup_mutex is the master lock.  Any modification to cgroup or its
+ * hierarchy must be performed while holding it.
+ *
+ * css_set_lock protects task->cgroups pointer, the list of css_set
+ * objects, and the chain of tasks off each css_set.
+ *
+ * These locks are exported if CONFIG_PROVE_RCU so that accessors in
+ * cgroup.h can use them for lockdep annotations.
+ */
+#ifdef CONFIG_PROVE_RCU
+DEFINE_MUTEX(cgroup_mutex);
+DEFINE_SPINLOCK(css_set_lock);
+EXPORT_SYMBOL_GPL(cgroup_mutex);
+EXPORT_SYMBOL_GPL(css_set_lock);
+#else
+static DEFINE_MUTEX(cgroup_mutex);
+static DEFINE_SPINLOCK(css_set_lock);
+#endif
+
+/*
+ * Protects cgroup_idr and css_idr so that IDs can be released without
+ * grabbing cgroup_mutex.
+ */
+static DEFINE_SPINLOCK(cgroup_idr_lock);
+
+/*
+ * Protects cgroup_file->kn for !self csses.  It synchronizes notifications
+ * against file removal/re-creation across css hiding.
+ */
+static DEFINE_SPINLOCK(cgroup_file_kn_lock);
+
+/*
+ * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
+ * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
+ */
+static DEFINE_SPINLOCK(release_agent_path_lock);
+
+struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+
+#define cgroup_assert_mutex_or_rcu_locked()				\
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&			\
+			   !lockdep_is_held(&cgroup_mutex),		\
+			   "cgroup_mutex or RCU read lock required");
+
+/*
+ * cgroup destruction makes heavy use of work items and there can be a lot
+ * of concurrent destructions.  Use a separate workqueue so that cgroup
+ * destruction work items don't end up filling up max_active of system_wq
+ * which may lead to deadlock.
+ */
+static struct workqueue_struct *cgroup_destroy_wq;
+
+/*
+ * pidlist destructions need to be flushed on cgroup destruction.  Use a
+ * separate workqueue as flush domain.
+ */
+static struct workqueue_struct *cgroup_pidlist_destroy_wq;
+
+/* generate an array of cgroup subsystem pointers */
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
+static struct cgroup_subsys *cgroup_subsys[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
+
+/* array of cgroup subsystem names */
+#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
+static const char *cgroup_subsys_name[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
+
+/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
+#define SUBSYS(_x)								\
+	DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key);			\
+	DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key);			\
+	EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key);			\
+	EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
+#include <linux/cgroup_subsys.h>
+#undef SUBSYS
+
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
+static struct static_key_true *cgroup_subsys_enabled_key[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
+
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
+static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
+
+/*
+ * The default hierarchy, reserved for the subsystems that are otherwise
+ * unattached - it never has more than a single cgroup, and all tasks are
+ * part of that cgroup.
+ */
+struct cgroup_root cgrp_dfl_root;
+EXPORT_SYMBOL_GPL(cgrp_dfl_root);
+
+/*
+ * The default hierarchy always exists but is hidden until mounted for the
+ * first time.  This is for backward compatibility.
+ */
+static bool cgrp_dfl_visible;
+
+/* Controllers blocked by the commandline in v1 */
+static u16 cgroup_no_v1_mask;
+
+/* some controllers are not supported in the default hierarchy */
+static u16 cgrp_dfl_inhibit_ss_mask;
+
+/* some controllers are implicitly enabled on the default hierarchy */
+static unsigned long cgrp_dfl_implicit_ss_mask;
+
+/* The list of hierarchy roots */
+
+static LIST_HEAD(cgroup_roots);
+static int cgroup_root_count;
+
+/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
+static DEFINE_IDR(cgroup_hierarchy_idr);
+
+/*
+ * Assign a monotonically increasing serial number to csses.  It guarantees
+ * cgroups with bigger numbers are newer than those with smaller numbers.
+ * Also, as csses are always appended to the parent's ->children list, it
+ * guarantees that sibling csses are always sorted in the ascending serial
+ * number order on the list.  Protected by cgroup_mutex.
+ */
+static u64 css_serial_nr_next = 1;
+
+/*
+ * These bitmask flags indicate whether tasks in the fork and exit paths have
+ * fork/exit handlers to call. This avoids us having to do extra work in the
+ * fork/exit path to check which subsystems have fork/exit callbacks.
+ */
+static u16 have_fork_callback __read_mostly;
+static u16 have_exit_callback __read_mostly;
+static u16 have_free_callback __read_mostly;
+
+/* cgroup namespace for init task */
+struct cgroup_namespace init_cgroup_ns = {
+	.count		= { .counter = 2, },
+	.user_ns	= &init_user_ns,
+	.ns.ops		= &cgroupns_operations,
+	.ns.inum	= PROC_CGROUP_INIT_INO,
+	.root_cset	= &init_css_set,
+};
+
+/* Ditto for the can_fork callback. */
+static u16 have_canfork_callback __read_mostly;
+
+static struct file_system_type cgroup2_fs_type;
+static struct cftype cgroup_dfl_base_files[];
+static struct cftype cgroup_legacy_base_files[];
+
+static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
+static void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
+static int cgroup_apply_control(struct cgroup *cgrp);
+static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
+static void css_task_iter_advance(struct css_task_iter *it);
+static int cgroup_destroy_locked(struct cgroup *cgrp);
+static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
+					      struct cgroup_subsys *ss);
+static void css_release(struct percpu_ref *ref);
+static void kill_css(struct cgroup_subsys_state *css);
+static int cgroup_addrm_files(struct cgroup_subsys_state *css,
+			      struct cgroup *cgrp, struct cftype cfts[],
+			      bool is_add);
+
+/**
+ * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
+ * @ssid: subsys ID of interest
+ *
+ * cgroup_subsys_enabled() can only be used with literal subsys names which
+ * is fine for individual subsystems but unsuitable for cgroup core.  This
+ * is slower static_key_enabled() based test indexed by @ssid.
+ */
+static bool cgroup_ssid_enabled(int ssid)
+{
+	if (CGROUP_SUBSYS_COUNT == 0)
+		return false;
+
+	return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
+}
+
+static bool cgroup_ssid_no_v1(int ssid)
+{
+	return cgroup_no_v1_mask & (1 << ssid);
+}
+
+/**
+ * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
+ * @cgrp: the cgroup of interest
+ *
+ * The default hierarchy is the v2 interface of cgroup and this function
+ * can be used to test whether a cgroup is on the default hierarchy for
+ * cases where a subsystem should behave differnetly depending on the
+ * interface version.
+ *
+ * The set of behaviors which change on the default hierarchy are still
+ * being determined and the mount option is prefixed with __DEVEL__.
+ *
+ * List of changed behaviors:
+ *
+ * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
+ *   and "name" are disallowed.
+ *
+ * - When mounting an existing superblock, mount options should match.
+ *
+ * - Remount is disallowed.
+ *
+ * - rename(2) is disallowed.
+ *
+ * - "tasks" is removed.  Everything should be at process granularity.  Use
+ *   "cgroup.procs" instead.
+ *
+ * - "cgroup.procs" is not sorted.  pids will be unique unless they got
+ *   recycled inbetween reads.
+ *
+ * - "release_agent" and "notify_on_release" are removed.  Replacement
+ *   notification mechanism will be implemented.
+ *
+ * - "cgroup.clone_children" is removed.
+ *
+ * - "cgroup.subtree_populated" is available.  Its value is 0 if the cgroup
+ *   and its descendants contain no task; otherwise, 1.  The file also
+ *   generates kernfs notification which can be monitored through poll and
+ *   [di]notify when the value of the file changes.
+ *
+ * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
+ *   take masks of ancestors with non-empty cpus/mems, instead of being
+ *   moved to an ancestor.
+ *
+ * - cpuset: a task can be moved into an empty cpuset, and again it takes
+ *   masks of ancestors.
+ *
+ * - memcg: use_hierarchy is on by default and the cgroup file for the flag
+ *   is not created.
+ *
+ * - blkcg: blk-throttle becomes properly hierarchical.
+ *
+ * - debug: disallowed on the default hierarchy.
+ */
+static bool cgroup_on_dfl(const struct cgroup *cgrp)
+{
+	return cgrp->root == &cgrp_dfl_root;
+}
+
+/* IDR wrappers which synchronize using cgroup_idr_lock */
+static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
+			    gfp_t gfp_mask)
+{
+	int ret;
+
+	idr_preload(gfp_mask);
+	spin_lock_bh(&cgroup_idr_lock);
+	ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
+	spin_unlock_bh(&cgroup_idr_lock);
+	idr_preload_end();
+	return ret;
+}
+
+static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
+{
+	void *ret;
+
+	spin_lock_bh(&cgroup_idr_lock);
+	ret = idr_replace(idr, ptr, id);
+	spin_unlock_bh(&cgroup_idr_lock);
+	return ret;
+}
+
+static void cgroup_idr_remove(struct idr *idr, int id)
+{
+	spin_lock_bh(&cgroup_idr_lock);
+	idr_remove(idr, id);
+	spin_unlock_bh(&cgroup_idr_lock);
+}
+
+static struct cgroup *cgroup_parent(struct cgroup *cgrp)
+{
+	struct cgroup_subsys_state *parent_css = cgrp->self.parent;
+
+	if (parent_css)
+		return container_of(parent_css, struct cgroup, self);
+	return NULL;
+}
+
+/* subsystems visibly enabled on a cgroup */
+static u16 cgroup_control(struct cgroup *cgrp)
+{
+	struct cgroup *parent = cgroup_parent(cgrp);
+	u16 root_ss_mask = cgrp->root->subsys_mask;
+
+	if (parent)
+		return parent->subtree_control;
+
+	if (cgroup_on_dfl(cgrp))
+		root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
+				  cgrp_dfl_implicit_ss_mask);
+	return root_ss_mask;
+}
+
+/* subsystems enabled on a cgroup */
+static u16 cgroup_ss_mask(struct cgroup *cgrp)
+{
+	struct cgroup *parent = cgroup_parent(cgrp);
+
+	if (parent)
+		return parent->subtree_ss_mask;
+
+	return cgrp->root->subsys_mask;
+}
+
+/**
+ * cgroup_css - obtain a cgroup's css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest (%NULL returns @cgrp->self)
+ *
+ * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
+ * function must be called either under cgroup_mutex or rcu_read_lock() and
+ * the caller is responsible for pinning the returned css if it wants to
+ * keep accessing it outside the said locks.  This function may return
+ * %NULL if @cgrp doesn't have @subsys_id enabled.
+ */
+static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
+					      struct cgroup_subsys *ss)
+{
+	if (ss)
+		return rcu_dereference_check(cgrp->subsys[ss->id],
+					lockdep_is_held(&cgroup_mutex));
+	else
+		return &cgrp->self;
+}
+
+/**
+ * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest (%NULL returns @cgrp->self)
+ *
+ * Similar to cgroup_css() but returns the effective css, which is defined
+ * as the matching css of the nearest ancestor including self which has @ss
+ * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
+ * function is guaranteed to return non-NULL css.
+ */
+static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
+						struct cgroup_subsys *ss)
+{
+	lockdep_assert_held(&cgroup_mutex);
+
+	if (!ss)
+		return &cgrp->self;
+
+	/*
+	 * This function is used while updating css associations and thus
+	 * can't test the csses directly.  Test ss_mask.
+	 */
+	while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
+		cgrp = cgroup_parent(cgrp);
+		if (!cgrp)
+			return NULL;
+	}
+
+	return cgroup_css(cgrp, ss);
+}
+
+/**
+ * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get the effective css of @cgrp for @ss.  The effective css is
+ * defined as the matching css of the nearest ancestor including self which
+ * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
+ * the root css is returned, so this function always returns a valid css.
+ * The returned css must be put using css_put().
+ */
+struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
+					     struct cgroup_subsys *ss)
+{
+	struct cgroup_subsys_state *css;
+
+	rcu_read_lock();
+
+	do {
+		css = cgroup_css(cgrp, ss);
+
+		if (css && css_tryget_online(css))
+			goto out_unlock;
+		cgrp = cgroup_parent(cgrp);
+	} while (cgrp);
+
+	css = init_css_set.subsys[ss->id];
+	css_get(css);
+out_unlock:
+	rcu_read_unlock();
+	return css;
+}
+
+/* convenient tests for these bits */
+static inline bool cgroup_is_dead(const struct cgroup *cgrp)
+{
+	return !(cgrp->self.flags & CSS_ONLINE);
+}
+
+static void cgroup_get(struct cgroup *cgrp)
+{
+	WARN_ON_ONCE(cgroup_is_dead(cgrp));
+	css_get(&cgrp->self);
+}
+
+static bool cgroup_tryget(struct cgroup *cgrp)
+{
+	return css_tryget(&cgrp->self);
+}
+
+struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
+{
+	struct cgroup *cgrp = of->kn->parent->priv;
+	struct cftype *cft = of_cft(of);
+
+	/*
+	 * This is open and unprotected implementation of cgroup_css().
+	 * seq_css() is only called from a kernfs file operation which has
+	 * an active reference on the file.  Because all the subsystem
+	 * files are drained before a css is disassociated with a cgroup,
+	 * the matching css from the cgroup's subsys table is guaranteed to
+	 * be and stay valid until the enclosing operation is complete.
+	 */
+	if (cft->ss)
+		return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
+	else
+		return &cgrp->self;
+}
+EXPORT_SYMBOL_GPL(of_css);
+
+static int notify_on_release(const struct cgroup *cgrp)
+{
+	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+}
+
+/**
+ * for_each_css - iterate all css's of a cgroup
+ * @css: the iteration cursor
+ * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
+ * @cgrp: the target cgroup to iterate css's of
+ *
+ * Should be called under cgroup_[tree_]mutex.
+ */
+#define for_each_css(css, ssid, cgrp)					\
+	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\
+		if (!((css) = rcu_dereference_check(			\
+				(cgrp)->subsys[(ssid)],			\
+				lockdep_is_held(&cgroup_mutex)))) { }	\
+		else
+
+/**
+ * for_each_e_css - iterate all effective css's of a cgroup
+ * @css: the iteration cursor
+ * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
+ * @cgrp: the target cgroup to iterate css's of
+ *
+ * Should be called under cgroup_[tree_]mutex.
+ */
+#define for_each_e_css(css, ssid, cgrp)					\
+	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\
+		if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
+			;						\
+		else
+
+/**
+ * for_each_subsys - iterate all enabled cgroup subsystems
+ * @ss: the iteration cursor
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ */
+#define for_each_subsys(ss, ssid)					\
+	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
+	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
+
+/**
+ * do_each_subsys_mask - filter for_each_subsys with a bitmask
+ * @ss: the iteration cursor
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ * @ss_mask: the bitmask
+ *
+ * The block will only run for cases where the ssid-th bit (1 << ssid) of
+ * @ss_mask is set.
+ */
+#define do_each_subsys_mask(ss, ssid, ss_mask) do {			\
+	unsigned long __ss_mask = (ss_mask);				\
+	if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */	\
+		(ssid) = 0;						\
+		break;							\
+	}								\
+	for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) {	\
+		(ss) = cgroup_subsys[ssid];				\
+		{
+
+#define while_each_subsys_mask()					\
+		}							\
+	}								\
+} while (false)
+
+/* iterate across the hierarchies */
+#define for_each_root(root)						\
+	list_for_each_entry((root), &cgroup_roots, root_list)
+
+/* iterate over child cgrps, lock should be held throughout iteration */
+#define cgroup_for_each_live_child(child, cgrp)				\
+	list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
+		if (({ lockdep_assert_held(&cgroup_mutex);		\
+		       cgroup_is_dead(child); }))			\
+			;						\
+		else
+
+/* walk live descendants in preorder */
+#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)		\
+	css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL))	\
+		if (({ lockdep_assert_held(&cgroup_mutex);		\
+		       (dsct) = (d_css)->cgroup;			\
+		       cgroup_is_dead(dsct); }))			\
+			;						\
+		else
+
+/* walk live descendants in postorder */
+#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp)		\
+	css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL))	\
+		if (({ lockdep_assert_held(&cgroup_mutex);		\
+		       (dsct) = (d_css)->cgroup;			\
+		       cgroup_is_dead(dsct); }))			\
+			;						\
+		else
+
+static void cgroup_release_agent(struct work_struct *work);
+static void check_for_release(struct cgroup *cgrp);
+
+/*
+ * A cgroup can be associated with multiple css_sets as different tasks may
+ * belong to different cgroups on different hierarchies.  In the other
+ * direction, a css_set is naturally associated with multiple cgroups.
+ * This M:N relationship is represented by the following link structure
+ * which exists for each association and allows traversing the associations
+ * from both sides.
+ */
+struct cgrp_cset_link {
+	/* the cgroup and css_set this link associates */
+	struct cgroup		*cgrp;
+	struct css_set		*cset;
+
+	/* list of cgrp_cset_links anchored at cgrp->cset_links */
+	struct list_head	cset_link;
+
+	/* list of cgrp_cset_links anchored at css_set->cgrp_links */
+	struct list_head	cgrp_link;
+};
+
+/*
+ * The default css_set - used by init and its children prior to any
+ * hierarchies being mounted. It contains a pointer to the root state
+ * for each subsystem. Also used to anchor the list of css_sets. Not
+ * reference-counted, to improve performance when child cgroups
+ * haven't been created.
+ */
+struct css_set init_css_set = {
+	.refcount		= ATOMIC_INIT(1),
+	.tasks			= LIST_HEAD_INIT(init_css_set.tasks),
+	.mg_tasks		= LIST_HEAD_INIT(init_css_set.mg_tasks),
+	.task_iters		= LIST_HEAD_INIT(init_css_set.task_iters),
+	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
+	.mg_preload_node	= LIST_HEAD_INIT(init_css_set.mg_preload_node),
+	.mg_node		= LIST_HEAD_INIT(init_css_set.mg_node),
+};
+
+static int css_set_count	= 1;	/* 1 for init_css_set */
+
+/**
+ * css_set_populated - does a css_set contain any tasks?
+ * @cset: target css_set
+ */
+static bool css_set_populated(struct css_set *cset)
+{
+	lockdep_assert_held(&css_set_lock);
+
+	return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
+}
+
+/**
+ * cgroup_update_populated - updated populated count of a cgroup
+ * @cgrp: the target cgroup
+ * @populated: inc or dec populated count
+ *
+ * One of the css_sets associated with @cgrp is either getting its first
+ * task or losing the last.  Update @cgrp->populated_cnt accordingly.  The
+ * count is propagated towards root so that a given cgroup's populated_cnt
+ * is zero iff the cgroup and all its descendants don't contain any tasks.
+ *
+ * @cgrp's interface file "cgroup.populated" is zero if
+ * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt
+ * changes from or to zero, userland is notified that the content of the
+ * interface file has changed.  This can be used to detect when @cgrp and
+ * its descendants become populated or empty.
+ */
+static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
+{
+	lockdep_assert_held(&css_set_lock);
+
+	do {
+		bool trigger;
+
+		if (populated)
+			trigger = !cgrp->populated_cnt++;
+		else
+			trigger = !--cgrp->populated_cnt;
+
+		if (!trigger)
+			break;
+
+		check_for_release(cgrp);
+		cgroup_file_notify(&cgrp->events_file);
+
+		cgrp = cgroup_parent(cgrp);
+	} while (cgrp);
+}
+
+/**
+ * css_set_update_populated - update populated state of a css_set
+ * @cset: target css_set
+ * @populated: whether @cset is populated or depopulated
+ *
+ * @cset is either getting the first task or losing the last.  Update the
+ * ->populated_cnt of all associated cgroups accordingly.
+ */
+static void css_set_update_populated(struct css_set *cset, bool populated)
+{
+	struct cgrp_cset_link *link;
+
+	lockdep_assert_held(&css_set_lock);
+
+	list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
+		cgroup_update_populated(link->cgrp, populated);
+}
+
+/**
+ * css_set_move_task - move a task from one css_set to another
+ * @task: task being moved
+ * @from_cset: css_set @task currently belongs to (may be NULL)
+ * @to_cset: new css_set @task is being moved to (may be NULL)
+ * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
+ *
+ * Move @task from @from_cset to @to_cset.  If @task didn't belong to any
+ * css_set, @from_cset can be NULL.  If @task is being disassociated
+ * instead of moved, @to_cset can be NULL.
+ *
+ * This function automatically handles populated_cnt updates and
+ * css_task_iter adjustments but the caller is responsible for managing
+ * @from_cset and @to_cset's reference counts.
+ */
+static void css_set_move_task(struct task_struct *task,
+			      struct css_set *from_cset, struct css_set *to_cset,
+			      bool use_mg_tasks)
+{
+	lockdep_assert_held(&css_set_lock);
+
+	if (to_cset && !css_set_populated(to_cset))
+		css_set_update_populated(to_cset, true);
+
+	if (from_cset) {
+		struct css_task_iter *it, *pos;
+
+		WARN_ON_ONCE(list_empty(&task->cg_list));
+
+		/*
+		 * @task is leaving, advance task iterators which are
+		 * pointing to it so that they can resume at the next
+		 * position.  Advancing an iterator might remove it from
+		 * the list, use safe walk.  See css_task_iter_advance*()
+		 * for details.
+		 */
+		list_for_each_entry_safe(it, pos, &from_cset->task_iters,
+					 iters_node)
+			if (it->task_pos == &task->cg_list)
+				css_task_iter_advance(it);
+
+		list_del_init(&task->cg_list);
+		if (!css_set_populated(from_cset))
+			css_set_update_populated(from_cset, false);
+	} else {
+		WARN_ON_ONCE(!list_empty(&task->cg_list));
+	}
+
+	if (to_cset) {
+		/*
+		 * We are synchronized through cgroup_threadgroup_rwsem
+		 * against PF_EXITING setting such that we can't race
+		 * against cgroup_exit() changing the css_set to
+		 * init_css_set and dropping the old one.
+		 */
+		WARN_ON_ONCE(task->flags & PF_EXITING);
+
+		rcu_assign_pointer(task->cgroups, to_cset);
+		list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
+							     &to_cset->tasks);
+	}
+}
+
+/*
+ * hash table for cgroup groups. This improves the performance to find
+ * an existing css_set. This hash doesn't (currently) take into
+ * account cgroups in empty hierarchies.
+ */
+#define CSS_SET_HASH_BITS	7
+static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
+
+static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
+{
+	unsigned long key = 0UL;
+	struct cgroup_subsys *ss;
+	int i;
+
+	for_each_subsys(ss, i)
+		key += (unsigned long)css[i];
+	key = (key >> 16) ^ key;
+
+	return key;
+}
+
+static void put_css_set_locked(struct css_set *cset)
+{
+	struct cgrp_cset_link *link, *tmp_link;
+	struct cgroup_subsys *ss;
+	int ssid;
+
+	lockdep_assert_held(&css_set_lock);
+
+	if (!atomic_dec_and_test(&cset->refcount))
+		return;
+
+	/* This css_set is dead. unlink it and release cgroup and css refs */
+	for_each_subsys(ss, ssid) {
+		list_del(&cset->e_cset_node[ssid]);
+		css_put(cset->subsys[ssid]);
+	}
+	hash_del(&cset->hlist);
+	css_set_count--;
+
+	list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
+		list_del(&link->cset_link);
+		list_del(&link->cgrp_link);
+		if (cgroup_parent(link->cgrp))
+			cgroup_put(link->cgrp);
+		kfree(link);
+	}
+
+	kfree_rcu(cset, rcu_head);
+}
+
+static void put_css_set(struct css_set *cset)
+{
+	unsigned long flags;
+
+	/*
+	 * Ensure that the refcount doesn't hit zero while any readers
+	 * can see it. Similar to atomic_dec_and_lock(), but for an
+	 * rwlock
+	 */
+	if (atomic_add_unless(&cset->refcount, -1, 1))
+		return;
+
+	spin_lock_irqsave(&css_set_lock, flags);
+	put_css_set_locked(cset);
+	spin_unlock_irqrestore(&css_set_lock, flags);
+}
+
+/*
+ * refcounted get/put for css_set objects
+ */
+static inline void get_css_set(struct css_set *cset)
+{
+	atomic_inc(&cset->refcount);
+}
+
+/**
+ * compare_css_sets - helper function for find_existing_css_set().
+ * @cset: candidate css_set being tested
+ * @old_cset: existing css_set for a task
+ * @new_cgrp: cgroup that's being entered by the task
+ * @template: desired set of css pointers in css_set (pre-calculated)
+ *
+ * Returns true if "cset" matches "old_cset" except for the hierarchy
+ * which "new_cgrp" belongs to, for which it should match "new_cgrp".
+ */
+static bool compare_css_sets(struct css_set *cset,
+			     struct css_set *old_cset,
+			     struct cgroup *new_cgrp,
+			     struct cgroup_subsys_state *template[])
+{
+	struct list_head *l1, *l2;
+
+	/*
+	 * On the default hierarchy, there can be csets which are
+	 * associated with the same set of cgroups but different csses.
+	 * Let's first ensure that csses match.
+	 */
+	if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
+		return false;
+
+	/*
+	 * Compare cgroup pointers in order to distinguish between
+	 * different cgroups in hierarchies.  As different cgroups may
+	 * share the same effective css, this comparison is always
+	 * necessary.
+	 */
+	l1 = &cset->cgrp_links;
+	l2 = &old_cset->cgrp_links;
+	while (1) {
+		struct cgrp_cset_link *link1, *link2;
+		struct cgroup *cgrp1, *cgrp2;
+
+		l1 = l1->next;
+		l2 = l2->next;
+		/* See if we reached the end - both lists are equal length. */
+		if (l1 == &cset->cgrp_links) {
+			BUG_ON(l2 != &old_cset->cgrp_links);
+			break;
+		} else {
+			BUG_ON(l2 == &old_cset->cgrp_links);
+		}
+		/* Locate the cgroups associated with these links. */
+		link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
+		link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
+		cgrp1 = link1->cgrp;
+		cgrp2 = link2->cgrp;
+		/* Hierarchies should be linked in the same order. */
+		BUG_ON(cgrp1->root != cgrp2->root);
+
+		/*
+		 * If this hierarchy is the hierarchy of the cgroup
+		 * that's changing, then we need to check that this
+		 * css_set points to the new cgroup; if it's any other
+		 * hierarchy, then this css_set should point to the
+		 * same cgroup as the old css_set.
+		 */
+		if (cgrp1->root == new_cgrp->root) {
+			if (cgrp1 != new_cgrp)
+				return false;
+		} else {
+			if (cgrp1 != cgrp2)
+				return false;
+		}
+	}
+	return true;
+}
+
+/**
+ * find_existing_css_set - init css array and find the matching css_set
+ * @old_cset: the css_set that we're using before the cgroup transition
+ * @cgrp: the cgroup that we're moving into
+ * @template: out param for the new set of csses, should be clear on entry
+ */
+static struct css_set *find_existing_css_set(struct css_set *old_cset,
+					struct cgroup *cgrp,
+					struct cgroup_subsys_state *template[])
+{
+	struct cgroup_root *root = cgrp->root;
+	struct cgroup_subsys *ss;
+	struct css_set *cset;
+	unsigned long key;
+	int i;
+
+	/*
+	 * Build the set of subsystem state objects that we want to see in the
+	 * new css_set. while subsystems can change globally, the entries here
+	 * won't change, so no need for locking.
+	 */
+	for_each_subsys(ss, i) {
+		if (root->subsys_mask & (1UL << i)) {
+			/*
+			 * @ss is in this hierarchy, so we want the
+			 * effective css from @cgrp.
+			 */
+			template[i] = cgroup_e_css(cgrp, ss);
+		} else {
+			/*
+			 * @ss is not in this hierarchy, so we don't want
+			 * to change the css.
+			 */
+			template[i] = old_cset->subsys[i];
+		}
+	}
+
+	key = css_set_hash(template);
+	hash_for_each_possible(css_set_table, cset, hlist, key) {
+		if (!compare_css_sets(cset, old_cset, cgrp, template))
+			continue;
+
+		/* This css_set matches what we need */
+		return cset;
+	}
+
+	/* No existing cgroup group matched */
+	return NULL;
+}
+
+static void free_cgrp_cset_links(struct list_head *links_to_free)
+{
+	struct cgrp_cset_link *link, *tmp_link;
+
+	list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
+		list_del(&link->cset_link);
+		kfree(link);
+	}
+}
+
+/**
+ * allocate_cgrp_cset_links - allocate cgrp_cset_links
+ * @count: the number of links to allocate
+ * @tmp_links: list_head the allocated links are put on
+ *
+ * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
+ * through ->cset_link.  Returns 0 on success or -errno.
+ */
+static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
+{
+	struct cgrp_cset_link *link;
+	int i;
+
+	INIT_LIST_HEAD(tmp_links);
+
+	for (i = 0; i < count; i++) {
+		link = kzalloc(sizeof(*link), GFP_KERNEL);
+		if (!link) {
+			free_cgrp_cset_links(tmp_links);
+			return -ENOMEM;
+		}
+		list_add(&link->cset_link, tmp_links);
+	}
+	return 0;
+}
+
+/**
+ * link_css_set - a helper function to link a css_set to a cgroup
+ * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
+ * @cset: the css_set to be linked
+ * @cgrp: the destination cgroup
+ */
+static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
+			 struct cgroup *cgrp)
+{
+	struct cgrp_cset_link *link;
+
+	BUG_ON(list_empty(tmp_links));
+
+	if (cgroup_on_dfl(cgrp))
+		cset->dfl_cgrp = cgrp;
+
+	link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
+	link->cset = cset;
+	link->cgrp = cgrp;
+
+	/*
+	 * Always add links to the tail of the lists so that the lists are
+	 * in choronological order.
+	 */
+	list_move_tail(&link->cset_link, &cgrp->cset_links);
+	list_add_tail(&link->cgrp_link, &cset->cgrp_links);
+
+	if (cgroup_parent(cgrp))
+		cgroup_get(cgrp);
+}
+
+/**
+ * find_css_set - return a new css_set with one cgroup updated
+ * @old_cset: the baseline css_set
+ * @cgrp: the cgroup to be updated
+ *
+ * Return a new css_set that's equivalent to @old_cset, but with @cgrp
+ * substituted into the appropriate hierarchy.
+ */
+static struct css_set *find_css_set(struct css_set *old_cset,
+				    struct cgroup *cgrp)
+{
+	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
+	struct css_set *cset;
+	struct list_head tmp_links;
+	struct cgrp_cset_link *link;
+	struct cgroup_subsys *ss;
+	unsigned long key;
+	int ssid;
+
+	lockdep_assert_held(&cgroup_mutex);
+
+	/* First see if we already have a cgroup group that matches
+	 * the desired set */
+	spin_lock_irq(&css_set_lock);
+	cset = find_existing_css_set(old_cset, cgrp, template);
+	if (cset)
+		get_css_set(cset);
+	spin_unlock_irq(&css_set_lock);
+
+	if (cset)
+		return cset;
+
+	cset = kzalloc(sizeof(*cset), GFP_KERNEL);
+	if (!cset)
+		return NULL;
+
+	/* Allocate all the cgrp_cset_link objects that we'll need */
+	if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
+		kfree(cset);
+		return NULL;
+	}
+
+	atomic_set(&cset->refcount, 1);
+	INIT_LIST_HEAD(&cset->tasks);
+	INIT_LIST_HEAD(&cset->mg_tasks);
+	INIT_LIST_HEAD(&cset->task_iters);
+	INIT_HLIST_NODE(&cset->hlist);
+	INIT_LIST_HEAD(&cset->cgrp_links);
+	INIT_LIST_HEAD(&cset->mg_preload_node);
+	INIT_LIST_HEAD(&cset->mg_node);
+
+	/* Copy the set of subsystem state objects generated in
+	 * find_existing_css_set() */
+	memcpy(cset->subsys, template, sizeof(cset->subsys));
+
+	spin_lock_irq(&css_set_lock);
+	/* Add reference counts and links from the new css_set. */
+	list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
+		struct cgroup *c = link->cgrp;
+
+		if (c->root == cgrp->root)
+			c = cgrp;
+		link_css_set(&tmp_links, cset, c);
+	}
+
+	BUG_ON(!list_empty(&tmp_links));
+
+	css_set_count++;
+
+	/* Add @cset to the hash table */
+	key = css_set_hash(cset->subsys);
+	hash_add(css_set_table, &cset->hlist, key);
+
+	for_each_subsys(ss, ssid) {
+		struct cgroup_subsys_state *css = cset->subsys[ssid];
+
+		list_add_tail(&cset->e_cset_node[ssid],
+			      &css->cgroup->e_csets[ssid]);
+		css_get(css);
+	}
+
+	spin_unlock_irq(&css_set_lock);
+
+	return cset;
+}
+
+static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
+{
+	struct cgroup *root_cgrp = kf_root->kn->priv;
+
+	return root_cgrp->root;
+}
+
+static int cgroup_init_root_id(struct cgroup_root *root)
+{
+	int id;
+
+	lockdep_assert_held(&cgroup_mutex);
+
+	id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
+	if (id < 0)
+		return id;
+
+	root->hierarchy_id = id;
+	return 0;
+}
+
+static void cgroup_exit_root_id(struct cgroup_root *root)
+{
+	lockdep_assert_held(&cgroup_mutex);
+
+	idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
+}
+
+static void cgroup_free_root(struct cgroup_root *root)
+{
+	if (root) {
+		idr_destroy(&root->cgroup_idr);
+		kfree(root);
+	}
+}
+
+static void cgroup_destroy_root(struct cgroup_root *root)
+{
+	struct cgroup *cgrp = &root->cgrp;
+	struct cgrp_cset_link *link, *tmp_link;
+
+	trace_cgroup_destroy_root(root);
+
+	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
+
+	BUG_ON(atomic_read(&root->nr_cgrps));
+	BUG_ON(!list_empty(&cgrp->self.children));
+
+	/* Rebind all subsystems back to the default hierarchy */
+	WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
+
+	/*
+	 * Release all the links from cset_links to this hierarchy's
+	 * root cgroup
+	 */
+	spin_lock_irq(&css_set_lock);
+
+	list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
+		list_del(&link->cset_link);
+		list_del(&link->cgrp_link);
+		kfree(link);
+	}
+
+	spin_unlock_irq(&css_set_lock);
+
+	if (!list_empty(&root->root_list)) {
+		list_del(&root->root_list);
+		cgroup_root_count--;
+	}
+
+	cgroup_exit_root_id(root);
+
+	mutex_unlock(&cgroup_mutex);
+
+	kernfs_destroy_root(root->kf_root);
+	cgroup_free_root(root);
+}
+
+/*
+ * look up cgroup associated with current task's cgroup namespace on the
+ * specified hierarchy
+ */
+static struct cgroup *
+current_cgns_cgroup_from_root(struct cgroup_root *root)
+{
+	struct cgroup *res = NULL;
+	struct css_set *cset;
+
+	lockdep_assert_held(&css_set_lock);
+
+	rcu_read_lock();
+
+	cset = current->nsproxy->cgroup_ns->root_cset;
+	if (cset == &init_css_set) {
+		res = &root->cgrp;
+	} else {
+		struct cgrp_cset_link *link;
+
+		list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+			struct cgroup *c = link->cgrp;
+
+			if (c->root == root) {
+				res = c;
+				break;
+			}
+		}
+	}
+	rcu_read_unlock();
+
+	BUG_ON(!res);
+	return res;
+}
+
+/* look up cgroup associated with given css_set on the specified hierarchy */
+static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
+					    struct cgroup_root *root)
+{
+	struct cgroup *res = NULL;
+
+	lockdep_assert_held(&cgroup_mutex);
+	lockdep_assert_held(&css_set_lock);
+
+	if (cset == &init_css_set) {
+		res = &root->cgrp;
+	} else {
+		struct cgrp_cset_link *link;
+
+		list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+			struct cgroup *c = link->cgrp;
+
+			if (c->root == root) {
+				res = c;
+				break;
+			}
+		}
+	}
+
+	BUG_ON(!res);
+	return res;
+}
+
+/*
+ * Return the cgroup for "task" from the given hierarchy. Must be
+ * called with cgroup_mutex and css_set_lock held.
+ */
+static struct cgroup *task_cgroup_from_root(struct task_struct *task,
+					    struct cgroup_root *root)
+{
+	/*
+	 * No need to lock the task - since we hold cgroup_mutex the
+	 * task can't change groups, so the only thing that can happen
+	 * is that it exits and its css is set back to init_css_set.
+	 */
+	return cset_cgroup_from_root(task_css_set(task), root);
+}
+
+/*
+ * A task must hold cgroup_mutex to modify cgroups.
+ *
+ * Any task can increment and decrement the count field without lock.
+ * So in general, code holding cgroup_mutex can't rely on the count
+ * field not changing.  However, if the count goes to zero, then only
+ * cgroup_attach_task() can increment it again.  Because a count of zero
+ * means that no tasks are currently attached, therefore there is no
+ * way a task attached to that cgroup can fork (the other way to
+ * increment the count).  So code holding cgroup_mutex can safely
+ * assume that if the count is zero, it will stay zero. Similarly, if
+ * a task holds cgroup_mutex on a cgroup with zero count, it
+ * knows that the cgroup won't be removed, as cgroup_rmdir()
+ * needs that mutex.
+ *
+ * A cgroup can only be deleted if both its 'count' of using tasks
+ * is zero, and its list of 'children' cgroups is empty.  Since all
+ * tasks in the system use _some_ cgroup, and since there is always at
+ * least one task in the system (init, pid == 1), therefore, root cgroup
+ * always has either children cgroups and/or using tasks.  So we don't
+ * need a special hack to ensure that root cgroup cannot be deleted.
+ *
+ * P.S.  One more locking exception.  RCU is used to guard the
+ * update of a tasks cgroup pointer by cgroup_attach_task()
+ */
+
+static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
+static const struct file_operations proc_cgroupstats_operations;
+
+static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
+			      char *buf)
+{
+	struct cgroup_subsys *ss = cft->ss;
+
+	if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
+	    !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
+		snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
+			 cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
+			 cft->name);
+	else
+		strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+	return buf;
+}
+
+/**
+ * cgroup_file_mode - deduce file mode of a control file
+ * @cft: the control file in question
+ *
+ * S_IRUGO for read, S_IWUSR for write.
+ */
+static umode_t cgroup_file_mode(const struct cftype *cft)
+{
+	umode_t mode = 0;
+
+	if (cft->read_u64 || cft->read_s64 || cft->seq_show)
+		mode |= S_IRUGO;
+
+	if (cft->write_u64 || cft->write_s64 || cft->write) {
+		if (cft->flags & CFTYPE_WORLD_WRITABLE)
+			mode |= S_IWUGO;
+		else
+			mode |= S_IWUSR;
+	}
+
+	return mode;
+}
+
+/**
+ * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask
+ * @subtree_control: the new subtree_control mask to consider
+ * @this_ss_mask: available subsystems
+ *
+ * On the default hierarchy, a subsystem may request other subsystems to be
+ * enabled together through its ->depends_on mask.  In such cases, more
+ * subsystems than specified in "cgroup.subtree_control" may be enabled.
+ *
+ * This function calculates which subsystems need to be enabled if
+ * @subtree_control is to be applied while restricted to @this_ss_mask.
+ */
+static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
+{
+	u16 cur_ss_mask = subtree_control;
+	struct cgroup_subsys *ss;
+	int ssid;
+
+	lockdep_assert_held(&cgroup_mutex);
+
+	cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
+
+	while (true) {
+		u16 new_ss_mask = cur_ss_mask;
+
+		do_each_subsys_mask(ss, ssid, cur_ss_mask) {
+			new_ss_mask |= ss->depends_on;
+		} while_each_subsys_mask();
+
+		/*
+		 * Mask out subsystems which aren't available.  This can
+		 * happen only if some depended-upon subsystems were bound
+		 * to non-default hierarchies.
+		 */
+		new_ss_mask &= this_ss_mask;
+
+		if (new_ss_mask == cur_ss_mask)
+			break;
+		cur_ss_mask = new_ss_mask;
+	}
+
+	return cur_ss_mask;
+}
+
+/**
+ * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
+ * @kn: the kernfs_node being serviced
+ *
+ * This helper undoes cgroup_kn_lock_live() and should be invoked before
+ * the method finishes if locking succeeded.  Note that once this function
+ * returns the cgroup returned by cgroup_kn_lock_live() may become
+ * inaccessible any time.  If the caller intends to continue to access the
+ * cgroup, it should pin it before invoking this function.
+ */
+static void cgroup_kn_unlock(struct kernfs_node *kn)
+{
+	struct cgroup *cgrp;
+
+	if (kernfs_type(kn) == KERNFS_DIR)
+		cgrp = kn->priv;
+	else
+		cgrp = kn->parent->priv;
+
+	mutex_unlock(&cgroup_mutex);
+
+	kernfs_unbreak_active_protection(kn);
+	cgroup_put(cgrp);
+}
+
+/**
+ * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
+ * @kn: the kernfs_node being serviced
+ * @drain_offline: perform offline draining on the cgroup
+ *
+ * This helper is to be used by a cgroup kernfs method currently servicing
+ * @kn.  It breaks the active protection, performs cgroup locking and
+ * verifies that the associated cgroup is alive.  Returns the cgroup if
+ * alive; otherwise, %NULL.  A successful return should be undone by a
+ * matching cgroup_kn_unlock() invocation.  If @drain_offline is %true, the
+ * cgroup is drained of offlining csses before return.
+ *
+ * Any cgroup kernfs method implementation which requires locking the
+ * associated cgroup should use this helper.  It avoids nesting cgroup
+ * locking under kernfs active protection and allows all kernfs operations
+ * including self-removal.
+ */
+static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn,
+					  bool drain_offline)
+{
+	struct cgroup *cgrp;
+
+	if (kernfs_type(kn) == KERNFS_DIR)
+		cgrp = kn->priv;
+	else
+		cgrp = kn->parent->priv;
+
+	/*
+	 * We're gonna grab cgroup_mutex which nests outside kernfs
+	 * active_ref.  cgroup liveliness check alone provides enough
+	 * protection against removal.  Ensure @cgrp stays accessible and
+	 * break the active_ref protection.
+	 */
+	if (!cgroup_tryget(cgrp))
+		return NULL;
+	kernfs_break_active_protection(kn);
+
+	if (drain_offline)
+		cgroup_lock_and_drain_offline(cgrp);
+	else
+		mutex_lock(&cgroup_mutex);
+
+	if (!cgroup_is_dead(cgrp))
+		return cgrp;
+
+	cgroup_kn_unlock(kn);
+	return NULL;
+}
+
+static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
+{
+	char name[CGROUP_FILE_NAME_MAX];
+
+	lockdep_assert_held(&cgroup_mutex);
+
+	if (cft->file_offset) {
+		struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
+		struct cgroup_file *cfile = (void *)css + cft->file_offset;
+
+		spin_lock_irq(&cgroup_file_kn_lock);
+		cfile->kn = NULL;
+		spin_unlock_irq(&cgroup_file_kn_lock);
+	}
+
+	kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
+}
+
+/**
+ * css_clear_dir - remove subsys files in a cgroup directory
+ * @css: taget css
+ */
+static void css_clear_dir(struct cgroup_subsys_state *css)
+{
+	struct cgroup *cgrp = css->cgroup;
+	struct cftype *cfts;
+
+	if (!(css->flags & CSS_VISIBLE))
+		return;
+
+	css->flags &= ~CSS_VISIBLE;
+
+	list_for_each_entry(cfts, &css->ss->cfts, node)
+		cgroup_addrm_files(css, cgrp, cfts, false);
+}
+
+/**
+ * css_populate_dir - create subsys files in a cgroup directory
+ * @css: target css
+ *
+ * On failure, no file is added.
+ */
+static int css_populate_dir(struct cgroup_subsys_state *css)
+{
+	struct cgroup *cgrp = css->cgroup;
+	struct cftype *cfts, *failed_cfts;
+	int ret;
+
+	if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
+		return 0;
+
+	if (!css->ss) {
+		if (cgroup_on_dfl(cgrp))
+			cfts = cgroup_dfl_base_files;
+		else
+			cfts = cgroup_legacy_base_files;
+
+		return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
+	}
+
+	list_for_each_entry(cfts, &css->ss->cfts, node) {
+		ret = cgroup_addrm_files(css, cgrp, cfts, true);
+		if (ret < 0) {
+			failed_cfts = cfts;
+			goto err;
+		}
+	}
+
+	css->flags |= CSS_VISIBLE;
+
+	return 0;
+err:
+	list_for_each_entry(cfts, &css->ss->cfts, node) {
+		if (cfts == failed_cfts)
+			break;
+		cgroup_addrm_files(css, cgrp, cfts, false);
+	}
+	return ret;
+}
+
+static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
+{
+	struct cgroup *dcgrp = &dst_root->cgrp;
+	struct cgroup_subsys *ss;
+	int ssid, i, ret;
+
+	lockdep_assert_held(&cgroup_mutex);
+
+	do_each_subsys_mask(ss, ssid, ss_mask) {
+		/*
+		 * If @ss has non-root csses attached to it, can't move.
+		 * If @ss is an implicit controller, it is exempt from this
+		 * rule and can be stolen.
+		 */
+		if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
+		    !ss->implicit_on_dfl)
+			return -EBUSY;
+
+		/* can't move between two non-dummy roots either */
+		if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
+			return -EBUSY;
+	} while_each_subsys_mask();
+
+	do_each_subsys_mask(ss, ssid, ss_mask) {
+		struct cgroup_root *src_root = ss->root;
+		struct cgroup *scgrp = &src_root->cgrp;
+		struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
+		struct css_set *cset;
+
+		WARN_ON(!css || cgroup_css(dcgrp, ss));
+
+		/* disable from the source */
+		src_root->subsys_mask &= ~(1 << ssid);
+		WARN_ON(cgroup_apply_control(scgrp));
+		cgroup_finalize_control(scgrp, 0);
+
+		/* rebind */
+		RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
+		rcu_assign_pointer(dcgrp->subsys[ssid], css);
+		ss->root = dst_root;
+		css->cgroup = dcgrp;
+
+		spin_lock_irq(&css_set_lock);
+		hash_for_each(css_set_table, i, cset, hlist)
+			list_move_tail(&cset->e_cset_node[ss->id],
+				       &dcgrp->e_csets[ss->id]);
+		spin_unlock_irq(&css_set_lock);
+
+		/* default hierarchy doesn't enable controllers by default */
+		dst_root->subsys_mask |= 1 << ssid;
+		if (dst_root == &cgrp_dfl_root) {
+			static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
+		} else {
+			dcgrp->subtree_control |= 1 << ssid;
+			static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
+		}
+
+		ret = cgroup_apply_control(dcgrp);
+		if (ret)
+			pr_warn("partial failure to rebind %s controller (err=%d)\n",
+				ss->name, ret);
+
+		if (ss->bind)
+			ss->bind(css);
+	} while_each_subsys_mask();
+
+	kernfs_activate(dcgrp->kn);
+	return 0;
+}
+
+static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
+			    struct kernfs_root *kf_root)
+{
+	int len = 0;
+	char *buf = NULL;
+	struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
+	struct cgroup *ns_cgroup;
+
+	buf = kmalloc(PATH_MAX, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	spin_lock_irq(&css_set_lock);
+	ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
+	len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
+	spin_unlock_irq(&css_set_lock);
+
+	if (len >= PATH_MAX)
+		len = -ERANGE;
+	else if (len > 0) {
+		seq_escape(sf, buf, " \t\n\\");
+		len = 0;
+	}
+	kfree(buf);
+	return len;
+}
+
+static int cgroup_show_options(struct seq_file *seq,
+			       struct kernfs_root *kf_root)
+{
+	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
+	struct cgroup_subsys *ss;
+	int ssid;
+
+	if (root != &cgrp_dfl_root)
+		for_each_subsys(ss, ssid)
+			if (root->subsys_mask & (1 << ssid))
+				seq_show_option(seq, ss->legacy_name, NULL);
+	if (root->flags & CGRP_ROOT_NOPREFIX)
+		seq_puts(seq, ",noprefix");
+	if (root->flags & CGRP_ROOT_XATTR)
+		seq_puts(seq, ",xattr");
+
+	spin_lock(&release_agent_path_lock);
+	if (strlen(root->release_agent_path))
+		seq_show_option(seq, "release_agent",
+				root->release_agent_path);
+	spin_unlock(&release_agent_path_lock);
+
+	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
+		seq_puts(seq, ",clone_children");
+	if (strlen(root->name))
+		seq_show_option(seq, "name", root->name);
+	return 0;
+}
+
+struct cgroup_sb_opts {
+	u16 subsys_mask;
+	unsigned int flags;
+	char *release_agent;
+	bool cpuset_clone_children;
+	char *name;
+	/* User explicitly requested empty subsystem */
+	bool none;
+};
+
+static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
+{
+	char *token, *o = data;
+	bool all_ss = false, one_ss = false;
+	u16 mask = U16_MAX;
+	struct cgroup_subsys *ss;
+	int nr_opts = 0;
+	int i;
+
+#ifdef CONFIG_CPUSETS
+	mask = ~((u16)1 << cpuset_cgrp_id);
+#endif
+
+	memset(opts, 0, sizeof(*opts));
+
+	while ((token = strsep(&o, ",")) != NULL) {
+		nr_opts++;
+
+		if (!*token)
+			return -EINVAL;
+		if (!strcmp(token, "none")) {
+			/* Explicitly have no subsystems */
+			opts->none = true;
+			continue;
+		}
+		if (!strcmp(token, "all")) {
+			/* Mutually exclusive option 'all' + subsystem name */
+			if (one_ss)
+				return -EINVAL;
+			all_ss = true;
+			continue;
+		}
+		if (!strcmp(token, "noprefix")) {
+			opts->flags |= CGRP_ROOT_NOPREFIX;
+			continue;
+		}
+		if (!strcmp(token, "clone_children")) {
+			opts->cpuset_clone_children = true;
+			continue;
+		}
+		if (!strcmp(token, "xattr")) {
+			opts->flags |= CGRP_ROOT_XATTR;
+			continue;
+		}
+		if (!strncmp(token, "release_agent=", 14)) {
+			/* Specifying two release agents is forbidden */
+			if (opts->release_agent)
+				return -EINVAL;
+			opts->release_agent =
+				kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
+			if (!opts->release_agent)
+				return -ENOMEM;
+			continue;
+		}
+		if (!strncmp(token, "name=", 5)) {
+			const char *name = token + 5;
+			/* Can't specify an empty name */
+			if (!strlen(name))
+				return -EINVAL;
+			/* Must match [\w.-]+ */
+			for (i = 0; i < strlen(name); i++) {
+				char c = name[i];
+				if (isalnum(c))
+					continue;
+				if ((c == '.') || (c == '-') || (c == '_'))
+					continue;
+				return -EINVAL;
+			}
+			/* Specifying two names is forbidden */
+			if (opts->name)
+				return -EINVAL;
+			opts->name = kstrndup(name,
+					      MAX_CGROUP_ROOT_NAMELEN - 1,
+					      GFP_KERNEL);
+			if (!opts->name)
+				return -ENOMEM;
+
+			continue;
+		}
+
+		for_each_subsys(ss, i) {
+			if (strcmp(token, ss->legacy_name))
+				continue;
+			if (!cgroup_ssid_enabled(i))
+				continue;
+			if (cgroup_ssid_no_v1(i))
+				continue;
+
+			/* Mutually exclusive option 'all' + subsystem name */
+			if (all_ss)
+				return -EINVAL;
+			opts->subsys_mask |= (1 << i);
+			one_ss = true;
+
+			break;
+		}
+		if (i == CGROUP_SUBSYS_COUNT)
+			return -ENOENT;
+	}
+
+	/*
+	 * If the 'all' option was specified select all the subsystems,
+	 * otherwise if 'none', 'name=' and a subsystem name options were
+	 * not specified, let's default to 'all'
+	 */
+	if (all_ss || (!one_ss && !opts->none && !opts->name))
+		for_each_subsys(ss, i)
+			if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i))
+				opts->subsys_mask |= (1 << i);
+
+	/*
+	 * We either have to specify by name or by subsystems. (So all
+	 * empty hierarchies must have a name).
+	 */
+	if (!opts->subsys_mask && !opts->name)
+		return -EINVAL;
+
+	/*
+	 * Option noprefix was introduced just for backward compatibility
+	 * with the old cpuset, so we allow noprefix only if mounting just
+	 * the cpuset subsystem.
+	 */
+	if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
+		return -EINVAL;
+
+	/* Can't specify "none" and some subsystems */
+	if (opts->subsys_mask && opts->none)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
+{
+	int ret = 0;
+	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
+	struct cgroup_sb_opts opts;
+	u16 added_mask, removed_mask;
+
+	if (root == &cgrp_dfl_root) {
+		pr_err("remount is not allowed\n");
+		return -EINVAL;
+	}
+
+	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
+
+	/* See what subsystems are wanted */
+	ret = parse_cgroupfs_options(data, &opts);
+	if (ret)
+		goto out_unlock;
+
+	if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
+		pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
+			task_tgid_nr(current), current->comm);
+
+	added_mask = opts.subsys_mask & ~root->subsys_mask;
+	removed_mask = root->subsys_mask & ~opts.subsys_mask;
+
+	/* Don't allow flags or name to change at remount */
+	if ((opts.flags ^ root->flags) ||
+	    (opts.name && strcmp(opts.name, root->name))) {
+		pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
+		       opts.flags, opts.name ?: "", root->flags, root->name);
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* remounting is not allowed for populated hierarchies */
+	if (!list_empty(&root->cgrp.self.children)) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
+	ret = rebind_subsystems(root, added_mask);
+	if (ret)
+		goto out_unlock;
+
+	WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
+
+	if (opts.release_agent) {
+		spin_lock(&release_agent_path_lock);
+		strcpy(root->release_agent_path, opts.release_agent);
+		spin_unlock(&release_agent_path_lock);
+	}
+
+	trace_cgroup_remount(root);
+
+ out_unlock:
+	kfree(opts.release_agent);
+	kfree(opts.name);
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
+
+/*
+ * To reduce the fork() overhead for systems that are not actually using
+ * their cgroups capability, we don't maintain the lists running through
+ * each css_set to its tasks until we see the list actually used - in other
+ * words after the first mount.
+ */
+static bool use_task_css_set_links __read_mostly;
+
+static void cgroup_enable_task_cg_lists(void)
+{
+	struct task_struct *p, *g;
+
+	spin_lock_irq(&css_set_lock);
+
+	if (use_task_css_set_links)
+		goto out_unlock;
+
+	use_task_css_set_links = true;
+
+	/*
+	 * We need tasklist_lock because RCU is not safe against
+	 * while_each_thread(). Besides, a forking task that has passed
+	 * cgroup_post_fork() without seeing use_task_css_set_links = 1
+	 * is not guaranteed to have its child immediately visible in the
+	 * tasklist if we walk through it with RCU.
+	 */
+	read_lock(&tasklist_lock);
+	do_each_thread(g, p) {
+		WARN_ON_ONCE(!list_empty(&p->cg_list) ||
+			     task_css_set(p) != &init_css_set);
+
+		/*
+		 * We should check if the process is exiting, otherwise
+		 * it will race with cgroup_exit() in that the list
+		 * entry won't be deleted though the process has exited.
+		 * Do it while holding siglock so that we don't end up
+		 * racing against cgroup_exit().
+		 *
+		 * Interrupts were already disabled while acquiring
+		 * the css_set_lock, so we do not need to disable it
+		 * again when acquiring the sighand->siglock here.
+		 */
+		spin_lock(&p->sighand->siglock);
+		if (!(p->flags & PF_EXITING)) {
+			struct css_set *cset = task_css_set(p);
+
+			if (!css_set_populated(cset))
+				css_set_update_populated(cset, true);
+			list_add_tail(&p->cg_list, &cset->tasks);
+			get_css_set(cset);
+		}
+		spin_unlock(&p->sighand->siglock);
+	} while_each_thread(g, p);
+	read_unlock(&tasklist_lock);
+out_unlock:
+	spin_unlock_irq(&css_set_lock);
+}
+
+static void init_cgroup_housekeeping(struct cgroup *cgrp)
+{
+	struct cgroup_subsys *ss;
+	int ssid;
+
+	INIT_LIST_HEAD(&cgrp->self.sibling);
+	INIT_LIST_HEAD(&cgrp->self.children);
+	INIT_LIST_HEAD(&cgrp->cset_links);
+	INIT_LIST_HEAD(&cgrp->pidlists);
+	mutex_init(&cgrp->pidlist_mutex);
+	cgrp->self.cgroup = cgrp;
+	cgrp->self.flags |= CSS_ONLINE;
+
+	for_each_subsys(ss, ssid)
+		INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
+
+	init_waitqueue_head(&cgrp->offline_waitq);
+	INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
+}
+
+static void init_cgroup_root(struct cgroup_root *root,
+			     struct cgroup_sb_opts *opts)
+{
+	struct cgroup *cgrp = &root->cgrp;
+
+	INIT_LIST_HEAD(&root->root_list);
+	atomic_set(&root->nr_cgrps, 1);
+	cgrp->root = root;
+	init_cgroup_housekeeping(cgrp);
+	idr_init(&root->cgroup_idr);
+
+	root->flags = opts->flags;
+	if (opts->release_agent)
+		strcpy(root->release_agent_path, opts->release_agent);
+	if (opts->name)
+		strcpy(root->name, opts->name);
+	if (opts->cpuset_clone_children)
+		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
+}
+
+static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
+{
+	LIST_HEAD(tmp_links);
+	struct cgroup *root_cgrp = &root->cgrp;
+	struct css_set *cset;
+	int i, ret;
+
+	lockdep_assert_held(&cgroup_mutex);
+
+	ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
+	if (ret < 0)
+		goto out;
+	root_cgrp->id = ret;
+	root_cgrp->ancestor_ids[0] = ret;
+
+	ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
+			      GFP_KERNEL);
+	if (ret)
+		goto out;
+
+	/*
+	 * We're accessing css_set_count without locking css_set_lock here,
+	 * but that's OK - it can only be increased by someone holding
+	 * cgroup_lock, and that's us.  Later rebinding may disable
+	 * controllers on the default hierarchy and thus create new csets,
+	 * which can't be more than the existing ones.  Allocate 2x.
+	 */
+	ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
+	if (ret)
+		goto cancel_ref;
+
+	ret = cgroup_init_root_id(root);
+	if (ret)
+		goto cancel_ref;
+
+	root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
+					   KERNFS_ROOT_CREATE_DEACTIVATED,
+					   root_cgrp);
+	if (IS_ERR(root->kf_root)) {
+		ret = PTR_ERR(root->kf_root);
+		goto exit_root_id;
+	}
+	root_cgrp->kn = root->kf_root->kn;
+
+	ret = css_populate_dir(&root_cgrp->self);
+	if (ret)
+		goto destroy_root;
+
+	ret = rebind_subsystems(root, ss_mask);
+	if (ret)
+		goto destroy_root;
+
+	trace_cgroup_setup_root(root);
+
+	/*
+	 * There must be no failure case after here, since rebinding takes
+	 * care of subsystems' refcounts, which are explicitly dropped in
+	 * the failure exit path.
+	 */
+	list_add(&root->root_list, &cgroup_roots);
+	cgroup_root_count++;
+
+	/*
+	 * Link the root cgroup in this hierarchy into all the css_set
+	 * objects.
+	 */
+	spin_lock_irq(&css_set_lock);
+	hash_for_each(css_set_table, i, cset, hlist) {
+		link_css_set(&tmp_links, cset, root_cgrp);
+		if (css_set_populated(cset))
+			cgroup_update_populated(root_cgrp, true);
+	}
+	spin_unlock_irq(&css_set_lock);
+
+	BUG_ON(!list_empty(&root_cgrp->self.children));
+	BUG_ON(atomic_read(&root->nr_cgrps) != 1);
+
+	kernfs_activate(root_cgrp->kn);
+	ret = 0;
+	goto out;
+
+destroy_root:
+	kernfs_destroy_root(root->kf_root);
+	root->kf_root = NULL;
+exit_root_id:
+	cgroup_exit_root_id(root);
+cancel_ref:
+	percpu_ref_exit(&root_cgrp->self.refcnt);
+out:
+	free_cgrp_cset_links(&tmp_links);
+	return ret;
+}
+
+static struct dentry *cgroup_mount(struct file_system_type *fs_type,
+			 int flags, const char *unused_dev_name,
+			 void *data)
+{
+	bool is_v2 = fs_type == &cgroup2_fs_type;
+	struct super_block *pinned_sb = NULL;
+	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+	struct cgroup_subsys *ss;
+	struct cgroup_root *root;
+	struct cgroup_sb_opts opts;
+	struct dentry *dentry;
+	int ret;
+	int i;
+	bool new_sb;
+
+	get_cgroup_ns(ns);
+
+	/* Check if the caller has permission to mount. */
+	if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
+		put_cgroup_ns(ns);
+		return ERR_PTR(-EPERM);
+	}
+
+	/*
+	 * The first time anyone tries to mount a cgroup, enable the list
+	 * linking each css_set to its tasks and fix up all existing tasks.
+	 */
+	if (!use_task_css_set_links)
+		cgroup_enable_task_cg_lists();
+
+	if (is_v2) {
+		if (data) {
+			pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
+			put_cgroup_ns(ns);
+			return ERR_PTR(-EINVAL);
+		}
+		cgrp_dfl_visible = true;
+		root = &cgrp_dfl_root;
+		cgroup_get(&root->cgrp);
+		goto out_mount;
+	}
+
+	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
+
+	/* First find the desired set of subsystems */
+	ret = parse_cgroupfs_options(data, &opts);
+	if (ret)
+		goto out_unlock;
+
+	/*
+	 * Destruction of cgroup root is asynchronous, so subsystems may
+	 * still be dying after the previous unmount.  Let's drain the
+	 * dying subsystems.  We just need to ensure that the ones
+	 * unmounted previously finish dying and don't care about new ones
+	 * starting.  Testing ref liveliness is good enough.
+	 */
+	for_each_subsys(ss, i) {
+		if (!(opts.subsys_mask & (1 << i)) ||
+		    ss->root == &cgrp_dfl_root)
+			continue;
+
+		if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
+			mutex_unlock(&cgroup_mutex);
+			msleep(10);
+			ret = restart_syscall();
+			goto out_free;
+		}
+		cgroup_put(&ss->root->cgrp);
+	}
+
+	for_each_root(root) {
+		bool name_match = false;
+
+		if (root == &cgrp_dfl_root)
+			continue;
+
+		/*
+		 * If we asked for a name then it must match.  Also, if
+		 * name matches but sybsys_mask doesn't, we should fail.
+		 * Remember whether name matched.
+		 */
+		if (opts.name) {
+			if (strcmp(opts.name, root->name))
+				continue;
+			name_match = true;
+		}
+
+		/*
+		 * If we asked for subsystems (or explicitly for no
+		 * subsystems) then they must match.
+		 */
+		if ((opts.subsys_mask || opts.none) &&
+		    (opts.subsys_mask != root->subsys_mask)) {
+			if (!name_match)
+				continue;
+			ret = -EBUSY;
+			goto out_unlock;
+		}
+
+		if (root->flags ^ opts.flags)
+			pr_warn("new mount options do not match the existing superblock, will be ignored\n");
+
+		/*
+		 * We want to reuse @root whose lifetime is governed by its
+		 * ->cgrp.  Let's check whether @root is alive and keep it
+		 * that way.  As cgroup_kill_sb() can happen anytime, we
+		 * want to block it by pinning the sb so that @root doesn't
+		 * get killed before mount is complete.
+		 *
+		 * With the sb pinned, tryget_live can reliably indicate
+		 * whether @root can be reused.  If it's being killed,
+		 * drain it.  We can use wait_queue for the wait but this
+		 * path is super cold.  Let's just sleep a bit and retry.
+		 */
+		pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
+		if (IS_ERR(pinned_sb) ||
+		    !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
+			mutex_unlock(&cgroup_mutex);
+			if (!IS_ERR_OR_NULL(pinned_sb))
+				deactivate_super(pinned_sb);
+			msleep(10);
+			ret = restart_syscall();
+			goto out_free;
+		}
+
+		ret = 0;
+		goto out_unlock;
+	}
+
+	/*
+	 * No such thing, create a new one.  name= matching without subsys
+	 * specification is allowed for already existing hierarchies but we
+	 * can't create new one without subsys specification.
+	 */
+	if (!opts.subsys_mask && !opts.none) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* Hierarchies may only be created in the initial cgroup namespace. */
+	if (ns != &init_cgroup_ns) {
+		ret = -EPERM;
+		goto out_unlock;
+	}
+
+	root = kzalloc(sizeof(*root), GFP_KERNEL);
+	if (!root) {
+		ret = -ENOMEM;
+		goto out_unlock;
+	}
+
+	init_cgroup_root(root, &opts);
+
+	ret = cgroup_setup_root(root, opts.subsys_mask);
+	if (ret)
+		cgroup_free_root(root);
+
+out_unlock:
+	mutex_unlock(&cgroup_mutex);
+out_free:
+	kfree(opts.release_agent);
+	kfree(opts.name);
+
+	if (ret) {
+		put_cgroup_ns(ns);
+		return ERR_PTR(ret);
+	}
+out_mount:
+	dentry = kernfs_mount(fs_type, flags, root->kf_root,
+			      is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
+			      &new_sb);
+
+	/*
+	 * In non-init cgroup namespace, instead of root cgroup's
+	 * dentry, we return the dentry corresponding to the
+	 * cgroupns->root_cgrp.
+	 */
+	if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
+		struct dentry *nsdentry;
+		struct cgroup *cgrp;
+
+		mutex_lock(&cgroup_mutex);
+		spin_lock_irq(&css_set_lock);
+
+		cgrp = cset_cgroup_from_root(ns->root_cset, root);
+
+		spin_unlock_irq(&css_set_lock);
+		mutex_unlock(&cgroup_mutex);
+
+		nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
+		dput(dentry);
+		dentry = nsdentry;
+	}
+
+	if (IS_ERR(dentry) || !new_sb)
+		cgroup_put(&root->cgrp);
+
+	/*
+	 * If @pinned_sb, we're reusing an existing root and holding an
+	 * extra ref on its sb.  Mount is complete.  Put the extra ref.
+	 */
+	if (pinned_sb) {
+		WARN_ON(new_sb);
+		deactivate_super(pinned_sb);
+	}
+
+	put_cgroup_ns(ns);
+	return dentry;
+}
+
+static void cgroup_kill_sb(struct super_block *sb)
+{
+	struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
+	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
+
+	/*
+	 * If @root doesn't have any mounts or children, start killing it.
+	 * This prevents new mounts by disabling percpu_ref_tryget_live().
+	 * cgroup_mount() may wait for @root's release.
+	 *
+	 * And don't kill the default root.
+	 */
+	if (!list_empty(&root->cgrp.self.children) ||
+	    root == &cgrp_dfl_root)
+		cgroup_put(&root->cgrp);
+	else
+		percpu_ref_kill(&root->cgrp.self.refcnt);
+
+	kernfs_kill_sb(sb);
+}
+
+static struct file_system_type cgroup_fs_type = {
+	.name = "cgroup",
+	.mount = cgroup_mount,
+	.kill_sb = cgroup_kill_sb,
+	.fs_flags = FS_USERNS_MOUNT,
+};
+
+static struct file_system_type cgroup2_fs_type = {
+	.name = "cgroup2",
+	.mount = cgroup_mount,
+	.kill_sb = cgroup_kill_sb,
+	.fs_flags = FS_USERNS_MOUNT,
+};
+
+static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
+				 struct cgroup_namespace *ns)
+{
+	struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
+
+	return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
+}
+
+int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
+		   struct cgroup_namespace *ns)
+{
+	int ret;
+
+	mutex_lock(&cgroup_mutex);
+	spin_lock_irq(&css_set_lock);
+
+	ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
+
+	spin_unlock_irq(&css_set_lock);
+	mutex_unlock(&cgroup_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cgroup_path_ns);
+
+/**
+ * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
+ * @task: target task
+ * @buf: the buffer to write the path into
+ * @buflen: the length of the buffer
+ *
+ * Determine @task's cgroup on the first (the one with the lowest non-zero
+ * hierarchy_id) cgroup hierarchy and copy its path into @buf.  This
+ * function grabs cgroup_mutex and shouldn't be used inside locks used by
+ * cgroup controller callbacks.
+ *
+ * Return value is the same as kernfs_path().
+ */
+int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
+{
+	struct cgroup_root *root;
+	struct cgroup *cgrp;
+	int hierarchy_id = 1;
+	int ret;
+
+	mutex_lock(&cgroup_mutex);
+	spin_lock_irq(&css_set_lock);
+
+	root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
+
+	if (root) {
+		cgrp = task_cgroup_from_root(task, root);
+		ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
+	} else {
+		/* if no hierarchy exists, everyone is in "/" */
+		ret = strlcpy(buf, "/", buflen);
+	}
+
+	spin_unlock_irq(&css_set_lock);
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(task_cgroup_path);
+
+/* used to track tasks and other necessary states during migration */
+struct cgroup_taskset {
+	/* the src and dst cset list running through cset->mg_node */
+	struct list_head	src_csets;
+	struct list_head	dst_csets;
+
+	/* the subsys currently being processed */
+	int			ssid;
+
+	/*
+	 * Fields for cgroup_taskset_*() iteration.
+	 *
+	 * Before migration is committed, the target migration tasks are on
+	 * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
+	 * the csets on ->dst_csets.  ->csets point to either ->src_csets
+	 * or ->dst_csets depending on whether migration is committed.
+	 *
+	 * ->cur_csets and ->cur_task point to the current task position
+	 * during iteration.
+	 */
+	struct list_head	*csets;
+	struct css_set		*cur_cset;
+	struct task_struct	*cur_task;
+};
+
+#define CGROUP_TASKSET_INIT(tset)	(struct cgroup_taskset){	\
+	.src_csets		= LIST_HEAD_INIT(tset.src_csets),	\
+	.dst_csets		= LIST_HEAD_INIT(tset.dst_csets),	\
+	.csets			= &tset.src_csets,			\
+}
+
+/**
+ * cgroup_taskset_add - try to add a migration target task to a taskset
+ * @task: target task
+ * @tset: target taskset
+ *
+ * Add @task, which is a migration target, to @tset.  This function becomes
+ * noop if @task doesn't need to be migrated.  @task's css_set should have
+ * been added as a migration source and @task->cg_list will be moved from
+ * the css_set's tasks list to mg_tasks one.
+ */
+static void cgroup_taskset_add(struct task_struct *task,
+			       struct cgroup_taskset *tset)
+{
+	struct css_set *cset;
+
+	lockdep_assert_held(&css_set_lock);
+
+	/* @task either already exited or can't exit until the end */
+	if (task->flags & PF_EXITING)
+		return;
+
+	/* leave @task alone if post_fork() hasn't linked it yet */
+	if (list_empty(&task->cg_list))
+		return;
+
+	cset = task_css_set(task);
+	if (!cset->mg_src_cgrp)
+		return;
+
+	list_move_tail(&task->cg_list, &cset->mg_tasks);
+	if (list_empty(&cset->mg_node))
+		list_add_tail(&cset->mg_node, &tset->src_csets);
+	if (list_empty(&cset->mg_dst_cset->mg_node))
+		list_move_tail(&cset->mg_dst_cset->mg_node,
+			       &tset->dst_csets);
+}
+
+/**
+ * cgroup_taskset_first - reset taskset and return the first task
+ * @tset: taskset of interest
+ * @dst_cssp: output variable for the destination css
+ *
+ * @tset iteration is initialized and the first task is returned.
+ */
+struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
+					 struct cgroup_subsys_state **dst_cssp)
+{
+	tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
+	tset->cur_task = NULL;
+
+	return cgroup_taskset_next(tset, dst_cssp);
+}
+
+/**
+ * cgroup_taskset_next - iterate to the next task in taskset
+ * @tset: taskset of interest
+ * @dst_cssp: output variable for the destination css
+ *
+ * Return the next task in @tset.  Iteration must have been initialized
+ * with cgroup_taskset_first().
+ */
+struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
+					struct cgroup_subsys_state **dst_cssp)
+{
+	struct css_set *cset = tset->cur_cset;
+	struct task_struct *task = tset->cur_task;
+
+	while (&cset->mg_node != tset->csets) {
+		if (!task)
+			task = list_first_entry(&cset->mg_tasks,
+						struct task_struct, cg_list);
+		else
+			task = list_next_entry(task, cg_list);
+
+		if (&task->cg_list != &cset->mg_tasks) {
+			tset->cur_cset = cset;
+			tset->cur_task = task;
+
+			/*
+			 * This function may be called both before and
+			 * after cgroup_taskset_migrate().  The two cases
+			 * can be distinguished by looking at whether @cset
+			 * has its ->mg_dst_cset set.
+			 */
+			if (cset->mg_dst_cset)
+				*dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
+			else
+				*dst_cssp = cset->subsys[tset->ssid];
+
+			return task;
+		}
+
+		cset = list_next_entry(cset, mg_node);
+		task = NULL;
+	}
+
+	return NULL;
+}
+
+/**
+ * cgroup_taskset_migrate - migrate a taskset
+ * @tset: taget taskset
+ * @root: cgroup root the migration is taking place on
+ *
+ * Migrate tasks in @tset as setup by migration preparation functions.
+ * This function fails iff one of the ->can_attach callbacks fails and
+ * guarantees that either all or none of the tasks in @tset are migrated.
+ * @tset is consumed regardless of success.
+ */
+static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
+				  struct cgroup_root *root)
+{
+	struct cgroup_subsys *ss;
+	struct task_struct *task, *tmp_task;
+	struct css_set *cset, *tmp_cset;
+	int ssid, failed_ssid, ret;
+
+	/* methods shouldn't be called if no task is actually migrating */
+	if (list_empty(&tset->src_csets))
+		return 0;
+
+	/* check that we can legitimately attach to the cgroup */
+	do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+		if (ss->can_attach) {
+			tset->ssid = ssid;
+			ret = ss->can_attach(tset);
+			if (ret) {
+				failed_ssid = ssid;
+				goto out_cancel_attach;
+			}
+		}
+	} while_each_subsys_mask();
+
+	/*
+	 * Now that we're guaranteed success, proceed to move all tasks to
+	 * the new cgroup.  There are no failure cases after here, so this
+	 * is the commit point.
+	 */
+	spin_lock_irq(&css_set_lock);
+	list_for_each_entry(cset, &tset->src_csets, mg_node) {
+		list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
+			struct css_set *from_cset = task_css_set(task);
+			struct css_set *to_cset = cset->mg_dst_cset;
+
+			get_css_set(to_cset);
+			css_set_move_task(task, from_cset, to_cset, true);
+			put_css_set_locked(from_cset);
+		}
+	}
+	spin_unlock_irq(&css_set_lock);
+
+	/*
+	 * Migration is committed, all target tasks are now on dst_csets.
+	 * Nothing is sensitive to fork() after this point.  Notify
+	 * controllers that migration is complete.
+	 */
+	tset->csets = &tset->dst_csets;
+
+	do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+		if (ss->attach) {
+			tset->ssid = ssid;
+			ss->attach(tset);
+		}
+	} while_each_subsys_mask();
+
+	ret = 0;
+	goto out_release_tset;
+
+out_cancel_attach:
+	do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+		if (ssid == failed_ssid)
+			break;
+		if (ss->cancel_attach) {
+			tset->ssid = ssid;
+			ss->cancel_attach(tset);
+		}
+	} while_each_subsys_mask();
+out_release_tset:
+	spin_lock_irq(&css_set_lock);
+	list_splice_init(&tset->dst_csets, &tset->src_csets);
+	list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
+		list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
+		list_del_init(&cset->mg_node);
+	}
+	spin_unlock_irq(&css_set_lock);
+	return ret;
+}
+
+/**
+ * cgroup_may_migrate_to - verify whether a cgroup can be migration destination
+ * @dst_cgrp: destination cgroup to test
+ *
+ * On the default hierarchy, except for the root, subtree_control must be
+ * zero for migration destination cgroups with tasks so that child cgroups
+ * don't compete against tasks.
+ */
+static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
+{
+	return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
+		!dst_cgrp->subtree_control;
+}
+
+/**
+ * cgroup_migrate_finish - cleanup after attach
+ * @preloaded_csets: list of preloaded css_sets
+ *
+ * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
+ * those functions for details.
+ */
+static void cgroup_migrate_finish(struct list_head *preloaded_csets)
+{
+	struct css_set *cset, *tmp_cset;
+
+	lockdep_assert_held(&cgroup_mutex);
+
+	spin_lock_irq(&css_set_lock);
+	list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
+		cset->mg_src_cgrp = NULL;
+		cset->mg_dst_cgrp = NULL;
+		cset->mg_dst_cset = NULL;
+		list_del_init(&cset->mg_preload_node);
+		put_css_set_locked(cset);
+	}
+	spin_unlock_irq(&css_set_lock);
+}
+
+/**
+ * cgroup_migrate_add_src - add a migration source css_set
+ * @src_cset: the source css_set to add
+ * @dst_cgrp: the destination cgroup
+ * @preloaded_csets: list of preloaded css_sets
+ *
+ * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
+ * @src_cset and add it to @preloaded_csets, which should later be cleaned
+ * up by cgroup_migrate_finish().
+ *
+ * This function may be called without holding cgroup_threadgroup_rwsem
+ * even if the target is a process.  Threads may be created and destroyed
+ * but as long as cgroup_mutex is not dropped, no new css_set can be put
+ * into play and the preloaded css_sets are guaranteed to cover all
+ * migrations.
+ */
+static void cgroup_migrate_add_src(struct css_set *src_cset,
+				   struct cgroup *dst_cgrp,
+				   struct list_head *preloaded_csets)
+{
+	struct cgroup *src_cgrp;
+
+	lockdep_assert_held(&cgroup_mutex);
+	lockdep_assert_held(&css_set_lock);
+
+	/*
+	 * If ->dead, @src_set is associated with one or more dead cgroups
+	 * and doesn't contain any migratable tasks.  Ignore it early so
+	 * that the rest of migration path doesn't get confused by it.
+	 */
+	if (src_cset->dead)
+		return;
+
+	src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
+
+	if (!list_empty(&src_cset->mg_preload_node))
+		return;
+
+	WARN_ON(src_cset->mg_src_cgrp);
+	WARN_ON(src_cset->mg_dst_cgrp);
+	WARN_ON(!list_empty(&src_cset->mg_tasks));
+	WARN_ON(!list_empty(&src_cset->mg_node));
+
+	src_cset->mg_src_cgrp = src_cgrp;
+	src_cset->mg_dst_cgrp = dst_cgrp;
+	get_css_set(src_cset);
+	list_add(&src_cset->mg_preload_node, preloaded_csets);
+}
+
+/**
+ * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
+ * @preloaded_csets: list of preloaded source css_sets
+ *
+ * Tasks are about to be moved and all the source css_sets have been
+ * preloaded to @preloaded_csets.  This function looks up and pins all
+ * destination css_sets, links each to its source, and append them to
+ * @preloaded_csets.
+ *
+ * This function must be called after cgroup_migrate_add_src() has been
+ * called on each migration source css_set.  After migration is performed
+ * using cgroup_migrate(), cgroup_migrate_finish() must be called on
+ * @preloaded_csets.
+ */
+static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
+{
+	LIST_HEAD(csets);
+	struct css_set *src_cset, *tmp_cset;
+
+	lockdep_assert_held(&cgroup_mutex);
+
+	/* look up the dst cset for each src cset and link it to src */
+	list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
+		struct css_set *dst_cset;
+
+		dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
+		if (!dst_cset)
+			goto err;
+
+		WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
+
+		/*
+		 * If src cset equals dst, it's noop.  Drop the src.
+		 * cgroup_migrate() will skip the cset too.  Note that we
+		 * can't handle src == dst as some nodes are used by both.
+		 */
+		if (src_cset == dst_cset) {
+			src_cset->mg_src_cgrp = NULL;
+			src_cset->mg_dst_cgrp = NULL;
+			list_del_init(&src_cset->mg_preload_node);
+			put_css_set(src_cset);
+			put_css_set(dst_cset);
+			continue;
+		}
+
+		src_cset->mg_dst_cset = dst_cset;
+
+		if (list_empty(&dst_cset->mg_preload_node))
+			list_add(&dst_cset->mg_preload_node, &csets);
+		else
+			put_css_set(dst_cset);
+	}
+
+	list_splice_tail(&csets, preloaded_csets);
+	return 0;
+err:
+	cgroup_migrate_finish(&csets);
+	return -ENOMEM;
+}
+
+/**
+ * cgroup_migrate - migrate a process or task to a cgroup
+ * @leader: the leader of the process or the task to migrate
+ * @threadgroup: whether @leader points to the whole process or a single task
+ * @root: cgroup root migration is taking place on
+ *
+ * Migrate a process or task denoted by @leader.  If migrating a process,
+ * the caller must be holding cgroup_threadgroup_rwsem.  The caller is also
+ * responsible for invoking cgroup_migrate_add_src() and
+ * cgroup_migrate_prepare_dst() on the targets before invoking this
+ * function and following up with cgroup_migrate_finish().
+ *
+ * As long as a controller's ->can_attach() doesn't fail, this function is
+ * guaranteed to succeed.  This means that, excluding ->can_attach()
+ * failure, when migrating multiple targets, the success or failure can be
+ * decided for all targets by invoking group_migrate_prepare_dst() before
+ * actually starting migrating.
+ */
+static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
+			  struct cgroup_root *root)
+{
+	struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
+	struct task_struct *task;
+
+	/*
+	 * Prevent freeing of tasks while we take a snapshot. Tasks that are
+	 * already PF_EXITING could be freed from underneath us unless we
+	 * take an rcu_read_lock.
+	 */
+	spin_lock_irq(&css_set_lock);
+	rcu_read_lock();
+	task = leader;
+	do {
+		cgroup_taskset_add(task, &tset);
+		if (!threadgroup)
+			break;
+	} while_each_thread(leader, task);
+	rcu_read_unlock();
+	spin_unlock_irq(&css_set_lock);
+
+	return cgroup_taskset_migrate(&tset, root);
+}
+
+/**
+ * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
+ * @dst_cgrp: the cgroup to attach to
+ * @leader: the task or the leader of the threadgroup to be attached
+ * @threadgroup: attach the whole threadgroup?
+ *
+ * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
+ */
+static int cgroup_attach_task(struct cgroup *dst_cgrp,
+			      struct task_struct *leader, bool threadgroup)
+{
+	LIST_HEAD(preloaded_csets);
+	struct task_struct *task;
+	int ret;
+
+	if (!cgroup_may_migrate_to(dst_cgrp))
+		return -EBUSY;
+
+	/* look up all src csets */
+	spin_lock_irq(&css_set_lock);
+	rcu_read_lock();
+	task = leader;
+	do {
+		cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
+				       &preloaded_csets);
+		if (!threadgroup)
+			break;
+	} while_each_thread(leader, task);
+	rcu_read_unlock();
+	spin_unlock_irq(&css_set_lock);
+
+	/* prepare dst csets and commit */
+	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
+	if (!ret)
+		ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root);
+
+	cgroup_migrate_finish(&preloaded_csets);
+
+	if (!ret)
+		trace_cgroup_attach_task(dst_cgrp, leader, threadgroup);
+
+	return ret;
+}
+
+static int cgroup_procs_write_permission(struct task_struct *task,
+					 struct cgroup *dst_cgrp,
+					 struct kernfs_open_file *of)
+{
+	const struct cred *cred = current_cred();
+	const struct cred *tcred = get_task_cred(task);
+	int ret = 0;
+
+	/*
+	 * even if we're attaching all tasks in the thread group, we only
+	 * need to check permissions on one of them.
+	 */
+	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+	    !uid_eq(cred->euid, tcred->uid) &&
+	    !uid_eq(cred->euid, tcred->suid))
+		ret = -EACCES;
+
+	if (!ret && cgroup_on_dfl(dst_cgrp)) {
+		struct super_block *sb = of->file->f_path.dentry->d_sb;
+		struct cgroup *cgrp;
+		struct inode *inode;
+
+		spin_lock_irq(&css_set_lock);
+		cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+		spin_unlock_irq(&css_set_lock);
+
+		while (!cgroup_is_descendant(dst_cgrp, cgrp))
+			cgrp = cgroup_parent(cgrp);
+
+		ret = -ENOMEM;
+		inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
+		if (inode) {
+			ret = inode_permission(inode, MAY_WRITE);
+			iput(inode);
+		}
+	}
+
+	put_cred(tcred);
+	return ret;
+}
+
+/*
+ * Find the task_struct of the task to attach by vpid and pass it along to the
+ * function to attach either it or all tasks in its threadgroup. Will lock
+ * cgroup_mutex and threadgroup.
+ */
+static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+				    size_t nbytes, loff_t off, bool threadgroup)
+{
+	struct task_struct *tsk;
+	struct cgroup_subsys *ss;
+	struct cgroup *cgrp;
+	pid_t pid;
+	int ssid, ret;
+
+	if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
+		return -EINVAL;
+
+	cgrp = cgroup_kn_lock_live(of->kn, false);
+	if (!cgrp)
+		return -ENODEV;
+
+	percpu_down_write(&cgroup_threadgroup_rwsem);
+	rcu_read_lock();
+	if (pid) {
+		tsk = find_task_by_vpid(pid);
+		if (!tsk) {
+			ret = -ESRCH;
+			goto out_unlock_rcu;
+		}
+	} else {
+		tsk = current;
+	}
+
+	if (threadgroup)
+		tsk = tsk->group_leader;
+
+	/*
+	 * Workqueue threads may acquire PF_NO_SETAFFINITY and become
+	 * trapped in a cpuset, or RT worker may be born in a cgroup
+	 * with no rt_runtime allocated.  Just say no.
+	 */
+	if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
+		ret = -EINVAL;
+		goto out_unlock_rcu;
+	}
+
+	get_task_struct(tsk);
+	rcu_read_unlock();
+
+	ret = cgroup_procs_write_permission(tsk, cgrp, of);
+	if (!ret)
+		ret = cgroup_attach_task(cgrp, tsk, threadgroup);
+
+	put_task_struct(tsk);
+	goto out_unlock_threadgroup;
+
+out_unlock_rcu:
+	rcu_read_unlock();
+out_unlock_threadgroup:
+	percpu_up_write(&cgroup_threadgroup_rwsem);
+	for_each_subsys(ss, ssid)
+		if (ss->post_attach)
+			ss->post_attach();
+	cgroup_kn_unlock(of->kn);
+	return ret ?: nbytes;
+}
+
+/**
+ * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
+ * @from: attach to all cgroups of a given task
+ * @tsk: the task to be attached
+ */
+int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
+{
+	struct cgroup_root *root;
+	int retval = 0;
+
+	mutex_lock(&cgroup_mutex);
+	percpu_down_write(&cgroup_threadgroup_rwsem);
+	for_each_root(root) {
+		struct cgroup *from_cgrp;
+
+		if (root == &cgrp_dfl_root)
+			continue;
+
+		spin_lock_irq(&css_set_lock);
+		from_cgrp = task_cgroup_from_root(from, root);
+		spin_unlock_irq(&css_set_lock);
+
+		retval = cgroup_attach_task(from_cgrp, tsk, false);
+		if (retval)
+			break;
+	}
+	percpu_up_write(&cgroup_threadgroup_rwsem);
+	mutex_unlock(&cgroup_mutex);
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
+
+static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
+				  char *buf, size_t nbytes, loff_t off)
+{
+	return __cgroup_procs_write(of, buf, nbytes, off, false);
+}
+
+static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
+				  char *buf, size_t nbytes, loff_t off)
+{
+	return __cgroup_procs_write(of, buf, nbytes, off, true);
+}
+
+static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
+					  char *buf, size_t nbytes, loff_t off)
+{
+	struct cgroup *cgrp;
+
+	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+
+	cgrp = cgroup_kn_lock_live(of->kn, false);
+	if (!cgrp)
+		return -ENODEV;
+	spin_lock(&release_agent_path_lock);
+	strlcpy(cgrp->root->release_agent_path, strstrip(buf),
+		sizeof(cgrp->root->release_agent_path));
+	spin_unlock(&release_agent_path_lock);
+	cgroup_kn_unlock(of->kn);
+	return nbytes;
+}
+
+static int cgroup_release_agent_show(struct seq_file *seq, void *v)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+	spin_lock(&release_agent_path_lock);
+	seq_puts(seq, cgrp->root->release_agent_path);
+	spin_unlock(&release_agent_path_lock);
+	seq_putc(seq, '\n');
+	return 0;
+}
+
+static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
+{
+	seq_puts(seq, "0\n");
+	return 0;
+}
+
+static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
+{
+	struct cgroup_subsys *ss;
+	bool printed = false;
+	int ssid;
+
+	do_each_subsys_mask(ss, ssid, ss_mask) {
+		if (printed)
+			seq_putc(seq, ' ');
+		seq_printf(seq, "%s", ss->name);
+		printed = true;
+	} while_each_subsys_mask();
+	if (printed)
+		seq_putc(seq, '\n');
+}
+
+/* show controllers which are enabled from the parent */
+static int cgroup_controllers_show(struct seq_file *seq, void *v)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+	cgroup_print_ss_mask(seq, cgroup_control(cgrp));
+	return 0;
+}
+
+/* show controllers which are enabled for a given cgroup's children */
+static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+	cgroup_print_ss_mask(seq, cgrp->subtree_control);
+	return 0;
+}
+
+/**
+ * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
+ * @cgrp: root of the subtree to update csses for
+ *
+ * @cgrp's control masks have changed and its subtree's css associations
+ * need to be updated accordingly.  This function looks up all css_sets
+ * which are attached to the subtree, creates the matching updated css_sets
+ * and migrates the tasks to the new ones.
+ */
+static int cgroup_update_dfl_csses(struct cgroup *cgrp)
+{
+	LIST_HEAD(preloaded_csets);
+	struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
+	struct cgroup_subsys_state *d_css;
+	struct cgroup *dsct;
+	struct css_set *src_cset;
+	int ret;
+
+	lockdep_assert_held(&cgroup_mutex);
+
+	percpu_down_write(&cgroup_threadgroup_rwsem);
+
+	/* look up all csses currently attached to @cgrp's subtree */
+	spin_lock_irq(&css_set_lock);
+	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
+		struct cgrp_cset_link *link;
+
+		list_for_each_entry(link, &dsct->cset_links, cset_link)
+			cgroup_migrate_add_src(link->cset, dsct,
+					       &preloaded_csets);
+	}
+	spin_unlock_irq(&css_set_lock);
+
+	/* NULL dst indicates self on default hierarchy */
+	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
+	if (ret)
+		goto out_finish;
+
+	spin_lock_irq(&css_set_lock);
+	list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
+		struct task_struct *task, *ntask;
+
+		/* src_csets precede dst_csets, break on the first dst_cset */
+		if (!src_cset->mg_src_cgrp)
+			break;
+
+		/* all tasks in src_csets need to be migrated */
+		list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
+			cgroup_taskset_add(task, &tset);
+	}
+	spin_unlock_irq(&css_set_lock);
+
+	ret = cgroup_taskset_migrate(&tset, cgrp->root);
+out_finish:
+	cgroup_migrate_finish(&preloaded_csets);
+	percpu_up_write(&cgroup_threadgroup_rwsem);
+	return ret;
+}
+
+/**
+ * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses
+ * @cgrp: root of the target subtree
+ *
+ * Because css offlining is asynchronous, userland may try to re-enable a
+ * controller while the previous css is still around.  This function grabs
+ * cgroup_mutex and drains the previous css instances of @cgrp's subtree.
+ */
+static void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
+	__acquires(&cgroup_mutex)
+{
+	struct cgroup *dsct;
+	struct cgroup_subsys_state *d_css;
+	struct cgroup_subsys *ss;
+	int ssid;
+
+restart:
+	mutex_lock(&cgroup_mutex);
+
+	cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
+		for_each_subsys(ss, ssid) {
+			struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
+			DEFINE_WAIT(wait);
+
+			if (!css || !percpu_ref_is_dying(&css->refcnt))
+				continue;
+
+			cgroup_get(dsct);
+			prepare_to_wait(&dsct->offline_waitq, &wait,
+					TASK_UNINTERRUPTIBLE);
+
+			mutex_unlock(&cgroup_mutex);
+			schedule();
+			finish_wait(&dsct->offline_waitq, &wait);
+
+			cgroup_put(dsct);
+			goto restart;
+		}
+	}
+}
+
+/**
+ * cgroup_save_control - save control masks of a subtree
+ * @cgrp: root of the target subtree
+ *
+ * Save ->subtree_control and ->subtree_ss_mask to the respective old_
+ * prefixed fields for @cgrp's subtree including @cgrp itself.
+ */
+static void cgroup_save_control(struct cgroup *cgrp)
+{
+	struct cgroup *dsct;
+	struct cgroup_subsys_state *d_css;
+
+	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
+		dsct->old_subtree_control = dsct->subtree_control;
+		dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
+	}
+}
+
+/**
+ * cgroup_propagate_control - refresh control masks of a subtree
+ * @cgrp: root of the target subtree
+ *
+ * For @cgrp and its subtree, ensure ->subtree_ss_mask matches
+ * ->subtree_control and propagate controller availability through the
+ * subtree so that descendants don't have unavailable controllers enabled.
+ */
+static void cgroup_propagate_control(struct cgroup *cgrp)
+{
+	struct cgroup *dsct;
+	struct cgroup_subsys_state *d_css;
+
+	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
+		dsct->subtree_control &= cgroup_control(dsct);
+		dsct->subtree_ss_mask =
+			cgroup_calc_subtree_ss_mask(dsct->subtree_control,
+						    cgroup_ss_mask(dsct));
+	}
+}
+
+/**
+ * cgroup_restore_control - restore control masks of a subtree
+ * @cgrp: root of the target subtree
+ *
+ * Restore ->subtree_control and ->subtree_ss_mask from the respective old_
+ * prefixed fields for @cgrp's subtree including @cgrp itself.
+ */
+static void cgroup_restore_control(struct cgroup *cgrp)
+{
+	struct cgroup *dsct;
+	struct cgroup_subsys_state *d_css;
+
+	cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
+		dsct->subtree_control = dsct->old_subtree_control;
+		dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
+	}
+}
+
+static bool css_visible(struct cgroup_subsys_state *css)
+{
+	struct cgroup_subsys *ss = css->ss;
+	struct cgroup *cgrp = css->cgroup;
+
+	if (cgroup_control(cgrp) & (1 << ss->id))
+		return true;
+	if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
+		return false;
+	return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
+}
+
+/**
+ * cgroup_apply_control_enable - enable or show csses according to control
+ * @cgrp: root of the target subtree
+ *
+ * Walk @cgrp's subtree and create new csses or make the existing ones
+ * visible.  A css is created invisible if it's being implicitly enabled
+ * through dependency.  An invisible css is made visible when the userland
+ * explicitly enables it.
+ *
+ * Returns 0 on success, -errno on failure.  On failure, csses which have
+ * been processed already aren't cleaned up.  The caller is responsible for
+ * cleaning up with cgroup_apply_control_disble().
+ */
+static int cgroup_apply_control_enable(struct cgroup *cgrp)
+{
+	struct cgroup *dsct;
+	struct cgroup_subsys_state *d_css;
+	struct cgroup_subsys *ss;
+	int ssid, ret;
+
+	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
+		for_each_subsys(ss, ssid) {
+			struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
+
+			WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
+
+			if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
+				continue;
+
+			if (!css) {
+				css = css_create(dsct, ss);
+				if (IS_ERR(css))
+					return PTR_ERR(css);
+			}
+
+			if (css_visible(css)) {
+				ret = css_populate_dir(css);
+				if (ret)
+					return ret;
+			}
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * cgroup_apply_control_disable - kill or hide csses according to control
+ * @cgrp: root of the target subtree
+ *
+ * Walk @cgrp's subtree and kill and hide csses so that they match
+ * cgroup_ss_mask() and cgroup_visible_mask().
+ *
+ * A css is hidden when the userland requests it to be disabled while other
+ * subsystems are still depending on it.  The css must not actively control
+ * resources and be in the vanilla state if it's made visible again later.
+ * Controllers which may be depended upon should provide ->css_reset() for
+ * this purpose.
+ */
+static void cgroup_apply_control_disable(struct cgroup *cgrp)
+{
+	struct cgroup *dsct;
+	struct cgroup_subsys_state *d_css;
+	struct cgroup_subsys *ss;
+	int ssid;
+
+	cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
+		for_each_subsys(ss, ssid) {
+			struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
+
+			WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
+
+			if (!css)
+				continue;
+
+			if (css->parent &&
+			    !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
+				kill_css(css);
+			} else if (!css_visible(css)) {
+				css_clear_dir(css);
+				if (ss->css_reset)
+					ss->css_reset(css);
+			}
+		}
+	}
+}
+
+/**
+ * cgroup_apply_control - apply control mask updates to the subtree
+ * @cgrp: root of the target subtree
+ *
+ * subsystems can be enabled and disabled in a subtree using the following
+ * steps.
+ *
+ * 1. Call cgroup_save_control() to stash the current state.
+ * 2. Update ->subtree_control masks in the subtree as desired.
+ * 3. Call cgroup_apply_control() to apply the changes.
+ * 4. Optionally perform other related operations.
+ * 5. Call cgroup_finalize_control() to finish up.
+ *
+ * This function implements step 3 and propagates the mask changes
+ * throughout @cgrp's subtree, updates csses accordingly and perform
+ * process migrations.
+ */
+static int cgroup_apply_control(struct cgroup *cgrp)
+{
+	int ret;
+
+	cgroup_propagate_control(cgrp);
+
+	ret = cgroup_apply_control_enable(cgrp);
+	if (ret)
+		return ret;
+
+	/*
+	 * At this point, cgroup_e_css() results reflect the new csses
+	 * making the following cgroup_update_dfl_csses() properly update
+	 * css associations of all tasks in the subtree.
+	 */
+	ret = cgroup_update_dfl_csses(cgrp);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+/**
+ * cgroup_finalize_control - finalize control mask update
+ * @cgrp: root of the target subtree
+ * @ret: the result of the update
+ *
+ * Finalize control mask update.  See cgroup_apply_control() for more info.
+ */
+static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
+{
+	if (ret) {
+		cgroup_restore_control(cgrp);
+		cgroup_propagate_control(cgrp);
+	}
+
+	cgroup_apply_control_disable(cgrp);
+}
+
+/* change the enabled child controllers for a cgroup in the default hierarchy */
+static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
+					    char *buf, size_t nbytes,
+					    loff_t off)
+{
+	u16 enable = 0, disable = 0;
+	struct cgroup *cgrp, *child;
+	struct cgroup_subsys *ss;
+	char *tok;
+	int ssid, ret;
+
+	/*
+	 * Parse input - space separated list of subsystem names prefixed
+	 * with either + or -.
+	 */
+	buf = strstrip(buf);
+	while ((tok = strsep(&buf, " "))) {
+		if (tok[0] == '\0')
+			continue;
+		do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
+			if (!cgroup_ssid_enabled(ssid) ||
+			    strcmp(tok + 1, ss->name))
+				continue;
+
+			if (*tok == '+') {
+				enable |= 1 << ssid;
+				disable &= ~(1 << ssid);
+			} else if (*tok == '-') {
+				disable |= 1 << ssid;
+				enable &= ~(1 << ssid);
+			} else {
+				return -EINVAL;
+			}
+			break;
+		} while_each_subsys_mask();
+		if (ssid == CGROUP_SUBSYS_COUNT)
+			return -EINVAL;
+	}
+
+	cgrp = cgroup_kn_lock_live(of->kn, true);
+	if (!cgrp)
+		return -ENODEV;
+
+	for_each_subsys(ss, ssid) {
+		if (enable & (1 << ssid)) {
+			if (cgrp->subtree_control & (1 << ssid)) {
+				enable &= ~(1 << ssid);
+				continue;
+			}
+
+			if (!(cgroup_control(cgrp) & (1 << ssid))) {
+				ret = -ENOENT;
+				goto out_unlock;
+			}
+		} else if (disable & (1 << ssid)) {
+			if (!(cgrp->subtree_control & (1 << ssid))) {
+				disable &= ~(1 << ssid);
+				continue;
+			}
+
+			/* a child has it enabled? */
+			cgroup_for_each_live_child(child, cgrp) {
+				if (child->subtree_control & (1 << ssid)) {
+					ret = -EBUSY;
+					goto out_unlock;
+				}
+			}
+		}
+	}
+
+	if (!enable && !disable) {
+		ret = 0;
+		goto out_unlock;
+	}
+
+	/*
+	 * Except for the root, subtree_control must be zero for a cgroup
+	 * with tasks so that child cgroups don't compete against tasks.
+	 */
+	if (enable && cgroup_parent(cgrp)) {
+		struct cgrp_cset_link *link;
+
+		/*
+		 * Because namespaces pin csets too, @cgrp->cset_links
+		 * might not be empty even when @cgrp is empty.  Walk and
+		 * verify each cset.
+		 */
+		spin_lock_irq(&css_set_lock);
+
+		ret = 0;
+		list_for_each_entry(link, &cgrp->cset_links, cset_link) {
+			if (css_set_populated(link->cset)) {
+				ret = -EBUSY;
+				break;
+			}
+		}
+
+		spin_unlock_irq(&css_set_lock);
+
+		if (ret)
+			goto out_unlock;
+	}
+
+	/* save and update control masks and prepare csses */
+	cgroup_save_control(cgrp);
+
+	cgrp->subtree_control |= enable;
+	cgrp->subtree_control &= ~disable;
+
+	ret = cgroup_apply_control(cgrp);
+
+	cgroup_finalize_control(cgrp, ret);
+
+	kernfs_activate(cgrp->kn);
+	ret = 0;
+out_unlock:
+	cgroup_kn_unlock(of->kn);
+	return ret ?: nbytes;
+}
+
+static int cgroup_events_show(struct seq_file *seq, void *v)
+{
+	seq_printf(seq, "populated %d\n",
+		   cgroup_is_populated(seq_css(seq)->cgroup));
+	return 0;
+}
+
+static int cgroup_file_open(struct kernfs_open_file *of)
+{
+	struct cftype *cft = of->kn->priv;
+
+	if (cft->open)
+		return cft->open(of);
+	return 0;
+}
+
+static void cgroup_file_release(struct kernfs_open_file *of)
+{
+	struct cftype *cft = of->kn->priv;
+
+	if (cft->release)
+		cft->release(of);
+}
+
+static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
+				 size_t nbytes, loff_t off)
+{
+	struct cgroup *cgrp = of->kn->parent->priv;
+	struct cftype *cft = of->kn->priv;
+	struct cgroup_subsys_state *css;
+	int ret;
+
+	if (cft->write)
+		return cft->write(of, buf, nbytes, off);
+
+	/*
+	 * kernfs guarantees that a file isn't deleted with operations in
+	 * flight, which means that the matching css is and stays alive and
+	 * doesn't need to be pinned.  The RCU locking is not necessary
+	 * either.  It's just for the convenience of using cgroup_css().
+	 */
+	rcu_read_lock();
+	css = cgroup_css(cgrp, cft->ss);
+	rcu_read_unlock();
+
+	if (cft->write_u64) {
+		unsigned long long v;
+		ret = kstrtoull(buf, 0, &v);
+		if (!ret)
+			ret = cft->write_u64(css, cft, v);
+	} else if (cft->write_s64) {
+		long long v;
+		ret = kstrtoll(buf, 0, &v);
+		if (!ret)
+			ret = cft->write_s64(css, cft, v);
+	} else {
+		ret = -EINVAL;
+	}
+
+	return ret ?: nbytes;
+}
+
+static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
+{
+	return seq_cft(seq)->seq_start(seq, ppos);
+}
+
+static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
+{
+	return seq_cft(seq)->seq_next(seq, v, ppos);
+}
+
+static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
+{
+	if (seq_cft(seq)->seq_stop)
+		seq_cft(seq)->seq_stop(seq, v);
+}
+
+static int cgroup_seqfile_show(struct seq_file *m, void *arg)
+{
+	struct cftype *cft = seq_cft(m);
+	struct cgroup_subsys_state *css = seq_css(m);
+
+	if (cft->seq_show)
+		return cft->seq_show(m, arg);
+
+	if (cft->read_u64)
+		seq_printf(m, "%llu\n", cft->read_u64(css, cft));
+	else if (cft->read_s64)
+		seq_printf(m, "%lld\n", cft->read_s64(css, cft));
+	else
+		return -EINVAL;
+	return 0;
+}
+
+static struct kernfs_ops cgroup_kf_single_ops = {
+	.atomic_write_len	= PAGE_SIZE,
+	.open			= cgroup_file_open,
+	.release		= cgroup_file_release,
+	.write			= cgroup_file_write,
+	.seq_show		= cgroup_seqfile_show,
+};
+
+static struct kernfs_ops cgroup_kf_ops = {
+	.atomic_write_len	= PAGE_SIZE,
+	.open			= cgroup_file_open,
+	.release		= cgroup_file_release,
+	.write			= cgroup_file_write,
+	.seq_start		= cgroup_seqfile_start,
+	.seq_next		= cgroup_seqfile_next,
+	.seq_stop		= cgroup_seqfile_stop,
+	.seq_show		= cgroup_seqfile_show,
+};
+
+/*
+ * cgroup_rename - Only allow simple rename of directories in place.
+ */
+static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
+			 const char *new_name_str)
+{
+	struct cgroup *cgrp = kn->priv;
+	int ret;
+
+	if (kernfs_type(kn) != KERNFS_DIR)
+		return -ENOTDIR;
+	if (kn->parent != new_parent)
+		return -EIO;
+
+	/*
+	 * This isn't a proper migration and its usefulness is very
+	 * limited.  Disallow on the default hierarchy.
+	 */
+	if (cgroup_on_dfl(cgrp))
+		return -EPERM;
+
+	/*
+	 * We're gonna grab cgroup_mutex which nests outside kernfs
+	 * active_ref.  kernfs_rename() doesn't require active_ref
+	 * protection.  Break them before grabbing cgroup_mutex.
+	 */
+	kernfs_break_active_protection(new_parent);
+	kernfs_break_active_protection(kn);
+
+	mutex_lock(&cgroup_mutex);
+
+	ret = kernfs_rename(kn, new_parent, new_name_str);
+	if (!ret)
+		trace_cgroup_rename(cgrp);
+
+	mutex_unlock(&cgroup_mutex);
+
+	kernfs_unbreak_active_protection(kn);
+	kernfs_unbreak_active_protection(new_parent);
+	return ret;
+}
+
+/* set uid and gid of cgroup dirs and files to that of the creator */
+static int cgroup_kn_set_ugid(struct kernfs_node *kn)
+{
+	struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
+			       .ia_uid = current_fsuid(),
+			       .ia_gid = current_fsgid(), };
+
+	if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
+	    gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
+		return 0;
+
+	return kernfs_setattr(kn, &iattr);
+}
+
+static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
+			   struct cftype *cft)
+{
+	char name[CGROUP_FILE_NAME_MAX];
+	struct kernfs_node *kn;
+	struct lock_class_key *key = NULL;
+	int ret;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	key = &cft->lockdep_key;
+#endif
+	kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
+				  cgroup_file_mode(cft), 0, cft->kf_ops, cft,
+				  NULL, key);
+	if (IS_ERR(kn))
+		return PTR_ERR(kn);
+
+	ret = cgroup_kn_set_ugid(kn);
+	if (ret) {
+		kernfs_remove(kn);
+		return ret;
+	}
+
+	if (cft->file_offset) {
+		struct cgroup_file *cfile = (void *)css + cft->file_offset;
+
+		spin_lock_irq(&cgroup_file_kn_lock);
+		cfile->kn = kn;
+		spin_unlock_irq(&cgroup_file_kn_lock);
+	}
+
+	return 0;
+}
+
+/**
+ * cgroup_addrm_files - add or remove files to a cgroup directory
+ * @css: the target css
+ * @cgrp: the target cgroup (usually css->cgroup)
+ * @cfts: array of cftypes to be added
+ * @is_add: whether to add or remove
+ *
+ * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
+ * For removals, this function never fails.
+ */
+static int cgroup_addrm_files(struct cgroup_subsys_state *css,
+			      struct cgroup *cgrp, struct cftype cfts[],
+			      bool is_add)
+{
+	struct cftype *cft, *cft_end = NULL;
+	int ret = 0;
+
+	lockdep_assert_held(&cgroup_mutex);
+
+restart:
+	for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
+		/* does cft->flags tell us to skip this file on @cgrp? */
+		if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
+			continue;
+		if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
+			continue;
+		if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
+			continue;
+		if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
+			continue;
+
+		if (is_add) {
+			ret = cgroup_add_file(css, cgrp, cft);
+			if (ret) {
+				pr_warn("%s: failed to add %s, err=%d\n",
+					__func__, cft->name, ret);
+				cft_end = cft;
+				is_add = false;
+				goto restart;
+			}
+		} else {
+			cgroup_rm_file(cgrp, cft);
+		}
+	}
+	return ret;
+}
+
+static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
+{
+	LIST_HEAD(pending);
+	struct cgroup_subsys *ss = cfts[0].ss;
+	struct cgroup *root = &ss->root->cgrp;
+	struct cgroup_subsys_state *css;
+	int ret = 0;
+
+	lockdep_assert_held(&cgroup_mutex);
+
+	/* add/rm files for all cgroups created before */
+	css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
+		struct cgroup *cgrp = css->cgroup;
+
+		if (!(css->flags & CSS_VISIBLE))
+			continue;
+
+		ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
+		if (ret)
+			break;
+	}
+
+	if (is_add && !ret)
+		kernfs_activate(root->kn);
+	return ret;
+}
+
+static void cgroup_exit_cftypes(struct cftype *cfts)
+{
+	struct cftype *cft;
+
+	for (cft = cfts; cft->name[0] != '\0'; cft++) {
+		/* free copy for custom atomic_write_len, see init_cftypes() */
+		if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
+			kfree(cft->kf_ops);
+		cft->kf_ops = NULL;
+		cft->ss = NULL;
+
+		/* revert flags set by cgroup core while adding @cfts */
+		cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
+	}
+}
+
+static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+	struct cftype *cft;
+
+	for (cft = cfts; cft->name[0] != '\0'; cft++) {
+		struct kernfs_ops *kf_ops;
+
+		WARN_ON(cft->ss || cft->kf_ops);
+
+		if (cft->seq_start)
+			kf_ops = &cgroup_kf_ops;
+		else
+			kf_ops = &cgroup_kf_single_ops;
+
+		/*
+		 * Ugh... if @cft wants a custom max_write_len, we need to
+		 * make a copy of kf_ops to set its atomic_write_len.
+		 */
+		if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
+			kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
+			if (!kf_ops) {
+				cgroup_exit_cftypes(cfts);
+				return -ENOMEM;
+			}
+			kf_ops->atomic_write_len = cft->max_write_len;
+		}
+
+		cft->kf_ops = kf_ops;
+		cft->ss = ss;
+	}
+
+	return 0;
+}
+
+static int cgroup_rm_cftypes_locked(struct cftype *cfts)
+{
+	lockdep_assert_held(&cgroup_mutex);
+
+	if (!cfts || !cfts[0].ss)
+		return -ENOENT;
+
+	list_del(&cfts->node);
+	cgroup_apply_cftypes(cfts, false);
+	cgroup_exit_cftypes(cfts);
+	return 0;
+}
+
+/**
+ * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Unregister @cfts.  Files described by @cfts are removed from all
+ * existing cgroups and all future cgroups won't have them either.  This
+ * function can be called anytime whether @cfts' subsys is attached or not.
+ *
+ * Returns 0 on successful unregistration, -ENOENT if @cfts is not
+ * registered.
+ */
+int cgroup_rm_cftypes(struct cftype *cfts)
+{
+	int ret;
+
+	mutex_lock(&cgroup_mutex);
+	ret = cgroup_rm_cftypes_locked(cfts);
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
+
+/**
+ * cgroup_add_cftypes - add an array of cftypes to a subsystem
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Register @cfts to @ss.  Files described by @cfts are created for all
+ * existing cgroups to which @ss is attached and all future cgroups will
+ * have them too.  This function can be called anytime whether @ss is
+ * attached or not.
+ *
+ * Returns 0 on successful registration, -errno on failure.  Note that this
+ * function currently returns 0 as long as @cfts registration is successful
+ * even if some file creation attempts on existing cgroups fail.
+ */
+static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+	int ret;
+
+	if (!cgroup_ssid_enabled(ss->id))
+		return 0;
+
+	if (!cfts || cfts[0].name[0] == '\0')
+		return 0;
+
+	ret = cgroup_init_cftypes(ss, cfts);
+	if (ret)
+		return ret;
+
+	mutex_lock(&cgroup_mutex);
+
+	list_add_tail(&cfts->node, &ss->cfts);
+	ret = cgroup_apply_cftypes(cfts, true);
+	if (ret)
+		cgroup_rm_cftypes_locked(cfts);
+
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
+
+/**
+ * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Similar to cgroup_add_cftypes() but the added files are only used for
+ * the default hierarchy.
+ */
+int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+	struct cftype *cft;
+
+	for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+		cft->flags |= __CFTYPE_ONLY_ON_DFL;
+	return cgroup_add_cftypes(ss, cfts);
+}
+
+/**
+ * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Similar to cgroup_add_cftypes() but the added files are only used for
+ * the legacy hierarchies.
+ */
+int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+	struct cftype *cft;
+
+	for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+		cft->flags |= __CFTYPE_NOT_ON_DFL;
+	return cgroup_add_cftypes(ss, cfts);
+}
+
+/**
+ * cgroup_file_notify - generate a file modified event for a cgroup_file
+ * @cfile: target cgroup_file
+ *
+ * @cfile must have been obtained by setting cftype->file_offset.
+ */
+void cgroup_file_notify(struct cgroup_file *cfile)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cgroup_file_kn_lock, flags);
+	if (cfile->kn)
+		kernfs_notify(cfile->kn);
+	spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
+}
+
+/**
+ * cgroup_task_count - count the number of tasks in a cgroup.
+ * @cgrp: the cgroup in question
+ *
+ * Return the number of tasks in the cgroup.  The returned number can be
+ * higher than the actual number of tasks due to css_set references from
+ * namespace roots and temporary usages.
+ */
+static int cgroup_task_count(const struct cgroup *cgrp)
+{
+	int count = 0;
+	struct cgrp_cset_link *link;
+
+	spin_lock_irq(&css_set_lock);
+	list_for_each_entry(link, &cgrp->cset_links, cset_link)
+		count += atomic_read(&link->cset->refcount);
+	spin_unlock_irq(&css_set_lock);
+	return count;
+}
+
+/**
+ * css_next_child - find the next child of a given css
+ * @pos: the current position (%NULL to initiate traversal)
+ * @parent: css whose children to walk
+ *
+ * This function returns the next child of @parent and should be called
+ * under either cgroup_mutex or RCU read lock.  The only requirement is
+ * that @parent and @pos are accessible.  The next sibling is guaranteed to
+ * be returned regardless of their states.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal.  It's each subsystem's
+ * responsibility to synchronize against on/offlining.
+ */
+struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
+					   struct cgroup_subsys_state *parent)
+{
+	struct cgroup_subsys_state *next;
+
+	cgroup_assert_mutex_or_rcu_locked();
+
+	/*
+	 * @pos could already have been unlinked from the sibling list.
+	 * Once a cgroup is removed, its ->sibling.next is no longer
+	 * updated when its next sibling changes.  CSS_RELEASED is set when
+	 * @pos is taken off list, at which time its next pointer is valid,
+	 * and, as releases are serialized, the one pointed to by the next
+	 * pointer is guaranteed to not have started release yet.  This
+	 * implies that if we observe !CSS_RELEASED on @pos in this RCU
+	 * critical section, the one pointed to by its next pointer is
+	 * guaranteed to not have finished its RCU grace period even if we
+	 * have dropped rcu_read_lock() inbetween iterations.
+	 *
+	 * If @pos has CSS_RELEASED set, its next pointer can't be
+	 * dereferenced; however, as each css is given a monotonically
+	 * increasing unique serial number and always appended to the
+	 * sibling list, the next one can be found by walking the parent's
+	 * children until the first css with higher serial number than
+	 * @pos's.  While this path can be slower, it happens iff iteration
+	 * races against release and the race window is very small.
+	 */
+	if (!pos) {
+		next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
+	} else if (likely(!(pos->flags & CSS_RELEASED))) {
+		next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
+	} else {
+		list_for_each_entry_rcu(next, &parent->children, sibling)
+			if (next->serial_nr > pos->serial_nr)
+				break;
+	}
+
+	/*
+	 * @next, if not pointing to the head, can be dereferenced and is
+	 * the next sibling.
+	 */
+	if (&next->sibling != &parent->children)
+		return next;
+	return NULL;
+}
+
+/**
+ * css_next_descendant_pre - find the next descendant for pre-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @root: css whose descendants to walk
+ *
+ * To be used by css_for_each_descendant_pre().  Find the next descendant
+ * to visit for pre-order traversal of @root's descendants.  @root is
+ * included in the iteration and the first node to be visited.
+ *
+ * While this function requires cgroup_mutex or RCU read locking, it
+ * doesn't require the whole traversal to be contained in a single critical
+ * section.  This function will return the correct next descendant as long
+ * as both @pos and @root are accessible and @pos is a descendant of @root.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal.  It's each subsystem's
+ * responsibility to synchronize against on/offlining.
+ */
+struct cgroup_subsys_state *
+css_next_descendant_pre(struct cgroup_subsys_state *pos,
+			struct cgroup_subsys_state *root)
+{
+	struct cgroup_subsys_state *next;
+
+	cgroup_assert_mutex_or_rcu_locked();
+
+	/* if first iteration, visit @root */
+	if (!pos)
+		return root;
+
+	/* visit the first child if exists */
+	next = css_next_child(NULL, pos);
+	if (next)
+		return next;
+
+	/* no child, visit my or the closest ancestor's next sibling */
+	while (pos != root) {
+		next = css_next_child(pos, pos->parent);
+		if (next)
+			return next;
+		pos = pos->parent;
+	}
+
+	return NULL;
+}
+
+/**
+ * css_rightmost_descendant - return the rightmost descendant of a css
+ * @pos: css of interest
+ *
+ * Return the rightmost descendant of @pos.  If there's no descendant, @pos
+ * is returned.  This can be used during pre-order traversal to skip
+ * subtree of @pos.
+ *
+ * While this function requires cgroup_mutex or RCU read locking, it
+ * doesn't require the whole traversal to be contained in a single critical
+ * section.  This function will return the correct rightmost descendant as
+ * long as @pos is accessible.
+ */
+struct cgroup_subsys_state *
+css_rightmost_descendant(struct cgroup_subsys_state *pos)
+{
+	struct cgroup_subsys_state *last, *tmp;
+
+	cgroup_assert_mutex_or_rcu_locked();
+
+	do {
+		last = pos;
+		/* ->prev isn't RCU safe, walk ->next till the end */
+		pos = NULL;
+		css_for_each_child(tmp, last)
+			pos = tmp;
+	} while (pos);
+
+	return last;
+}
+
+static struct cgroup_subsys_state *
+css_leftmost_descendant(struct cgroup_subsys_state *pos)
+{
+	struct cgroup_subsys_state *last;
+
+	do {
+		last = pos;
+		pos = css_next_child(NULL, pos);
+	} while (pos);
+
+	return last;
+}
+
+/**
+ * css_next_descendant_post - find the next descendant for post-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @root: css whose descendants to walk
+ *
+ * To be used by css_for_each_descendant_post().  Find the next descendant
+ * to visit for post-order traversal of @root's descendants.  @root is
+ * included in the iteration and the last node to be visited.
+ *
+ * While this function requires cgroup_mutex or RCU read locking, it
+ * doesn't require the whole traversal to be contained in a single critical
+ * section.  This function will return the correct next descendant as long
+ * as both @pos and @cgroup are accessible and @pos is a descendant of
+ * @cgroup.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal.  It's each subsystem's
+ * responsibility to synchronize against on/offlining.
+ */
+struct cgroup_subsys_state *
+css_next_descendant_post(struct cgroup_subsys_state *pos,
+			 struct cgroup_subsys_state *root)
+{
+	struct cgroup_subsys_state *next;
+
+	cgroup_assert_mutex_or_rcu_locked();
+
+	/* if first iteration, visit leftmost descendant which may be @root */
+	if (!pos)
+		return css_leftmost_descendant(root);
+
+	/* if we visited @root, we're done */
+	if (pos == root)
+		return NULL;
+
+	/* if there's an unvisited sibling, visit its leftmost descendant */
+	next = css_next_child(pos, pos->parent);
+	if (next)
+		return css_leftmost_descendant(next);
+
+	/* no sibling left, visit parent */
+	return pos->parent;
+}
+
+/**
+ * css_has_online_children - does a css have online children
+ * @css: the target css
+ *
+ * Returns %true if @css has any online children; otherwise, %false.  This
+ * function can be called from any context but the caller is responsible
+ * for synchronizing against on/offlining as necessary.
+ */
+bool css_has_online_children(struct cgroup_subsys_state *css)
+{
+	struct cgroup_subsys_state *child;
+	bool ret = false;
+
+	rcu_read_lock();
+	css_for_each_child(child, css) {
+		if (child->flags & CSS_ONLINE) {
+			ret = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
+/**
+ * css_task_iter_advance_css_set - advance a task itererator to the next css_set
+ * @it: the iterator to advance
+ *
+ * Advance @it to the next css_set to walk.
+ */
+static void css_task_iter_advance_css_set(struct css_task_iter *it)
+{
+	struct list_head *l = it->cset_pos;
+	struct cgrp_cset_link *link;
+	struct css_set *cset;
+
+	lockdep_assert_held(&css_set_lock);
+
+	/* Advance to the next non-empty css_set */
+	do {
+		l = l->next;
+		if (l == it->cset_head) {
+			it->cset_pos = NULL;
+			it->task_pos = NULL;
+			return;
+		}
+
+		if (it->ss) {
+			cset = container_of(l, struct css_set,
+					    e_cset_node[it->ss->id]);
+		} else {
+			link = list_entry(l, struct cgrp_cset_link, cset_link);
+			cset = link->cset;
+		}
+	} while (!css_set_populated(cset));
+
+	it->cset_pos = l;
+
+	if (!list_empty(&cset->tasks))
+		it->task_pos = cset->tasks.next;
+	else
+		it->task_pos = cset->mg_tasks.next;
+
+	it->tasks_head = &cset->tasks;
+	it->mg_tasks_head = &cset->mg_tasks;
+
+	/*
+	 * We don't keep css_sets locked across iteration steps and thus
+	 * need to take steps to ensure that iteration can be resumed after
+	 * the lock is re-acquired.  Iteration is performed at two levels -
+	 * css_sets and tasks in them.
+	 *
+	 * Once created, a css_set never leaves its cgroup lists, so a
+	 * pinned css_set is guaranteed to stay put and we can resume
+	 * iteration afterwards.
+	 *
+	 * Tasks may leave @cset across iteration steps.  This is resolved
+	 * by registering each iterator with the css_set currently being
+	 * walked and making css_set_move_task() advance iterators whose
+	 * next task is leaving.
+	 */
+	if (it->cur_cset) {
+		list_del(&it->iters_node);
+		put_css_set_locked(it->cur_cset);
+	}
+	get_css_set(cset);
+	it->cur_cset = cset;
+	list_add(&it->iters_node, &cset->task_iters);
+}
+
+static void css_task_iter_advance(struct css_task_iter *it)
+{
+	struct list_head *l = it->task_pos;
+
+	lockdep_assert_held(&css_set_lock);
+	WARN_ON_ONCE(!l);
+
+	/*
+	 * Advance iterator to find next entry.  cset->tasks is consumed
+	 * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
+	 * next cset.
+	 */
+	l = l->next;
+
+	if (l == it->tasks_head)
+		l = it->mg_tasks_head->next;
+
+	if (l == it->mg_tasks_head)
+		css_task_iter_advance_css_set(it);
+	else
+		it->task_pos = l;
+}
+
+/**
+ * css_task_iter_start - initiate task iteration
+ * @css: the css to walk tasks of
+ * @it: the task iterator to use
+ *
+ * Initiate iteration through the tasks of @css.  The caller can call
+ * css_task_iter_next() to walk through the tasks until the function
+ * returns NULL.  On completion of iteration, css_task_iter_end() must be
+ * called.
+ */
+void css_task_iter_start(struct cgroup_subsys_state *css,
+			 struct css_task_iter *it)
+{
+	/* no one should try to iterate before mounting cgroups */
+	WARN_ON_ONCE(!use_task_css_set_links);
+
+	memset(it, 0, sizeof(*it));
+
+	spin_lock_irq(&css_set_lock);
+
+	it->ss = css->ss;
+
+	if (it->ss)
+		it->cset_pos = &css->cgroup->e_csets[css->ss->id];
+	else
+		it->cset_pos = &css->cgroup->cset_links;
+
+	it->cset_head = it->cset_pos;
+
+	css_task_iter_advance_css_set(it);
+
+	spin_unlock_irq(&css_set_lock);
+}
+
+/**
+ * css_task_iter_next - return the next task for the iterator
+ * @it: the task iterator being iterated
+ *
+ * The "next" function for task iteration.  @it should have been
+ * initialized via css_task_iter_start().  Returns NULL when the iteration
+ * reaches the end.
+ */
+struct task_struct *css_task_iter_next(struct css_task_iter *it)
+{
+	if (it->cur_task) {
+		put_task_struct(it->cur_task);
+		it->cur_task = NULL;
+	}
+
+	spin_lock_irq(&css_set_lock);
+
+	if (it->task_pos) {
+		it->cur_task = list_entry(it->task_pos, struct task_struct,
+					  cg_list);
+		get_task_struct(it->cur_task);
+		css_task_iter_advance(it);
+	}
+
+	spin_unlock_irq(&css_set_lock);
+
+	return it->cur_task;
+}
+
+/**
+ * css_task_iter_end - finish task iteration
+ * @it: the task iterator to finish
+ *
+ * Finish task iteration started by css_task_iter_start().
+ */
+void css_task_iter_end(struct css_task_iter *it)
+{
+	if (it->cur_cset) {
+		spin_lock_irq(&css_set_lock);
+		list_del(&it->iters_node);
+		put_css_set_locked(it->cur_cset);
+		spin_unlock_irq(&css_set_lock);
+	}
+
+	if (it->cur_task)
+		put_task_struct(it->cur_task);
+}
+
+/**
+ * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
+ * @to: cgroup to which the tasks will be moved
+ * @from: cgroup in which the tasks currently reside
+ *
+ * Locking rules between cgroup_post_fork() and the migration path
+ * guarantee that, if a task is forking while being migrated, the new child
+ * is guaranteed to be either visible in the source cgroup after the
+ * parent's migration is complete or put into the target cgroup.  No task
+ * can slip out of migration through forking.
+ */
+int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
+{
+	LIST_HEAD(preloaded_csets);
+	struct cgrp_cset_link *link;
+	struct css_task_iter it;
+	struct task_struct *task;
+	int ret;
+
+	if (cgroup_on_dfl(to))
+		return -EINVAL;
+
+	if (!cgroup_may_migrate_to(to))
+		return -EBUSY;
+
+	mutex_lock(&cgroup_mutex);
+
+	percpu_down_write(&cgroup_threadgroup_rwsem);
+
+	/* all tasks in @from are being moved, all csets are source */
+	spin_lock_irq(&css_set_lock);
+	list_for_each_entry(link, &from->cset_links, cset_link)
+		cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
+	spin_unlock_irq(&css_set_lock);
+
+	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
+	if (ret)
+		goto out_err;
+
+	/*
+	 * Migrate tasks one-by-one until @from is empty.  This fails iff
+	 * ->can_attach() fails.
+	 */
+	do {
+		css_task_iter_start(&from->self, &it);
+		task = css_task_iter_next(&it);
+		if (task)
+			get_task_struct(task);
+		css_task_iter_end(&it);
+
+		if (task) {
+			ret = cgroup_migrate(task, false, to->root);
+			if (!ret)
+				trace_cgroup_transfer_tasks(to, task, false);
+			put_task_struct(task);
+		}
+	} while (task && !ret);
+out_err:
+	cgroup_migrate_finish(&preloaded_csets);
+	percpu_up_write(&cgroup_threadgroup_rwsem);
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
+
+static void cgroup_procs_release(struct kernfs_open_file *of)
+{
+	if (of->priv) {
+		css_task_iter_end(of->priv);
+		kfree(of->priv);
+	}
+}
+
+static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct kernfs_open_file *of = s->private;
+	struct css_task_iter *it = of->priv;
+	struct task_struct *task;
+
+	do {
+		task = css_task_iter_next(it);
+	} while (task && !thread_group_leader(task));
+
+	return task;
+}
+
+static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
+{
+	struct kernfs_open_file *of = s->private;
+	struct cgroup *cgrp = seq_css(s)->cgroup;
+	struct css_task_iter *it = of->priv;
+
+	/*
+	 * When a seq_file is seeked, it's always traversed sequentially
+	 * from position 0, so we can simply keep iterating on !0 *pos.
+	 */
+	if (!it) {
+		if (WARN_ON_ONCE((*pos)++))
+			return ERR_PTR(-EINVAL);
+
+		it = kzalloc(sizeof(*it), GFP_KERNEL);
+		if (!it)
+			return ERR_PTR(-ENOMEM);
+		of->priv = it;
+		css_task_iter_start(&cgrp->self, it);
+	} else if (!(*pos)++) {
+		css_task_iter_end(it);
+		css_task_iter_start(&cgrp->self, it);
+	}
+
+	return cgroup_procs_next(s, NULL, NULL);
+}
+
+static int cgroup_procs_show(struct seq_file *s, void *v)
+{
+	seq_printf(s, "%d\n", task_tgid_vnr(v));
+	return 0;
+}
+
+/*
+ * Stuff for reading the 'tasks'/'procs' files.
+ *
+ * Reading this file can return large amounts of data if a cgroup has
+ * *lots* of attached tasks. So it may need several calls to read(),
+ * but we cannot guarantee that the information we produce is correct
+ * unless we produce it entirely atomically.
+ *
+ */
+
+/* which pidlist file are we talking about? */
+enum cgroup_filetype {
+	CGROUP_FILE_PROCS,
+	CGROUP_FILE_TASKS,
+};
+
+/*
+ * A pidlist is a list of pids that virtually represents the contents of one
+ * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
+ * a pair (one each for procs, tasks) for each pid namespace that's relevant
+ * to the cgroup.
+ */
+struct cgroup_pidlist {
+	/*
+	 * used to find which pidlist is wanted. doesn't change as long as
+	 * this particular list stays in the list.
+	*/
+	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
+	/* array of xids */
+	pid_t *list;
+	/* how many elements the above list has */
+	int length;
+	/* each of these stored in a list by its cgroup */
+	struct list_head links;
+	/* pointer to the cgroup we belong to, for list removal purposes */
+	struct cgroup *owner;
+	/* for delayed destruction */
+	struct delayed_work destroy_dwork;
+};
+
+/*
+ * The following two functions "fix" the issue where there are more pids
+ * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
+ * TODO: replace with a kernel-wide solution to this problem
+ */
+#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
+static void *pidlist_allocate(int count)
+{
+	if (PIDLIST_TOO_LARGE(count))
+		return vmalloc(count * sizeof(pid_t));
+	else
+		return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
+}
+
+static void pidlist_free(void *p)
+{
+	kvfree(p);
+}
+
+/*
+ * Used to destroy all pidlists lingering waiting for destroy timer.  None
+ * should be left afterwards.
+ */
+static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
+{
+	struct cgroup_pidlist *l, *tmp_l;
+
+	mutex_lock(&cgrp->pidlist_mutex);
+	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
+		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
+	mutex_unlock(&cgrp->pidlist_mutex);
+
+	flush_workqueue(cgroup_pidlist_destroy_wq);
+	BUG_ON(!list_empty(&cgrp->pidlists));
+}
+
+static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
+						destroy_dwork);
+	struct cgroup_pidlist *tofree = NULL;
+
+	mutex_lock(&l->owner->pidlist_mutex);
+
+	/*
+	 * Destroy iff we didn't get queued again.  The state won't change
+	 * as destroy_dwork can only be queued while locked.
+	 */
+	if (!delayed_work_pending(dwork)) {
+		list_del(&l->links);
+		pidlist_free(l->list);
+		put_pid_ns(l->key.ns);
+		tofree = l;
+	}
+
+	mutex_unlock(&l->owner->pidlist_mutex);
+	kfree(tofree);
+}
+
+/*
+ * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
+ * Returns the number of unique elements.
+ */
+static int pidlist_uniq(pid_t *list, int length)
+{
+	int src, dest = 1;
+
+	/*
+	 * we presume the 0th element is unique, so i starts at 1. trivial
+	 * edge cases first; no work needs to be done for either
+	 */
+	if (length == 0 || length == 1)
+		return length;
+	/* src and dest walk down the list; dest counts unique elements */
+	for (src = 1; src < length; src++) {
+		/* find next unique element */
+		while (list[src] == list[src-1]) {
+			src++;
+			if (src == length)
+				goto after;
+		}
+		/* dest always points to where the next unique element goes */
+		list[dest] = list[src];
+		dest++;
+	}
+after:
+	return dest;
+}
+
+/*
+ * The two pid files - task and cgroup.procs - guaranteed that the result
+ * is sorted, which forced this whole pidlist fiasco.  As pid order is
+ * different per namespace, each namespace needs differently sorted list,
+ * making it impossible to use, for example, single rbtree of member tasks
+ * sorted by task pointer.  As pidlists can be fairly large, allocating one
+ * per open file is dangerous, so cgroup had to implement shared pool of
+ * pidlists keyed by cgroup and namespace.
+ */
+static int cmppid(const void *a, const void *b)
+{
+	return *(pid_t *)a - *(pid_t *)b;
+}
+
+static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
+						  enum cgroup_filetype type)
+{
+	struct cgroup_pidlist *l;
+	/* don't need task_nsproxy() if we're looking at ourself */
+	struct pid_namespace *ns = task_active_pid_ns(current);
+
+	lockdep_assert_held(&cgrp->pidlist_mutex);
+
+	list_for_each_entry(l, &cgrp->pidlists, links)
+		if (l->key.type == type && l->key.ns == ns)
+			return l;
+	return NULL;
+}
+
+/*
+ * find the appropriate pidlist for our purpose (given procs vs tasks)
+ * returns with the lock on that pidlist already held, and takes care
+ * of the use count, or returns NULL with no locks held if we're out of
+ * memory.
+ */
+static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
+						enum cgroup_filetype type)
+{
+	struct cgroup_pidlist *l;
+
+	lockdep_assert_held(&cgrp->pidlist_mutex);
+
+	l = cgroup_pidlist_find(cgrp, type);
+	if (l)
+		return l;
+
+	/* entry not found; create a new one */
+	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
+	if (!l)
+		return l;
+
+	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
+	l->key.type = type;
+	/* don't need task_nsproxy() if we're looking at ourself */
+	l->key.ns = get_pid_ns(task_active_pid_ns(current));
+	l->owner = cgrp;
+	list_add(&l->links, &cgrp->pidlists);
+	return l;
+}
+
+/*
+ * Load a cgroup's pidarray with either procs' tgids or tasks' pids
+ */
+static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
+			      struct cgroup_pidlist **lp)
+{
+	pid_t *array;
+	int length;
+	int pid, n = 0; /* used for populating the array */
+	struct css_task_iter it;
+	struct task_struct *tsk;
+	struct cgroup_pidlist *l;
+
+	lockdep_assert_held(&cgrp->pidlist_mutex);
+
+	/*
+	 * If cgroup gets more users after we read count, we won't have
+	 * enough space - tough.  This race is indistinguishable to the
+	 * caller from the case that the additional cgroup users didn't
+	 * show up until sometime later on.
+	 */
+	length = cgroup_task_count(cgrp);
+	array = pidlist_allocate(length);
+	if (!array)
+		return -ENOMEM;
+	/* now, populate the array */
+	css_task_iter_start(&cgrp->self, &it);
+	while ((tsk = css_task_iter_next(&it))) {
+		if (unlikely(n == length))
+			break;
+		/* get tgid or pid for procs or tasks file respectively */
+		if (type == CGROUP_FILE_PROCS)
+			pid = task_tgid_vnr(tsk);
+		else
+			pid = task_pid_vnr(tsk);
+		if (pid > 0) /* make sure to only use valid results */
+			array[n++] = pid;
+	}
+	css_task_iter_end(&it);
+	length = n;
+	/* now sort & (if procs) strip out duplicates */
+	sort(array, length, sizeof(pid_t), cmppid, NULL);
+	if (type == CGROUP_FILE_PROCS)
+		length = pidlist_uniq(array, length);
+
+	l = cgroup_pidlist_find_create(cgrp, type);
+	if (!l) {
+		pidlist_free(array);
+		return -ENOMEM;
+	}
+
+	/* store array, freeing old if necessary */
+	pidlist_free(l->list);
+	l->list = array;
+	l->length = length;
+	*lp = l;
+	return 0;
+}
+
+/**
+ * cgroupstats_build - build and fill cgroupstats
+ * @stats: cgroupstats to fill information into
+ * @dentry: A dentry entry belonging to the cgroup for which stats have
+ * been requested.
+ *
+ * Build and fill cgroupstats so that taskstats can export it to user
+ * space.
+ */
+int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
+{
+	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
+	struct cgroup *cgrp;
+	struct css_task_iter it;
+	struct task_struct *tsk;
+
+	/* it should be kernfs_node belonging to cgroupfs and is a directory */
+	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
+	    kernfs_type(kn) != KERNFS_DIR)
+		return -EINVAL;
+
+	mutex_lock(&cgroup_mutex);
+
+	/*
+	 * We aren't being called from kernfs and there's no guarantee on
+	 * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
+	 * @kn->priv is RCU safe.  Let's do the RCU dancing.
+	 */
+	rcu_read_lock();
+	cgrp = rcu_dereference(kn->priv);
+	if (!cgrp || cgroup_is_dead(cgrp)) {
+		rcu_read_unlock();
+		mutex_unlock(&cgroup_mutex);
+		return -ENOENT;
+	}
+	rcu_read_unlock();
+
+	css_task_iter_start(&cgrp->self, &it);
+	while ((tsk = css_task_iter_next(&it))) {
+		switch (tsk->state) {
+		case TASK_RUNNING:
+			stats->nr_running++;
+			break;
+		case TASK_INTERRUPTIBLE:
+			stats->nr_sleeping++;
+			break;
+		case TASK_UNINTERRUPTIBLE:
+			stats->nr_uninterruptible++;
+			break;
+		case TASK_STOPPED:
+			stats->nr_stopped++;
+			break;
+		default:
+			if (delayacct_is_task_waiting_on_io(tsk))
+				stats->nr_io_wait++;
+			break;
+		}
+	}
+	css_task_iter_end(&it);
+
+	mutex_unlock(&cgroup_mutex);
+	return 0;
+}
+
+
+/*
+ * seq_file methods for the tasks/procs files. The seq_file position is the
+ * next pid to display; the seq_file iterator is a pointer to the pid
+ * in the cgroup->l->list array.
+ */
+
+static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
+{
+	/*
+	 * Initially we receive a position value that corresponds to
+	 * one more than the last pid shown (or 0 on the first call or
+	 * after a seek to the start). Use a binary-search to find the
+	 * next pid to display, if any
+	 */
+	struct kernfs_open_file *of = s->private;
+	struct cgroup *cgrp = seq_css(s)->cgroup;
+	struct cgroup_pidlist *l;
+	enum cgroup_filetype type = seq_cft(s)->private;
+	int index = 0, pid = *pos;
+	int *iter, ret;
+
+	mutex_lock(&cgrp->pidlist_mutex);
+
+	/*
+	 * !NULL @of->priv indicates that this isn't the first start()
+	 * after open.  If the matching pidlist is around, we can use that.
+	 * Look for it.  Note that @of->priv can't be used directly.  It
+	 * could already have been destroyed.
+	 */
+	if (of->priv)
+		of->priv = cgroup_pidlist_find(cgrp, type);
+
+	/*
+	 * Either this is the first start() after open or the matching
+	 * pidlist has been destroyed inbetween.  Create a new one.
+	 */
+	if (!of->priv) {
+		ret = pidlist_array_load(cgrp, type,
+					 (struct cgroup_pidlist **)&of->priv);
+		if (ret)
+			return ERR_PTR(ret);
+	}
+	l = of->priv;
+
+	if (pid) {
+		int end = l->length;
+
+		while (index < end) {
+			int mid = (index + end) / 2;
+			if (l->list[mid] == pid) {
+				index = mid;
+				break;
+			} else if (l->list[mid] <= pid)
+				index = mid + 1;
+			else
+				end = mid;
+		}
+	}
+	/* If we're off the end of the array, we're done */
+	if (index >= l->length)
+		return NULL;
+	/* Update the abstract position to be the actual pid that we found */
+	iter = l->list + index;
+	*pos = *iter;
+	return iter;
+}
+
+static void cgroup_pidlist_stop(struct seq_file *s, void *v)
+{
+	struct kernfs_open_file *of = s->private;
+	struct cgroup_pidlist *l = of->priv;
+
+	if (l)
+		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
+				 CGROUP_PIDLIST_DESTROY_DELAY);
+	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
+}
+
+static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct kernfs_open_file *of = s->private;
+	struct cgroup_pidlist *l = of->priv;
+	pid_t *p = v;
+	pid_t *end = l->list + l->length;
+	/*
+	 * Advance to the next pid in the array. If this goes off the
+	 * end, we're done
+	 */
+	p++;
+	if (p >= end) {
+		return NULL;
+	} else {
+		*pos = *p;
+		return p;
+	}
+}
+
+static int cgroup_pidlist_show(struct seq_file *s, void *v)
+{
+	seq_printf(s, "%d\n", *(int *)v);
+
+	return 0;
+}
+
+static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
+					 struct cftype *cft)
+{
+	return notify_on_release(css->cgroup);
+}
+
+static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
+					  struct cftype *cft, u64 val)
+{
+	if (val)
+		set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
+	else
+		clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
+	return 0;
+}
+
+static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
+				      struct cftype *cft)
+{
+	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+}
+
+static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
+				       struct cftype *cft, u64 val)
+{
+	if (val)
+		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+	else
+		clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+	return 0;
+}
+
+/* cgroup core interface files for the default hierarchy */
+static struct cftype cgroup_dfl_base_files[] = {
+	{
+		.name = "cgroup.procs",
+		.file_offset = offsetof(struct cgroup, procs_file),
+		.release = cgroup_procs_release,
+		.seq_start = cgroup_procs_start,
+		.seq_next = cgroup_procs_next,
+		.seq_show = cgroup_procs_show,
+		.write = cgroup_procs_write,
+	},
+	{
+		.name = "cgroup.controllers",
+		.seq_show = cgroup_controllers_show,
+	},
+	{
+		.name = "cgroup.subtree_control",
+		.seq_show = cgroup_subtree_control_show,
+		.write = cgroup_subtree_control_write,
+	},
+	{
+		.name = "cgroup.events",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.file_offset = offsetof(struct cgroup, events_file),
+		.seq_show = cgroup_events_show,
+	},
+	{ }	/* terminate */
+};
+
+/* cgroup core interface files for the legacy hierarchies */
+static struct cftype cgroup_legacy_base_files[] = {
+	{
+		.name = "cgroup.procs",
+		.seq_start = cgroup_pidlist_start,
+		.seq_next = cgroup_pidlist_next,
+		.seq_stop = cgroup_pidlist_stop,
+		.seq_show = cgroup_pidlist_show,
+		.private = CGROUP_FILE_PROCS,
+		.write = cgroup_procs_write,
+	},
+	{
+		.name = "cgroup.clone_children",
+		.read_u64 = cgroup_clone_children_read,
+		.write_u64 = cgroup_clone_children_write,
+	},
+	{
+		.name = "cgroup.sane_behavior",
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.seq_show = cgroup_sane_behavior_show,
+	},
+	{
+		.name = "tasks",
+		.seq_start = cgroup_pidlist_start,
+		.seq_next = cgroup_pidlist_next,
+		.seq_stop = cgroup_pidlist_stop,
+		.seq_show = cgroup_pidlist_show,
+		.private = CGROUP_FILE_TASKS,
+		.write = cgroup_tasks_write,
+	},
+	{
+		.name = "notify_on_release",
+		.read_u64 = cgroup_read_notify_on_release,
+		.write_u64 = cgroup_write_notify_on_release,
+	},
+	{
+		.name = "release_agent",
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.seq_show = cgroup_release_agent_show,
+		.write = cgroup_release_agent_write,
+		.max_write_len = PATH_MAX - 1,
+	},
+	{ }	/* terminate */
+};
+
+/*
+ * css destruction is four-stage process.
+ *
+ * 1. Destruction starts.  Killing of the percpu_ref is initiated.
+ *    Implemented in kill_css().
+ *
+ * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
+ *    and thus css_tryget_online() is guaranteed to fail, the css can be
+ *    offlined by invoking offline_css().  After offlining, the base ref is
+ *    put.  Implemented in css_killed_work_fn().
+ *
+ * 3. When the percpu_ref reaches zero, the only possible remaining
+ *    accessors are inside RCU read sections.  css_release() schedules the
+ *    RCU callback.
+ *
+ * 4. After the grace period, the css can be freed.  Implemented in
+ *    css_free_work_fn().
+ *
+ * It is actually hairier because both step 2 and 4 require process context
+ * and thus involve punting to css->destroy_work adding two additional
+ * steps to the already complex sequence.
+ */
+static void css_free_work_fn(struct work_struct *work)
+{
+	struct cgroup_subsys_state *css =
+		container_of(work, struct cgroup_subsys_state, destroy_work);
+	struct cgroup_subsys *ss = css->ss;
+	struct cgroup *cgrp = css->cgroup;
+
+	percpu_ref_exit(&css->refcnt);
+
+	if (ss) {
+		/* css free path */
+		struct cgroup_subsys_state *parent = css->parent;
+		int id = css->id;
+
+		ss->css_free(css);
+		cgroup_idr_remove(&ss->css_idr, id);
+		cgroup_put(cgrp);
+
+		if (parent)
+			css_put(parent);
+	} else {
+		/* cgroup free path */
+		atomic_dec(&cgrp->root->nr_cgrps);
+		cgroup_pidlist_destroy_all(cgrp);
+		cancel_work_sync(&cgrp->release_agent_work);
+
+		if (cgroup_parent(cgrp)) {
+			/*
+			 * We get a ref to the parent, and put the ref when
+			 * this cgroup is being freed, so it's guaranteed
+			 * that the parent won't be destroyed before its
+			 * children.
+			 */
+			cgroup_put(cgroup_parent(cgrp));
+			kernfs_put(cgrp->kn);
+			kfree(cgrp);
+		} else {
+			/*
+			 * This is root cgroup's refcnt reaching zero,
+			 * which indicates that the root should be
+			 * released.
+			 */
+			cgroup_destroy_root(cgrp->root);
+		}
+	}
+}
+
+static void css_free_rcu_fn(struct rcu_head *rcu_head)
+{
+	struct cgroup_subsys_state *css =
+		container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
+
+	INIT_WORK(&css->destroy_work, css_free_work_fn);
+	queue_work(cgroup_destroy_wq, &css->destroy_work);
+}
+
+static void css_release_work_fn(struct work_struct *work)
+{
+	struct cgroup_subsys_state *css =
+		container_of(work, struct cgroup_subsys_state, destroy_work);
+	struct cgroup_subsys *ss = css->ss;
+	struct cgroup *cgrp = css->cgroup;
+
+	mutex_lock(&cgroup_mutex);
+
+	css->flags |= CSS_RELEASED;
+	list_del_rcu(&css->sibling);
+
+	if (ss) {
+		/* css release path */
+		cgroup_idr_replace(&ss->css_idr, NULL, css->id);
+		if (ss->css_released)
+			ss->css_released(css);
+	} else {
+		/* cgroup release path */
+		trace_cgroup_release(cgrp);
+
+		cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
+		cgrp->id = -1;
+
+		/*
+		 * There are two control paths which try to determine
+		 * cgroup from dentry without going through kernfs -
+		 * cgroupstats_build() and css_tryget_online_from_dir().
+		 * Those are supported by RCU protecting clearing of
+		 * cgrp->kn->priv backpointer.
+		 */
+		if (cgrp->kn)
+			RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
+					 NULL);
+
+		cgroup_bpf_put(cgrp);
+	}
+
+	mutex_unlock(&cgroup_mutex);
+
+	call_rcu(&css->rcu_head, css_free_rcu_fn);
+}
+
+static void css_release(struct percpu_ref *ref)
+{
+	struct cgroup_subsys_state *css =
+		container_of(ref, struct cgroup_subsys_state, refcnt);
+
+	INIT_WORK(&css->destroy_work, css_release_work_fn);
+	queue_work(cgroup_destroy_wq, &css->destroy_work);
+}
+
+static void init_and_link_css(struct cgroup_subsys_state *css,
+			      struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+	lockdep_assert_held(&cgroup_mutex);
+
+	cgroup_get(cgrp);
+
+	memset(css, 0, sizeof(*css));
+	css->cgroup = cgrp;
+	css->ss = ss;
+	css->id = -1;
+	INIT_LIST_HEAD(&css->sibling);
+	INIT_LIST_HEAD(&css->children);
+	css->serial_nr = css_serial_nr_next++;
+	atomic_set(&css->online_cnt, 0);
+
+	if (cgroup_parent(cgrp)) {
+		css->parent = cgroup_css(cgroup_parent(cgrp), ss);
+		css_get(css->parent);
+	}
+
+	BUG_ON(cgroup_css(cgrp, ss));
+}
+
+/* invoke ->css_online() on a new CSS and mark it online if successful */
+static int online_css(struct cgroup_subsys_state *css)
+{
+	struct cgroup_subsys *ss = css->ss;
+	int ret = 0;
+
+	lockdep_assert_held(&cgroup_mutex);
+
+	if (ss->css_online)
+		ret = ss->css_online(css);
+	if (!ret) {
+		css->flags |= CSS_ONLINE;
+		rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
+
+		atomic_inc(&css->online_cnt);
+		if (css->parent)
+			atomic_inc(&css->parent->online_cnt);
+	}
+	return ret;
+}
+
+/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
+static void offline_css(struct cgroup_subsys_state *css)
+{
+	struct cgroup_subsys *ss = css->ss;
+
+	lockdep_assert_held(&cgroup_mutex);
+
+	if (!(css->flags & CSS_ONLINE))
+		return;
+
+	if (ss->css_reset)
+		ss->css_reset(css);
+
+	if (ss->css_offline)
+		ss->css_offline(css);
+
+	css->flags &= ~CSS_ONLINE;
+	RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
+
+	wake_up_all(&css->cgroup->offline_waitq);
+}
+
+/**
+ * css_create - create a cgroup_subsys_state
+ * @cgrp: the cgroup new css will be associated with
+ * @ss: the subsys of new css
+ *
+ * Create a new css associated with @cgrp - @ss pair.  On success, the new
+ * css is online and installed in @cgrp.  This function doesn't create the
+ * interface files.  Returns 0 on success, -errno on failure.
+ */
+static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
+					      struct cgroup_subsys *ss)
+{
+	struct cgroup *parent = cgroup_parent(cgrp);
+	struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
+	struct cgroup_subsys_state *css;
+	int err;
+
+	lockdep_assert_held(&cgroup_mutex);
+
+	css = ss->css_alloc(parent_css);
+	if (!css)
+		css = ERR_PTR(-ENOMEM);
+	if (IS_ERR(css))
+		return css;
+
+	init_and_link_css(css, ss, cgrp);
+
+	err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
+	if (err)
+		goto err_free_css;
+
+	err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
+	if (err < 0)
+		goto err_free_css;
+	css->id = err;
+
+	/* @css is ready to be brought online now, make it visible */
+	list_add_tail_rcu(&css->sibling, &parent_css->children);
+	cgroup_idr_replace(&ss->css_idr, css, css->id);
+
+	err = online_css(css);
+	if (err)
+		goto err_list_del;
+
+	if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
+	    cgroup_parent(parent)) {
+		pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
+			current->comm, current->pid, ss->name);
+		if (!strcmp(ss->name, "memory"))
+			pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
+		ss->warned_broken_hierarchy = true;
+	}
+
+	return css;
+
+err_list_del:
+	list_del_rcu(&css->sibling);
+err_free_css:
+	call_rcu(&css->rcu_head, css_free_rcu_fn);
+	return ERR_PTR(err);
+}
+
+static struct cgroup *cgroup_create(struct cgroup *parent)
+{
+	struct cgroup_root *root = parent->root;
+	struct cgroup *cgrp, *tcgrp;
+	int level = parent->level + 1;
+	int ret;
+
+	/* allocate the cgroup and its ID, 0 is reserved for the root */
+	cgrp = kzalloc(sizeof(*cgrp) +
+		       sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL);
+	if (!cgrp)
+		return ERR_PTR(-ENOMEM);
+
+	ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
+	if (ret)
+		goto out_free_cgrp;
+
+	/*
+	 * Temporarily set the pointer to NULL, so idr_find() won't return
+	 * a half-baked cgroup.
+	 */
+	cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
+	if (cgrp->id < 0) {
+		ret = -ENOMEM;
+		goto out_cancel_ref;
+	}
+
+	init_cgroup_housekeeping(cgrp);
+
+	cgrp->self.parent = &parent->self;
+	cgrp->root = root;
+	cgrp->level = level;
+
+	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
+		cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
+
+	if (notify_on_release(parent))
+		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+
+	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
+		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
+
+	cgrp->self.serial_nr = css_serial_nr_next++;
+
+	/* allocation complete, commit to creation */
+	list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
+	atomic_inc(&root->nr_cgrps);
+	cgroup_get(parent);
+
+	/*
+	 * @cgrp is now fully operational.  If something fails after this
+	 * point, it'll be released via the normal destruction path.
+	 */
+	cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
+
+	/*
+	 * On the default hierarchy, a child doesn't automatically inherit
+	 * subtree_control from the parent.  Each is configured manually.
+	 */
+	if (!cgroup_on_dfl(cgrp))
+		cgrp->subtree_control = cgroup_control(cgrp);
+
+	if (parent)
+		cgroup_bpf_inherit(cgrp, parent);
+
+	cgroup_propagate_control(cgrp);
+
+	/* @cgrp doesn't have dir yet so the following will only create csses */
+	ret = cgroup_apply_control_enable(cgrp);
+	if (ret)
+		goto out_destroy;
+
+	return cgrp;
+
+out_cancel_ref:
+	percpu_ref_exit(&cgrp->self.refcnt);
+out_free_cgrp:
+	kfree(cgrp);
+	return ERR_PTR(ret);
+out_destroy:
+	cgroup_destroy_locked(cgrp);
+	return ERR_PTR(ret);
+}
+
+static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+			umode_t mode)
+{
+	struct cgroup *parent, *cgrp;
+	struct kernfs_node *kn;
+	int ret;
+
+	/* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
+	if (strchr(name, '\n'))
+		return -EINVAL;
+
+	parent = cgroup_kn_lock_live(parent_kn, false);
+	if (!parent)
+		return -ENODEV;
+
+	cgrp = cgroup_create(parent);
+	if (IS_ERR(cgrp)) {
+		ret = PTR_ERR(cgrp);
+		goto out_unlock;
+	}
+
+	/* create the directory */
+	kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
+	if (IS_ERR(kn)) {
+		ret = PTR_ERR(kn);
+		goto out_destroy;
+	}
+	cgrp->kn = kn;
+
+	/*
+	 * This extra ref will be put in cgroup_free_fn() and guarantees
+	 * that @cgrp->kn is always accessible.
+	 */
+	kernfs_get(kn);
+
+	ret = cgroup_kn_set_ugid(kn);
+	if (ret)
+		goto out_destroy;
+
+	ret = css_populate_dir(&cgrp->self);
+	if (ret)
+		goto out_destroy;
+
+	ret = cgroup_apply_control_enable(cgrp);
+	if (ret)
+		goto out_destroy;
+
+	trace_cgroup_mkdir(cgrp);
+
+	/* let's create and online css's */
+	kernfs_activate(kn);
+
+	ret = 0;
+	goto out_unlock;
+
+out_destroy:
+	cgroup_destroy_locked(cgrp);
+out_unlock:
+	cgroup_kn_unlock(parent_kn);
+	return ret;
+}
+
+/*
+ * This is called when the refcnt of a css is confirmed to be killed.
+ * css_tryget_online() is now guaranteed to fail.  Tell the subsystem to
+ * initate destruction and put the css ref from kill_css().
+ */
+static void css_killed_work_fn(struct work_struct *work)
+{
+	struct cgroup_subsys_state *css =
+		container_of(work, struct cgroup_subsys_state, destroy_work);
+
+	mutex_lock(&cgroup_mutex);
+
+	do {
+		offline_css(css);
+		css_put(css);
+		/* @css can't go away while we're holding cgroup_mutex */
+		css = css->parent;
+	} while (css && atomic_dec_and_test(&css->online_cnt));
+
+	mutex_unlock(&cgroup_mutex);
+}
+
+/* css kill confirmation processing requires process context, bounce */
+static void css_killed_ref_fn(struct percpu_ref *ref)
+{
+	struct cgroup_subsys_state *css =
+		container_of(ref, struct cgroup_subsys_state, refcnt);
+
+	if (atomic_dec_and_test(&css->online_cnt)) {
+		INIT_WORK(&css->destroy_work, css_killed_work_fn);
+		queue_work(cgroup_destroy_wq, &css->destroy_work);
+	}
+}
+
+/**
+ * kill_css - destroy a css
+ * @css: css to destroy
+ *
+ * This function initiates destruction of @css by removing cgroup interface
+ * files and putting its base reference.  ->css_offline() will be invoked
+ * asynchronously once css_tryget_online() is guaranteed to fail and when
+ * the reference count reaches zero, @css will be released.
+ */
+static void kill_css(struct cgroup_subsys_state *css)
+{
+	lockdep_assert_held(&cgroup_mutex);
+
+	/*
+	 * This must happen before css is disassociated with its cgroup.
+	 * See seq_css() for details.
+	 */
+	css_clear_dir(css);
+
+	/*
+	 * Killing would put the base ref, but we need to keep it alive
+	 * until after ->css_offline().
+	 */
+	css_get(css);
+
+	/*
+	 * cgroup core guarantees that, by the time ->css_offline() is
+	 * invoked, no new css reference will be given out via
+	 * css_tryget_online().  We can't simply call percpu_ref_kill() and
+	 * proceed to offlining css's because percpu_ref_kill() doesn't
+	 * guarantee that the ref is seen as killed on all CPUs on return.
+	 *
+	 * Use percpu_ref_kill_and_confirm() to get notifications as each
+	 * css is confirmed to be seen as killed on all CPUs.
+	 */
+	percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
+}
+
+/**
+ * cgroup_destroy_locked - the first stage of cgroup destruction
+ * @cgrp: cgroup to be destroyed
+ *
+ * css's make use of percpu refcnts whose killing latency shouldn't be
+ * exposed to userland and are RCU protected.  Also, cgroup core needs to
+ * guarantee that css_tryget_online() won't succeed by the time
+ * ->css_offline() is invoked.  To satisfy all the requirements,
+ * destruction is implemented in the following two steps.
+ *
+ * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
+ *     userland visible parts and start killing the percpu refcnts of
+ *     css's.  Set up so that the next stage will be kicked off once all
+ *     the percpu refcnts are confirmed to be killed.
+ *
+ * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
+ *     rest of destruction.  Once all cgroup references are gone, the
+ *     cgroup is RCU-freed.
+ *
+ * This function implements s1.  After this step, @cgrp is gone as far as
+ * the userland is concerned and a new cgroup with the same name may be
+ * created.  As cgroup doesn't care about the names internally, this
+ * doesn't cause any problem.
+ */
+static int cgroup_destroy_locked(struct cgroup *cgrp)
+	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
+{
+	struct cgroup_subsys_state *css;
+	struct cgrp_cset_link *link;
+	int ssid;
+
+	lockdep_assert_held(&cgroup_mutex);
+
+	/*
+	 * Only migration can raise populated from zero and we're already
+	 * holding cgroup_mutex.
+	 */
+	if (cgroup_is_populated(cgrp))
+		return -EBUSY;
+
+	/*
+	 * Make sure there's no live children.  We can't test emptiness of
+	 * ->self.children as dead children linger on it while being
+	 * drained; otherwise, "rmdir parent/child parent" may fail.
+	 */
+	if (css_has_online_children(&cgrp->self))
+		return -EBUSY;
+
+	/*
+	 * Mark @cgrp and the associated csets dead.  The former prevents
+	 * further task migration and child creation by disabling
+	 * cgroup_lock_live_group().  The latter makes the csets ignored by
+	 * the migration path.
+	 */
+	cgrp->self.flags &= ~CSS_ONLINE;
+
+	spin_lock_irq(&css_set_lock);
+	list_for_each_entry(link, &cgrp->cset_links, cset_link)
+		link->cset->dead = true;
+	spin_unlock_irq(&css_set_lock);
+
+	/* initiate massacre of all css's */
+	for_each_css(css, ssid, cgrp)
+		kill_css(css);
+
+	/*
+	 * Remove @cgrp directory along with the base files.  @cgrp has an
+	 * extra ref on its kn.
+	 */
+	kernfs_remove(cgrp->kn);
+
+	check_for_release(cgroup_parent(cgrp));
+
+	/* put the base reference */
+	percpu_ref_kill(&cgrp->self.refcnt);
+
+	return 0;
+};
+
+static int cgroup_rmdir(struct kernfs_node *kn)
+{
+	struct cgroup *cgrp;
+	int ret = 0;
+
+	cgrp = cgroup_kn_lock_live(kn, false);
+	if (!cgrp)
+		return 0;
+
+	ret = cgroup_destroy_locked(cgrp);
+
+	if (!ret)
+		trace_cgroup_rmdir(cgrp);
+
+	cgroup_kn_unlock(kn);
+	return ret;
+}
+
+static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
+	.remount_fs		= cgroup_remount,
+	.show_options		= cgroup_show_options,
+	.mkdir			= cgroup_mkdir,
+	.rmdir			= cgroup_rmdir,
+	.rename			= cgroup_rename,
+	.show_path		= cgroup_show_path,
+};
+
+static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
+{
+	struct cgroup_subsys_state *css;
+
+	pr_debug("Initializing cgroup subsys %s\n", ss->name);
+
+	mutex_lock(&cgroup_mutex);
+
+	idr_init(&ss->css_idr);
+	INIT_LIST_HEAD(&ss->cfts);
+
+	/* Create the root cgroup state for this subsystem */
+	ss->root = &cgrp_dfl_root;
+	css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
+	/* We don't handle early failures gracefully */
+	BUG_ON(IS_ERR(css));
+	init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
+
+	/*
+	 * Root csses are never destroyed and we can't initialize
+	 * percpu_ref during early init.  Disable refcnting.
+	 */
+	css->flags |= CSS_NO_REF;
+
+	if (early) {
+		/* allocation can't be done safely during early init */
+		css->id = 1;
+	} else {
+		css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
+		BUG_ON(css->id < 0);
+	}
+
+	/* Update the init_css_set to contain a subsys
+	 * pointer to this state - since the subsystem is
+	 * newly registered, all tasks and hence the
+	 * init_css_set is in the subsystem's root cgroup. */
+	init_css_set.subsys[ss->id] = css;
+
+	have_fork_callback |= (bool)ss->fork << ss->id;
+	have_exit_callback |= (bool)ss->exit << ss->id;
+	have_free_callback |= (bool)ss->free << ss->id;
+	have_canfork_callback |= (bool)ss->can_fork << ss->id;
+
+	/* At system boot, before all subsystems have been
+	 * registered, no tasks have been forked, so we don't
+	 * need to invoke fork callbacks here. */
+	BUG_ON(!list_empty(&init_task.tasks));
+
+	BUG_ON(online_css(css));
+
+	mutex_unlock(&cgroup_mutex);
+}
+
+/**
+ * cgroup_init_early - cgroup initialization at system boot
+ *
+ * Initialize cgroups at system boot, and initialize any
+ * subsystems that request early init.
+ */
+int __init cgroup_init_early(void)
+{
+	static struct cgroup_sb_opts __initdata opts;
+	struct cgroup_subsys *ss;
+	int i;
+
+	init_cgroup_root(&cgrp_dfl_root, &opts);
+	cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
+
+	RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
+
+	for_each_subsys(ss, i) {
+		WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
+		     "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
+		     i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
+		     ss->id, ss->name);
+		WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
+		     "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
+
+		ss->id = i;
+		ss->name = cgroup_subsys_name[i];
+		if (!ss->legacy_name)
+			ss->legacy_name = cgroup_subsys_name[i];
+
+		if (ss->early_init)
+			cgroup_init_subsys(ss, true);
+	}
+	return 0;
+}
+
+static u16 cgroup_disable_mask __initdata;
+
+/**
+ * cgroup_init - cgroup initialization
+ *
+ * Register cgroup filesystem and /proc file, and initialize
+ * any subsystems that didn't request early init.
+ */
+int __init cgroup_init(void)
+{
+	struct cgroup_subsys *ss;
+	int ssid;
+
+	BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
+	BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
+	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
+	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
+
+	/*
+	 * The latency of the synchronize_sched() is too high for cgroups,
+	 * avoid it at the cost of forcing all readers into the slow path.
+	 */
+	rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
+
+	get_user_ns(init_cgroup_ns.user_ns);
+
+	mutex_lock(&cgroup_mutex);
+
+	/*
+	 * Add init_css_set to the hash table so that dfl_root can link to
+	 * it during init.
+	 */
+	hash_add(css_set_table, &init_css_set.hlist,
+		 css_set_hash(init_css_set.subsys));
+
+	BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
+
+	mutex_unlock(&cgroup_mutex);
+
+	for_each_subsys(ss, ssid) {
+		if (ss->early_init) {
+			struct cgroup_subsys_state *css =
+				init_css_set.subsys[ss->id];
+
+			css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
+						   GFP_KERNEL);
+			BUG_ON(css->id < 0);
+		} else {
+			cgroup_init_subsys(ss, false);
+		}
+
+		list_add_tail(&init_css_set.e_cset_node[ssid],
+			      &cgrp_dfl_root.cgrp.e_csets[ssid]);
+
+		/*
+		 * Setting dfl_root subsys_mask needs to consider the
+		 * disabled flag and cftype registration needs kmalloc,
+		 * both of which aren't available during early_init.
+		 */
+		if (cgroup_disable_mask & (1 << ssid)) {
+			static_branch_disable(cgroup_subsys_enabled_key[ssid]);
+			printk(KERN_INFO "Disabling %s control group subsystem\n",
+			       ss->name);
+			continue;
+		}
+
+		if (cgroup_ssid_no_v1(ssid))
+			printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
+			       ss->name);
+
+		cgrp_dfl_root.subsys_mask |= 1 << ss->id;
+
+		if (ss->implicit_on_dfl)
+			cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
+		else if (!ss->dfl_cftypes)
+			cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
+
+		if (ss->dfl_cftypes == ss->legacy_cftypes) {
+			WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
+		} else {
+			WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
+			WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
+		}
+
+		if (ss->bind)
+			ss->bind(init_css_set.subsys[ssid]);
+	}
+
+	/* init_css_set.subsys[] has been updated, re-hash */
+	hash_del(&init_css_set.hlist);
+	hash_add(css_set_table, &init_css_set.hlist,
+		 css_set_hash(init_css_set.subsys));
+
+	WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
+	WARN_ON(register_filesystem(&cgroup_fs_type));
+	WARN_ON(register_filesystem(&cgroup2_fs_type));
+	WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
+
+	return 0;
+}
+
+static int __init cgroup_wq_init(void)
+{
+	/*
+	 * There isn't much point in executing destruction path in
+	 * parallel.  Good chunk is serialized with cgroup_mutex anyway.
+	 * Use 1 for @max_active.
+	 *
+	 * We would prefer to do this in cgroup_init() above, but that
+	 * is called before init_workqueues(): so leave this until after.
+	 */
+	cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
+	BUG_ON(!cgroup_destroy_wq);
+
+	/*
+	 * Used to destroy pidlists and separate to serve as flush domain.
+	 * Cap @max_active to 1 too.
+	 */
+	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
+						    0, 1);
+	BUG_ON(!cgroup_pidlist_destroy_wq);
+
+	return 0;
+}
+core_initcall(cgroup_wq_init);
+
+/*
+ * proc_cgroup_show()
+ *  - Print task's cgroup paths into seq_file, one line for each hierarchy
+ *  - Used for /proc/<pid>/cgroup.
+ */
+int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
+		     struct pid *pid, struct task_struct *tsk)
+{
+	char *buf;
+	int retval;
+	struct cgroup_root *root;
+
+	retval = -ENOMEM;
+	buf = kmalloc(PATH_MAX, GFP_KERNEL);
+	if (!buf)
+		goto out;
+
+	mutex_lock(&cgroup_mutex);
+	spin_lock_irq(&css_set_lock);
+
+	for_each_root(root) {
+		struct cgroup_subsys *ss;
+		struct cgroup *cgrp;
+		int ssid, count = 0;
+
+		if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
+			continue;
+
+		seq_printf(m, "%d:", root->hierarchy_id);
+		if (root != &cgrp_dfl_root)
+			for_each_subsys(ss, ssid)
+				if (root->subsys_mask & (1 << ssid))
+					seq_printf(m, "%s%s", count++ ? "," : "",
+						   ss->legacy_name);
+		if (strlen(root->name))
+			seq_printf(m, "%sname=%s", count ? "," : "",
+				   root->name);
+		seq_putc(m, ':');
+
+		cgrp = task_cgroup_from_root(tsk, root);
+
+		/*
+		 * On traditional hierarchies, all zombie tasks show up as
+		 * belonging to the root cgroup.  On the default hierarchy,
+		 * while a zombie doesn't show up in "cgroup.procs" and
+		 * thus can't be migrated, its /proc/PID/cgroup keeps
+		 * reporting the cgroup it belonged to before exiting.  If
+		 * the cgroup is removed before the zombie is reaped,
+		 * " (deleted)" is appended to the cgroup path.
+		 */
+		if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
+			retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
+						current->nsproxy->cgroup_ns);
+			if (retval >= PATH_MAX)
+				retval = -ENAMETOOLONG;
+			if (retval < 0)
+				goto out_unlock;
+
+			seq_puts(m, buf);
+		} else {
+			seq_puts(m, "/");
+		}
+
+		if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
+			seq_puts(m, " (deleted)\n");
+		else
+			seq_putc(m, '\n');
+	}
+
+	retval = 0;
+out_unlock:
+	spin_unlock_irq(&css_set_lock);
+	mutex_unlock(&cgroup_mutex);
+	kfree(buf);
+out:
+	return retval;
+}
+
+/* Display information about each subsystem and each hierarchy */
+static int proc_cgroupstats_show(struct seq_file *m, void *v)
+{
+	struct cgroup_subsys *ss;
+	int i;
+
+	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
+	/*
+	 * ideally we don't want subsystems moving around while we do this.
+	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
+	 * subsys/hierarchy state.
+	 */
+	mutex_lock(&cgroup_mutex);
+
+	for_each_subsys(ss, i)
+		seq_printf(m, "%s\t%d\t%d\t%d\n",
+			   ss->legacy_name, ss->root->hierarchy_id,
+			   atomic_read(&ss->root->nr_cgrps),
+			   cgroup_ssid_enabled(i));
+
+	mutex_unlock(&cgroup_mutex);
+	return 0;
+}
+
+static int cgroupstats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, proc_cgroupstats_show, NULL);
+}
+
+static const struct file_operations proc_cgroupstats_operations = {
+	.open = cgroupstats_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+/**
+ * cgroup_fork - initialize cgroup related fields during copy_process()
+ * @child: pointer to task_struct of forking parent process.
+ *
+ * A task is associated with the init_css_set until cgroup_post_fork()
+ * attaches it to the parent's css_set.  Empty cg_list indicates that
+ * @child isn't holding reference to its css_set.
+ */
+void cgroup_fork(struct task_struct *child)
+{
+	RCU_INIT_POINTER(child->cgroups, &init_css_set);
+	INIT_LIST_HEAD(&child->cg_list);
+}
+
+/**
+ * cgroup_can_fork - called on a new task before the process is exposed
+ * @child: the task in question.
+ *
+ * This calls the subsystem can_fork() callbacks. If the can_fork() callback
+ * returns an error, the fork aborts with that error code. This allows for
+ * a cgroup subsystem to conditionally allow or deny new forks.
+ */
+int cgroup_can_fork(struct task_struct *child)
+{
+	struct cgroup_subsys *ss;
+	int i, j, ret;
+
+	do_each_subsys_mask(ss, i, have_canfork_callback) {
+		ret = ss->can_fork(child);
+		if (ret)
+			goto out_revert;
+	} while_each_subsys_mask();
+
+	return 0;
+
+out_revert:
+	for_each_subsys(ss, j) {
+		if (j >= i)
+			break;
+		if (ss->cancel_fork)
+			ss->cancel_fork(child);
+	}
+
+	return ret;
+}
+
+/**
+ * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
+ * @child: the task in question
+ *
+ * This calls the cancel_fork() callbacks if a fork failed *after*
+ * cgroup_can_fork() succeded.
+ */
+void cgroup_cancel_fork(struct task_struct *child)
+{
+	struct cgroup_subsys *ss;
+	int i;
+
+	for_each_subsys(ss, i)
+		if (ss->cancel_fork)
+			ss->cancel_fork(child);
+}
+
+/**
+ * cgroup_post_fork - called on a new task after adding it to the task list
+ * @child: the task in question
+ *
+ * Adds the task to the list running through its css_set if necessary and
+ * call the subsystem fork() callbacks.  Has to be after the task is
+ * visible on the task list in case we race with the first call to
+ * cgroup_task_iter_start() - to guarantee that the new task ends up on its
+ * list.
+ */
+void cgroup_post_fork(struct task_struct *child)
+{
+	struct cgroup_subsys *ss;
+	int i;
+
+	/*
+	 * This may race against cgroup_enable_task_cg_lists().  As that
+	 * function sets use_task_css_set_links before grabbing
+	 * tasklist_lock and we just went through tasklist_lock to add
+	 * @child, it's guaranteed that either we see the set
+	 * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
+	 * @child during its iteration.
+	 *
+	 * If we won the race, @child is associated with %current's
+	 * css_set.  Grabbing css_set_lock guarantees both that the
+	 * association is stable, and, on completion of the parent's
+	 * migration, @child is visible in the source of migration or
+	 * already in the destination cgroup.  This guarantee is necessary
+	 * when implementing operations which need to migrate all tasks of
+	 * a cgroup to another.
+	 *
+	 * Note that if we lose to cgroup_enable_task_cg_lists(), @child
+	 * will remain in init_css_set.  This is safe because all tasks are
+	 * in the init_css_set before cg_links is enabled and there's no
+	 * operation which transfers all tasks out of init_css_set.
+	 */
+	if (use_task_css_set_links) {
+		struct css_set *cset;
+
+		spin_lock_irq(&css_set_lock);
+		cset = task_css_set(current);
+		if (list_empty(&child->cg_list)) {
+			get_css_set(cset);
+			css_set_move_task(child, NULL, cset, false);
+		}
+		spin_unlock_irq(&css_set_lock);
+	}
+
+	/*
+	 * Call ss->fork().  This must happen after @child is linked on
+	 * css_set; otherwise, @child might change state between ->fork()
+	 * and addition to css_set.
+	 */
+	do_each_subsys_mask(ss, i, have_fork_callback) {
+		ss->fork(child);
+	} while_each_subsys_mask();
+}
+
+/**
+ * cgroup_exit - detach cgroup from exiting task
+ * @tsk: pointer to task_struct of exiting process
+ *
+ * Description: Detach cgroup from @tsk and release it.
+ *
+ * Note that cgroups marked notify_on_release force every task in
+ * them to take the global cgroup_mutex mutex when exiting.
+ * This could impact scaling on very large systems.  Be reluctant to
+ * use notify_on_release cgroups where very high task exit scaling
+ * is required on large systems.
+ *
+ * We set the exiting tasks cgroup to the root cgroup (top_cgroup).  We
+ * call cgroup_exit() while the task is still competent to handle
+ * notify_on_release(), then leave the task attached to the root cgroup in
+ * each hierarchy for the remainder of its exit.  No need to bother with
+ * init_css_set refcnting.  init_css_set never goes away and we can't race
+ * with migration path - PF_EXITING is visible to migration path.
+ */
+void cgroup_exit(struct task_struct *tsk)
+{
+	struct cgroup_subsys *ss;
+	struct css_set *cset;
+	int i;
+
+	/*
+	 * Unlink from @tsk from its css_set.  As migration path can't race
+	 * with us, we can check css_set and cg_list without synchronization.
+	 */
+	cset = task_css_set(tsk);
+
+	if (!list_empty(&tsk->cg_list)) {
+		spin_lock_irq(&css_set_lock);
+		css_set_move_task(tsk, cset, NULL, false);
+		spin_unlock_irq(&css_set_lock);
+	} else {
+		get_css_set(cset);
+	}
+
+	/* see cgroup_post_fork() for details */
+	do_each_subsys_mask(ss, i, have_exit_callback) {
+		ss->exit(tsk);
+	} while_each_subsys_mask();
+}
+
+void cgroup_free(struct task_struct *task)
+{
+	struct css_set *cset = task_css_set(task);
+	struct cgroup_subsys *ss;
+	int ssid;
+
+	do_each_subsys_mask(ss, ssid, have_free_callback) {
+		ss->free(task);
+	} while_each_subsys_mask();
+
+	put_css_set(cset);
+}
+
+static void check_for_release(struct cgroup *cgrp)
+{
+	if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
+	    !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
+		schedule_work(&cgrp->release_agent_work);
+}
+
+/*
+ * Notify userspace when a cgroup is released, by running the
+ * configured release agent with the name of the cgroup (path
+ * relative to the root of cgroup file system) as the argument.
+ *
+ * Most likely, this user command will try to rmdir this cgroup.
+ *
+ * This races with the possibility that some other task will be
+ * attached to this cgroup before it is removed, or that some other
+ * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
+ * The presumed 'rmdir' will fail quietly if this cgroup is no longer
+ * unused, and this cgroup will be reprieved from its death sentence,
+ * to continue to serve a useful existence.  Next time it's released,
+ * we will get notified again, if it still has 'notify_on_release' set.
+ *
+ * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
+ * means only wait until the task is successfully execve()'d.  The
+ * separate release agent task is forked by call_usermodehelper(),
+ * then control in this thread returns here, without waiting for the
+ * release agent task.  We don't bother to wait because the caller of
+ * this routine has no use for the exit status of the release agent
+ * task, so no sense holding our caller up for that.
+ */
+static void cgroup_release_agent(struct work_struct *work)
+{
+	struct cgroup *cgrp =
+		container_of(work, struct cgroup, release_agent_work);
+	char *pathbuf = NULL, *agentbuf = NULL;
+	char *argv[3], *envp[3];
+	int ret;
+
+	mutex_lock(&cgroup_mutex);
+
+	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
+	agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
+	if (!pathbuf || !agentbuf)
+		goto out;
+
+	spin_lock_irq(&css_set_lock);
+	ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
+	spin_unlock_irq(&css_set_lock);
+	if (ret < 0 || ret >= PATH_MAX)
+		goto out;
+
+	argv[0] = agentbuf;
+	argv[1] = pathbuf;
+	argv[2] = NULL;
+
+	/* minimal command environment */
+	envp[0] = "HOME=/";
+	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+	envp[2] = NULL;
+
+	mutex_unlock(&cgroup_mutex);
+	call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+	goto out_free;
+out:
+	mutex_unlock(&cgroup_mutex);
+out_free:
+	kfree(agentbuf);
+	kfree(pathbuf);
+}
+
+static int __init cgroup_disable(char *str)
+{
+	struct cgroup_subsys *ss;
+	char *token;
+	int i;
+
+	while ((token = strsep(&str, ",")) != NULL) {
+		if (!*token)
+			continue;
+
+		for_each_subsys(ss, i) {
+			if (strcmp(token, ss->name) &&
+			    strcmp(token, ss->legacy_name))
+				continue;
+			cgroup_disable_mask |= 1 << i;
+		}
+	}
+	return 1;
+}
+__setup("cgroup_disable=", cgroup_disable);
+
+static int __init cgroup_no_v1(char *str)
+{
+	struct cgroup_subsys *ss;
+	char *token;
+	int i;
+
+	while ((token = strsep(&str, ",")) != NULL) {
+		if (!*token)
+			continue;
+
+		if (!strcmp(token, "all")) {
+			cgroup_no_v1_mask = U16_MAX;
+			break;
+		}
+
+		for_each_subsys(ss, i) {
+			if (strcmp(token, ss->name) &&
+			    strcmp(token, ss->legacy_name))
+				continue;
+
+			cgroup_no_v1_mask |= 1 << i;
+		}
+	}
+	return 1;
+}
+__setup("cgroup_no_v1=", cgroup_no_v1);
+
+/**
+ * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
+ * @dentry: directory dentry of interest
+ * @ss: subsystem of interest
+ *
+ * If @dentry is a directory for a cgroup which has @ss enabled on it, try
+ * to get the corresponding css and return it.  If such css doesn't exist
+ * or can't be pinned, an ERR_PTR value is returned.
+ */
+struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
+						       struct cgroup_subsys *ss)
+{
+	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
+	struct file_system_type *s_type = dentry->d_sb->s_type;
+	struct cgroup_subsys_state *css = NULL;
+	struct cgroup *cgrp;
+
+	/* is @dentry a cgroup dir? */
+	if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
+	    !kn || kernfs_type(kn) != KERNFS_DIR)
+		return ERR_PTR(-EBADF);
+
+	rcu_read_lock();
+
+	/*
+	 * This path doesn't originate from kernfs and @kn could already
+	 * have been or be removed at any point.  @kn->priv is RCU
+	 * protected for this access.  See css_release_work_fn() for details.
+	 */
+	cgrp = rcu_dereference(kn->priv);
+	if (cgrp)
+		css = cgroup_css(cgrp, ss);
+
+	if (!css || !css_tryget_online(css))
+		css = ERR_PTR(-ENOENT);
+
+	rcu_read_unlock();
+	return css;
+}
+
+/**
+ * css_from_id - lookup css by id
+ * @id: the cgroup id
+ * @ss: cgroup subsys to be looked into
+ *
+ * Returns the css if there's valid one with @id, otherwise returns NULL.
+ * Should be called under rcu_read_lock().
+ */
+struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	return idr_find(&ss->css_idr, id);
+}
+
+/**
+ * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path
+ * @path: path on the default hierarchy
+ *
+ * Find the cgroup at @path on the default hierarchy, increment its
+ * reference count and return it.  Returns pointer to the found cgroup on
+ * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR)
+ * if @path points to a non-directory.
+ */
+struct cgroup *cgroup_get_from_path(const char *path)
+{
+	struct kernfs_node *kn;
+	struct cgroup *cgrp;
+
+	mutex_lock(&cgroup_mutex);
+
+	kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
+	if (kn) {
+		if (kernfs_type(kn) == KERNFS_DIR) {
+			cgrp = kn->priv;
+			cgroup_get(cgrp);
+		} else {
+			cgrp = ERR_PTR(-ENOTDIR);
+		}
+		kernfs_put(kn);
+	} else {
+		cgrp = ERR_PTR(-ENOENT);
+	}
+
+	mutex_unlock(&cgroup_mutex);
+	return cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_get_from_path);
+
+/**
+ * cgroup_get_from_fd - get a cgroup pointer from a fd
+ * @fd: fd obtained by open(cgroup2_dir)
+ *
+ * Find the cgroup from a fd which should be obtained
+ * by opening a cgroup directory.  Returns a pointer to the
+ * cgroup on success. ERR_PTR is returned if the cgroup
+ * cannot be found.
+ */
+struct cgroup *cgroup_get_from_fd(int fd)
+{
+	struct cgroup_subsys_state *css;
+	struct cgroup *cgrp;
+	struct file *f;
+
+	f = fget_raw(fd);
+	if (!f)
+		return ERR_PTR(-EBADF);
+
+	css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
+	fput(f);
+	if (IS_ERR(css))
+		return ERR_CAST(css);
+
+	cgrp = css->cgroup;
+	if (!cgroup_on_dfl(cgrp)) {
+		cgroup_put(cgrp);
+		return ERR_PTR(-EBADF);
+	}
+
+	return cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
+
+/*
+ * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
+ * definition in cgroup-defs.h.
+ */
+#ifdef CONFIG_SOCK_CGROUP_DATA
+
+#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
+
+DEFINE_SPINLOCK(cgroup_sk_update_lock);
+static bool cgroup_sk_alloc_disabled __read_mostly;
+
+void cgroup_sk_alloc_disable(void)
+{
+	if (cgroup_sk_alloc_disabled)
+		return;
+	pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
+	cgroup_sk_alloc_disabled = true;
+}
+
+#else
+
+#define cgroup_sk_alloc_disabled	false
+
+#endif
+
+void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
+{
+	if (cgroup_sk_alloc_disabled)
+		return;
+
+	/* Socket clone path */
+	if (skcd->val) {
+		cgroup_get(sock_cgroup_ptr(skcd));
+		return;
+	}
+
+	rcu_read_lock();
+
+	while (true) {
+		struct css_set *cset;
+
+		cset = task_css_set(current);
+		if (likely(cgroup_tryget(cset->dfl_cgrp))) {
+			skcd->val = (unsigned long)cset->dfl_cgrp;
+			break;
+		}
+		cpu_relax();
+	}
+
+	rcu_read_unlock();
+}
+
+void cgroup_sk_free(struct sock_cgroup_data *skcd)
+{
+	cgroup_put(sock_cgroup_ptr(skcd));
+}
+
+#endif	/* CONFIG_SOCK_CGROUP_DATA */
+
+/* cgroup namespaces */
+
+static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
+{
+	return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
+}
+
+static void dec_cgroup_namespaces(struct ucounts *ucounts)
+{
+	dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
+}
+
+static struct cgroup_namespace *alloc_cgroup_ns(void)
+{
+	struct cgroup_namespace *new_ns;
+	int ret;
+
+	new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
+	if (!new_ns)
+		return ERR_PTR(-ENOMEM);
+	ret = ns_alloc_inum(&new_ns->ns);
+	if (ret) {
+		kfree(new_ns);
+		return ERR_PTR(ret);
+	}
+	atomic_set(&new_ns->count, 1);
+	new_ns->ns.ops = &cgroupns_operations;
+	return new_ns;
+}
+
+void free_cgroup_ns(struct cgroup_namespace *ns)
+{
+	put_css_set(ns->root_cset);
+	dec_cgroup_namespaces(ns->ucounts);
+	put_user_ns(ns->user_ns);
+	ns_free_inum(&ns->ns);
+	kfree(ns);
+}
+EXPORT_SYMBOL(free_cgroup_ns);
+
+struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
+					struct user_namespace *user_ns,
+					struct cgroup_namespace *old_ns)
+{
+	struct cgroup_namespace *new_ns;
+	struct ucounts *ucounts;
+	struct css_set *cset;
+
+	BUG_ON(!old_ns);
+
+	if (!(flags & CLONE_NEWCGROUP)) {
+		get_cgroup_ns(old_ns);
+		return old_ns;
+	}
+
+	/* Allow only sysadmin to create cgroup namespace. */
+	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	ucounts = inc_cgroup_namespaces(user_ns);
+	if (!ucounts)
+		return ERR_PTR(-ENOSPC);
+
+	/* It is not safe to take cgroup_mutex here */
+	spin_lock_irq(&css_set_lock);
+	cset = task_css_set(current);
+	get_css_set(cset);
+	spin_unlock_irq(&css_set_lock);
+
+	new_ns = alloc_cgroup_ns();
+	if (IS_ERR(new_ns)) {
+		put_css_set(cset);
+		dec_cgroup_namespaces(ucounts);
+		return new_ns;
+	}
+
+	new_ns->user_ns = get_user_ns(user_ns);
+	new_ns->ucounts = ucounts;
+	new_ns->root_cset = cset;
+
+	return new_ns;
+}
+
+static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
+{
+	return container_of(ns, struct cgroup_namespace, ns);
+}
+
+static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+{
+	struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
+
+	if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
+	    !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* Don't need to do anything if we are attaching to our own cgroupns. */
+	if (cgroup_ns == nsproxy->cgroup_ns)
+		return 0;
+
+	get_cgroup_ns(cgroup_ns);
+	put_cgroup_ns(nsproxy->cgroup_ns);
+	nsproxy->cgroup_ns = cgroup_ns;
+
+	return 0;
+}
+
+static struct ns_common *cgroupns_get(struct task_struct *task)
+{
+	struct cgroup_namespace *ns = NULL;
+	struct nsproxy *nsproxy;
+
+	task_lock(task);
+	nsproxy = task->nsproxy;
+	if (nsproxy) {
+		ns = nsproxy->cgroup_ns;
+		get_cgroup_ns(ns);
+	}
+	task_unlock(task);
+
+	return ns ? &ns->ns : NULL;
+}
+
+static void cgroupns_put(struct ns_common *ns)
+{
+	put_cgroup_ns(to_cg_ns(ns));
+}
+
+static struct user_namespace *cgroupns_owner(struct ns_common *ns)
+{
+	return to_cg_ns(ns)->user_ns;
+}
+
+const struct proc_ns_operations cgroupns_operations = {
+	.name		= "cgroup",
+	.type		= CLONE_NEWCGROUP,
+	.get		= cgroupns_get,
+	.put		= cgroupns_put,
+	.install	= cgroupns_install,
+	.owner		= cgroupns_owner,
+};
+
+static __init int cgroup_namespaces_init(void)
+{
+	return 0;
+}
+subsys_initcall(cgroup_namespaces_init);
+
+#ifdef CONFIG_CGROUP_BPF
+void cgroup_bpf_update(struct cgroup *cgrp,
+		       struct bpf_prog *prog,
+		       enum bpf_attach_type type)
+{
+	struct cgroup *parent = cgroup_parent(cgrp);
+
+	mutex_lock(&cgroup_mutex);
+	__cgroup_bpf_update(cgrp, parent, prog, type);
+	mutex_unlock(&cgroup_mutex);
+}
+#endif /* CONFIG_CGROUP_BPF */
+
+#ifdef CONFIG_CGROUP_DEBUG
+static struct cgroup_subsys_state *
+debug_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
+
+	if (!css)
+		return ERR_PTR(-ENOMEM);
+
+	return css;
+}
+
+static void debug_css_free(struct cgroup_subsys_state *css)
+{
+	kfree(css);
+}
+
+static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
+				struct cftype *cft)
+{
+	return cgroup_task_count(css->cgroup);
+}
+
+static u64 current_css_set_read(struct cgroup_subsys_state *css,
+				struct cftype *cft)
+{
+	return (u64)(unsigned long)current->cgroups;
+}
+
+static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
+					 struct cftype *cft)
+{
+	u64 count;
+
+	rcu_read_lock();
+	count = atomic_read(&task_css_set(current)->refcount);
+	rcu_read_unlock();
+	return count;
+}
+
+static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
+{
+	struct cgrp_cset_link *link;
+	struct css_set *cset;
+	char *name_buf;
+
+	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+	if (!name_buf)
+		return -ENOMEM;
+
+	spin_lock_irq(&css_set_lock);
+	rcu_read_lock();
+	cset = rcu_dereference(current->cgroups);
+	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+		struct cgroup *c = link->cgrp;
+
+		cgroup_name(c, name_buf, NAME_MAX + 1);
+		seq_printf(seq, "Root %d group %s\n",
+			   c->root->hierarchy_id, name_buf);
+	}
+	rcu_read_unlock();
+	spin_unlock_irq(&css_set_lock);
+	kfree(name_buf);
+	return 0;
+}
+
+#define MAX_TASKS_SHOWN_PER_CSS 25
+static int cgroup_css_links_read(struct seq_file *seq, void *v)
+{
+	struct cgroup_subsys_state *css = seq_css(seq);
+	struct cgrp_cset_link *link;
+
+	spin_lock_irq(&css_set_lock);
+	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
+		struct css_set *cset = link->cset;
+		struct task_struct *task;
+		int count = 0;
+
+		seq_printf(seq, "css_set %p\n", cset);
+
+		list_for_each_entry(task, &cset->tasks, cg_list) {
+			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+				goto overflow;
+			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
+		}
+
+		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
+			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+				goto overflow;
+			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
+		}
+		continue;
+	overflow:
+		seq_puts(seq, "  ...\n");
+	}
+	spin_unlock_irq(&css_set_lock);
+	return 0;
+}
+
+static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+	return (!cgroup_is_populated(css->cgroup) &&
+		!css_has_online_children(&css->cgroup->self));
+}
+
+static struct cftype debug_files[] =  {
+	{
+		.name = "taskcount",
+		.read_u64 = debug_taskcount_read,
+	},
+
+	{
+		.name = "current_css_set",
+		.read_u64 = current_css_set_read,
+	},
+
+	{
+		.name = "current_css_set_refcount",
+		.read_u64 = current_css_set_refcount_read,
+	},
+
+	{
+		.name = "current_css_set_cg_links",
+		.seq_show = current_css_set_cg_links_read,
+	},
+
+	{
+		.name = "cgroup_css_links",
+		.seq_show = cgroup_css_links_read,
+	},
+
+	{
+		.name = "releasable",
+		.read_u64 = releasable_read,
+	},
+
+	{ }	/* terminate */
+};
+
+struct cgroup_subsys debug_cgrp_subsys = {
+	.css_alloc = debug_css_alloc,
+	.css_free = debug_css_free,
+	.legacy_cftypes = debug_files,
+};
+#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
new file mode 100644
index 000000000000..b3088886cd37
--- /dev/null
+++ b/kernel/cgroup/cpuset.c
@@ -0,0 +1,2752 @@
+/*
+ *  kernel/cpuset.c
+ *
+ *  Processor and Memory placement constraints for sets of tasks.
+ *
+ *  Copyright (C) 2003 BULL SA.
+ *  Copyright (C) 2004-2007 Silicon Graphics, Inc.
+ *  Copyright (C) 2006 Google, Inc
+ *
+ *  Portions derived from Patrick Mochel's sysfs code.
+ *  sysfs is Copyright (c) 2001-3 Patrick Mochel
+ *
+ *  2003-10-10 Written by Simon Derr.
+ *  2003-10-22 Updates by Stephen Hemminger.
+ *  2004 May-July Rework by Paul Jackson.
+ *  2006 Rework by Paul Menage to use generic cgroups
+ *  2008 Rework of the scheduler domains and CPU hotplug handling
+ *       by Max Krasnyansky
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/cpuset.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/list.h>
+#include <linux/mempolicy.h>
+#include <linux/mm.h>
+#include <linux/memory.h>
+#include <linux/export.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/pagemap.h>
+#include <linux/proc_fs.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/security.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/time.h>
+#include <linux/time64.h>
+#include <linux/backing-dev.h>
+#include <linux/sort.h>
+
+#include <linux/uaccess.h>
+#include <linux/atomic.h>
+#include <linux/mutex.h>
+#include <linux/cgroup.h>
+#include <linux/wait.h>
+
+DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
+
+/* See "Frequency meter" comments, below. */
+
+struct fmeter {
+	int cnt;		/* unprocessed events count */
+	int val;		/* most recent output value */
+	time64_t time;		/* clock (secs) when val computed */
+	spinlock_t lock;	/* guards read or write of above */
+};
+
+struct cpuset {
+	struct cgroup_subsys_state css;
+
+	unsigned long flags;		/* "unsigned long" so bitops work */
+
+	/*
+	 * On default hierarchy:
+	 *
+	 * The user-configured masks can only be changed by writing to
+	 * cpuset.cpus and cpuset.mems, and won't be limited by the
+	 * parent masks.
+	 *
+	 * The effective masks is the real masks that apply to the tasks
+	 * in the cpuset. They may be changed if the configured masks are
+	 * changed or hotplug happens.
+	 *
+	 * effective_mask == configured_mask & parent's effective_mask,
+	 * and if it ends up empty, it will inherit the parent's mask.
+	 *
+	 *
+	 * On legacy hierachy:
+	 *
+	 * The user-configured masks are always the same with effective masks.
+	 */
+
+	/* user-configured CPUs and Memory Nodes allow to tasks */
+	cpumask_var_t cpus_allowed;
+	nodemask_t mems_allowed;
+
+	/* effective CPUs and Memory Nodes allow to tasks */
+	cpumask_var_t effective_cpus;
+	nodemask_t effective_mems;
+
+	/*
+	 * This is old Memory Nodes tasks took on.
+	 *
+	 * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
+	 * - A new cpuset's old_mems_allowed is initialized when some
+	 *   task is moved into it.
+	 * - old_mems_allowed is used in cpuset_migrate_mm() when we change
+	 *   cpuset.mems_allowed and have tasks' nodemask updated, and
+	 *   then old_mems_allowed is updated to mems_allowed.
+	 */
+	nodemask_t old_mems_allowed;
+
+	struct fmeter fmeter;		/* memory_pressure filter */
+
+	/*
+	 * Tasks are being attached to this cpuset.  Used to prevent
+	 * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
+	 */
+	int attach_in_progress;
+
+	/* partition number for rebuild_sched_domains() */
+	int pn;
+
+	/* for custom sched domain */
+	int relax_domain_level;
+};
+
+static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
+{
+	return css ? container_of(css, struct cpuset, css) : NULL;
+}
+
+/* Retrieve the cpuset for a task */
+static inline struct cpuset *task_cs(struct task_struct *task)
+{
+	return css_cs(task_css(task, cpuset_cgrp_id));
+}
+
+static inline struct cpuset *parent_cs(struct cpuset *cs)
+{
+	return css_cs(cs->css.parent);
+}
+
+#ifdef CONFIG_NUMA
+static inline bool task_has_mempolicy(struct task_struct *task)
+{
+	return task->mempolicy;
+}
+#else
+static inline bool task_has_mempolicy(struct task_struct *task)
+{
+	return false;
+}
+#endif
+
+
+/* bits in struct cpuset flags field */
+typedef enum {
+	CS_ONLINE,
+	CS_CPU_EXCLUSIVE,
+	CS_MEM_EXCLUSIVE,
+	CS_MEM_HARDWALL,
+	CS_MEMORY_MIGRATE,
+	CS_SCHED_LOAD_BALANCE,
+	CS_SPREAD_PAGE,
+	CS_SPREAD_SLAB,
+} cpuset_flagbits_t;
+
+/* convenient tests for these bits */
+static inline bool is_cpuset_online(const struct cpuset *cs)
+{
+	return test_bit(CS_ONLINE, &cs->flags);
+}
+
+static inline int is_cpu_exclusive(const struct cpuset *cs)
+{
+	return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
+}
+
+static inline int is_mem_exclusive(const struct cpuset *cs)
+{
+	return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
+}
+
+static inline int is_mem_hardwall(const struct cpuset *cs)
+{
+	return test_bit(CS_MEM_HARDWALL, &cs->flags);
+}
+
+static inline int is_sched_load_balance(const struct cpuset *cs)
+{
+	return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+}
+
+static inline int is_memory_migrate(const struct cpuset *cs)
+{
+	return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
+}
+
+static inline int is_spread_page(const struct cpuset *cs)
+{
+	return test_bit(CS_SPREAD_PAGE, &cs->flags);
+}
+
+static inline int is_spread_slab(const struct cpuset *cs)
+{
+	return test_bit(CS_SPREAD_SLAB, &cs->flags);
+}
+
+static struct cpuset top_cpuset = {
+	.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
+		  (1 << CS_MEM_EXCLUSIVE)),
+};
+
+/**
+ * cpuset_for_each_child - traverse online children of a cpuset
+ * @child_cs: loop cursor pointing to the current child
+ * @pos_css: used for iteration
+ * @parent_cs: target cpuset to walk children of
+ *
+ * Walk @child_cs through the online children of @parent_cs.  Must be used
+ * with RCU read locked.
+ */
+#define cpuset_for_each_child(child_cs, pos_css, parent_cs)		\
+	css_for_each_child((pos_css), &(parent_cs)->css)		\
+		if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
+
+/**
+ * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
+ * @des_cs: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @root_cs: target cpuset to walk ancestor of
+ *
+ * Walk @des_cs through the online descendants of @root_cs.  Must be used
+ * with RCU read locked.  The caller may modify @pos_css by calling
+ * css_rightmost_descendant() to skip subtree.  @root_cs is included in the
+ * iteration and the first node to be visited.
+ */
+#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs)	\
+	css_for_each_descendant_pre((pos_css), &(root_cs)->css)		\
+		if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
+
+/*
+ * There are two global locks guarding cpuset structures - cpuset_mutex and
+ * callback_lock. We also require taking task_lock() when dereferencing a
+ * task's cpuset pointer. See "The task_lock() exception", at the end of this
+ * comment.
+ *
+ * A task must hold both locks to modify cpusets.  If a task holds
+ * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
+ * is the only task able to also acquire callback_lock and be able to
+ * modify cpusets.  It can perform various checks on the cpuset structure
+ * first, knowing nothing will change.  It can also allocate memory while
+ * just holding cpuset_mutex.  While it is performing these checks, various
+ * callback routines can briefly acquire callback_lock to query cpusets.
+ * Once it is ready to make the changes, it takes callback_lock, blocking
+ * everyone else.
+ *
+ * Calls to the kernel memory allocator can not be made while holding
+ * callback_lock, as that would risk double tripping on callback_lock
+ * from one of the callbacks into the cpuset code from within
+ * __alloc_pages().
+ *
+ * If a task is only holding callback_lock, then it has read-only
+ * access to cpusets.
+ *
+ * Now, the task_struct fields mems_allowed and mempolicy may be changed
+ * by other task, we use alloc_lock in the task_struct fields to protect
+ * them.
+ *
+ * The cpuset_common_file_read() handlers only hold callback_lock across
+ * small pieces of code, such as when reading out possibly multi-word
+ * cpumasks and nodemasks.
+ *
+ * Accessing a task's cpuset should be done in accordance with the
+ * guidelines for accessing subsystem state in kernel/cgroup.c
+ */
+
+static DEFINE_MUTEX(cpuset_mutex);
+static DEFINE_SPINLOCK(callback_lock);
+
+static struct workqueue_struct *cpuset_migrate_mm_wq;
+
+/*
+ * CPU / memory hotplug is handled asynchronously.
+ */
+static void cpuset_hotplug_workfn(struct work_struct *work);
+static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
+
+static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
+
+/*
+ * This is ugly, but preserves the userspace API for existing cpuset
+ * users. If someone tries to mount the "cpuset" filesystem, we
+ * silently switch it to mount "cgroup" instead
+ */
+static struct dentry *cpuset_mount(struct file_system_type *fs_type,
+			 int flags, const char *unused_dev_name, void *data)
+{
+	struct file_system_type *cgroup_fs = get_fs_type("cgroup");
+	struct dentry *ret = ERR_PTR(-ENODEV);
+	if (cgroup_fs) {
+		char mountopts[] =
+			"cpuset,noprefix,"
+			"release_agent=/sbin/cpuset_release_agent";
+		ret = cgroup_fs->mount(cgroup_fs, flags,
+					   unused_dev_name, mountopts);
+		put_filesystem(cgroup_fs);
+	}
+	return ret;
+}
+
+static struct file_system_type cpuset_fs_type = {
+	.name = "cpuset",
+	.mount = cpuset_mount,
+};
+
+/*
+ * Return in pmask the portion of a cpusets's cpus_allowed that
+ * are online.  If none are online, walk up the cpuset hierarchy
+ * until we find one that does have some online cpus.
+ *
+ * One way or another, we guarantee to return some non-empty subset
+ * of cpu_online_mask.
+ *
+ * Call with callback_lock or cpuset_mutex held.
+ */
+static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
+{
+	while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
+		cs = parent_cs(cs);
+		if (unlikely(!cs)) {
+			/*
+			 * The top cpuset doesn't have any online cpu as a
+			 * consequence of a race between cpuset_hotplug_work
+			 * and cpu hotplug notifier.  But we know the top
+			 * cpuset's effective_cpus is on its way to to be
+			 * identical to cpu_online_mask.
+			 */
+			cpumask_copy(pmask, cpu_online_mask);
+			return;
+		}
+	}
+	cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
+}
+
+/*
+ * Return in *pmask the portion of a cpusets's mems_allowed that
+ * are online, with memory.  If none are online with memory, walk
+ * up the cpuset hierarchy until we find one that does have some
+ * online mems.  The top cpuset always has some mems online.
+ *
+ * One way or another, we guarantee to return some non-empty subset
+ * of node_states[N_MEMORY].
+ *
+ * Call with callback_lock or cpuset_mutex held.
+ */
+static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
+{
+	while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
+		cs = parent_cs(cs);
+	nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
+}
+
+/*
+ * update task's spread flag if cpuset's page/slab spread flag is set
+ *
+ * Call with callback_lock or cpuset_mutex held.
+ */
+static void cpuset_update_task_spread_flag(struct cpuset *cs,
+					struct task_struct *tsk)
+{
+	if (is_spread_page(cs))
+		task_set_spread_page(tsk);
+	else
+		task_clear_spread_page(tsk);
+
+	if (is_spread_slab(cs))
+		task_set_spread_slab(tsk);
+	else
+		task_clear_spread_slab(tsk);
+}
+
+/*
+ * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
+ *
+ * One cpuset is a subset of another if all its allowed CPUs and
+ * Memory Nodes are a subset of the other, and its exclusive flags
+ * are only set if the other's are set.  Call holding cpuset_mutex.
+ */
+
+static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
+{
+	return	cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
+		nodes_subset(p->mems_allowed, q->mems_allowed) &&
+		is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
+		is_mem_exclusive(p) <= is_mem_exclusive(q);
+}
+
+/**
+ * alloc_trial_cpuset - allocate a trial cpuset
+ * @cs: the cpuset that the trial cpuset duplicates
+ */
+static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
+{
+	struct cpuset *trial;
+
+	trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
+	if (!trial)
+		return NULL;
+
+	if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
+		goto free_cs;
+	if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
+		goto free_cpus;
+
+	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+	cpumask_copy(trial->effective_cpus, cs->effective_cpus);
+	return trial;
+
+free_cpus:
+	free_cpumask_var(trial->cpus_allowed);
+free_cs:
+	kfree(trial);
+	return NULL;
+}
+
+/**
+ * free_trial_cpuset - free the trial cpuset
+ * @trial: the trial cpuset to be freed
+ */
+static void free_trial_cpuset(struct cpuset *trial)
+{
+	free_cpumask_var(trial->effective_cpus);
+	free_cpumask_var(trial->cpus_allowed);
+	kfree(trial);
+}
+
+/*
+ * validate_change() - Used to validate that any proposed cpuset change
+ *		       follows the structural rules for cpusets.
+ *
+ * If we replaced the flag and mask values of the current cpuset
+ * (cur) with those values in the trial cpuset (trial), would
+ * our various subset and exclusive rules still be valid?  Presumes
+ * cpuset_mutex held.
+ *
+ * 'cur' is the address of an actual, in-use cpuset.  Operations
+ * such as list traversal that depend on the actual address of the
+ * cpuset in the list must use cur below, not trial.
+ *
+ * 'trial' is the address of bulk structure copy of cur, with
+ * perhaps one or more of the fields cpus_allowed, mems_allowed,
+ * or flags changed to new, trial values.
+ *
+ * Return 0 if valid, -errno if not.
+ */
+
+static int validate_change(struct cpuset *cur, struct cpuset *trial)
+{
+	struct cgroup_subsys_state *css;
+	struct cpuset *c, *par;
+	int ret;
+
+	rcu_read_lock();
+
+	/* Each of our child cpusets must be a subset of us */
+	ret = -EBUSY;
+	cpuset_for_each_child(c, css, cur)
+		if (!is_cpuset_subset(c, trial))
+			goto out;
+
+	/* Remaining checks don't apply to root cpuset */
+	ret = 0;
+	if (cur == &top_cpuset)
+		goto out;
+
+	par = parent_cs(cur);
+
+	/* On legacy hiearchy, we must be a subset of our parent cpuset. */
+	ret = -EACCES;
+	if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+	    !is_cpuset_subset(trial, par))
+		goto out;
+
+	/*
+	 * If either I or some sibling (!= me) is exclusive, we can't
+	 * overlap
+	 */
+	ret = -EINVAL;
+	cpuset_for_each_child(c, css, par) {
+		if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
+		    c != cur &&
+		    cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
+			goto out;
+		if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
+		    c != cur &&
+		    nodes_intersects(trial->mems_allowed, c->mems_allowed))
+			goto out;
+	}
+
+	/*
+	 * Cpusets with tasks - existing or newly being attached - can't
+	 * be changed to have empty cpus_allowed or mems_allowed.
+	 */
+	ret = -ENOSPC;
+	if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
+		if (!cpumask_empty(cur->cpus_allowed) &&
+		    cpumask_empty(trial->cpus_allowed))
+			goto out;
+		if (!nodes_empty(cur->mems_allowed) &&
+		    nodes_empty(trial->mems_allowed))
+			goto out;
+	}
+
+	/*
+	 * We can't shrink if we won't have enough room for SCHED_DEADLINE
+	 * tasks.
+	 */
+	ret = -EBUSY;
+	if (is_cpu_exclusive(cur) &&
+	    !cpuset_cpumask_can_shrink(cur->cpus_allowed,
+				       trial->cpus_allowed))
+		goto out;
+
+	ret = 0;
+out:
+	rcu_read_unlock();
+	return ret;
+}
+
+#ifdef CONFIG_SMP
+/*
+ * Helper routine for generate_sched_domains().
+ * Do cpusets a, b have overlapping effective cpus_allowed masks?
+ */
+static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
+{
+	return cpumask_intersects(a->effective_cpus, b->effective_cpus);
+}
+
+static void
+update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
+{
+	if (dattr->relax_domain_level < c->relax_domain_level)
+		dattr->relax_domain_level = c->relax_domain_level;
+	return;
+}
+
+static void update_domain_attr_tree(struct sched_domain_attr *dattr,
+				    struct cpuset *root_cs)
+{
+	struct cpuset *cp;
+	struct cgroup_subsys_state *pos_css;
+
+	rcu_read_lock();
+	cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
+		/* skip the whole subtree if @cp doesn't have any CPU */
+		if (cpumask_empty(cp->cpus_allowed)) {
+			pos_css = css_rightmost_descendant(pos_css);
+			continue;
+		}
+
+		if (is_sched_load_balance(cp))
+			update_domain_attr(dattr, cp);
+	}
+	rcu_read_unlock();
+}
+
+/*
+ * generate_sched_domains()
+ *
+ * This function builds a partial partition of the systems CPUs
+ * A 'partial partition' is a set of non-overlapping subsets whose
+ * union is a subset of that set.
+ * The output of this function needs to be passed to kernel/sched/core.c
+ * partition_sched_domains() routine, which will rebuild the scheduler's
+ * load balancing domains (sched domains) as specified by that partial
+ * partition.
+ *
+ * See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt
+ * for a background explanation of this.
+ *
+ * Does not return errors, on the theory that the callers of this
+ * routine would rather not worry about failures to rebuild sched
+ * domains when operating in the severe memory shortage situations
+ * that could cause allocation failures below.
+ *
+ * Must be called with cpuset_mutex held.
+ *
+ * The three key local variables below are:
+ *    q  - a linked-list queue of cpuset pointers, used to implement a
+ *	   top-down scan of all cpusets.  This scan loads a pointer
+ *	   to each cpuset marked is_sched_load_balance into the
+ *	   array 'csa'.  For our purposes, rebuilding the schedulers
+ *	   sched domains, we can ignore !is_sched_load_balance cpusets.
+ *  csa  - (for CpuSet Array) Array of pointers to all the cpusets
+ *	   that need to be load balanced, for convenient iterative
+ *	   access by the subsequent code that finds the best partition,
+ *	   i.e the set of domains (subsets) of CPUs such that the
+ *	   cpus_allowed of every cpuset marked is_sched_load_balance
+ *	   is a subset of one of these domains, while there are as
+ *	   many such domains as possible, each as small as possible.
+ * doms  - Conversion of 'csa' to an array of cpumasks, for passing to
+ *	   the kernel/sched/core.c routine partition_sched_domains() in a
+ *	   convenient format, that can be easily compared to the prior
+ *	   value to determine what partition elements (sched domains)
+ *	   were changed (added or removed.)
+ *
+ * Finding the best partition (set of domains):
+ *	The triple nested loops below over i, j, k scan over the
+ *	load balanced cpusets (using the array of cpuset pointers in
+ *	csa[]) looking for pairs of cpusets that have overlapping
+ *	cpus_allowed, but which don't have the same 'pn' partition
+ *	number and gives them in the same partition number.  It keeps
+ *	looping on the 'restart' label until it can no longer find
+ *	any such pairs.
+ *
+ *	The union of the cpus_allowed masks from the set of
+ *	all cpusets having the same 'pn' value then form the one
+ *	element of the partition (one sched domain) to be passed to
+ *	partition_sched_domains().
+ */
+static int generate_sched_domains(cpumask_var_t **domains,
+			struct sched_domain_attr **attributes)
+{
+	struct cpuset *cp;	/* scans q */
+	struct cpuset **csa;	/* array of all cpuset ptrs */
+	int csn;		/* how many cpuset ptrs in csa so far */
+	int i, j, k;		/* indices for partition finding loops */
+	cpumask_var_t *doms;	/* resulting partition; i.e. sched domains */
+	cpumask_var_t non_isolated_cpus;  /* load balanced CPUs */
+	struct sched_domain_attr *dattr;  /* attributes for custom domains */
+	int ndoms = 0;		/* number of sched domains in result */
+	int nslot;		/* next empty doms[] struct cpumask slot */
+	struct cgroup_subsys_state *pos_css;
+
+	doms = NULL;
+	dattr = NULL;
+	csa = NULL;
+
+	if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
+		goto done;
+	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
+
+	/* Special case for the 99% of systems with one, full, sched domain */
+	if (is_sched_load_balance(&top_cpuset)) {
+		ndoms = 1;
+		doms = alloc_sched_domains(ndoms);
+		if (!doms)
+			goto done;
+
+		dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
+		if (dattr) {
+			*dattr = SD_ATTR_INIT;
+			update_domain_attr_tree(dattr, &top_cpuset);
+		}
+		cpumask_and(doms[0], top_cpuset.effective_cpus,
+				     non_isolated_cpus);
+
+		goto done;
+	}
+
+	csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);
+	if (!csa)
+		goto done;
+	csn = 0;
+
+	rcu_read_lock();
+	cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
+		if (cp == &top_cpuset)
+			continue;
+		/*
+		 * Continue traversing beyond @cp iff @cp has some CPUs and
+		 * isn't load balancing.  The former is obvious.  The
+		 * latter: All child cpusets contain a subset of the
+		 * parent's cpus, so just skip them, and then we call
+		 * update_domain_attr_tree() to calc relax_domain_level of
+		 * the corresponding sched domain.
+		 */
+		if (!cpumask_empty(cp->cpus_allowed) &&
+		    !(is_sched_load_balance(cp) &&
+		      cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
+			continue;
+
+		if (is_sched_load_balance(cp))
+			csa[csn++] = cp;
+
+		/* skip @cp's subtree */
+		pos_css = css_rightmost_descendant(pos_css);
+	}
+	rcu_read_unlock();
+
+	for (i = 0; i < csn; i++)
+		csa[i]->pn = i;
+	ndoms = csn;
+
+restart:
+	/* Find the best partition (set of sched domains) */
+	for (i = 0; i < csn; i++) {
+		struct cpuset *a = csa[i];
+		int apn = a->pn;
+
+		for (j = 0; j < csn; j++) {
+			struct cpuset *b = csa[j];
+			int bpn = b->pn;
+
+			if (apn != bpn && cpusets_overlap(a, b)) {
+				for (k = 0; k < csn; k++) {
+					struct cpuset *c = csa[k];
+
+					if (c->pn == bpn)
+						c->pn = apn;
+				}
+				ndoms--;	/* one less element */
+				goto restart;
+			}
+		}
+	}
+
+	/*
+	 * Now we know how many domains to create.
+	 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
+	 */
+	doms = alloc_sched_domains(ndoms);
+	if (!doms)
+		goto done;
+
+	/*
+	 * The rest of the code, including the scheduler, can deal with
+	 * dattr==NULL case. No need to abort if alloc fails.
+	 */
+	dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
+
+	for (nslot = 0, i = 0; i < csn; i++) {
+		struct cpuset *a = csa[i];
+		struct cpumask *dp;
+		int apn = a->pn;
+
+		if (apn < 0) {
+			/* Skip completed partitions */
+			continue;
+		}
+
+		dp = doms[nslot];
+
+		if (nslot == ndoms) {
+			static int warnings = 10;
+			if (warnings) {
+				pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
+					nslot, ndoms, csn, i, apn);
+				warnings--;
+			}
+			continue;
+		}
+
+		cpumask_clear(dp);
+		if (dattr)
+			*(dattr + nslot) = SD_ATTR_INIT;
+		for (j = i; j < csn; j++) {
+			struct cpuset *b = csa[j];
+
+			if (apn == b->pn) {
+				cpumask_or(dp, dp, b->effective_cpus);
+				cpumask_and(dp, dp, non_isolated_cpus);
+				if (dattr)
+					update_domain_attr_tree(dattr + nslot, b);
+
+				/* Done with this partition */
+				b->pn = -1;
+			}
+		}
+		nslot++;
+	}
+	BUG_ON(nslot != ndoms);
+
+done:
+	free_cpumask_var(non_isolated_cpus);
+	kfree(csa);
+
+	/*
+	 * Fallback to the default domain if kmalloc() failed.
+	 * See comments in partition_sched_domains().
+	 */
+	if (doms == NULL)
+		ndoms = 1;
+
+	*domains    = doms;
+	*attributes = dattr;
+	return ndoms;
+}
+
+/*
+ * Rebuild scheduler domains.
+ *
+ * If the flag 'sched_load_balance' of any cpuset with non-empty
+ * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
+ * which has that flag enabled, or if any cpuset with a non-empty
+ * 'cpus' is removed, then call this routine to rebuild the
+ * scheduler's dynamic sched domains.
+ *
+ * Call with cpuset_mutex held.  Takes get_online_cpus().
+ */
+static void rebuild_sched_domains_locked(void)
+{
+	struct sched_domain_attr *attr;
+	cpumask_var_t *doms;
+	int ndoms;
+
+	lockdep_assert_held(&cpuset_mutex);
+	get_online_cpus();
+
+	/*
+	 * We have raced with CPU hotplug. Don't do anything to avoid
+	 * passing doms with offlined cpu to partition_sched_domains().
+	 * Anyways, hotplug work item will rebuild sched domains.
+	 */
+	if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
+		goto out;
+
+	/* Generate domain masks and attrs */
+	ndoms = generate_sched_domains(&doms, &attr);
+
+	/* Have scheduler rebuild the domains */
+	partition_sched_domains(ndoms, doms, attr);
+out:
+	put_online_cpus();
+}
+#else /* !CONFIG_SMP */
+static void rebuild_sched_domains_locked(void)
+{
+}
+#endif /* CONFIG_SMP */
+
+void rebuild_sched_domains(void)
+{
+	mutex_lock(&cpuset_mutex);
+	rebuild_sched_domains_locked();
+	mutex_unlock(&cpuset_mutex);
+}
+
+/**
+ * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
+ * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
+ *
+ * Iterate through each task of @cs updating its cpus_allowed to the
+ * effective cpuset's.  As this function is called with cpuset_mutex held,
+ * cpuset membership stays stable.
+ */
+static void update_tasks_cpumask(struct cpuset *cs)
+{
+	struct css_task_iter it;
+	struct task_struct *task;
+
+	css_task_iter_start(&cs->css, &it);
+	while ((task = css_task_iter_next(&it)))
+		set_cpus_allowed_ptr(task, cs->effective_cpus);
+	css_task_iter_end(&it);
+}
+
+/*
+ * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
+ * @cs: the cpuset to consider
+ * @new_cpus: temp variable for calculating new effective_cpus
+ *
+ * When congifured cpumask is changed, the effective cpumasks of this cpuset
+ * and all its descendants need to be updated.
+ *
+ * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
+ *
+ * Called with cpuset_mutex held
+ */
+static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
+{
+	struct cpuset *cp;
+	struct cgroup_subsys_state *pos_css;
+	bool need_rebuild_sched_domains = false;
+
+	rcu_read_lock();
+	cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+		struct cpuset *parent = parent_cs(cp);
+
+		cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
+
+		/*
+		 * If it becomes empty, inherit the effective mask of the
+		 * parent, which is guaranteed to have some CPUs.
+		 */
+		if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+		    cpumask_empty(new_cpus))
+			cpumask_copy(new_cpus, parent->effective_cpus);
+
+		/* Skip the whole subtree if the cpumask remains the same. */
+		if (cpumask_equal(new_cpus, cp->effective_cpus)) {
+			pos_css = css_rightmost_descendant(pos_css);
+			continue;
+		}
+
+		if (!css_tryget_online(&cp->css))
+			continue;
+		rcu_read_unlock();
+
+		spin_lock_irq(&callback_lock);
+		cpumask_copy(cp->effective_cpus, new_cpus);
+		spin_unlock_irq(&callback_lock);
+
+		WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+			!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
+
+		update_tasks_cpumask(cp);
+
+		/*
+		 * If the effective cpumask of any non-empty cpuset is changed,
+		 * we need to rebuild sched domains.
+		 */
+		if (!cpumask_empty(cp->cpus_allowed) &&
+		    is_sched_load_balance(cp))
+			need_rebuild_sched_domains = true;
+
+		rcu_read_lock();
+		css_put(&cp->css);
+	}
+	rcu_read_unlock();
+
+	if (need_rebuild_sched_domains)
+		rebuild_sched_domains_locked();
+}
+
+/**
+ * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
+ * @cs: the cpuset to consider
+ * @trialcs: trial cpuset
+ * @buf: buffer of cpu numbers written to this cpuset
+ */
+static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
+			  const char *buf)
+{
+	int retval;
+
+	/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
+	if (cs == &top_cpuset)
+		return -EACCES;
+
+	/*
+	 * An empty cpus_allowed is ok only if the cpuset has no tasks.
+	 * Since cpulist_parse() fails on an empty mask, we special case
+	 * that parsing.  The validate_change() call ensures that cpusets
+	 * with tasks have cpus.
+	 */
+	if (!*buf) {
+		cpumask_clear(trialcs->cpus_allowed);
+	} else {
+		retval = cpulist_parse(buf, trialcs->cpus_allowed);
+		if (retval < 0)
+			return retval;
+
+		if (!cpumask_subset(trialcs->cpus_allowed,
+				    top_cpuset.cpus_allowed))
+			return -EINVAL;
+	}
+
+	/* Nothing to do if the cpus didn't change */
+	if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
+		return 0;
+
+	retval = validate_change(cs, trialcs);
+	if (retval < 0)
+		return retval;
+
+	spin_lock_irq(&callback_lock);
+	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
+	spin_unlock_irq(&callback_lock);
+
+	/* use trialcs->cpus_allowed as a temp variable */
+	update_cpumasks_hier(cs, trialcs->cpus_allowed);
+	return 0;
+}
+
+/*
+ * Migrate memory region from one set of nodes to another.  This is
+ * performed asynchronously as it can be called from process migration path
+ * holding locks involved in process management.  All mm migrations are
+ * performed in the queued order and can be waited for by flushing
+ * cpuset_migrate_mm_wq.
+ */
+
+struct cpuset_migrate_mm_work {
+	struct work_struct	work;
+	struct mm_struct	*mm;
+	nodemask_t		from;
+	nodemask_t		to;
+};
+
+static void cpuset_migrate_mm_workfn(struct work_struct *work)
+{
+	struct cpuset_migrate_mm_work *mwork =
+		container_of(work, struct cpuset_migrate_mm_work, work);
+
+	/* on a wq worker, no need to worry about %current's mems_allowed */
+	do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
+	mmput(mwork->mm);
+	kfree(mwork);
+}
+
+static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
+							const nodemask_t *to)
+{
+	struct cpuset_migrate_mm_work *mwork;
+
+	mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
+	if (mwork) {
+		mwork->mm = mm;
+		mwork->from = *from;
+		mwork->to = *to;
+		INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
+		queue_work(cpuset_migrate_mm_wq, &mwork->work);
+	} else {
+		mmput(mm);
+	}
+}
+
+static void cpuset_post_attach(void)
+{
+	flush_workqueue(cpuset_migrate_mm_wq);
+}
+
+/*
+ * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
+ * @tsk: the task to change
+ * @newmems: new nodes that the task will be set
+ *
+ * In order to avoid seeing no nodes if the old and new nodes are disjoint,
+ * we structure updates as setting all new allowed nodes, then clearing newly
+ * disallowed ones.
+ */
+static void cpuset_change_task_nodemask(struct task_struct *tsk,
+					nodemask_t *newmems)
+{
+	bool need_loop;
+
+	task_lock(tsk);
+	/*
+	 * Determine if a loop is necessary if another thread is doing
+	 * read_mems_allowed_begin().  If at least one node remains unchanged and
+	 * tsk does not have a mempolicy, then an empty nodemask will not be
+	 * possible when mems_allowed is larger than a word.
+	 */
+	need_loop = task_has_mempolicy(tsk) ||
+			!nodes_intersects(*newmems, tsk->mems_allowed);
+
+	if (need_loop) {
+		local_irq_disable();
+		write_seqcount_begin(&tsk->mems_allowed_seq);
+	}
+
+	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
+	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
+
+	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
+	tsk->mems_allowed = *newmems;
+
+	if (need_loop) {
+		write_seqcount_end(&tsk->mems_allowed_seq);
+		local_irq_enable();
+	}
+
+	task_unlock(tsk);
+}
+
+static void *cpuset_being_rebound;
+
+/**
+ * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
+ * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
+ *
+ * Iterate through each task of @cs updating its mems_allowed to the
+ * effective cpuset's.  As this function is called with cpuset_mutex held,
+ * cpuset membership stays stable.
+ */
+static void update_tasks_nodemask(struct cpuset *cs)
+{
+	static nodemask_t newmems;	/* protected by cpuset_mutex */
+	struct css_task_iter it;
+	struct task_struct *task;
+
+	cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */
+
+	guarantee_online_mems(cs, &newmems);
+
+	/*
+	 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
+	 * take while holding tasklist_lock.  Forks can happen - the
+	 * mpol_dup() cpuset_being_rebound check will catch such forks,
+	 * and rebind their vma mempolicies too.  Because we still hold
+	 * the global cpuset_mutex, we know that no other rebind effort
+	 * will be contending for the global variable cpuset_being_rebound.
+	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
+	 * is idempotent.  Also migrate pages in each mm to new nodes.
+	 */
+	css_task_iter_start(&cs->css, &it);
+	while ((task = css_task_iter_next(&it))) {
+		struct mm_struct *mm;
+		bool migrate;
+
+		cpuset_change_task_nodemask(task, &newmems);
+
+		mm = get_task_mm(task);
+		if (!mm)
+			continue;
+
+		migrate = is_memory_migrate(cs);
+
+		mpol_rebind_mm(mm, &cs->mems_allowed);
+		if (migrate)
+			cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
+		else
+			mmput(mm);
+	}
+	css_task_iter_end(&it);
+
+	/*
+	 * All the tasks' nodemasks have been updated, update
+	 * cs->old_mems_allowed.
+	 */
+	cs->old_mems_allowed = newmems;
+
+	/* We're done rebinding vmas to this cpuset's new mems_allowed. */
+	cpuset_being_rebound = NULL;
+}
+
+/*
+ * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
+ * @cs: the cpuset to consider
+ * @new_mems: a temp variable for calculating new effective_mems
+ *
+ * When configured nodemask is changed, the effective nodemasks of this cpuset
+ * and all its descendants need to be updated.
+ *
+ * On legacy hiearchy, effective_mems will be the same with mems_allowed.
+ *
+ * Called with cpuset_mutex held
+ */
+static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
+{
+	struct cpuset *cp;
+	struct cgroup_subsys_state *pos_css;
+
+	rcu_read_lock();
+	cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+		struct cpuset *parent = parent_cs(cp);
+
+		nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
+
+		/*
+		 * If it becomes empty, inherit the effective mask of the
+		 * parent, which is guaranteed to have some MEMs.
+		 */
+		if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+		    nodes_empty(*new_mems))
+			*new_mems = parent->effective_mems;
+
+		/* Skip the whole subtree if the nodemask remains the same. */
+		if (nodes_equal(*new_mems, cp->effective_mems)) {
+			pos_css = css_rightmost_descendant(pos_css);
+			continue;
+		}
+
+		if (!css_tryget_online(&cp->css))
+			continue;
+		rcu_read_unlock();
+
+		spin_lock_irq(&callback_lock);
+		cp->effective_mems = *new_mems;
+		spin_unlock_irq(&callback_lock);
+
+		WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+			!nodes_equal(cp->mems_allowed, cp->effective_mems));
+
+		update_tasks_nodemask(cp);
+
+		rcu_read_lock();
+		css_put(&cp->css);
+	}
+	rcu_read_unlock();
+}
+
+/*
+ * Handle user request to change the 'mems' memory placement
+ * of a cpuset.  Needs to validate the request, update the
+ * cpusets mems_allowed, and for each task in the cpuset,
+ * update mems_allowed and rebind task's mempolicy and any vma
+ * mempolicies and if the cpuset is marked 'memory_migrate',
+ * migrate the tasks pages to the new memory.
+ *
+ * Call with cpuset_mutex held. May take callback_lock during call.
+ * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
+ * lock each such tasks mm->mmap_sem, scan its vma's and rebind
+ * their mempolicies to the cpusets new mems_allowed.
+ */
+static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
+			   const char *buf)
+{
+	int retval;
+
+	/*
+	 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
+	 * it's read-only
+	 */
+	if (cs == &top_cpuset) {
+		retval = -EACCES;
+		goto done;
+	}
+
+	/*
+	 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
+	 * Since nodelist_parse() fails on an empty mask, we special case
+	 * that parsing.  The validate_change() call ensures that cpusets
+	 * with tasks have memory.
+	 */
+	if (!*buf) {
+		nodes_clear(trialcs->mems_allowed);
+	} else {
+		retval = nodelist_parse(buf, trialcs->mems_allowed);
+		if (retval < 0)
+			goto done;
+
+		if (!nodes_subset(trialcs->mems_allowed,
+				  top_cpuset.mems_allowed)) {
+			retval = -EINVAL;
+			goto done;
+		}
+	}
+
+	if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
+		retval = 0;		/* Too easy - nothing to do */
+		goto done;
+	}
+	retval = validate_change(cs, trialcs);
+	if (retval < 0)
+		goto done;
+
+	spin_lock_irq(&callback_lock);
+	cs->mems_allowed = trialcs->mems_allowed;
+	spin_unlock_irq(&callback_lock);
+
+	/* use trialcs->mems_allowed as a temp variable */
+	update_nodemasks_hier(cs, &trialcs->mems_allowed);
+done:
+	return retval;
+}
+
+int current_cpuset_is_being_rebound(void)
+{
+	int ret;
+
+	rcu_read_lock();
+	ret = task_cs(current) == cpuset_being_rebound;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static int update_relax_domain_level(struct cpuset *cs, s64 val)
+{
+#ifdef CONFIG_SMP
+	if (val < -1 || val >= sched_domain_level_max)
+		return -EINVAL;
+#endif
+
+	if (val != cs->relax_domain_level) {
+		cs->relax_domain_level = val;
+		if (!cpumask_empty(cs->cpus_allowed) &&
+		    is_sched_load_balance(cs))
+			rebuild_sched_domains_locked();
+	}
+
+	return 0;
+}
+
+/**
+ * update_tasks_flags - update the spread flags of tasks in the cpuset.
+ * @cs: the cpuset in which each task's spread flags needs to be changed
+ *
+ * Iterate through each task of @cs updating its spread flags.  As this
+ * function is called with cpuset_mutex held, cpuset membership stays
+ * stable.
+ */
+static void update_tasks_flags(struct cpuset *cs)
+{
+	struct css_task_iter it;
+	struct task_struct *task;
+
+	css_task_iter_start(&cs->css, &it);
+	while ((task = css_task_iter_next(&it)))
+		cpuset_update_task_spread_flag(cs, task);
+	css_task_iter_end(&it);
+}
+
+/*
+ * update_flag - read a 0 or a 1 in a file and update associated flag
+ * bit:		the bit to update (see cpuset_flagbits_t)
+ * cs:		the cpuset to update
+ * turning_on: 	whether the flag is being set or cleared
+ *
+ * Call with cpuset_mutex held.
+ */
+
+static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
+		       int turning_on)
+{
+	struct cpuset *trialcs;
+	int balance_flag_changed;
+	int spread_flag_changed;
+	int err;
+
+	trialcs = alloc_trial_cpuset(cs);
+	if (!trialcs)
+		return -ENOMEM;
+
+	if (turning_on)
+		set_bit(bit, &trialcs->flags);
+	else
+		clear_bit(bit, &trialcs->flags);
+
+	err = validate_change(cs, trialcs);
+	if (err < 0)
+		goto out;
+
+	balance_flag_changed = (is_sched_load_balance(cs) !=
+				is_sched_load_balance(trialcs));
+
+	spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
+			|| (is_spread_page(cs) != is_spread_page(trialcs)));
+
+	spin_lock_irq(&callback_lock);
+	cs->flags = trialcs->flags;
+	spin_unlock_irq(&callback_lock);
+
+	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
+		rebuild_sched_domains_locked();
+
+	if (spread_flag_changed)
+		update_tasks_flags(cs);
+out:
+	free_trial_cpuset(trialcs);
+	return err;
+}
+
+/*
+ * Frequency meter - How fast is some event occurring?
+ *
+ * These routines manage a digitally filtered, constant time based,
+ * event frequency meter.  There are four routines:
+ *   fmeter_init() - initialize a frequency meter.
+ *   fmeter_markevent() - called each time the event happens.
+ *   fmeter_getrate() - returns the recent rate of such events.
+ *   fmeter_update() - internal routine used to update fmeter.
+ *
+ * A common data structure is passed to each of these routines,
+ * which is used to keep track of the state required to manage the
+ * frequency meter and its digital filter.
+ *
+ * The filter works on the number of events marked per unit time.
+ * The filter is single-pole low-pass recursive (IIR).  The time unit
+ * is 1 second.  Arithmetic is done using 32-bit integers scaled to
+ * simulate 3 decimal digits of precision (multiplied by 1000).
+ *
+ * With an FM_COEF of 933, and a time base of 1 second, the filter
+ * has a half-life of 10 seconds, meaning that if the events quit
+ * happening, then the rate returned from the fmeter_getrate()
+ * will be cut in half each 10 seconds, until it converges to zero.
+ *
+ * It is not worth doing a real infinitely recursive filter.  If more
+ * than FM_MAXTICKS ticks have elapsed since the last filter event,
+ * just compute FM_MAXTICKS ticks worth, by which point the level
+ * will be stable.
+ *
+ * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
+ * arithmetic overflow in the fmeter_update() routine.
+ *
+ * Given the simple 32 bit integer arithmetic used, this meter works
+ * best for reporting rates between one per millisecond (msec) and
+ * one per 32 (approx) seconds.  At constant rates faster than one
+ * per msec it maxes out at values just under 1,000,000.  At constant
+ * rates between one per msec, and one per second it will stabilize
+ * to a value N*1000, where N is the rate of events per second.
+ * At constant rates between one per second and one per 32 seconds,
+ * it will be choppy, moving up on the seconds that have an event,
+ * and then decaying until the next event.  At rates slower than
+ * about one in 32 seconds, it decays all the way back to zero between
+ * each event.
+ */
+
+#define FM_COEF 933		/* coefficient for half-life of 10 secs */
+#define FM_MAXTICKS ((u32)99)   /* useless computing more ticks than this */
+#define FM_MAXCNT 1000000	/* limit cnt to avoid overflow */
+#define FM_SCALE 1000		/* faux fixed point scale */
+
+/* Initialize a frequency meter */
+static void fmeter_init(struct fmeter *fmp)
+{
+	fmp->cnt = 0;
+	fmp->val = 0;
+	fmp->time = 0;
+	spin_lock_init(&fmp->lock);
+}
+
+/* Internal meter update - process cnt events and update value */
+static void fmeter_update(struct fmeter *fmp)
+{
+	time64_t now;
+	u32 ticks;
+
+	now = ktime_get_seconds();
+	ticks = now - fmp->time;
+
+	if (ticks == 0)
+		return;
+
+	ticks = min(FM_MAXTICKS, ticks);
+	while (ticks-- > 0)
+		fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
+	fmp->time = now;
+
+	fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
+	fmp->cnt = 0;
+}
+
+/* Process any previous ticks, then bump cnt by one (times scale). */
+static void fmeter_markevent(struct fmeter *fmp)
+{
+	spin_lock(&fmp->lock);
+	fmeter_update(fmp);
+	fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
+	spin_unlock(&fmp->lock);
+}
+
+/* Process any previous ticks, then return current value. */
+static int fmeter_getrate(struct fmeter *fmp)
+{
+	int val;
+
+	spin_lock(&fmp->lock);
+	fmeter_update(fmp);
+	val = fmp->val;
+	spin_unlock(&fmp->lock);
+	return val;
+}
+
+static struct cpuset *cpuset_attach_old_cs;
+
+/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
+static int cpuset_can_attach(struct cgroup_taskset *tset)
+{
+	struct cgroup_subsys_state *css;
+	struct cpuset *cs;
+	struct task_struct *task;
+	int ret;
+
+	/* used later by cpuset_attach() */
+	cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
+	cs = css_cs(css);
+
+	mutex_lock(&cpuset_mutex);
+
+	/* allow moving tasks into an empty cpuset if on default hierarchy */
+	ret = -ENOSPC;
+	if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+	    (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
+		goto out_unlock;
+
+	cgroup_taskset_for_each(task, css, tset) {
+		ret = task_can_attach(task, cs->cpus_allowed);
+		if (ret)
+			goto out_unlock;
+		ret = security_task_setscheduler(task);
+		if (ret)
+			goto out_unlock;
+	}
+
+	/*
+	 * Mark attach is in progress.  This makes validate_change() fail
+	 * changes which zero cpus/mems_allowed.
+	 */
+	cs->attach_in_progress++;
+	ret = 0;
+out_unlock:
+	mutex_unlock(&cpuset_mutex);
+	return ret;
+}
+
+static void cpuset_cancel_attach(struct cgroup_taskset *tset)
+{
+	struct cgroup_subsys_state *css;
+	struct cpuset *cs;
+
+	cgroup_taskset_first(tset, &css);
+	cs = css_cs(css);
+
+	mutex_lock(&cpuset_mutex);
+	css_cs(css)->attach_in_progress--;
+	mutex_unlock(&cpuset_mutex);
+}
+
+/*
+ * Protected by cpuset_mutex.  cpus_attach is used only by cpuset_attach()
+ * but we can't allocate it dynamically there.  Define it global and
+ * allocate from cpuset_init().
+ */
+static cpumask_var_t cpus_attach;
+
+static void cpuset_attach(struct cgroup_taskset *tset)
+{
+	/* static buf protected by cpuset_mutex */
+	static nodemask_t cpuset_attach_nodemask_to;
+	struct task_struct *task;
+	struct task_struct *leader;
+	struct cgroup_subsys_state *css;
+	struct cpuset *cs;
+	struct cpuset *oldcs = cpuset_attach_old_cs;
+
+	cgroup_taskset_first(tset, &css);
+	cs = css_cs(css);
+
+	mutex_lock(&cpuset_mutex);
+
+	/* prepare for attach */
+	if (cs == &top_cpuset)
+		cpumask_copy(cpus_attach, cpu_possible_mask);
+	else
+		guarantee_online_cpus(cs, cpus_attach);
+
+	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+
+	cgroup_taskset_for_each(task, css, tset) {
+		/*
+		 * can_attach beforehand should guarantee that this doesn't
+		 * fail.  TODO: have a better way to handle failure here
+		 */
+		WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
+
+		cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
+		cpuset_update_task_spread_flag(cs, task);
+	}
+
+	/*
+	 * Change mm for all threadgroup leaders. This is expensive and may
+	 * sleep and should be moved outside migration path proper.
+	 */
+	cpuset_attach_nodemask_to = cs->effective_mems;
+	cgroup_taskset_for_each_leader(leader, css, tset) {
+		struct mm_struct *mm = get_task_mm(leader);
+
+		if (mm) {
+			mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
+
+			/*
+			 * old_mems_allowed is the same with mems_allowed
+			 * here, except if this task is being moved
+			 * automatically due to hotplug.  In that case
+			 * @mems_allowed has been updated and is empty, so
+			 * @old_mems_allowed is the right nodesets that we
+			 * migrate mm from.
+			 */
+			if (is_memory_migrate(cs))
+				cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
+						  &cpuset_attach_nodemask_to);
+			else
+				mmput(mm);
+		}
+	}
+
+	cs->old_mems_allowed = cpuset_attach_nodemask_to;
+
+	cs->attach_in_progress--;
+	if (!cs->attach_in_progress)
+		wake_up(&cpuset_attach_wq);
+
+	mutex_unlock(&cpuset_mutex);
+}
+
+/* The various types of files and directories in a cpuset file system */
+
+typedef enum {
+	FILE_MEMORY_MIGRATE,
+	FILE_CPULIST,
+	FILE_MEMLIST,
+	FILE_EFFECTIVE_CPULIST,
+	FILE_EFFECTIVE_MEMLIST,
+	FILE_CPU_EXCLUSIVE,
+	FILE_MEM_EXCLUSIVE,
+	FILE_MEM_HARDWALL,
+	FILE_SCHED_LOAD_BALANCE,
+	FILE_SCHED_RELAX_DOMAIN_LEVEL,
+	FILE_MEMORY_PRESSURE_ENABLED,
+	FILE_MEMORY_PRESSURE,
+	FILE_SPREAD_PAGE,
+	FILE_SPREAD_SLAB,
+} cpuset_filetype_t;
+
+static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
+			    u64 val)
+{
+	struct cpuset *cs = css_cs(css);
+	cpuset_filetype_t type = cft->private;
+	int retval = 0;
+
+	mutex_lock(&cpuset_mutex);
+	if (!is_cpuset_online(cs)) {
+		retval = -ENODEV;
+		goto out_unlock;
+	}
+
+	switch (type) {
+	case FILE_CPU_EXCLUSIVE:
+		retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
+		break;
+	case FILE_MEM_EXCLUSIVE:
+		retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
+		break;
+	case FILE_MEM_HARDWALL:
+		retval = update_flag(CS_MEM_HARDWALL, cs, val);
+		break;
+	case FILE_SCHED_LOAD_BALANCE:
+		retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
+		break;
+	case FILE_MEMORY_MIGRATE:
+		retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
+		break;
+	case FILE_MEMORY_PRESSURE_ENABLED:
+		cpuset_memory_pressure_enabled = !!val;
+		break;
+	case FILE_SPREAD_PAGE:
+		retval = update_flag(CS_SPREAD_PAGE, cs, val);
+		break;
+	case FILE_SPREAD_SLAB:
+		retval = update_flag(CS_SPREAD_SLAB, cs, val);
+		break;
+	default:
+		retval = -EINVAL;
+		break;
+	}
+out_unlock:
+	mutex_unlock(&cpuset_mutex);
+	return retval;
+}
+
+static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
+			    s64 val)
+{
+	struct cpuset *cs = css_cs(css);
+	cpuset_filetype_t type = cft->private;
+	int retval = -ENODEV;
+
+	mutex_lock(&cpuset_mutex);
+	if (!is_cpuset_online(cs))
+		goto out_unlock;
+
+	switch (type) {
+	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+		retval = update_relax_domain_level(cs, val);
+		break;
+	default:
+		retval = -EINVAL;
+		break;
+	}
+out_unlock:
+	mutex_unlock(&cpuset_mutex);
+	return retval;
+}
+
+/*
+ * Common handling for a write to a "cpus" or "mems" file.
+ */
+static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
+				    char *buf, size_t nbytes, loff_t off)
+{
+	struct cpuset *cs = css_cs(of_css(of));
+	struct cpuset *trialcs;
+	int retval = -ENODEV;
+
+	buf = strstrip(buf);
+
+	/*
+	 * CPU or memory hotunplug may leave @cs w/o any execution
+	 * resources, in which case the hotplug code asynchronously updates
+	 * configuration and transfers all tasks to the nearest ancestor
+	 * which can execute.
+	 *
+	 * As writes to "cpus" or "mems" may restore @cs's execution
+	 * resources, wait for the previously scheduled operations before
+	 * proceeding, so that we don't end up keep removing tasks added
+	 * after execution capability is restored.
+	 *
+	 * cpuset_hotplug_work calls back into cgroup core via
+	 * cgroup_transfer_tasks() and waiting for it from a cgroupfs
+	 * operation like this one can lead to a deadlock through kernfs
+	 * active_ref protection.  Let's break the protection.  Losing the
+	 * protection is okay as we check whether @cs is online after
+	 * grabbing cpuset_mutex anyway.  This only happens on the legacy
+	 * hierarchies.
+	 */
+	css_get(&cs->css);
+	kernfs_break_active_protection(of->kn);
+	flush_work(&cpuset_hotplug_work);
+
+	mutex_lock(&cpuset_mutex);
+	if (!is_cpuset_online(cs))
+		goto out_unlock;
+
+	trialcs = alloc_trial_cpuset(cs);
+	if (!trialcs) {
+		retval = -ENOMEM;
+		goto out_unlock;
+	}
+
+	switch (of_cft(of)->private) {
+	case FILE_CPULIST:
+		retval = update_cpumask(cs, trialcs, buf);
+		break;
+	case FILE_MEMLIST:
+		retval = update_nodemask(cs, trialcs, buf);
+		break;
+	default:
+		retval = -EINVAL;
+		break;
+	}
+
+	free_trial_cpuset(trialcs);
+out_unlock:
+	mutex_unlock(&cpuset_mutex);
+	kernfs_unbreak_active_protection(of->kn);
+	css_put(&cs->css);
+	flush_workqueue(cpuset_migrate_mm_wq);
+	return retval ?: nbytes;
+}
+
+/*
+ * These ascii lists should be read in a single call, by using a user
+ * buffer large enough to hold the entire map.  If read in smaller
+ * chunks, there is no guarantee of atomicity.  Since the display format
+ * used, list of ranges of sequential numbers, is variable length,
+ * and since these maps can change value dynamically, one could read
+ * gibberish by doing partial reads while a list was changing.
+ */
+static int cpuset_common_seq_show(struct seq_file *sf, void *v)
+{
+	struct cpuset *cs = css_cs(seq_css(sf));
+	cpuset_filetype_t type = seq_cft(sf)->private;
+	int ret = 0;
+
+	spin_lock_irq(&callback_lock);
+
+	switch (type) {
+	case FILE_CPULIST:
+		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
+		break;
+	case FILE_MEMLIST:
+		seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
+		break;
+	case FILE_EFFECTIVE_CPULIST:
+		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
+		break;
+	case FILE_EFFECTIVE_MEMLIST:
+		seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	spin_unlock_irq(&callback_lock);
+	return ret;
+}
+
+static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+	struct cpuset *cs = css_cs(css);
+	cpuset_filetype_t type = cft->private;
+	switch (type) {
+	case FILE_CPU_EXCLUSIVE:
+		return is_cpu_exclusive(cs);
+	case FILE_MEM_EXCLUSIVE:
+		return is_mem_exclusive(cs);
+	case FILE_MEM_HARDWALL:
+		return is_mem_hardwall(cs);
+	case FILE_SCHED_LOAD_BALANCE:
+		return is_sched_load_balance(cs);
+	case FILE_MEMORY_MIGRATE:
+		return is_memory_migrate(cs);
+	case FILE_MEMORY_PRESSURE_ENABLED:
+		return cpuset_memory_pressure_enabled;
+	case FILE_MEMORY_PRESSURE:
+		return fmeter_getrate(&cs->fmeter);
+	case FILE_SPREAD_PAGE:
+		return is_spread_page(cs);
+	case FILE_SPREAD_SLAB:
+		return is_spread_slab(cs);
+	default:
+		BUG();
+	}
+
+	/* Unreachable but makes gcc happy */
+	return 0;
+}
+
+static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+	struct cpuset *cs = css_cs(css);
+	cpuset_filetype_t type = cft->private;
+	switch (type) {
+	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+		return cs->relax_domain_level;
+	default:
+		BUG();
+	}
+
+	/* Unrechable but makes gcc happy */
+	return 0;
+}
+
+
+/*
+ * for the common functions, 'private' gives the type of file
+ */
+
+static struct cftype files[] = {
+	{
+		.name = "cpus",
+		.seq_show = cpuset_common_seq_show,
+		.write = cpuset_write_resmask,
+		.max_write_len = (100U + 6 * NR_CPUS),
+		.private = FILE_CPULIST,
+	},
+
+	{
+		.name = "mems",
+		.seq_show = cpuset_common_seq_show,
+		.write = cpuset_write_resmask,
+		.max_write_len = (100U + 6 * MAX_NUMNODES),
+		.private = FILE_MEMLIST,
+	},
+
+	{
+		.name = "effective_cpus",
+		.seq_show = cpuset_common_seq_show,
+		.private = FILE_EFFECTIVE_CPULIST,
+	},
+
+	{
+		.name = "effective_mems",
+		.seq_show = cpuset_common_seq_show,
+		.private = FILE_EFFECTIVE_MEMLIST,
+	},
+
+	{
+		.name = "cpu_exclusive",
+		.read_u64 = cpuset_read_u64,
+		.write_u64 = cpuset_write_u64,
+		.private = FILE_CPU_EXCLUSIVE,
+	},
+
+	{
+		.name = "mem_exclusive",
+		.read_u64 = cpuset_read_u64,
+		.write_u64 = cpuset_write_u64,
+		.private = FILE_MEM_EXCLUSIVE,
+	},
+
+	{
+		.name = "mem_hardwall",
+		.read_u64 = cpuset_read_u64,
+		.write_u64 = cpuset_write_u64,
+		.private = FILE_MEM_HARDWALL,
+	},
+
+	{
+		.name = "sched_load_balance",
+		.read_u64 = cpuset_read_u64,
+		.write_u64 = cpuset_write_u64,
+		.private = FILE_SCHED_LOAD_BALANCE,
+	},
+
+	{
+		.name = "sched_relax_domain_level",
+		.read_s64 = cpuset_read_s64,
+		.write_s64 = cpuset_write_s64,
+		.private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
+	},
+
+	{
+		.name = "memory_migrate",
+		.read_u64 = cpuset_read_u64,
+		.write_u64 = cpuset_write_u64,
+		.private = FILE_MEMORY_MIGRATE,
+	},
+
+	{
+		.name = "memory_pressure",
+		.read_u64 = cpuset_read_u64,
+	},
+
+	{
+		.name = "memory_spread_page",
+		.read_u64 = cpuset_read_u64,
+		.write_u64 = cpuset_write_u64,
+		.private = FILE_SPREAD_PAGE,
+	},
+
+	{
+		.name = "memory_spread_slab",
+		.read_u64 = cpuset_read_u64,
+		.write_u64 = cpuset_write_u64,
+		.private = FILE_SPREAD_SLAB,
+	},
+
+	{
+		.name = "memory_pressure_enabled",
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.read_u64 = cpuset_read_u64,
+		.write_u64 = cpuset_write_u64,
+		.private = FILE_MEMORY_PRESSURE_ENABLED,
+	},
+
+	{ }	/* terminate */
+};
+
+/*
+ *	cpuset_css_alloc - allocate a cpuset css
+ *	cgrp:	control group that the new cpuset will be part of
+ */
+
+static struct cgroup_subsys_state *
+cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+	struct cpuset *cs;
+
+	if (!parent_css)
+		return &top_cpuset.css;
+
+	cs = kzalloc(sizeof(*cs), GFP_KERNEL);
+	if (!cs)
+		return ERR_PTR(-ENOMEM);
+	if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
+		goto free_cs;
+	if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
+		goto free_cpus;
+
+	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+	cpumask_clear(cs->cpus_allowed);
+	nodes_clear(cs->mems_allowed);
+	cpumask_clear(cs->effective_cpus);
+	nodes_clear(cs->effective_mems);
+	fmeter_init(&cs->fmeter);
+	cs->relax_domain_level = -1;
+
+	return &cs->css;
+
+free_cpus:
+	free_cpumask_var(cs->cpus_allowed);
+free_cs:
+	kfree(cs);
+	return ERR_PTR(-ENOMEM);
+}
+
+static int cpuset_css_online(struct cgroup_subsys_state *css)
+{
+	struct cpuset *cs = css_cs(css);
+	struct cpuset *parent = parent_cs(cs);
+	struct cpuset *tmp_cs;
+	struct cgroup_subsys_state *pos_css;
+
+	if (!parent)
+		return 0;
+
+	mutex_lock(&cpuset_mutex);
+
+	set_bit(CS_ONLINE, &cs->flags);
+	if (is_spread_page(parent))
+		set_bit(CS_SPREAD_PAGE, &cs->flags);
+	if (is_spread_slab(parent))
+		set_bit(CS_SPREAD_SLAB, &cs->flags);
+
+	cpuset_inc();
+
+	spin_lock_irq(&callback_lock);
+	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
+		cpumask_copy(cs->effective_cpus, parent->effective_cpus);
+		cs->effective_mems = parent->effective_mems;
+	}
+	spin_unlock_irq(&callback_lock);
+
+	if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
+		goto out_unlock;
+
+	/*
+	 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
+	 * set.  This flag handling is implemented in cgroup core for
+	 * histrical reasons - the flag may be specified during mount.
+	 *
+	 * Currently, if any sibling cpusets have exclusive cpus or mem, we
+	 * refuse to clone the configuration - thereby refusing the task to
+	 * be entered, and as a result refusing the sys_unshare() or
+	 * clone() which initiated it.  If this becomes a problem for some
+	 * users who wish to allow that scenario, then this could be
+	 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
+	 * (and likewise for mems) to the new cgroup.
+	 */
+	rcu_read_lock();
+	cpuset_for_each_child(tmp_cs, pos_css, parent) {
+		if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
+			rcu_read_unlock();
+			goto out_unlock;
+		}
+	}
+	rcu_read_unlock();
+
+	spin_lock_irq(&callback_lock);
+	cs->mems_allowed = parent->mems_allowed;
+	cs->effective_mems = parent->mems_allowed;
+	cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
+	cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
+	spin_unlock_irq(&callback_lock);
+out_unlock:
+	mutex_unlock(&cpuset_mutex);
+	return 0;
+}
+
+/*
+ * If the cpuset being removed has its flag 'sched_load_balance'
+ * enabled, then simulate turning sched_load_balance off, which
+ * will call rebuild_sched_domains_locked().
+ */
+
+static void cpuset_css_offline(struct cgroup_subsys_state *css)
+{
+	struct cpuset *cs = css_cs(css);
+
+	mutex_lock(&cpuset_mutex);
+
+	if (is_sched_load_balance(cs))
+		update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+
+	cpuset_dec();
+	clear_bit(CS_ONLINE, &cs->flags);
+
+	mutex_unlock(&cpuset_mutex);
+}
+
+static void cpuset_css_free(struct cgroup_subsys_state *css)
+{
+	struct cpuset *cs = css_cs(css);
+
+	free_cpumask_var(cs->effective_cpus);
+	free_cpumask_var(cs->cpus_allowed);
+	kfree(cs);
+}
+
+static void cpuset_bind(struct cgroup_subsys_state *root_css)
+{
+	mutex_lock(&cpuset_mutex);
+	spin_lock_irq(&callback_lock);
+
+	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
+		cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
+		top_cpuset.mems_allowed = node_possible_map;
+	} else {
+		cpumask_copy(top_cpuset.cpus_allowed,
+			     top_cpuset.effective_cpus);
+		top_cpuset.mems_allowed = top_cpuset.effective_mems;
+	}
+
+	spin_unlock_irq(&callback_lock);
+	mutex_unlock(&cpuset_mutex);
+}
+
+/*
+ * Make sure the new task conform to the current state of its parent,
+ * which could have been changed by cpuset just after it inherits the
+ * state from the parent and before it sits on the cgroup's task list.
+ */
+static void cpuset_fork(struct task_struct *task)
+{
+	if (task_css_is_root(task, cpuset_cgrp_id))
+		return;
+
+	set_cpus_allowed_ptr(task, &current->cpus_allowed);
+	task->mems_allowed = current->mems_allowed;
+}
+
+struct cgroup_subsys cpuset_cgrp_subsys = {
+	.css_alloc	= cpuset_css_alloc,
+	.css_online	= cpuset_css_online,
+	.css_offline	= cpuset_css_offline,
+	.css_free	= cpuset_css_free,
+	.can_attach	= cpuset_can_attach,
+	.cancel_attach	= cpuset_cancel_attach,
+	.attach		= cpuset_attach,
+	.post_attach	= cpuset_post_attach,
+	.bind		= cpuset_bind,
+	.fork		= cpuset_fork,
+	.legacy_cftypes	= files,
+	.early_init	= true,
+};
+
+/**
+ * cpuset_init - initialize cpusets at system boot
+ *
+ * Description: Initialize top_cpuset and the cpuset internal file system,
+ **/
+
+int __init cpuset_init(void)
+{
+	int err = 0;
+
+	if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
+		BUG();
+	if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
+		BUG();
+
+	cpumask_setall(top_cpuset.cpus_allowed);
+	nodes_setall(top_cpuset.mems_allowed);
+	cpumask_setall(top_cpuset.effective_cpus);
+	nodes_setall(top_cpuset.effective_mems);
+
+	fmeter_init(&top_cpuset.fmeter);
+	set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
+	top_cpuset.relax_domain_level = -1;
+
+	err = register_filesystem(&cpuset_fs_type);
+	if (err < 0)
+		return err;
+
+	if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
+		BUG();
+
+	return 0;
+}
+
+/*
+ * If CPU and/or memory hotplug handlers, below, unplug any CPUs
+ * or memory nodes, we need to walk over the cpuset hierarchy,
+ * removing that CPU or node from all cpusets.  If this removes the
+ * last CPU or node from a cpuset, then move the tasks in the empty
+ * cpuset to its next-highest non-empty parent.
+ */
+static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
+{
+	struct cpuset *parent;
+
+	/*
+	 * Find its next-highest non-empty parent, (top cpuset
+	 * has online cpus, so can't be empty).
+	 */
+	parent = parent_cs(cs);
+	while (cpumask_empty(parent->cpus_allowed) ||
+			nodes_empty(parent->mems_allowed))
+		parent = parent_cs(parent);
+
+	if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
+		pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
+		pr_cont_cgroup_name(cs->css.cgroup);
+		pr_cont("\n");
+	}
+}
+
+static void
+hotplug_update_tasks_legacy(struct cpuset *cs,
+			    struct cpumask *new_cpus, nodemask_t *new_mems,
+			    bool cpus_updated, bool mems_updated)
+{
+	bool is_empty;
+
+	spin_lock_irq(&callback_lock);
+	cpumask_copy(cs->cpus_allowed, new_cpus);
+	cpumask_copy(cs->effective_cpus, new_cpus);
+	cs->mems_allowed = *new_mems;
+	cs->effective_mems = *new_mems;
+	spin_unlock_irq(&callback_lock);
+
+	/*
+	 * Don't call update_tasks_cpumask() if the cpuset becomes empty,
+	 * as the tasks will be migratecd to an ancestor.
+	 */
+	if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
+		update_tasks_cpumask(cs);
+	if (mems_updated && !nodes_empty(cs->mems_allowed))
+		update_tasks_nodemask(cs);
+
+	is_empty = cpumask_empty(cs->cpus_allowed) ||
+		   nodes_empty(cs->mems_allowed);
+
+	mutex_unlock(&cpuset_mutex);
+
+	/*
+	 * Move tasks to the nearest ancestor with execution resources,
+	 * This is full cgroup operation which will also call back into
+	 * cpuset. Should be done outside any lock.
+	 */
+	if (is_empty)
+		remove_tasks_in_empty_cpuset(cs);
+
+	mutex_lock(&cpuset_mutex);
+}
+
+static void
+hotplug_update_tasks(struct cpuset *cs,
+		     struct cpumask *new_cpus, nodemask_t *new_mems,
+		     bool cpus_updated, bool mems_updated)
+{
+	if (cpumask_empty(new_cpus))
+		cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
+	if (nodes_empty(*new_mems))
+		*new_mems = parent_cs(cs)->effective_mems;
+
+	spin_lock_irq(&callback_lock);
+	cpumask_copy(cs->effective_cpus, new_cpus);
+	cs->effective_mems = *new_mems;
+	spin_unlock_irq(&callback_lock);
+
+	if (cpus_updated)
+		update_tasks_cpumask(cs);
+	if (mems_updated)
+		update_tasks_nodemask(cs);
+}
+
+/**
+ * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
+ * @cs: cpuset in interest
+ *
+ * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
+ * offline, update @cs accordingly.  If @cs ends up with no CPU or memory,
+ * all its tasks are moved to the nearest ancestor with both resources.
+ */
+static void cpuset_hotplug_update_tasks(struct cpuset *cs)
+{
+	static cpumask_t new_cpus;
+	static nodemask_t new_mems;
+	bool cpus_updated;
+	bool mems_updated;
+retry:
+	wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
+
+	mutex_lock(&cpuset_mutex);
+
+	/*
+	 * We have raced with task attaching. We wait until attaching
+	 * is finished, so we won't attach a task to an empty cpuset.
+	 */
+	if (cs->attach_in_progress) {
+		mutex_unlock(&cpuset_mutex);
+		goto retry;
+	}
+
+	cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
+	nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
+
+	cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
+	mems_updated = !nodes_equal(new_mems, cs->effective_mems);
+
+	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+		hotplug_update_tasks(cs, &new_cpus, &new_mems,
+				     cpus_updated, mems_updated);
+	else
+		hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
+					    cpus_updated, mems_updated);
+
+	mutex_unlock(&cpuset_mutex);
+}
+
+/**
+ * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
+ *
+ * This function is called after either CPU or memory configuration has
+ * changed and updates cpuset accordingly.  The top_cpuset is always
+ * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
+ * order to make cpusets transparent (of no affect) on systems that are
+ * actively using CPU hotplug but making no active use of cpusets.
+ *
+ * Non-root cpusets are only affected by offlining.  If any CPUs or memory
+ * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
+ * all descendants.
+ *
+ * Note that CPU offlining during suspend is ignored.  We don't modify
+ * cpusets across suspend/resume cycles at all.
+ */
+static void cpuset_hotplug_workfn(struct work_struct *work)
+{
+	static cpumask_t new_cpus;
+	static nodemask_t new_mems;
+	bool cpus_updated, mems_updated;
+	bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
+
+	mutex_lock(&cpuset_mutex);
+
+	/* fetch the available cpus/mems and find out which changed how */
+	cpumask_copy(&new_cpus, cpu_active_mask);
+	new_mems = node_states[N_MEMORY];
+
+	cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
+	mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
+
+	/* synchronize cpus_allowed to cpu_active_mask */
+	if (cpus_updated) {
+		spin_lock_irq(&callback_lock);
+		if (!on_dfl)
+			cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+		cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
+		spin_unlock_irq(&callback_lock);
+		/* we don't mess with cpumasks of tasks in top_cpuset */
+	}
+
+	/* synchronize mems_allowed to N_MEMORY */
+	if (mems_updated) {
+		spin_lock_irq(&callback_lock);
+		if (!on_dfl)
+			top_cpuset.mems_allowed = new_mems;
+		top_cpuset.effective_mems = new_mems;
+		spin_unlock_irq(&callback_lock);
+		update_tasks_nodemask(&top_cpuset);
+	}
+
+	mutex_unlock(&cpuset_mutex);
+
+	/* if cpus or mems changed, we need to propagate to descendants */
+	if (cpus_updated || mems_updated) {
+		struct cpuset *cs;
+		struct cgroup_subsys_state *pos_css;
+
+		rcu_read_lock();
+		cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
+			if (cs == &top_cpuset || !css_tryget_online(&cs->css))
+				continue;
+			rcu_read_unlock();
+
+			cpuset_hotplug_update_tasks(cs);
+
+			rcu_read_lock();
+			css_put(&cs->css);
+		}
+		rcu_read_unlock();
+	}
+
+	/* rebuild sched domains if cpus_allowed has changed */
+	if (cpus_updated)
+		rebuild_sched_domains();
+}
+
+void cpuset_update_active_cpus(bool cpu_online)
+{
+	/*
+	 * We're inside cpu hotplug critical region which usually nests
+	 * inside cgroup synchronization.  Bounce actual hotplug processing
+	 * to a work item to avoid reverse locking order.
+	 *
+	 * We still need to do partition_sched_domains() synchronously;
+	 * otherwise, the scheduler will get confused and put tasks to the
+	 * dead CPU.  Fall back to the default single domain.
+	 * cpuset_hotplug_workfn() will rebuild it as necessary.
+	 */
+	partition_sched_domains(1, NULL, NULL);
+	schedule_work(&cpuset_hotplug_work);
+}
+
+/*
+ * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
+ * Call this routine anytime after node_states[N_MEMORY] changes.
+ * See cpuset_update_active_cpus() for CPU hotplug handling.
+ */
+static int cpuset_track_online_nodes(struct notifier_block *self,
+				unsigned long action, void *arg)
+{
+	schedule_work(&cpuset_hotplug_work);
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cpuset_track_online_nodes_nb = {
+	.notifier_call = cpuset_track_online_nodes,
+	.priority = 10,		/* ??! */
+};
+
+/**
+ * cpuset_init_smp - initialize cpus_allowed
+ *
+ * Description: Finish top cpuset after cpu, node maps are initialized
+ */
+void __init cpuset_init_smp(void)
+{
+	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
+	top_cpuset.mems_allowed = node_states[N_MEMORY];
+	top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
+
+	cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
+	top_cpuset.effective_mems = node_states[N_MEMORY];
+
+	register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
+
+	cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
+	BUG_ON(!cpuset_migrate_mm_wq);
+}
+
+/**
+ * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
+ * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
+ * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
+ *
+ * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
+ * attached to the specified @tsk.  Guaranteed to return some non-empty
+ * subset of cpu_online_mask, even if this means going outside the
+ * tasks cpuset.
+ **/
+
+void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&callback_lock, flags);
+	rcu_read_lock();
+	guarantee_online_cpus(task_cs(tsk), pmask);
+	rcu_read_unlock();
+	spin_unlock_irqrestore(&callback_lock, flags);
+}
+
+void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
+{
+	rcu_read_lock();
+	do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
+	rcu_read_unlock();
+
+	/*
+	 * We own tsk->cpus_allowed, nobody can change it under us.
+	 *
+	 * But we used cs && cs->cpus_allowed lockless and thus can
+	 * race with cgroup_attach_task() or update_cpumask() and get
+	 * the wrong tsk->cpus_allowed. However, both cases imply the
+	 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
+	 * which takes task_rq_lock().
+	 *
+	 * If we are called after it dropped the lock we must see all
+	 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
+	 * set any mask even if it is not right from task_cs() pov,
+	 * the pending set_cpus_allowed_ptr() will fix things.
+	 *
+	 * select_fallback_rq() will fix things ups and set cpu_possible_mask
+	 * if required.
+	 */
+}
+
+void __init cpuset_init_current_mems_allowed(void)
+{
+	nodes_setall(current->mems_allowed);
+}
+
+/**
+ * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
+ * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
+ *
+ * Description: Returns the nodemask_t mems_allowed of the cpuset
+ * attached to the specified @tsk.  Guaranteed to return some non-empty
+ * subset of node_states[N_MEMORY], even if this means going outside the
+ * tasks cpuset.
+ **/
+
+nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
+{
+	nodemask_t mask;
+	unsigned long flags;
+
+	spin_lock_irqsave(&callback_lock, flags);
+	rcu_read_lock();
+	guarantee_online_mems(task_cs(tsk), &mask);
+	rcu_read_unlock();
+	spin_unlock_irqrestore(&callback_lock, flags);
+
+	return mask;
+}
+
+/**
+ * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
+ * @nodemask: the nodemask to be checked
+ *
+ * Are any of the nodes in the nodemask allowed in current->mems_allowed?
+ */
+int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
+{
+	return nodes_intersects(*nodemask, current->mems_allowed);
+}
+
+/*
+ * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
+ * mem_hardwall ancestor to the specified cpuset.  Call holding
+ * callback_lock.  If no ancestor is mem_exclusive or mem_hardwall
+ * (an unusual configuration), then returns the root cpuset.
+ */
+static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
+{
+	while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
+		cs = parent_cs(cs);
+	return cs;
+}
+
+/**
+ * cpuset_node_allowed - Can we allocate on a memory node?
+ * @node: is this an allowed node?
+ * @gfp_mask: memory allocation flags
+ *
+ * If we're in interrupt, yes, we can always allocate.  If @node is set in
+ * current's mems_allowed, yes.  If it's not a __GFP_HARDWALL request and this
+ * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
+ * yes.  If current has access to memory reserves due to TIF_MEMDIE, yes.
+ * Otherwise, no.
+ *
+ * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
+ * and do not allow allocations outside the current tasks cpuset
+ * unless the task has been OOM killed as is marked TIF_MEMDIE.
+ * GFP_KERNEL allocations are not so marked, so can escape to the
+ * nearest enclosing hardwalled ancestor cpuset.
+ *
+ * Scanning up parent cpusets requires callback_lock.  The
+ * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
+ * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
+ * current tasks mems_allowed came up empty on the first pass over
+ * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the
+ * cpuset are short of memory, might require taking the callback_lock.
+ *
+ * The first call here from mm/page_alloc:get_page_from_freelist()
+ * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
+ * so no allocation on a node outside the cpuset is allowed (unless
+ * in interrupt, of course).
+ *
+ * The second pass through get_page_from_freelist() doesn't even call
+ * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
+ * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
+ * in alloc_flags.  That logic and the checks below have the combined
+ * affect that:
+ *	in_interrupt - any node ok (current task context irrelevant)
+ *	GFP_ATOMIC   - any node ok
+ *	TIF_MEMDIE   - any node ok
+ *	GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
+ *	GFP_USER     - only nodes in current tasks mems allowed ok.
+ */
+bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
+{
+	struct cpuset *cs;		/* current cpuset ancestors */
+	int allowed;			/* is allocation in zone z allowed? */
+	unsigned long flags;
+
+	if (in_interrupt())
+		return true;
+	if (node_isset(node, current->mems_allowed))
+		return true;
+	/*
+	 * Allow tasks that have access to memory reserves because they have
+	 * been OOM killed to get memory anywhere.
+	 */
+	if (unlikely(test_thread_flag(TIF_MEMDIE)))
+		return true;
+	if (gfp_mask & __GFP_HARDWALL)	/* If hardwall request, stop here */
+		return false;
+
+	if (current->flags & PF_EXITING) /* Let dying task have memory */
+		return true;
+
+	/* Not hardwall and node outside mems_allowed: scan up cpusets */
+	spin_lock_irqsave(&callback_lock, flags);
+
+	rcu_read_lock();
+	cs = nearest_hardwall_ancestor(task_cs(current));
+	allowed = node_isset(node, cs->mems_allowed);
+	rcu_read_unlock();
+
+	spin_unlock_irqrestore(&callback_lock, flags);
+	return allowed;
+}
+
+/**
+ * cpuset_mem_spread_node() - On which node to begin search for a file page
+ * cpuset_slab_spread_node() - On which node to begin search for a slab page
+ *
+ * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
+ * tasks in a cpuset with is_spread_page or is_spread_slab set),
+ * and if the memory allocation used cpuset_mem_spread_node()
+ * to determine on which node to start looking, as it will for
+ * certain page cache or slab cache pages such as used for file
+ * system buffers and inode caches, then instead of starting on the
+ * local node to look for a free page, rather spread the starting
+ * node around the tasks mems_allowed nodes.
+ *
+ * We don't have to worry about the returned node being offline
+ * because "it can't happen", and even if it did, it would be ok.
+ *
+ * The routines calling guarantee_online_mems() are careful to
+ * only set nodes in task->mems_allowed that are online.  So it
+ * should not be possible for the following code to return an
+ * offline node.  But if it did, that would be ok, as this routine
+ * is not returning the node where the allocation must be, only
+ * the node where the search should start.  The zonelist passed to
+ * __alloc_pages() will include all nodes.  If the slab allocator
+ * is passed an offline node, it will fall back to the local node.
+ * See kmem_cache_alloc_node().
+ */
+
+static int cpuset_spread_node(int *rotor)
+{
+	return *rotor = next_node_in(*rotor, current->mems_allowed);
+}
+
+int cpuset_mem_spread_node(void)
+{
+	if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
+		current->cpuset_mem_spread_rotor =
+			node_random(&current->mems_allowed);
+
+	return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
+}
+
+int cpuset_slab_spread_node(void)
+{
+	if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
+		current->cpuset_slab_spread_rotor =
+			node_random(&current->mems_allowed);
+
+	return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
+}
+
+EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
+
+/**
+ * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
+ * @tsk1: pointer to task_struct of some task.
+ * @tsk2: pointer to task_struct of some other task.
+ *
+ * Description: Return true if @tsk1's mems_allowed intersects the
+ * mems_allowed of @tsk2.  Used by the OOM killer to determine if
+ * one of the task's memory usage might impact the memory available
+ * to the other.
+ **/
+
+int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
+				   const struct task_struct *tsk2)
+{
+	return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
+}
+
+/**
+ * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
+ *
+ * Description: Prints current's name, cpuset name, and cached copy of its
+ * mems_allowed to the kernel log.
+ */
+void cpuset_print_current_mems_allowed(void)
+{
+	struct cgroup *cgrp;
+
+	rcu_read_lock();
+
+	cgrp = task_cs(current)->css.cgroup;
+	pr_info("%s cpuset=", current->comm);
+	pr_cont_cgroup_name(cgrp);
+	pr_cont(" mems_allowed=%*pbl\n",
+		nodemask_pr_args(&current->mems_allowed));
+
+	rcu_read_unlock();
+}
+
+/*
+ * Collection of memory_pressure is suppressed unless
+ * this flag is enabled by writing "1" to the special
+ * cpuset file 'memory_pressure_enabled' in the root cpuset.
+ */
+
+int cpuset_memory_pressure_enabled __read_mostly;
+
+/**
+ * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
+ *
+ * Keep a running average of the rate of synchronous (direct)
+ * page reclaim efforts initiated by tasks in each cpuset.
+ *
+ * This represents the rate at which some task in the cpuset
+ * ran low on memory on all nodes it was allowed to use, and
+ * had to enter the kernels page reclaim code in an effort to
+ * create more free memory by tossing clean pages or swapping
+ * or writing dirty pages.
+ *
+ * Display to user space in the per-cpuset read-only file
+ * "memory_pressure".  Value displayed is an integer
+ * representing the recent rate of entry into the synchronous
+ * (direct) page reclaim by any task attached to the cpuset.
+ **/
+
+void __cpuset_memory_pressure_bump(void)
+{
+	rcu_read_lock();
+	fmeter_markevent(&task_cs(current)->fmeter);
+	rcu_read_unlock();
+}
+
+#ifdef CONFIG_PROC_PID_CPUSET
+/*
+ * proc_cpuset_show()
+ *  - Print tasks cpuset path into seq_file.
+ *  - Used for /proc/<pid>/cpuset.
+ *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
+ *    doesn't really matter if tsk->cpuset changes after we read it,
+ *    and we take cpuset_mutex, keeping cpuset_attach() from changing it
+ *    anyway.
+ */
+int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
+		     struct pid *pid, struct task_struct *tsk)
+{
+	char *buf;
+	struct cgroup_subsys_state *css;
+	int retval;
+
+	retval = -ENOMEM;
+	buf = kmalloc(PATH_MAX, GFP_KERNEL);
+	if (!buf)
+		goto out;
+
+	css = task_get_css(tsk, cpuset_cgrp_id);
+	retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
+				current->nsproxy->cgroup_ns);
+	css_put(css);
+	if (retval >= PATH_MAX)
+		retval = -ENAMETOOLONG;
+	if (retval < 0)
+		goto out_free;
+	seq_puts(m, buf);
+	seq_putc(m, '\n');
+	retval = 0;
+out_free:
+	kfree(buf);
+out:
+	return retval;
+}
+#endif /* CONFIG_PROC_PID_CPUSET */
+
+/* Display task mems_allowed in /proc/<pid>/status file. */
+void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
+{
+	seq_printf(m, "Mems_allowed:\t%*pb\n",
+		   nodemask_pr_args(&task->mems_allowed));
+	seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
+		   nodemask_pr_args(&task->mems_allowed));
+}
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
new file mode 100644
index 000000000000..1b72d56edce5
--- /dev/null
+++ b/kernel/cgroup/freezer.c
@@ -0,0 +1,481 @@
+/*
+ * cgroup_freezer.c -  control group freezer subsystem
+ *
+ * Copyright IBM Corporation, 2007
+ *
+ * Author : Cedric Le Goater <clg@fr.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/freezer.h>
+#include <linux/seq_file.h>
+#include <linux/mutex.h>
+
+/*
+ * A cgroup is freezing if any FREEZING flags are set.  FREEZING_SELF is
+ * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared
+ * for "THAWED".  FREEZING_PARENT is set if the parent freezer is FREEZING
+ * for whatever reason.  IOW, a cgroup has FREEZING_PARENT set if one of
+ * its ancestors has FREEZING_SELF set.
+ */
+enum freezer_state_flags {
+	CGROUP_FREEZER_ONLINE	= (1 << 0), /* freezer is fully online */
+	CGROUP_FREEZING_SELF	= (1 << 1), /* this freezer is freezing */
+	CGROUP_FREEZING_PARENT	= (1 << 2), /* the parent freezer is freezing */
+	CGROUP_FROZEN		= (1 << 3), /* this and its descendants frozen */
+
+	/* mask for all FREEZING flags */
+	CGROUP_FREEZING		= CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT,
+};
+
+struct freezer {
+	struct cgroup_subsys_state	css;
+	unsigned int			state;
+};
+
+static DEFINE_MUTEX(freezer_mutex);
+
+static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
+{
+	return css ? container_of(css, struct freezer, css) : NULL;
+}
+
+static inline struct freezer *task_freezer(struct task_struct *task)
+{
+	return css_freezer(task_css(task, freezer_cgrp_id));
+}
+
+static struct freezer *parent_freezer(struct freezer *freezer)
+{
+	return css_freezer(freezer->css.parent);
+}
+
+bool cgroup_freezing(struct task_struct *task)
+{
+	bool ret;
+
+	rcu_read_lock();
+	ret = task_freezer(task)->state & CGROUP_FREEZING;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static const char *freezer_state_strs(unsigned int state)
+{
+	if (state & CGROUP_FROZEN)
+		return "FROZEN";
+	if (state & CGROUP_FREEZING)
+		return "FREEZING";
+	return "THAWED";
+};
+
+static struct cgroup_subsys_state *
+freezer_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+	struct freezer *freezer;
+
+	freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL);
+	if (!freezer)
+		return ERR_PTR(-ENOMEM);
+
+	return &freezer->css;
+}
+
+/**
+ * freezer_css_online - commit creation of a freezer css
+ * @css: css being created
+ *
+ * We're committing to creation of @css.  Mark it online and inherit
+ * parent's freezing state while holding both parent's and our
+ * freezer->lock.
+ */
+static int freezer_css_online(struct cgroup_subsys_state *css)
+{
+	struct freezer *freezer = css_freezer(css);
+	struct freezer *parent = parent_freezer(freezer);
+
+	mutex_lock(&freezer_mutex);
+
+	freezer->state |= CGROUP_FREEZER_ONLINE;
+
+	if (parent && (parent->state & CGROUP_FREEZING)) {
+		freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
+		atomic_inc(&system_freezing_cnt);
+	}
+
+	mutex_unlock(&freezer_mutex);
+	return 0;
+}
+
+/**
+ * freezer_css_offline - initiate destruction of a freezer css
+ * @css: css being destroyed
+ *
+ * @css is going away.  Mark it dead and decrement system_freezing_count if
+ * it was holding one.
+ */
+static void freezer_css_offline(struct cgroup_subsys_state *css)
+{
+	struct freezer *freezer = css_freezer(css);
+
+	mutex_lock(&freezer_mutex);
+
+	if (freezer->state & CGROUP_FREEZING)
+		atomic_dec(&system_freezing_cnt);
+
+	freezer->state = 0;
+
+	mutex_unlock(&freezer_mutex);
+}
+
+static void freezer_css_free(struct cgroup_subsys_state *css)
+{
+	kfree(css_freezer(css));
+}
+
+/*
+ * Tasks can be migrated into a different freezer anytime regardless of its
+ * current state.  freezer_attach() is responsible for making new tasks
+ * conform to the current state.
+ *
+ * Freezer state changes and task migration are synchronized via
+ * @freezer->lock.  freezer_attach() makes the new tasks conform to the
+ * current state and all following state changes can see the new tasks.
+ */
+static void freezer_attach(struct cgroup_taskset *tset)
+{
+	struct task_struct *task;
+	struct cgroup_subsys_state *new_css;
+
+	mutex_lock(&freezer_mutex);
+
+	/*
+	 * Make the new tasks conform to the current state of @new_css.
+	 * For simplicity, when migrating any task to a FROZEN cgroup, we
+	 * revert it to FREEZING and let update_if_frozen() determine the
+	 * correct state later.
+	 *
+	 * Tasks in @tset are on @new_css but may not conform to its
+	 * current state before executing the following - !frozen tasks may
+	 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
+	 */
+	cgroup_taskset_for_each(task, new_css, tset) {
+		struct freezer *freezer = css_freezer(new_css);
+
+		if (!(freezer->state & CGROUP_FREEZING)) {
+			__thaw_task(task);
+		} else {
+			freeze_task(task);
+			/* clear FROZEN and propagate upwards */
+			while (freezer && (freezer->state & CGROUP_FROZEN)) {
+				freezer->state &= ~CGROUP_FROZEN;
+				freezer = parent_freezer(freezer);
+			}
+		}
+	}
+
+	mutex_unlock(&freezer_mutex);
+}
+
+/**
+ * freezer_fork - cgroup post fork callback
+ * @task: a task which has just been forked
+ *
+ * @task has just been created and should conform to the current state of
+ * the cgroup_freezer it belongs to.  This function may race against
+ * freezer_attach().  Losing to freezer_attach() means that we don't have
+ * to do anything as freezer_attach() will put @task into the appropriate
+ * state.
+ */
+static void freezer_fork(struct task_struct *task)
+{
+	struct freezer *freezer;
+
+	/*
+	 * The root cgroup is non-freezable, so we can skip locking the
+	 * freezer.  This is safe regardless of race with task migration.
+	 * If we didn't race or won, skipping is obviously the right thing
+	 * to do.  If we lost and root is the new cgroup, noop is still the
+	 * right thing to do.
+	 */
+	if (task_css_is_root(task, freezer_cgrp_id))
+		return;
+
+	mutex_lock(&freezer_mutex);
+	rcu_read_lock();
+
+	freezer = task_freezer(task);
+	if (freezer->state & CGROUP_FREEZING)
+		freeze_task(task);
+
+	rcu_read_unlock();
+	mutex_unlock(&freezer_mutex);
+}
+
+/**
+ * update_if_frozen - update whether a cgroup finished freezing
+ * @css: css of interest
+ *
+ * Once FREEZING is initiated, transition to FROZEN is lazily updated by
+ * calling this function.  If the current state is FREEZING but not FROZEN,
+ * this function checks whether all tasks of this cgroup and the descendant
+ * cgroups finished freezing and, if so, sets FROZEN.
+ *
+ * The caller is responsible for grabbing RCU read lock and calling
+ * update_if_frozen() on all descendants prior to invoking this function.
+ *
+ * Task states and freezer state might disagree while tasks are being
+ * migrated into or out of @css, so we can't verify task states against
+ * @freezer state here.  See freezer_attach() for details.
+ */
+static void update_if_frozen(struct cgroup_subsys_state *css)
+{
+	struct freezer *freezer = css_freezer(css);
+	struct cgroup_subsys_state *pos;
+	struct css_task_iter it;
+	struct task_struct *task;
+
+	lockdep_assert_held(&freezer_mutex);
+
+	if (!(freezer->state & CGROUP_FREEZING) ||
+	    (freezer->state & CGROUP_FROZEN))
+		return;
+
+	/* are all (live) children frozen? */
+	rcu_read_lock();
+	css_for_each_child(pos, css) {
+		struct freezer *child = css_freezer(pos);
+
+		if ((child->state & CGROUP_FREEZER_ONLINE) &&
+		    !(child->state & CGROUP_FROZEN)) {
+			rcu_read_unlock();
+			return;
+		}
+	}
+	rcu_read_unlock();
+
+	/* are all tasks frozen? */
+	css_task_iter_start(css, &it);
+
+	while ((task = css_task_iter_next(&it))) {
+		if (freezing(task)) {
+			/*
+			 * freezer_should_skip() indicates that the task
+			 * should be skipped when determining freezing
+			 * completion.  Consider it frozen in addition to
+			 * the usual frozen condition.
+			 */
+			if (!frozen(task) && !freezer_should_skip(task))
+				goto out_iter_end;
+		}
+	}
+
+	freezer->state |= CGROUP_FROZEN;
+out_iter_end:
+	css_task_iter_end(&it);
+}
+
+static int freezer_read(struct seq_file *m, void *v)
+{
+	struct cgroup_subsys_state *css = seq_css(m), *pos;
+
+	mutex_lock(&freezer_mutex);
+	rcu_read_lock();
+
+	/* update states bottom-up */
+	css_for_each_descendant_post(pos, css) {
+		if (!css_tryget_online(pos))
+			continue;
+		rcu_read_unlock();
+
+		update_if_frozen(pos);
+
+		rcu_read_lock();
+		css_put(pos);
+	}
+
+	rcu_read_unlock();
+	mutex_unlock(&freezer_mutex);
+
+	seq_puts(m, freezer_state_strs(css_freezer(css)->state));
+	seq_putc(m, '\n');
+	return 0;
+}
+
+static void freeze_cgroup(struct freezer *freezer)
+{
+	struct css_task_iter it;
+	struct task_struct *task;
+
+	css_task_iter_start(&freezer->css, &it);
+	while ((task = css_task_iter_next(&it)))
+		freeze_task(task);
+	css_task_iter_end(&it);
+}
+
+static void unfreeze_cgroup(struct freezer *freezer)
+{
+	struct css_task_iter it;
+	struct task_struct *task;
+
+	css_task_iter_start(&freezer->css, &it);
+	while ((task = css_task_iter_next(&it)))
+		__thaw_task(task);
+	css_task_iter_end(&it);
+}
+
+/**
+ * freezer_apply_state - apply state change to a single cgroup_freezer
+ * @freezer: freezer to apply state change to
+ * @freeze: whether to freeze or unfreeze
+ * @state: CGROUP_FREEZING_* flag to set or clear
+ *
+ * Set or clear @state on @cgroup according to @freeze, and perform
+ * freezing or thawing as necessary.
+ */
+static void freezer_apply_state(struct freezer *freezer, bool freeze,
+				unsigned int state)
+{
+	/* also synchronizes against task migration, see freezer_attach() */
+	lockdep_assert_held(&freezer_mutex);
+
+	if (!(freezer->state & CGROUP_FREEZER_ONLINE))
+		return;
+
+	if (freeze) {
+		if (!(freezer->state & CGROUP_FREEZING))
+			atomic_inc(&system_freezing_cnt);
+		freezer->state |= state;
+		freeze_cgroup(freezer);
+	} else {
+		bool was_freezing = freezer->state & CGROUP_FREEZING;
+
+		freezer->state &= ~state;
+
+		if (!(freezer->state & CGROUP_FREEZING)) {
+			if (was_freezing)
+				atomic_dec(&system_freezing_cnt);
+			freezer->state &= ~CGROUP_FROZEN;
+			unfreeze_cgroup(freezer);
+		}
+	}
+}
+
+/**
+ * freezer_change_state - change the freezing state of a cgroup_freezer
+ * @freezer: freezer of interest
+ * @freeze: whether to freeze or thaw
+ *
+ * Freeze or thaw @freezer according to @freeze.  The operations are
+ * recursive - all descendants of @freezer will be affected.
+ */
+static void freezer_change_state(struct freezer *freezer, bool freeze)
+{
+	struct cgroup_subsys_state *pos;
+
+	/*
+	 * Update all its descendants in pre-order traversal.  Each
+	 * descendant will try to inherit its parent's FREEZING state as
+	 * CGROUP_FREEZING_PARENT.
+	 */
+	mutex_lock(&freezer_mutex);
+	rcu_read_lock();
+	css_for_each_descendant_pre(pos, &freezer->css) {
+		struct freezer *pos_f = css_freezer(pos);
+		struct freezer *parent = parent_freezer(pos_f);
+
+		if (!css_tryget_online(pos))
+			continue;
+		rcu_read_unlock();
+
+		if (pos_f == freezer)
+			freezer_apply_state(pos_f, freeze,
+					    CGROUP_FREEZING_SELF);
+		else
+			freezer_apply_state(pos_f,
+					    parent->state & CGROUP_FREEZING,
+					    CGROUP_FREEZING_PARENT);
+
+		rcu_read_lock();
+		css_put(pos);
+	}
+	rcu_read_unlock();
+	mutex_unlock(&freezer_mutex);
+}
+
+static ssize_t freezer_write(struct kernfs_open_file *of,
+			     char *buf, size_t nbytes, loff_t off)
+{
+	bool freeze;
+
+	buf = strstrip(buf);
+
+	if (strcmp(buf, freezer_state_strs(0)) == 0)
+		freeze = false;
+	else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0)
+		freeze = true;
+	else
+		return -EINVAL;
+
+	freezer_change_state(css_freezer(of_css(of)), freeze);
+	return nbytes;
+}
+
+static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
+				      struct cftype *cft)
+{
+	struct freezer *freezer = css_freezer(css);
+
+	return (bool)(freezer->state & CGROUP_FREEZING_SELF);
+}
+
+static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css,
+					struct cftype *cft)
+{
+	struct freezer *freezer = css_freezer(css);
+
+	return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
+}
+
+static struct cftype files[] = {
+	{
+		.name = "state",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = freezer_read,
+		.write = freezer_write,
+	},
+	{
+		.name = "self_freezing",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = freezer_self_freezing_read,
+	},
+	{
+		.name = "parent_freezing",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = freezer_parent_freezing_read,
+	},
+	{ }	/* terminate */
+};
+
+struct cgroup_subsys freezer_cgrp_subsys = {
+	.css_alloc	= freezer_css_alloc,
+	.css_online	= freezer_css_online,
+	.css_offline	= freezer_css_offline,
+	.css_free	= freezer_css_free,
+	.attach		= freezer_attach,
+	.fork		= freezer_fork,
+	.legacy_cftypes	= files,
+};
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
new file mode 100644
index 000000000000..2bd673783f1a
--- /dev/null
+++ b/kernel/cgroup/pids.c
@@ -0,0 +1,348 @@
+/*
+ * Process number limiting controller for cgroups.
+ *
+ * Used to allow a cgroup hierarchy to stop any new processes from fork()ing
+ * after a certain limit is reached.
+ *
+ * Since it is trivial to hit the task limit without hitting any kmemcg limits
+ * in place, PIDs are a fundamental resource. As such, PID exhaustion must be
+ * preventable in the scope of a cgroup hierarchy by allowing resource limiting
+ * of the number of tasks in a cgroup.
+ *
+ * In order to use the `pids` controller, set the maximum number of tasks in
+ * pids.max (this is not available in the root cgroup for obvious reasons). The
+ * number of processes currently in the cgroup is given by pids.current.
+ * Organisational operations are not blocked by cgroup policies, so it is
+ * possible to have pids.current > pids.max. However, it is not possible to
+ * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
+ * would cause a cgroup policy to be violated.
+ *
+ * To set a cgroup to have no limit, set pids.max to "max". This is the default
+ * for all new cgroups (N.B. that PID limits are hierarchical, so the most
+ * stringent limit in the hierarchy is followed).
+ *
+ * pids.current tracks all child cgroup hierarchies, so parent/pids.current is
+ * a superset of parent/child/pids.current.
+ *
+ * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License.  See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/threads.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+
+#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
+#define PIDS_MAX_STR "max"
+
+struct pids_cgroup {
+	struct cgroup_subsys_state	css;
+
+	/*
+	 * Use 64-bit types so that we can safely represent "max" as
+	 * %PIDS_MAX = (%PID_MAX_LIMIT + 1).
+	 */
+	atomic64_t			counter;
+	int64_t				limit;
+
+	/* Handle for "pids.events" */
+	struct cgroup_file		events_file;
+
+	/* Number of times fork failed because limit was hit. */
+	atomic64_t			events_limit;
+};
+
+static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
+{
+	return container_of(css, struct pids_cgroup, css);
+}
+
+static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
+{
+	return css_pids(pids->css.parent);
+}
+
+static struct cgroup_subsys_state *
+pids_css_alloc(struct cgroup_subsys_state *parent)
+{
+	struct pids_cgroup *pids;
+
+	pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
+	if (!pids)
+		return ERR_PTR(-ENOMEM);
+
+	pids->limit = PIDS_MAX;
+	atomic64_set(&pids->counter, 0);
+	atomic64_set(&pids->events_limit, 0);
+	return &pids->css;
+}
+
+static void pids_css_free(struct cgroup_subsys_state *css)
+{
+	kfree(css_pids(css));
+}
+
+/**
+ * pids_cancel - uncharge the local pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to cancel
+ *
+ * This function will WARN if the pid count goes under 0, because such a case is
+ * a bug in the pids controller proper.
+ */
+static void pids_cancel(struct pids_cgroup *pids, int num)
+{
+	/*
+	 * A negative count (or overflow for that matter) is invalid,
+	 * and indicates a bug in the `pids` controller proper.
+	 */
+	WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
+}
+
+/**
+ * pids_uncharge - hierarchically uncharge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to uncharge
+ */
+static void pids_uncharge(struct pids_cgroup *pids, int num)
+{
+	struct pids_cgroup *p;
+
+	for (p = pids; parent_pids(p); p = parent_pids(p))
+		pids_cancel(p, num);
+}
+
+/**
+ * pids_charge - hierarchically charge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to charge
+ *
+ * This function does *not* follow the pid limit set. It cannot fail and the new
+ * pid count may exceed the limit. This is only used for reverting failed
+ * attaches, where there is no other way out than violating the limit.
+ */
+static void pids_charge(struct pids_cgroup *pids, int num)
+{
+	struct pids_cgroup *p;
+
+	for (p = pids; parent_pids(p); p = parent_pids(p))
+		atomic64_add(num, &p->counter);
+}
+
+/**
+ * pids_try_charge - hierarchically try to charge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to charge
+ *
+ * This function follows the set limit. It will fail if the charge would cause
+ * the new value to exceed the hierarchical limit. Returns 0 if the charge
+ * succeeded, otherwise -EAGAIN.
+ */
+static int pids_try_charge(struct pids_cgroup *pids, int num)
+{
+	struct pids_cgroup *p, *q;
+
+	for (p = pids; parent_pids(p); p = parent_pids(p)) {
+		int64_t new = atomic64_add_return(num, &p->counter);
+
+		/*
+		 * Since new is capped to the maximum number of pid_t, if
+		 * p->limit is %PIDS_MAX then we know that this test will never
+		 * fail.
+		 */
+		if (new > p->limit)
+			goto revert;
+	}
+
+	return 0;
+
+revert:
+	for (q = pids; q != p; q = parent_pids(q))
+		pids_cancel(q, num);
+	pids_cancel(p, num);
+
+	return -EAGAIN;
+}
+
+static int pids_can_attach(struct cgroup_taskset *tset)
+{
+	struct task_struct *task;
+	struct cgroup_subsys_state *dst_css;
+
+	cgroup_taskset_for_each(task, dst_css, tset) {
+		struct pids_cgroup *pids = css_pids(dst_css);
+		struct cgroup_subsys_state *old_css;
+		struct pids_cgroup *old_pids;
+
+		/*
+		 * No need to pin @old_css between here and cancel_attach()
+		 * because cgroup core protects it from being freed before
+		 * the migration completes or fails.
+		 */
+		old_css = task_css(task, pids_cgrp_id);
+		old_pids = css_pids(old_css);
+
+		pids_charge(pids, 1);
+		pids_uncharge(old_pids, 1);
+	}
+
+	return 0;
+}
+
+static void pids_cancel_attach(struct cgroup_taskset *tset)
+{
+	struct task_struct *task;
+	struct cgroup_subsys_state *dst_css;
+
+	cgroup_taskset_for_each(task, dst_css, tset) {
+		struct pids_cgroup *pids = css_pids(dst_css);
+		struct cgroup_subsys_state *old_css;
+		struct pids_cgroup *old_pids;
+
+		old_css = task_css(task, pids_cgrp_id);
+		old_pids = css_pids(old_css);
+
+		pids_charge(old_pids, 1);
+		pids_uncharge(pids, 1);
+	}
+}
+
+/*
+ * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
+ * on threadgroup_change_begin() held by the copy_process().
+ */
+static int pids_can_fork(struct task_struct *task)
+{
+	struct cgroup_subsys_state *css;
+	struct pids_cgroup *pids;
+	int err;
+
+	css = task_css_check(current, pids_cgrp_id, true);
+	pids = css_pids(css);
+	err = pids_try_charge(pids, 1);
+	if (err) {
+		/* Only log the first time events_limit is incremented. */
+		if (atomic64_inc_return(&pids->events_limit) == 1) {
+			pr_info("cgroup: fork rejected by pids controller in ");
+			pr_cont_cgroup_path(task_cgroup(current, pids_cgrp_id));
+			pr_cont("\n");
+		}
+		cgroup_file_notify(&pids->events_file);
+	}
+	return err;
+}
+
+static void pids_cancel_fork(struct task_struct *task)
+{
+	struct cgroup_subsys_state *css;
+	struct pids_cgroup *pids;
+
+	css = task_css_check(current, pids_cgrp_id, true);
+	pids = css_pids(css);
+	pids_uncharge(pids, 1);
+}
+
+static void pids_free(struct task_struct *task)
+{
+	struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));
+
+	pids_uncharge(pids, 1);
+}
+
+static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
+			      size_t nbytes, loff_t off)
+{
+	struct cgroup_subsys_state *css = of_css(of);
+	struct pids_cgroup *pids = css_pids(css);
+	int64_t limit;
+	int err;
+
+	buf = strstrip(buf);
+	if (!strcmp(buf, PIDS_MAX_STR)) {
+		limit = PIDS_MAX;
+		goto set_limit;
+	}
+
+	err = kstrtoll(buf, 0, &limit);
+	if (err)
+		return err;
+
+	if (limit < 0 || limit >= PIDS_MAX)
+		return -EINVAL;
+
+set_limit:
+	/*
+	 * Limit updates don't need to be mutex'd, since it isn't
+	 * critical that any racing fork()s follow the new limit.
+	 */
+	pids->limit = limit;
+	return nbytes;
+}
+
+static int pids_max_show(struct seq_file *sf, void *v)
+{
+	struct cgroup_subsys_state *css = seq_css(sf);
+	struct pids_cgroup *pids = css_pids(css);
+	int64_t limit = pids->limit;
+
+	if (limit >= PIDS_MAX)
+		seq_printf(sf, "%s\n", PIDS_MAX_STR);
+	else
+		seq_printf(sf, "%lld\n", limit);
+
+	return 0;
+}
+
+static s64 pids_current_read(struct cgroup_subsys_state *css,
+			     struct cftype *cft)
+{
+	struct pids_cgroup *pids = css_pids(css);
+
+	return atomic64_read(&pids->counter);
+}
+
+static int pids_events_show(struct seq_file *sf, void *v)
+{
+	struct pids_cgroup *pids = css_pids(seq_css(sf));
+
+	seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit));
+	return 0;
+}
+
+static struct cftype pids_files[] = {
+	{
+		.name = "max",
+		.write = pids_max_write,
+		.seq_show = pids_max_show,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "current",
+		.read_s64 = pids_current_read,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "events",
+		.seq_show = pids_events_show,
+		.file_offset = offsetof(struct pids_cgroup, events_file),
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{ }	/* terminate */
+};
+
+struct cgroup_subsys pids_cgrp_subsys = {
+	.css_alloc	= pids_css_alloc,
+	.css_free	= pids_css_free,
+	.can_attach 	= pids_can_attach,
+	.cancel_attach 	= pids_cancel_attach,
+	.can_fork	= pids_can_fork,
+	.cancel_fork	= pids_cancel_fork,
+	.free		= pids_free,
+	.legacy_cftypes	= pids_files,
+	.dfl_cftypes	= pids_files,
+};
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
deleted file mode 100644
index 1b72d56edce5..000000000000
--- a/kernel/cgroup_freezer.c
+++ /dev/null
@@ -1,481 +0,0 @@
-/*
- * cgroup_freezer.c -  control group freezer subsystem
- *
- * Copyright IBM Corporation, 2007
- *
- * Author : Cedric Le Goater <clg@fr.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- */
-
-#include <linux/export.h>
-#include <linux/slab.h>
-#include <linux/cgroup.h>
-#include <linux/fs.h>
-#include <linux/uaccess.h>
-#include <linux/freezer.h>
-#include <linux/seq_file.h>
-#include <linux/mutex.h>
-
-/*
- * A cgroup is freezing if any FREEZING flags are set.  FREEZING_SELF is
- * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared
- * for "THAWED".  FREEZING_PARENT is set if the parent freezer is FREEZING
- * for whatever reason.  IOW, a cgroup has FREEZING_PARENT set if one of
- * its ancestors has FREEZING_SELF set.
- */
-enum freezer_state_flags {
-	CGROUP_FREEZER_ONLINE	= (1 << 0), /* freezer is fully online */
-	CGROUP_FREEZING_SELF	= (1 << 1), /* this freezer is freezing */
-	CGROUP_FREEZING_PARENT	= (1 << 2), /* the parent freezer is freezing */
-	CGROUP_FROZEN		= (1 << 3), /* this and its descendants frozen */
-
-	/* mask for all FREEZING flags */
-	CGROUP_FREEZING		= CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT,
-};
-
-struct freezer {
-	struct cgroup_subsys_state	css;
-	unsigned int			state;
-};
-
-static DEFINE_MUTEX(freezer_mutex);
-
-static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
-{
-	return css ? container_of(css, struct freezer, css) : NULL;
-}
-
-static inline struct freezer *task_freezer(struct task_struct *task)
-{
-	return css_freezer(task_css(task, freezer_cgrp_id));
-}
-
-static struct freezer *parent_freezer(struct freezer *freezer)
-{
-	return css_freezer(freezer->css.parent);
-}
-
-bool cgroup_freezing(struct task_struct *task)
-{
-	bool ret;
-
-	rcu_read_lock();
-	ret = task_freezer(task)->state & CGROUP_FREEZING;
-	rcu_read_unlock();
-
-	return ret;
-}
-
-static const char *freezer_state_strs(unsigned int state)
-{
-	if (state & CGROUP_FROZEN)
-		return "FROZEN";
-	if (state & CGROUP_FREEZING)
-		return "FREEZING";
-	return "THAWED";
-};
-
-static struct cgroup_subsys_state *
-freezer_css_alloc(struct cgroup_subsys_state *parent_css)
-{
-	struct freezer *freezer;
-
-	freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL);
-	if (!freezer)
-		return ERR_PTR(-ENOMEM);
-
-	return &freezer->css;
-}
-
-/**
- * freezer_css_online - commit creation of a freezer css
- * @css: css being created
- *
- * We're committing to creation of @css.  Mark it online and inherit
- * parent's freezing state while holding both parent's and our
- * freezer->lock.
- */
-static int freezer_css_online(struct cgroup_subsys_state *css)
-{
-	struct freezer *freezer = css_freezer(css);
-	struct freezer *parent = parent_freezer(freezer);
-
-	mutex_lock(&freezer_mutex);
-
-	freezer->state |= CGROUP_FREEZER_ONLINE;
-
-	if (parent && (parent->state & CGROUP_FREEZING)) {
-		freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
-		atomic_inc(&system_freezing_cnt);
-	}
-
-	mutex_unlock(&freezer_mutex);
-	return 0;
-}
-
-/**
- * freezer_css_offline - initiate destruction of a freezer css
- * @css: css being destroyed
- *
- * @css is going away.  Mark it dead and decrement system_freezing_count if
- * it was holding one.
- */
-static void freezer_css_offline(struct cgroup_subsys_state *css)
-{
-	struct freezer *freezer = css_freezer(css);
-
-	mutex_lock(&freezer_mutex);
-
-	if (freezer->state & CGROUP_FREEZING)
-		atomic_dec(&system_freezing_cnt);
-
-	freezer->state = 0;
-
-	mutex_unlock(&freezer_mutex);
-}
-
-static void freezer_css_free(struct cgroup_subsys_state *css)
-{
-	kfree(css_freezer(css));
-}
-
-/*
- * Tasks can be migrated into a different freezer anytime regardless of its
- * current state.  freezer_attach() is responsible for making new tasks
- * conform to the current state.
- *
- * Freezer state changes and task migration are synchronized via
- * @freezer->lock.  freezer_attach() makes the new tasks conform to the
- * current state and all following state changes can see the new tasks.
- */
-static void freezer_attach(struct cgroup_taskset *tset)
-{
-	struct task_struct *task;
-	struct cgroup_subsys_state *new_css;
-
-	mutex_lock(&freezer_mutex);
-
-	/*
-	 * Make the new tasks conform to the current state of @new_css.
-	 * For simplicity, when migrating any task to a FROZEN cgroup, we
-	 * revert it to FREEZING and let update_if_frozen() determine the
-	 * correct state later.
-	 *
-	 * Tasks in @tset are on @new_css but may not conform to its
-	 * current state before executing the following - !frozen tasks may
-	 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
-	 */
-	cgroup_taskset_for_each(task, new_css, tset) {
-		struct freezer *freezer = css_freezer(new_css);
-
-		if (!(freezer->state & CGROUP_FREEZING)) {
-			__thaw_task(task);
-		} else {
-			freeze_task(task);
-			/* clear FROZEN and propagate upwards */
-			while (freezer && (freezer->state & CGROUP_FROZEN)) {
-				freezer->state &= ~CGROUP_FROZEN;
-				freezer = parent_freezer(freezer);
-			}
-		}
-	}
-
-	mutex_unlock(&freezer_mutex);
-}
-
-/**
- * freezer_fork - cgroup post fork callback
- * @task: a task which has just been forked
- *
- * @task has just been created and should conform to the current state of
- * the cgroup_freezer it belongs to.  This function may race against
- * freezer_attach().  Losing to freezer_attach() means that we don't have
- * to do anything as freezer_attach() will put @task into the appropriate
- * state.
- */
-static void freezer_fork(struct task_struct *task)
-{
-	struct freezer *freezer;
-
-	/*
-	 * The root cgroup is non-freezable, so we can skip locking the
-	 * freezer.  This is safe regardless of race with task migration.
-	 * If we didn't race or won, skipping is obviously the right thing
-	 * to do.  If we lost and root is the new cgroup, noop is still the
-	 * right thing to do.
-	 */
-	if (task_css_is_root(task, freezer_cgrp_id))
-		return;
-
-	mutex_lock(&freezer_mutex);
-	rcu_read_lock();
-
-	freezer = task_freezer(task);
-	if (freezer->state & CGROUP_FREEZING)
-		freeze_task(task);
-
-	rcu_read_unlock();
-	mutex_unlock(&freezer_mutex);
-}
-
-/**
- * update_if_frozen - update whether a cgroup finished freezing
- * @css: css of interest
- *
- * Once FREEZING is initiated, transition to FROZEN is lazily updated by
- * calling this function.  If the current state is FREEZING but not FROZEN,
- * this function checks whether all tasks of this cgroup and the descendant
- * cgroups finished freezing and, if so, sets FROZEN.
- *
- * The caller is responsible for grabbing RCU read lock and calling
- * update_if_frozen() on all descendants prior to invoking this function.
- *
- * Task states and freezer state might disagree while tasks are being
- * migrated into or out of @css, so we can't verify task states against
- * @freezer state here.  See freezer_attach() for details.
- */
-static void update_if_frozen(struct cgroup_subsys_state *css)
-{
-	struct freezer *freezer = css_freezer(css);
-	struct cgroup_subsys_state *pos;
-	struct css_task_iter it;
-	struct task_struct *task;
-
-	lockdep_assert_held(&freezer_mutex);
-
-	if (!(freezer->state & CGROUP_FREEZING) ||
-	    (freezer->state & CGROUP_FROZEN))
-		return;
-
-	/* are all (live) children frozen? */
-	rcu_read_lock();
-	css_for_each_child(pos, css) {
-		struct freezer *child = css_freezer(pos);
-
-		if ((child->state & CGROUP_FREEZER_ONLINE) &&
-		    !(child->state & CGROUP_FROZEN)) {
-			rcu_read_unlock();
-			return;
-		}
-	}
-	rcu_read_unlock();
-
-	/* are all tasks frozen? */
-	css_task_iter_start(css, &it);
-
-	while ((task = css_task_iter_next(&it))) {
-		if (freezing(task)) {
-			/*
-			 * freezer_should_skip() indicates that the task
-			 * should be skipped when determining freezing
-			 * completion.  Consider it frozen in addition to
-			 * the usual frozen condition.
-			 */
-			if (!frozen(task) && !freezer_should_skip(task))
-				goto out_iter_end;
-		}
-	}
-
-	freezer->state |= CGROUP_FROZEN;
-out_iter_end:
-	css_task_iter_end(&it);
-}
-
-static int freezer_read(struct seq_file *m, void *v)
-{
-	struct cgroup_subsys_state *css = seq_css(m), *pos;
-
-	mutex_lock(&freezer_mutex);
-	rcu_read_lock();
-
-	/* update states bottom-up */
-	css_for_each_descendant_post(pos, css) {
-		if (!css_tryget_online(pos))
-			continue;
-		rcu_read_unlock();
-
-		update_if_frozen(pos);
-
-		rcu_read_lock();
-		css_put(pos);
-	}
-
-	rcu_read_unlock();
-	mutex_unlock(&freezer_mutex);
-
-	seq_puts(m, freezer_state_strs(css_freezer(css)->state));
-	seq_putc(m, '\n');
-	return 0;
-}
-
-static void freeze_cgroup(struct freezer *freezer)
-{
-	struct css_task_iter it;
-	struct task_struct *task;
-
-	css_task_iter_start(&freezer->css, &it);
-	while ((task = css_task_iter_next(&it)))
-		freeze_task(task);
-	css_task_iter_end(&it);
-}
-
-static void unfreeze_cgroup(struct freezer *freezer)
-{
-	struct css_task_iter it;
-	struct task_struct *task;
-
-	css_task_iter_start(&freezer->css, &it);
-	while ((task = css_task_iter_next(&it)))
-		__thaw_task(task);
-	css_task_iter_end(&it);
-}
-
-/**
- * freezer_apply_state - apply state change to a single cgroup_freezer
- * @freezer: freezer to apply state change to
- * @freeze: whether to freeze or unfreeze
- * @state: CGROUP_FREEZING_* flag to set or clear
- *
- * Set or clear @state on @cgroup according to @freeze, and perform
- * freezing or thawing as necessary.
- */
-static void freezer_apply_state(struct freezer *freezer, bool freeze,
-				unsigned int state)
-{
-	/* also synchronizes against task migration, see freezer_attach() */
-	lockdep_assert_held(&freezer_mutex);
-
-	if (!(freezer->state & CGROUP_FREEZER_ONLINE))
-		return;
-
-	if (freeze) {
-		if (!(freezer->state & CGROUP_FREEZING))
-			atomic_inc(&system_freezing_cnt);
-		freezer->state |= state;
-		freeze_cgroup(freezer);
-	} else {
-		bool was_freezing = freezer->state & CGROUP_FREEZING;
-
-		freezer->state &= ~state;
-
-		if (!(freezer->state & CGROUP_FREEZING)) {
-			if (was_freezing)
-				atomic_dec(&system_freezing_cnt);
-			freezer->state &= ~CGROUP_FROZEN;
-			unfreeze_cgroup(freezer);
-		}
-	}
-}
-
-/**
- * freezer_change_state - change the freezing state of a cgroup_freezer
- * @freezer: freezer of interest
- * @freeze: whether to freeze or thaw
- *
- * Freeze or thaw @freezer according to @freeze.  The operations are
- * recursive - all descendants of @freezer will be affected.
- */
-static void freezer_change_state(struct freezer *freezer, bool freeze)
-{
-	struct cgroup_subsys_state *pos;
-
-	/*
-	 * Update all its descendants in pre-order traversal.  Each
-	 * descendant will try to inherit its parent's FREEZING state as
-	 * CGROUP_FREEZING_PARENT.
-	 */
-	mutex_lock(&freezer_mutex);
-	rcu_read_lock();
-	css_for_each_descendant_pre(pos, &freezer->css) {
-		struct freezer *pos_f = css_freezer(pos);
-		struct freezer *parent = parent_freezer(pos_f);
-
-		if (!css_tryget_online(pos))
-			continue;
-		rcu_read_unlock();
-
-		if (pos_f == freezer)
-			freezer_apply_state(pos_f, freeze,
-					    CGROUP_FREEZING_SELF);
-		else
-			freezer_apply_state(pos_f,
-					    parent->state & CGROUP_FREEZING,
-					    CGROUP_FREEZING_PARENT);
-
-		rcu_read_lock();
-		css_put(pos);
-	}
-	rcu_read_unlock();
-	mutex_unlock(&freezer_mutex);
-}
-
-static ssize_t freezer_write(struct kernfs_open_file *of,
-			     char *buf, size_t nbytes, loff_t off)
-{
-	bool freeze;
-
-	buf = strstrip(buf);
-
-	if (strcmp(buf, freezer_state_strs(0)) == 0)
-		freeze = false;
-	else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0)
-		freeze = true;
-	else
-		return -EINVAL;
-
-	freezer_change_state(css_freezer(of_css(of)), freeze);
-	return nbytes;
-}
-
-static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
-				      struct cftype *cft)
-{
-	struct freezer *freezer = css_freezer(css);
-
-	return (bool)(freezer->state & CGROUP_FREEZING_SELF);
-}
-
-static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css,
-					struct cftype *cft)
-{
-	struct freezer *freezer = css_freezer(css);
-
-	return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
-}
-
-static struct cftype files[] = {
-	{
-		.name = "state",
-		.flags = CFTYPE_NOT_ON_ROOT,
-		.seq_show = freezer_read,
-		.write = freezer_write,
-	},
-	{
-		.name = "self_freezing",
-		.flags = CFTYPE_NOT_ON_ROOT,
-		.read_u64 = freezer_self_freezing_read,
-	},
-	{
-		.name = "parent_freezing",
-		.flags = CFTYPE_NOT_ON_ROOT,
-		.read_u64 = freezer_parent_freezing_read,
-	},
-	{ }	/* terminate */
-};
-
-struct cgroup_subsys freezer_cgrp_subsys = {
-	.css_alloc	= freezer_css_alloc,
-	.css_online	= freezer_css_online,
-	.css_offline	= freezer_css_offline,
-	.css_free	= freezer_css_free,
-	.attach		= freezer_attach,
-	.fork		= freezer_fork,
-	.legacy_cftypes	= files,
-};
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
deleted file mode 100644
index 2bd673783f1a..000000000000
--- a/kernel/cgroup_pids.c
+++ /dev/null
@@ -1,348 +0,0 @@
-/*
- * Process number limiting controller for cgroups.
- *
- * Used to allow a cgroup hierarchy to stop any new processes from fork()ing
- * after a certain limit is reached.
- *
- * Since it is trivial to hit the task limit without hitting any kmemcg limits
- * in place, PIDs are a fundamental resource. As such, PID exhaustion must be
- * preventable in the scope of a cgroup hierarchy by allowing resource limiting
- * of the number of tasks in a cgroup.
- *
- * In order to use the `pids` controller, set the maximum number of tasks in
- * pids.max (this is not available in the root cgroup for obvious reasons). The
- * number of processes currently in the cgroup is given by pids.current.
- * Organisational operations are not blocked by cgroup policies, so it is
- * possible to have pids.current > pids.max. However, it is not possible to
- * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
- * would cause a cgroup policy to be violated.
- *
- * To set a cgroup to have no limit, set pids.max to "max". This is the default
- * for all new cgroups (N.B. that PID limits are hierarchical, so the most
- * stringent limit in the hierarchy is followed).
- *
- * pids.current tracks all child cgroup hierarchies, so parent/pids.current is
- * a superset of parent/child/pids.current.
- *
- * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
- *
- * This file is subject to the terms and conditions of version 2 of the GNU
- * General Public License.  See the file COPYING in the main directory of the
- * Linux distribution for more details.
- */
-
-#include <linux/kernel.h>
-#include <linux/threads.h>
-#include <linux/atomic.h>
-#include <linux/cgroup.h>
-#include <linux/slab.h>
-
-#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
-#define PIDS_MAX_STR "max"
-
-struct pids_cgroup {
-	struct cgroup_subsys_state	css;
-
-	/*
-	 * Use 64-bit types so that we can safely represent "max" as
-	 * %PIDS_MAX = (%PID_MAX_LIMIT + 1).
-	 */
-	atomic64_t			counter;
-	int64_t				limit;
-
-	/* Handle for "pids.events" */
-	struct cgroup_file		events_file;
-
-	/* Number of times fork failed because limit was hit. */
-	atomic64_t			events_limit;
-};
-
-static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
-{
-	return container_of(css, struct pids_cgroup, css);
-}
-
-static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
-{
-	return css_pids(pids->css.parent);
-}
-
-static struct cgroup_subsys_state *
-pids_css_alloc(struct cgroup_subsys_state *parent)
-{
-	struct pids_cgroup *pids;
-
-	pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
-	if (!pids)
-		return ERR_PTR(-ENOMEM);
-
-	pids->limit = PIDS_MAX;
-	atomic64_set(&pids->counter, 0);
-	atomic64_set(&pids->events_limit, 0);
-	return &pids->css;
-}
-
-static void pids_css_free(struct cgroup_subsys_state *css)
-{
-	kfree(css_pids(css));
-}
-
-/**
- * pids_cancel - uncharge the local pid count
- * @pids: the pid cgroup state
- * @num: the number of pids to cancel
- *
- * This function will WARN if the pid count goes under 0, because such a case is
- * a bug in the pids controller proper.
- */
-static void pids_cancel(struct pids_cgroup *pids, int num)
-{
-	/*
-	 * A negative count (or overflow for that matter) is invalid,
-	 * and indicates a bug in the `pids` controller proper.
-	 */
-	WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
-}
-
-/**
- * pids_uncharge - hierarchically uncharge the pid count
- * @pids: the pid cgroup state
- * @num: the number of pids to uncharge
- */
-static void pids_uncharge(struct pids_cgroup *pids, int num)
-{
-	struct pids_cgroup *p;
-
-	for (p = pids; parent_pids(p); p = parent_pids(p))
-		pids_cancel(p, num);
-}
-
-/**
- * pids_charge - hierarchically charge the pid count
- * @pids: the pid cgroup state
- * @num: the number of pids to charge
- *
- * This function does *not* follow the pid limit set. It cannot fail and the new
- * pid count may exceed the limit. This is only used for reverting failed
- * attaches, where there is no other way out than violating the limit.
- */
-static void pids_charge(struct pids_cgroup *pids, int num)
-{
-	struct pids_cgroup *p;
-
-	for (p = pids; parent_pids(p); p = parent_pids(p))
-		atomic64_add(num, &p->counter);
-}
-
-/**
- * pids_try_charge - hierarchically try to charge the pid count
- * @pids: the pid cgroup state
- * @num: the number of pids to charge
- *
- * This function follows the set limit. It will fail if the charge would cause
- * the new value to exceed the hierarchical limit. Returns 0 if the charge
- * succeeded, otherwise -EAGAIN.
- */
-static int pids_try_charge(struct pids_cgroup *pids, int num)
-{
-	struct pids_cgroup *p, *q;
-
-	for (p = pids; parent_pids(p); p = parent_pids(p)) {
-		int64_t new = atomic64_add_return(num, &p->counter);
-
-		/*
-		 * Since new is capped to the maximum number of pid_t, if
-		 * p->limit is %PIDS_MAX then we know that this test will never
-		 * fail.
-		 */
-		if (new > p->limit)
-			goto revert;
-	}
-
-	return 0;
-
-revert:
-	for (q = pids; q != p; q = parent_pids(q))
-		pids_cancel(q, num);
-	pids_cancel(p, num);
-
-	return -EAGAIN;
-}
-
-static int pids_can_attach(struct cgroup_taskset *tset)
-{
-	struct task_struct *task;
-	struct cgroup_subsys_state *dst_css;
-
-	cgroup_taskset_for_each(task, dst_css, tset) {
-		struct pids_cgroup *pids = css_pids(dst_css);
-		struct cgroup_subsys_state *old_css;
-		struct pids_cgroup *old_pids;
-
-		/*
-		 * No need to pin @old_css between here and cancel_attach()
-		 * because cgroup core protects it from being freed before
-		 * the migration completes or fails.
-		 */
-		old_css = task_css(task, pids_cgrp_id);
-		old_pids = css_pids(old_css);
-
-		pids_charge(pids, 1);
-		pids_uncharge(old_pids, 1);
-	}
-
-	return 0;
-}
-
-static void pids_cancel_attach(struct cgroup_taskset *tset)
-{
-	struct task_struct *task;
-	struct cgroup_subsys_state *dst_css;
-
-	cgroup_taskset_for_each(task, dst_css, tset) {
-		struct pids_cgroup *pids = css_pids(dst_css);
-		struct cgroup_subsys_state *old_css;
-		struct pids_cgroup *old_pids;
-
-		old_css = task_css(task, pids_cgrp_id);
-		old_pids = css_pids(old_css);
-
-		pids_charge(old_pids, 1);
-		pids_uncharge(pids, 1);
-	}
-}
-
-/*
- * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
- * on threadgroup_change_begin() held by the copy_process().
- */
-static int pids_can_fork(struct task_struct *task)
-{
-	struct cgroup_subsys_state *css;
-	struct pids_cgroup *pids;
-	int err;
-
-	css = task_css_check(current, pids_cgrp_id, true);
-	pids = css_pids(css);
-	err = pids_try_charge(pids, 1);
-	if (err) {
-		/* Only log the first time events_limit is incremented. */
-		if (atomic64_inc_return(&pids->events_limit) == 1) {
-			pr_info("cgroup: fork rejected by pids controller in ");
-			pr_cont_cgroup_path(task_cgroup(current, pids_cgrp_id));
-			pr_cont("\n");
-		}
-		cgroup_file_notify(&pids->events_file);
-	}
-	return err;
-}
-
-static void pids_cancel_fork(struct task_struct *task)
-{
-	struct cgroup_subsys_state *css;
-	struct pids_cgroup *pids;
-
-	css = task_css_check(current, pids_cgrp_id, true);
-	pids = css_pids(css);
-	pids_uncharge(pids, 1);
-}
-
-static void pids_free(struct task_struct *task)
-{
-	struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));
-
-	pids_uncharge(pids, 1);
-}
-
-static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
-			      size_t nbytes, loff_t off)
-{
-	struct cgroup_subsys_state *css = of_css(of);
-	struct pids_cgroup *pids = css_pids(css);
-	int64_t limit;
-	int err;
-
-	buf = strstrip(buf);
-	if (!strcmp(buf, PIDS_MAX_STR)) {
-		limit = PIDS_MAX;
-		goto set_limit;
-	}
-
-	err = kstrtoll(buf, 0, &limit);
-	if (err)
-		return err;
-
-	if (limit < 0 || limit >= PIDS_MAX)
-		return -EINVAL;
-
-set_limit:
-	/*
-	 * Limit updates don't need to be mutex'd, since it isn't
-	 * critical that any racing fork()s follow the new limit.
-	 */
-	pids->limit = limit;
-	return nbytes;
-}
-
-static int pids_max_show(struct seq_file *sf, void *v)
-{
-	struct cgroup_subsys_state *css = seq_css(sf);
-	struct pids_cgroup *pids = css_pids(css);
-	int64_t limit = pids->limit;
-
-	if (limit >= PIDS_MAX)
-		seq_printf(sf, "%s\n", PIDS_MAX_STR);
-	else
-		seq_printf(sf, "%lld\n", limit);
-
-	return 0;
-}
-
-static s64 pids_current_read(struct cgroup_subsys_state *css,
-			     struct cftype *cft)
-{
-	struct pids_cgroup *pids = css_pids(css);
-
-	return atomic64_read(&pids->counter);
-}
-
-static int pids_events_show(struct seq_file *sf, void *v)
-{
-	struct pids_cgroup *pids = css_pids(seq_css(sf));
-
-	seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit));
-	return 0;
-}
-
-static struct cftype pids_files[] = {
-	{
-		.name = "max",
-		.write = pids_max_write,
-		.seq_show = pids_max_show,
-		.flags = CFTYPE_NOT_ON_ROOT,
-	},
-	{
-		.name = "current",
-		.read_s64 = pids_current_read,
-		.flags = CFTYPE_NOT_ON_ROOT,
-	},
-	{
-		.name = "events",
-		.seq_show = pids_events_show,
-		.file_offset = offsetof(struct pids_cgroup, events_file),
-		.flags = CFTYPE_NOT_ON_ROOT,
-	},
-	{ }	/* terminate */
-};
-
-struct cgroup_subsys pids_cgrp_subsys = {
-	.css_alloc	= pids_css_alloc,
-	.css_free	= pids_css_free,
-	.can_attach 	= pids_can_attach,
-	.cancel_attach 	= pids_cancel_attach,
-	.can_fork	= pids_can_fork,
-	.cancel_fork	= pids_cancel_fork,
-	.free		= pids_free,
-	.legacy_cftypes	= pids_files,
-	.dfl_cftypes	= pids_files,
-};
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
deleted file mode 100644
index b3088886cd37..000000000000
--- a/kernel/cpuset.c
+++ /dev/null
@@ -1,2752 +0,0 @@
-/*
- *  kernel/cpuset.c
- *
- *  Processor and Memory placement constraints for sets of tasks.
- *
- *  Copyright (C) 2003 BULL SA.
- *  Copyright (C) 2004-2007 Silicon Graphics, Inc.
- *  Copyright (C) 2006 Google, Inc
- *
- *  Portions derived from Patrick Mochel's sysfs code.
- *  sysfs is Copyright (c) 2001-3 Patrick Mochel
- *
- *  2003-10-10 Written by Simon Derr.
- *  2003-10-22 Updates by Stephen Hemminger.
- *  2004 May-July Rework by Paul Jackson.
- *  2006 Rework by Paul Menage to use generic cgroups
- *  2008 Rework of the scheduler domains and CPU hotplug handling
- *       by Max Krasnyansky
- *
- *  This file is subject to the terms and conditions of the GNU General Public
- *  License.  See the file COPYING in the main directory of the Linux
- *  distribution for more details.
- */
-
-#include <linux/cpu.h>
-#include <linux/cpumask.h>
-#include <linux/cpuset.h>
-#include <linux/err.h>
-#include <linux/errno.h>
-#include <linux/file.h>
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/kernel.h>
-#include <linux/kmod.h>
-#include <linux/list.h>
-#include <linux/mempolicy.h>
-#include <linux/mm.h>
-#include <linux/memory.h>
-#include <linux/export.h>
-#include <linux/mount.h>
-#include <linux/namei.h>
-#include <linux/pagemap.h>
-#include <linux/proc_fs.h>
-#include <linux/rcupdate.h>
-#include <linux/sched.h>
-#include <linux/seq_file.h>
-#include <linux/security.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/stat.h>
-#include <linux/string.h>
-#include <linux/time.h>
-#include <linux/time64.h>
-#include <linux/backing-dev.h>
-#include <linux/sort.h>
-
-#include <linux/uaccess.h>
-#include <linux/atomic.h>
-#include <linux/mutex.h>
-#include <linux/cgroup.h>
-#include <linux/wait.h>
-
-DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
-
-/* See "Frequency meter" comments, below. */
-
-struct fmeter {
-	int cnt;		/* unprocessed events count */
-	int val;		/* most recent output value */
-	time64_t time;		/* clock (secs) when val computed */
-	spinlock_t lock;	/* guards read or write of above */
-};
-
-struct cpuset {
-	struct cgroup_subsys_state css;
-
-	unsigned long flags;		/* "unsigned long" so bitops work */
-
-	/*
-	 * On default hierarchy:
-	 *
-	 * The user-configured masks can only be changed by writing to
-	 * cpuset.cpus and cpuset.mems, and won't be limited by the
-	 * parent masks.
-	 *
-	 * The effective masks is the real masks that apply to the tasks
-	 * in the cpuset. They may be changed if the configured masks are
-	 * changed or hotplug happens.
-	 *
-	 * effective_mask == configured_mask & parent's effective_mask,
-	 * and if it ends up empty, it will inherit the parent's mask.
-	 *
-	 *
-	 * On legacy hierachy:
-	 *
-	 * The user-configured masks are always the same with effective masks.
-	 */
-
-	/* user-configured CPUs and Memory Nodes allow to tasks */
-	cpumask_var_t cpus_allowed;
-	nodemask_t mems_allowed;
-
-	/* effective CPUs and Memory Nodes allow to tasks */
-	cpumask_var_t effective_cpus;
-	nodemask_t effective_mems;
-
-	/*
-	 * This is old Memory Nodes tasks took on.
-	 *
-	 * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
-	 * - A new cpuset's old_mems_allowed is initialized when some
-	 *   task is moved into it.
-	 * - old_mems_allowed is used in cpuset_migrate_mm() when we change
-	 *   cpuset.mems_allowed and have tasks' nodemask updated, and
-	 *   then old_mems_allowed is updated to mems_allowed.
-	 */
-	nodemask_t old_mems_allowed;
-
-	struct fmeter fmeter;		/* memory_pressure filter */
-
-	/*
-	 * Tasks are being attached to this cpuset.  Used to prevent
-	 * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
-	 */
-	int attach_in_progress;
-
-	/* partition number for rebuild_sched_domains() */
-	int pn;
-
-	/* for custom sched domain */
-	int relax_domain_level;
-};
-
-static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
-{
-	return css ? container_of(css, struct cpuset, css) : NULL;
-}
-
-/* Retrieve the cpuset for a task */
-static inline struct cpuset *task_cs(struct task_struct *task)
-{
-	return css_cs(task_css(task, cpuset_cgrp_id));
-}
-
-static inline struct cpuset *parent_cs(struct cpuset *cs)
-{
-	return css_cs(cs->css.parent);
-}
-
-#ifdef CONFIG_NUMA
-static inline bool task_has_mempolicy(struct task_struct *task)
-{
-	return task->mempolicy;
-}
-#else
-static inline bool task_has_mempolicy(struct task_struct *task)
-{
-	return false;
-}
-#endif
-
-
-/* bits in struct cpuset flags field */
-typedef enum {
-	CS_ONLINE,
-	CS_CPU_EXCLUSIVE,
-	CS_MEM_EXCLUSIVE,
-	CS_MEM_HARDWALL,
-	CS_MEMORY_MIGRATE,
-	CS_SCHED_LOAD_BALANCE,
-	CS_SPREAD_PAGE,
-	CS_SPREAD_SLAB,
-} cpuset_flagbits_t;
-
-/* convenient tests for these bits */
-static inline bool is_cpuset_online(const struct cpuset *cs)
-{
-	return test_bit(CS_ONLINE, &cs->flags);
-}
-
-static inline int is_cpu_exclusive(const struct cpuset *cs)
-{
-	return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
-}
-
-static inline int is_mem_exclusive(const struct cpuset *cs)
-{
-	return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
-}
-
-static inline int is_mem_hardwall(const struct cpuset *cs)
-{
-	return test_bit(CS_MEM_HARDWALL, &cs->flags);
-}
-
-static inline int is_sched_load_balance(const struct cpuset *cs)
-{
-	return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
-}
-
-static inline int is_memory_migrate(const struct cpuset *cs)
-{
-	return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
-}
-
-static inline int is_spread_page(const struct cpuset *cs)
-{
-	return test_bit(CS_SPREAD_PAGE, &cs->flags);
-}
-
-static inline int is_spread_slab(const struct cpuset *cs)
-{
-	return test_bit(CS_SPREAD_SLAB, &cs->flags);
-}
-
-static struct cpuset top_cpuset = {
-	.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
-		  (1 << CS_MEM_EXCLUSIVE)),
-};
-
-/**
- * cpuset_for_each_child - traverse online children of a cpuset
- * @child_cs: loop cursor pointing to the current child
- * @pos_css: used for iteration
- * @parent_cs: target cpuset to walk children of
- *
- * Walk @child_cs through the online children of @parent_cs.  Must be used
- * with RCU read locked.
- */
-#define cpuset_for_each_child(child_cs, pos_css, parent_cs)		\
-	css_for_each_child((pos_css), &(parent_cs)->css)		\
-		if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
-
-/**
- * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
- * @des_cs: loop cursor pointing to the current descendant
- * @pos_css: used for iteration
- * @root_cs: target cpuset to walk ancestor of
- *
- * Walk @des_cs through the online descendants of @root_cs.  Must be used
- * with RCU read locked.  The caller may modify @pos_css by calling
- * css_rightmost_descendant() to skip subtree.  @root_cs is included in the
- * iteration and the first node to be visited.
- */
-#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs)	\
-	css_for_each_descendant_pre((pos_css), &(root_cs)->css)		\
-		if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
-
-/*
- * There are two global locks guarding cpuset structures - cpuset_mutex and
- * callback_lock. We also require taking task_lock() when dereferencing a
- * task's cpuset pointer. See "The task_lock() exception", at the end of this
- * comment.
- *
- * A task must hold both locks to modify cpusets.  If a task holds
- * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
- * is the only task able to also acquire callback_lock and be able to
- * modify cpusets.  It can perform various checks on the cpuset structure
- * first, knowing nothing will change.  It can also allocate memory while
- * just holding cpuset_mutex.  While it is performing these checks, various
- * callback routines can briefly acquire callback_lock to query cpusets.
- * Once it is ready to make the changes, it takes callback_lock, blocking
- * everyone else.
- *
- * Calls to the kernel memory allocator can not be made while holding
- * callback_lock, as that would risk double tripping on callback_lock
- * from one of the callbacks into the cpuset code from within
- * __alloc_pages().
- *
- * If a task is only holding callback_lock, then it has read-only
- * access to cpusets.
- *
- * Now, the task_struct fields mems_allowed and mempolicy may be changed
- * by other task, we use alloc_lock in the task_struct fields to protect
- * them.
- *
- * The cpuset_common_file_read() handlers only hold callback_lock across
- * small pieces of code, such as when reading out possibly multi-word
- * cpumasks and nodemasks.
- *
- * Accessing a task's cpuset should be done in accordance with the
- * guidelines for accessing subsystem state in kernel/cgroup.c
- */
-
-static DEFINE_MUTEX(cpuset_mutex);
-static DEFINE_SPINLOCK(callback_lock);
-
-static struct workqueue_struct *cpuset_migrate_mm_wq;
-
-/*
- * CPU / memory hotplug is handled asynchronously.
- */
-static void cpuset_hotplug_workfn(struct work_struct *work);
-static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
-
-static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
-
-/*
- * This is ugly, but preserves the userspace API for existing cpuset
- * users. If someone tries to mount the "cpuset" filesystem, we
- * silently switch it to mount "cgroup" instead
- */
-static struct dentry *cpuset_mount(struct file_system_type *fs_type,
-			 int flags, const char *unused_dev_name, void *data)
-{
-	struct file_system_type *cgroup_fs = get_fs_type("cgroup");
-	struct dentry *ret = ERR_PTR(-ENODEV);
-	if (cgroup_fs) {
-		char mountopts[] =
-			"cpuset,noprefix,"
-			"release_agent=/sbin/cpuset_release_agent";
-		ret = cgroup_fs->mount(cgroup_fs, flags,
-					   unused_dev_name, mountopts);
-		put_filesystem(cgroup_fs);
-	}
-	return ret;
-}
-
-static struct file_system_type cpuset_fs_type = {
-	.name = "cpuset",
-	.mount = cpuset_mount,
-};
-
-/*
- * Return in pmask the portion of a cpusets's cpus_allowed that
- * are online.  If none are online, walk up the cpuset hierarchy
- * until we find one that does have some online cpus.
- *
- * One way or another, we guarantee to return some non-empty subset
- * of cpu_online_mask.
- *
- * Call with callback_lock or cpuset_mutex held.
- */
-static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
-{
-	while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
-		cs = parent_cs(cs);
-		if (unlikely(!cs)) {
-			/*
-			 * The top cpuset doesn't have any online cpu as a
-			 * consequence of a race between cpuset_hotplug_work
-			 * and cpu hotplug notifier.  But we know the top
-			 * cpuset's effective_cpus is on its way to to be
-			 * identical to cpu_online_mask.
-			 */
-			cpumask_copy(pmask, cpu_online_mask);
-			return;
-		}
-	}
-	cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
-}
-
-/*
- * Return in *pmask the portion of a cpusets's mems_allowed that
- * are online, with memory.  If none are online with memory, walk
- * up the cpuset hierarchy until we find one that does have some
- * online mems.  The top cpuset always has some mems online.
- *
- * One way or another, we guarantee to return some non-empty subset
- * of node_states[N_MEMORY].
- *
- * Call with callback_lock or cpuset_mutex held.
- */
-static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
-{
-	while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
-		cs = parent_cs(cs);
-	nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
-}
-
-/*
- * update task's spread flag if cpuset's page/slab spread flag is set
- *
- * Call with callback_lock or cpuset_mutex held.
- */
-static void cpuset_update_task_spread_flag(struct cpuset *cs,
-					struct task_struct *tsk)
-{
-	if (is_spread_page(cs))
-		task_set_spread_page(tsk);
-	else
-		task_clear_spread_page(tsk);
-
-	if (is_spread_slab(cs))
-		task_set_spread_slab(tsk);
-	else
-		task_clear_spread_slab(tsk);
-}
-
-/*
- * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
- *
- * One cpuset is a subset of another if all its allowed CPUs and
- * Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set.  Call holding cpuset_mutex.
- */
-
-static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
-{
-	return	cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
-		nodes_subset(p->mems_allowed, q->mems_allowed) &&
-		is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
-		is_mem_exclusive(p) <= is_mem_exclusive(q);
-}
-
-/**
- * alloc_trial_cpuset - allocate a trial cpuset
- * @cs: the cpuset that the trial cpuset duplicates
- */
-static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
-{
-	struct cpuset *trial;
-
-	trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
-	if (!trial)
-		return NULL;
-
-	if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
-		goto free_cs;
-	if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
-		goto free_cpus;
-
-	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
-	cpumask_copy(trial->effective_cpus, cs->effective_cpus);
-	return trial;
-
-free_cpus:
-	free_cpumask_var(trial->cpus_allowed);
-free_cs:
-	kfree(trial);
-	return NULL;
-}
-
-/**
- * free_trial_cpuset - free the trial cpuset
- * @trial: the trial cpuset to be freed
- */
-static void free_trial_cpuset(struct cpuset *trial)
-{
-	free_cpumask_var(trial->effective_cpus);
-	free_cpumask_var(trial->cpus_allowed);
-	kfree(trial);
-}
-
-/*
- * validate_change() - Used to validate that any proposed cpuset change
- *		       follows the structural rules for cpusets.
- *
- * If we replaced the flag and mask values of the current cpuset
- * (cur) with those values in the trial cpuset (trial), would
- * our various subset and exclusive rules still be valid?  Presumes
- * cpuset_mutex held.
- *
- * 'cur' is the address of an actual, in-use cpuset.  Operations
- * such as list traversal that depend on the actual address of the
- * cpuset in the list must use cur below, not trial.
- *
- * 'trial' is the address of bulk structure copy of cur, with
- * perhaps one or more of the fields cpus_allowed, mems_allowed,
- * or flags changed to new, trial values.
- *
- * Return 0 if valid, -errno if not.
- */
-
-static int validate_change(struct cpuset *cur, struct cpuset *trial)
-{
-	struct cgroup_subsys_state *css;
-	struct cpuset *c, *par;
-	int ret;
-
-	rcu_read_lock();
-
-	/* Each of our child cpusets must be a subset of us */
-	ret = -EBUSY;
-	cpuset_for_each_child(c, css, cur)
-		if (!is_cpuset_subset(c, trial))
-			goto out;
-
-	/* Remaining checks don't apply to root cpuset */
-	ret = 0;
-	if (cur == &top_cpuset)
-		goto out;
-
-	par = parent_cs(cur);
-
-	/* On legacy hiearchy, we must be a subset of our parent cpuset. */
-	ret = -EACCES;
-	if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
-	    !is_cpuset_subset(trial, par))
-		goto out;
-
-	/*
-	 * If either I or some sibling (!= me) is exclusive, we can't
-	 * overlap
-	 */
-	ret = -EINVAL;
-	cpuset_for_each_child(c, css, par) {
-		if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
-		    c != cur &&
-		    cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
-			goto out;
-		if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
-		    c != cur &&
-		    nodes_intersects(trial->mems_allowed, c->mems_allowed))
-			goto out;
-	}
-
-	/*
-	 * Cpusets with tasks - existing or newly being attached - can't
-	 * be changed to have empty cpus_allowed or mems_allowed.
-	 */
-	ret = -ENOSPC;
-	if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
-		if (!cpumask_empty(cur->cpus_allowed) &&
-		    cpumask_empty(trial->cpus_allowed))
-			goto out;
-		if (!nodes_empty(cur->mems_allowed) &&
-		    nodes_empty(trial->mems_allowed))
-			goto out;
-	}
-
-	/*
-	 * We can't shrink if we won't have enough room for SCHED_DEADLINE
-	 * tasks.
-	 */
-	ret = -EBUSY;
-	if (is_cpu_exclusive(cur) &&
-	    !cpuset_cpumask_can_shrink(cur->cpus_allowed,
-				       trial->cpus_allowed))
-		goto out;
-
-	ret = 0;
-out:
-	rcu_read_unlock();
-	return ret;
-}
-
-#ifdef CONFIG_SMP
-/*
- * Helper routine for generate_sched_domains().
- * Do cpusets a, b have overlapping effective cpus_allowed masks?
- */
-static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
-{
-	return cpumask_intersects(a->effective_cpus, b->effective_cpus);
-}
-
-static void
-update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
-{
-	if (dattr->relax_domain_level < c->relax_domain_level)
-		dattr->relax_domain_level = c->relax_domain_level;
-	return;
-}
-
-static void update_domain_attr_tree(struct sched_domain_attr *dattr,
-				    struct cpuset *root_cs)
-{
-	struct cpuset *cp;
-	struct cgroup_subsys_state *pos_css;
-
-	rcu_read_lock();
-	cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
-		/* skip the whole subtree if @cp doesn't have any CPU */
-		if (cpumask_empty(cp->cpus_allowed)) {
-			pos_css = css_rightmost_descendant(pos_css);
-			continue;
-		}
-
-		if (is_sched_load_balance(cp))
-			update_domain_attr(dattr, cp);
-	}
-	rcu_read_unlock();
-}
-
-/*
- * generate_sched_domains()
- *
- * This function builds a partial partition of the systems CPUs
- * A 'partial partition' is a set of non-overlapping subsets whose
- * union is a subset of that set.
- * The output of this function needs to be passed to kernel/sched/core.c
- * partition_sched_domains() routine, which will rebuild the scheduler's
- * load balancing domains (sched domains) as specified by that partial
- * partition.
- *
- * See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt
- * for a background explanation of this.
- *
- * Does not return errors, on the theory that the callers of this
- * routine would rather not worry about failures to rebuild sched
- * domains when operating in the severe memory shortage situations
- * that could cause allocation failures below.
- *
- * Must be called with cpuset_mutex held.
- *
- * The three key local variables below are:
- *    q  - a linked-list queue of cpuset pointers, used to implement a
- *	   top-down scan of all cpusets.  This scan loads a pointer
- *	   to each cpuset marked is_sched_load_balance into the
- *	   array 'csa'.  For our purposes, rebuilding the schedulers
- *	   sched domains, we can ignore !is_sched_load_balance cpusets.
- *  csa  - (for CpuSet Array) Array of pointers to all the cpusets
- *	   that need to be load balanced, for convenient iterative
- *	   access by the subsequent code that finds the best partition,
- *	   i.e the set of domains (subsets) of CPUs such that the
- *	   cpus_allowed of every cpuset marked is_sched_load_balance
- *	   is a subset of one of these domains, while there are as
- *	   many such domains as possible, each as small as possible.
- * doms  - Conversion of 'csa' to an array of cpumasks, for passing to
- *	   the kernel/sched/core.c routine partition_sched_domains() in a
- *	   convenient format, that can be easily compared to the prior
- *	   value to determine what partition elements (sched domains)
- *	   were changed (added or removed.)
- *
- * Finding the best partition (set of domains):
- *	The triple nested loops below over i, j, k scan over the
- *	load balanced cpusets (using the array of cpuset pointers in
- *	csa[]) looking for pairs of cpusets that have overlapping
- *	cpus_allowed, but which don't have the same 'pn' partition
- *	number and gives them in the same partition number.  It keeps
- *	looping on the 'restart' label until it can no longer find
- *	any such pairs.
- *
- *	The union of the cpus_allowed masks from the set of
- *	all cpusets having the same 'pn' value then form the one
- *	element of the partition (one sched domain) to be passed to
- *	partition_sched_domains().
- */
-static int generate_sched_domains(cpumask_var_t **domains,
-			struct sched_domain_attr **attributes)
-{
-	struct cpuset *cp;	/* scans q */
-	struct cpuset **csa;	/* array of all cpuset ptrs */
-	int csn;		/* how many cpuset ptrs in csa so far */
-	int i, j, k;		/* indices for partition finding loops */
-	cpumask_var_t *doms;	/* resulting partition; i.e. sched domains */
-	cpumask_var_t non_isolated_cpus;  /* load balanced CPUs */
-	struct sched_domain_attr *dattr;  /* attributes for custom domains */
-	int ndoms = 0;		/* number of sched domains in result */
-	int nslot;		/* next empty doms[] struct cpumask slot */
-	struct cgroup_subsys_state *pos_css;
-
-	doms = NULL;
-	dattr = NULL;
-	csa = NULL;
-
-	if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
-		goto done;
-	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
-
-	/* Special case for the 99% of systems with one, full, sched domain */
-	if (is_sched_load_balance(&top_cpuset)) {
-		ndoms = 1;
-		doms = alloc_sched_domains(ndoms);
-		if (!doms)
-			goto done;
-
-		dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
-		if (dattr) {
-			*dattr = SD_ATTR_INIT;
-			update_domain_attr_tree(dattr, &top_cpuset);
-		}
-		cpumask_and(doms[0], top_cpuset.effective_cpus,
-				     non_isolated_cpus);
-
-		goto done;
-	}
-
-	csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);
-	if (!csa)
-		goto done;
-	csn = 0;
-
-	rcu_read_lock();
-	cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
-		if (cp == &top_cpuset)
-			continue;
-		/*
-		 * Continue traversing beyond @cp iff @cp has some CPUs and
-		 * isn't load balancing.  The former is obvious.  The
-		 * latter: All child cpusets contain a subset of the
-		 * parent's cpus, so just skip them, and then we call
-		 * update_domain_attr_tree() to calc relax_domain_level of
-		 * the corresponding sched domain.
-		 */
-		if (!cpumask_empty(cp->cpus_allowed) &&
-		    !(is_sched_load_balance(cp) &&
-		      cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
-			continue;
-
-		if (is_sched_load_balance(cp))
-			csa[csn++] = cp;
-
-		/* skip @cp's subtree */
-		pos_css = css_rightmost_descendant(pos_css);
-	}
-	rcu_read_unlock();
-
-	for (i = 0; i < csn; i++)
-		csa[i]->pn = i;
-	ndoms = csn;
-
-restart:
-	/* Find the best partition (set of sched domains) */
-	for (i = 0; i < csn; i++) {
-		struct cpuset *a = csa[i];
-		int apn = a->pn;
-
-		for (j = 0; j < csn; j++) {
-			struct cpuset *b = csa[j];
-			int bpn = b->pn;
-
-			if (apn != bpn && cpusets_overlap(a, b)) {
-				for (k = 0; k < csn; k++) {
-					struct cpuset *c = csa[k];
-
-					if (c->pn == bpn)
-						c->pn = apn;
-				}
-				ndoms--;	/* one less element */
-				goto restart;
-			}
-		}
-	}
-
-	/*
-	 * Now we know how many domains to create.
-	 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
-	 */
-	doms = alloc_sched_domains(ndoms);
-	if (!doms)
-		goto done;
-
-	/*
-	 * The rest of the code, including the scheduler, can deal with
-	 * dattr==NULL case. No need to abort if alloc fails.
-	 */
-	dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
-
-	for (nslot = 0, i = 0; i < csn; i++) {
-		struct cpuset *a = csa[i];
-		struct cpumask *dp;
-		int apn = a->pn;
-
-		if (apn < 0) {
-			/* Skip completed partitions */
-			continue;
-		}
-
-		dp = doms[nslot];
-
-		if (nslot == ndoms) {
-			static int warnings = 10;
-			if (warnings) {
-				pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
-					nslot, ndoms, csn, i, apn);
-				warnings--;
-			}
-			continue;
-		}
-
-		cpumask_clear(dp);
-		if (dattr)
-			*(dattr + nslot) = SD_ATTR_INIT;
-		for (j = i; j < csn; j++) {
-			struct cpuset *b = csa[j];
-
-			if (apn == b->pn) {
-				cpumask_or(dp, dp, b->effective_cpus);
-				cpumask_and(dp, dp, non_isolated_cpus);
-				if (dattr)
-					update_domain_attr_tree(dattr + nslot, b);
-
-				/* Done with this partition */
-				b->pn = -1;
-			}
-		}
-		nslot++;
-	}
-	BUG_ON(nslot != ndoms);
-
-done:
-	free_cpumask_var(non_isolated_cpus);
-	kfree(csa);
-
-	/*
-	 * Fallback to the default domain if kmalloc() failed.
-	 * See comments in partition_sched_domains().
-	 */
-	if (doms == NULL)
-		ndoms = 1;
-
-	*domains    = doms;
-	*attributes = dattr;
-	return ndoms;
-}
-
-/*
- * Rebuild scheduler domains.
- *
- * If the flag 'sched_load_balance' of any cpuset with non-empty
- * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
- * which has that flag enabled, or if any cpuset with a non-empty
- * 'cpus' is removed, then call this routine to rebuild the
- * scheduler's dynamic sched domains.
- *
- * Call with cpuset_mutex held.  Takes get_online_cpus().
- */
-static void rebuild_sched_domains_locked(void)
-{
-	struct sched_domain_attr *attr;
-	cpumask_var_t *doms;
-	int ndoms;
-
-	lockdep_assert_held(&cpuset_mutex);
-	get_online_cpus();
-
-	/*
-	 * We have raced with CPU hotplug. Don't do anything to avoid
-	 * passing doms with offlined cpu to partition_sched_domains().
-	 * Anyways, hotplug work item will rebuild sched domains.
-	 */
-	if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
-		goto out;
-
-	/* Generate domain masks and attrs */
-	ndoms = generate_sched_domains(&doms, &attr);
-
-	/* Have scheduler rebuild the domains */
-	partition_sched_domains(ndoms, doms, attr);
-out:
-	put_online_cpus();
-}
-#else /* !CONFIG_SMP */
-static void rebuild_sched_domains_locked(void)
-{
-}
-#endif /* CONFIG_SMP */
-
-void rebuild_sched_domains(void)
-{
-	mutex_lock(&cpuset_mutex);
-	rebuild_sched_domains_locked();
-	mutex_unlock(&cpuset_mutex);
-}
-
-/**
- * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
- * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
- *
- * Iterate through each task of @cs updating its cpus_allowed to the
- * effective cpuset's.  As this function is called with cpuset_mutex held,
- * cpuset membership stays stable.
- */
-static void update_tasks_cpumask(struct cpuset *cs)
-{
-	struct css_task_iter it;
-	struct task_struct *task;
-
-	css_task_iter_start(&cs->css, &it);
-	while ((task = css_task_iter_next(&it)))
-		set_cpus_allowed_ptr(task, cs->effective_cpus);
-	css_task_iter_end(&it);
-}
-
-/*
- * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
- * @cs: the cpuset to consider
- * @new_cpus: temp variable for calculating new effective_cpus
- *
- * When congifured cpumask is changed, the effective cpumasks of this cpuset
- * and all its descendants need to be updated.
- *
- * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
- *
- * Called with cpuset_mutex held
- */
-static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
-{
-	struct cpuset *cp;
-	struct cgroup_subsys_state *pos_css;
-	bool need_rebuild_sched_domains = false;
-
-	rcu_read_lock();
-	cpuset_for_each_descendant_pre(cp, pos_css, cs) {
-		struct cpuset *parent = parent_cs(cp);
-
-		cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
-
-		/*
-		 * If it becomes empty, inherit the effective mask of the
-		 * parent, which is guaranteed to have some CPUs.
-		 */
-		if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
-		    cpumask_empty(new_cpus))
-			cpumask_copy(new_cpus, parent->effective_cpus);
-
-		/* Skip the whole subtree if the cpumask remains the same. */
-		if (cpumask_equal(new_cpus, cp->effective_cpus)) {
-			pos_css = css_rightmost_descendant(pos_css);
-			continue;
-		}
-
-		if (!css_tryget_online(&cp->css))
-			continue;
-		rcu_read_unlock();
-
-		spin_lock_irq(&callback_lock);
-		cpumask_copy(cp->effective_cpus, new_cpus);
-		spin_unlock_irq(&callback_lock);
-
-		WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
-			!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
-
-		update_tasks_cpumask(cp);
-
-		/*
-		 * If the effective cpumask of any non-empty cpuset is changed,
-		 * we need to rebuild sched domains.
-		 */
-		if (!cpumask_empty(cp->cpus_allowed) &&
-		    is_sched_load_balance(cp))
-			need_rebuild_sched_domains = true;
-
-		rcu_read_lock();
-		css_put(&cp->css);
-	}
-	rcu_read_unlock();
-
-	if (need_rebuild_sched_domains)
-		rebuild_sched_domains_locked();
-}
-
-/**
- * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
- * @cs: the cpuset to consider
- * @trialcs: trial cpuset
- * @buf: buffer of cpu numbers written to this cpuset
- */
-static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
-			  const char *buf)
-{
-	int retval;
-
-	/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
-	if (cs == &top_cpuset)
-		return -EACCES;
-
-	/*
-	 * An empty cpus_allowed is ok only if the cpuset has no tasks.
-	 * Since cpulist_parse() fails on an empty mask, we special case
-	 * that parsing.  The validate_change() call ensures that cpusets
-	 * with tasks have cpus.
-	 */
-	if (!*buf) {
-		cpumask_clear(trialcs->cpus_allowed);
-	} else {
-		retval = cpulist_parse(buf, trialcs->cpus_allowed);
-		if (retval < 0)
-			return retval;
-
-		if (!cpumask_subset(trialcs->cpus_allowed,
-				    top_cpuset.cpus_allowed))
-			return -EINVAL;
-	}
-
-	/* Nothing to do if the cpus didn't change */
-	if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
-		return 0;
-
-	retval = validate_change(cs, trialcs);
-	if (retval < 0)
-		return retval;
-
-	spin_lock_irq(&callback_lock);
-	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
-	spin_unlock_irq(&callback_lock);
-
-	/* use trialcs->cpus_allowed as a temp variable */
-	update_cpumasks_hier(cs, trialcs->cpus_allowed);
-	return 0;
-}
-
-/*
- * Migrate memory region from one set of nodes to another.  This is
- * performed asynchronously as it can be called from process migration path
- * holding locks involved in process management.  All mm migrations are
- * performed in the queued order and can be waited for by flushing
- * cpuset_migrate_mm_wq.
- */
-
-struct cpuset_migrate_mm_work {
-	struct work_struct	work;
-	struct mm_struct	*mm;
-	nodemask_t		from;
-	nodemask_t		to;
-};
-
-static void cpuset_migrate_mm_workfn(struct work_struct *work)
-{
-	struct cpuset_migrate_mm_work *mwork =
-		container_of(work, struct cpuset_migrate_mm_work, work);
-
-	/* on a wq worker, no need to worry about %current's mems_allowed */
-	do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
-	mmput(mwork->mm);
-	kfree(mwork);
-}
-
-static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
-							const nodemask_t *to)
-{
-	struct cpuset_migrate_mm_work *mwork;
-
-	mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
-	if (mwork) {
-		mwork->mm = mm;
-		mwork->from = *from;
-		mwork->to = *to;
-		INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
-		queue_work(cpuset_migrate_mm_wq, &mwork->work);
-	} else {
-		mmput(mm);
-	}
-}
-
-static void cpuset_post_attach(void)
-{
-	flush_workqueue(cpuset_migrate_mm_wq);
-}
-
-/*
- * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
- * @tsk: the task to change
- * @newmems: new nodes that the task will be set
- *
- * In order to avoid seeing no nodes if the old and new nodes are disjoint,
- * we structure updates as setting all new allowed nodes, then clearing newly
- * disallowed ones.
- */
-static void cpuset_change_task_nodemask(struct task_struct *tsk,
-					nodemask_t *newmems)
-{
-	bool need_loop;
-
-	task_lock(tsk);
-	/*
-	 * Determine if a loop is necessary if another thread is doing
-	 * read_mems_allowed_begin().  If at least one node remains unchanged and
-	 * tsk does not have a mempolicy, then an empty nodemask will not be
-	 * possible when mems_allowed is larger than a word.
-	 */
-	need_loop = task_has_mempolicy(tsk) ||
-			!nodes_intersects(*newmems, tsk->mems_allowed);
-
-	if (need_loop) {
-		local_irq_disable();
-		write_seqcount_begin(&tsk->mems_allowed_seq);
-	}
-
-	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
-	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
-
-	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
-	tsk->mems_allowed = *newmems;
-
-	if (need_loop) {
-		write_seqcount_end(&tsk->mems_allowed_seq);
-		local_irq_enable();
-	}
-
-	task_unlock(tsk);
-}
-
-static void *cpuset_being_rebound;
-
-/**
- * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
- * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
- *
- * Iterate through each task of @cs updating its mems_allowed to the
- * effective cpuset's.  As this function is called with cpuset_mutex held,
- * cpuset membership stays stable.
- */
-static void update_tasks_nodemask(struct cpuset *cs)
-{
-	static nodemask_t newmems;	/* protected by cpuset_mutex */
-	struct css_task_iter it;
-	struct task_struct *task;
-
-	cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */
-
-	guarantee_online_mems(cs, &newmems);
-
-	/*
-	 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
-	 * take while holding tasklist_lock.  Forks can happen - the
-	 * mpol_dup() cpuset_being_rebound check will catch such forks,
-	 * and rebind their vma mempolicies too.  Because we still hold
-	 * the global cpuset_mutex, we know that no other rebind effort
-	 * will be contending for the global variable cpuset_being_rebound.
-	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
-	 * is idempotent.  Also migrate pages in each mm to new nodes.
-	 */
-	css_task_iter_start(&cs->css, &it);
-	while ((task = css_task_iter_next(&it))) {
-		struct mm_struct *mm;
-		bool migrate;
-
-		cpuset_change_task_nodemask(task, &newmems);
-
-		mm = get_task_mm(task);
-		if (!mm)
-			continue;
-
-		migrate = is_memory_migrate(cs);
-
-		mpol_rebind_mm(mm, &cs->mems_allowed);
-		if (migrate)
-			cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
-		else
-			mmput(mm);
-	}
-	css_task_iter_end(&it);
-
-	/*
-	 * All the tasks' nodemasks have been updated, update
-	 * cs->old_mems_allowed.
-	 */
-	cs->old_mems_allowed = newmems;
-
-	/* We're done rebinding vmas to this cpuset's new mems_allowed. */
-	cpuset_being_rebound = NULL;
-}
-
-/*
- * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
- * @cs: the cpuset to consider
- * @new_mems: a temp variable for calculating new effective_mems
- *
- * When configured nodemask is changed, the effective nodemasks of this cpuset
- * and all its descendants need to be updated.
- *
- * On legacy hiearchy, effective_mems will be the same with mems_allowed.
- *
- * Called with cpuset_mutex held
- */
-static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
-{
-	struct cpuset *cp;
-	struct cgroup_subsys_state *pos_css;
-
-	rcu_read_lock();
-	cpuset_for_each_descendant_pre(cp, pos_css, cs) {
-		struct cpuset *parent = parent_cs(cp);
-
-		nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
-
-		/*
-		 * If it becomes empty, inherit the effective mask of the
-		 * parent, which is guaranteed to have some MEMs.
-		 */
-		if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
-		    nodes_empty(*new_mems))
-			*new_mems = parent->effective_mems;
-
-		/* Skip the whole subtree if the nodemask remains the same. */
-		if (nodes_equal(*new_mems, cp->effective_mems)) {
-			pos_css = css_rightmost_descendant(pos_css);
-			continue;
-		}
-
-		if (!css_tryget_online(&cp->css))
-			continue;
-		rcu_read_unlock();
-
-		spin_lock_irq(&callback_lock);
-		cp->effective_mems = *new_mems;
-		spin_unlock_irq(&callback_lock);
-
-		WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
-			!nodes_equal(cp->mems_allowed, cp->effective_mems));
-
-		update_tasks_nodemask(cp);
-
-		rcu_read_lock();
-		css_put(&cp->css);
-	}
-	rcu_read_unlock();
-}
-
-/*
- * Handle user request to change the 'mems' memory placement
- * of a cpuset.  Needs to validate the request, update the
- * cpusets mems_allowed, and for each task in the cpuset,
- * update mems_allowed and rebind task's mempolicy and any vma
- * mempolicies and if the cpuset is marked 'memory_migrate',
- * migrate the tasks pages to the new memory.
- *
- * Call with cpuset_mutex held. May take callback_lock during call.
- * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
- * lock each such tasks mm->mmap_sem, scan its vma's and rebind
- * their mempolicies to the cpusets new mems_allowed.
- */
-static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
-			   const char *buf)
-{
-	int retval;
-
-	/*
-	 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
-	 * it's read-only
-	 */
-	if (cs == &top_cpuset) {
-		retval = -EACCES;
-		goto done;
-	}
-
-	/*
-	 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
-	 * Since nodelist_parse() fails on an empty mask, we special case
-	 * that parsing.  The validate_change() call ensures that cpusets
-	 * with tasks have memory.
-	 */
-	if (!*buf) {
-		nodes_clear(trialcs->mems_allowed);
-	} else {
-		retval = nodelist_parse(buf, trialcs->mems_allowed);
-		if (retval < 0)
-			goto done;
-
-		if (!nodes_subset(trialcs->mems_allowed,
-				  top_cpuset.mems_allowed)) {
-			retval = -EINVAL;
-			goto done;
-		}
-	}
-
-	if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
-		retval = 0;		/* Too easy - nothing to do */
-		goto done;
-	}
-	retval = validate_change(cs, trialcs);
-	if (retval < 0)
-		goto done;
-
-	spin_lock_irq(&callback_lock);
-	cs->mems_allowed = trialcs->mems_allowed;
-	spin_unlock_irq(&callback_lock);
-
-	/* use trialcs->mems_allowed as a temp variable */
-	update_nodemasks_hier(cs, &trialcs->mems_allowed);
-done:
-	return retval;
-}
-
-int current_cpuset_is_being_rebound(void)
-{
-	int ret;
-
-	rcu_read_lock();
-	ret = task_cs(current) == cpuset_being_rebound;
-	rcu_read_unlock();
-
-	return ret;
-}
-
-static int update_relax_domain_level(struct cpuset *cs, s64 val)
-{
-#ifdef CONFIG_SMP
-	if (val < -1 || val >= sched_domain_level_max)
-		return -EINVAL;
-#endif
-
-	if (val != cs->relax_domain_level) {
-		cs->relax_domain_level = val;
-		if (!cpumask_empty(cs->cpus_allowed) &&
-		    is_sched_load_balance(cs))
-			rebuild_sched_domains_locked();
-	}
-
-	return 0;
-}
-
-/**
- * update_tasks_flags - update the spread flags of tasks in the cpuset.
- * @cs: the cpuset in which each task's spread flags needs to be changed
- *
- * Iterate through each task of @cs updating its spread flags.  As this
- * function is called with cpuset_mutex held, cpuset membership stays
- * stable.
- */
-static void update_tasks_flags(struct cpuset *cs)
-{
-	struct css_task_iter it;
-	struct task_struct *task;
-
-	css_task_iter_start(&cs->css, &it);
-	while ((task = css_task_iter_next(&it)))
-		cpuset_update_task_spread_flag(cs, task);
-	css_task_iter_end(&it);
-}
-
-/*
- * update_flag - read a 0 or a 1 in a file and update associated flag
- * bit:		the bit to update (see cpuset_flagbits_t)
- * cs:		the cpuset to update
- * turning_on: 	whether the flag is being set or cleared
- *
- * Call with cpuset_mutex held.
- */
-
-static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
-		       int turning_on)
-{
-	struct cpuset *trialcs;
-	int balance_flag_changed;
-	int spread_flag_changed;
-	int err;
-
-	trialcs = alloc_trial_cpuset(cs);
-	if (!trialcs)
-		return -ENOMEM;
-
-	if (turning_on)
-		set_bit(bit, &trialcs->flags);
-	else
-		clear_bit(bit, &trialcs->flags);
-
-	err = validate_change(cs, trialcs);
-	if (err < 0)
-		goto out;
-
-	balance_flag_changed = (is_sched_load_balance(cs) !=
-				is_sched_load_balance(trialcs));
-
-	spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
-			|| (is_spread_page(cs) != is_spread_page(trialcs)));
-
-	spin_lock_irq(&callback_lock);
-	cs->flags = trialcs->flags;
-	spin_unlock_irq(&callback_lock);
-
-	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
-		rebuild_sched_domains_locked();
-
-	if (spread_flag_changed)
-		update_tasks_flags(cs);
-out:
-	free_trial_cpuset(trialcs);
-	return err;
-}
-
-/*
- * Frequency meter - How fast is some event occurring?
- *
- * These routines manage a digitally filtered, constant time based,
- * event frequency meter.  There are four routines:
- *   fmeter_init() - initialize a frequency meter.
- *   fmeter_markevent() - called each time the event happens.
- *   fmeter_getrate() - returns the recent rate of such events.
- *   fmeter_update() - internal routine used to update fmeter.
- *
- * A common data structure is passed to each of these routines,
- * which is used to keep track of the state required to manage the
- * frequency meter and its digital filter.
- *
- * The filter works on the number of events marked per unit time.
- * The filter is single-pole low-pass recursive (IIR).  The time unit
- * is 1 second.  Arithmetic is done using 32-bit integers scaled to
- * simulate 3 decimal digits of precision (multiplied by 1000).
- *
- * With an FM_COEF of 933, and a time base of 1 second, the filter
- * has a half-life of 10 seconds, meaning that if the events quit
- * happening, then the rate returned from the fmeter_getrate()
- * will be cut in half each 10 seconds, until it converges to zero.
- *
- * It is not worth doing a real infinitely recursive filter.  If more
- * than FM_MAXTICKS ticks have elapsed since the last filter event,
- * just compute FM_MAXTICKS ticks worth, by which point the level
- * will be stable.
- *
- * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
- * arithmetic overflow in the fmeter_update() routine.
- *
- * Given the simple 32 bit integer arithmetic used, this meter works
- * best for reporting rates between one per millisecond (msec) and
- * one per 32 (approx) seconds.  At constant rates faster than one
- * per msec it maxes out at values just under 1,000,000.  At constant
- * rates between one per msec, and one per second it will stabilize
- * to a value N*1000, where N is the rate of events per second.
- * At constant rates between one per second and one per 32 seconds,
- * it will be choppy, moving up on the seconds that have an event,
- * and then decaying until the next event.  At rates slower than
- * about one in 32 seconds, it decays all the way back to zero between
- * each event.
- */
-
-#define FM_COEF 933		/* coefficient for half-life of 10 secs */
-#define FM_MAXTICKS ((u32)99)   /* useless computing more ticks than this */
-#define FM_MAXCNT 1000000	/* limit cnt to avoid overflow */
-#define FM_SCALE 1000		/* faux fixed point scale */
-
-/* Initialize a frequency meter */
-static void fmeter_init(struct fmeter *fmp)
-{
-	fmp->cnt = 0;
-	fmp->val = 0;
-	fmp->time = 0;
-	spin_lock_init(&fmp->lock);
-}
-
-/* Internal meter update - process cnt events and update value */
-static void fmeter_update(struct fmeter *fmp)
-{
-	time64_t now;
-	u32 ticks;
-
-	now = ktime_get_seconds();
-	ticks = now - fmp->time;
-
-	if (ticks == 0)
-		return;
-
-	ticks = min(FM_MAXTICKS, ticks);
-	while (ticks-- > 0)
-		fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
-	fmp->time = now;
-
-	fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
-	fmp->cnt = 0;
-}
-
-/* Process any previous ticks, then bump cnt by one (times scale). */
-static void fmeter_markevent(struct fmeter *fmp)
-{
-	spin_lock(&fmp->lock);
-	fmeter_update(fmp);
-	fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
-	spin_unlock(&fmp->lock);
-}
-
-/* Process any previous ticks, then return current value. */
-static int fmeter_getrate(struct fmeter *fmp)
-{
-	int val;
-
-	spin_lock(&fmp->lock);
-	fmeter_update(fmp);
-	val = fmp->val;
-	spin_unlock(&fmp->lock);
-	return val;
-}
-
-static struct cpuset *cpuset_attach_old_cs;
-
-/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
-static int cpuset_can_attach(struct cgroup_taskset *tset)
-{
-	struct cgroup_subsys_state *css;
-	struct cpuset *cs;
-	struct task_struct *task;
-	int ret;
-
-	/* used later by cpuset_attach() */
-	cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
-	cs = css_cs(css);
-
-	mutex_lock(&cpuset_mutex);
-
-	/* allow moving tasks into an empty cpuset if on default hierarchy */
-	ret = -ENOSPC;
-	if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
-	    (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
-		goto out_unlock;
-
-	cgroup_taskset_for_each(task, css, tset) {
-		ret = task_can_attach(task, cs->cpus_allowed);
-		if (ret)
-			goto out_unlock;
-		ret = security_task_setscheduler(task);
-		if (ret)
-			goto out_unlock;
-	}
-
-	/*
-	 * Mark attach is in progress.  This makes validate_change() fail
-	 * changes which zero cpus/mems_allowed.
-	 */
-	cs->attach_in_progress++;
-	ret = 0;
-out_unlock:
-	mutex_unlock(&cpuset_mutex);
-	return ret;
-}
-
-static void cpuset_cancel_attach(struct cgroup_taskset *tset)
-{
-	struct cgroup_subsys_state *css;
-	struct cpuset *cs;
-
-	cgroup_taskset_first(tset, &css);
-	cs = css_cs(css);
-
-	mutex_lock(&cpuset_mutex);
-	css_cs(css)->attach_in_progress--;
-	mutex_unlock(&cpuset_mutex);
-}
-
-/*
- * Protected by cpuset_mutex.  cpus_attach is used only by cpuset_attach()
- * but we can't allocate it dynamically there.  Define it global and
- * allocate from cpuset_init().
- */
-static cpumask_var_t cpus_attach;
-
-static void cpuset_attach(struct cgroup_taskset *tset)
-{
-	/* static buf protected by cpuset_mutex */
-	static nodemask_t cpuset_attach_nodemask_to;
-	struct task_struct *task;
-	struct task_struct *leader;
-	struct cgroup_subsys_state *css;
-	struct cpuset *cs;
-	struct cpuset *oldcs = cpuset_attach_old_cs;
-
-	cgroup_taskset_first(tset, &css);
-	cs = css_cs(css);
-
-	mutex_lock(&cpuset_mutex);
-
-	/* prepare for attach */
-	if (cs == &top_cpuset)
-		cpumask_copy(cpus_attach, cpu_possible_mask);
-	else
-		guarantee_online_cpus(cs, cpus_attach);
-
-	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
-
-	cgroup_taskset_for_each(task, css, tset) {
-		/*
-		 * can_attach beforehand should guarantee that this doesn't
-		 * fail.  TODO: have a better way to handle failure here
-		 */
-		WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
-
-		cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
-		cpuset_update_task_spread_flag(cs, task);
-	}
-
-	/*
-	 * Change mm for all threadgroup leaders. This is expensive and may
-	 * sleep and should be moved outside migration path proper.
-	 */
-	cpuset_attach_nodemask_to = cs->effective_mems;
-	cgroup_taskset_for_each_leader(leader, css, tset) {
-		struct mm_struct *mm = get_task_mm(leader);
-
-		if (mm) {
-			mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
-
-			/*
-			 * old_mems_allowed is the same with mems_allowed
-			 * here, except if this task is being moved
-			 * automatically due to hotplug.  In that case
-			 * @mems_allowed has been updated and is empty, so
-			 * @old_mems_allowed is the right nodesets that we
-			 * migrate mm from.
-			 */
-			if (is_memory_migrate(cs))
-				cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
-						  &cpuset_attach_nodemask_to);
-			else
-				mmput(mm);
-		}
-	}
-
-	cs->old_mems_allowed = cpuset_attach_nodemask_to;
-
-	cs->attach_in_progress--;
-	if (!cs->attach_in_progress)
-		wake_up(&cpuset_attach_wq);
-
-	mutex_unlock(&cpuset_mutex);
-}
-
-/* The various types of files and directories in a cpuset file system */
-
-typedef enum {
-	FILE_MEMORY_MIGRATE,
-	FILE_CPULIST,
-	FILE_MEMLIST,
-	FILE_EFFECTIVE_CPULIST,
-	FILE_EFFECTIVE_MEMLIST,
-	FILE_CPU_EXCLUSIVE,
-	FILE_MEM_EXCLUSIVE,
-	FILE_MEM_HARDWALL,
-	FILE_SCHED_LOAD_BALANCE,
-	FILE_SCHED_RELAX_DOMAIN_LEVEL,
-	FILE_MEMORY_PRESSURE_ENABLED,
-	FILE_MEMORY_PRESSURE,
-	FILE_SPREAD_PAGE,
-	FILE_SPREAD_SLAB,
-} cpuset_filetype_t;
-
-static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
-			    u64 val)
-{
-	struct cpuset *cs = css_cs(css);
-	cpuset_filetype_t type = cft->private;
-	int retval = 0;
-
-	mutex_lock(&cpuset_mutex);
-	if (!is_cpuset_online(cs)) {
-		retval = -ENODEV;
-		goto out_unlock;
-	}
-
-	switch (type) {
-	case FILE_CPU_EXCLUSIVE:
-		retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
-		break;
-	case FILE_MEM_EXCLUSIVE:
-		retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
-		break;
-	case FILE_MEM_HARDWALL:
-		retval = update_flag(CS_MEM_HARDWALL, cs, val);
-		break;
-	case FILE_SCHED_LOAD_BALANCE:
-		retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
-		break;
-	case FILE_MEMORY_MIGRATE:
-		retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
-		break;
-	case FILE_MEMORY_PRESSURE_ENABLED:
-		cpuset_memory_pressure_enabled = !!val;
-		break;
-	case FILE_SPREAD_PAGE:
-		retval = update_flag(CS_SPREAD_PAGE, cs, val);
-		break;
-	case FILE_SPREAD_SLAB:
-		retval = update_flag(CS_SPREAD_SLAB, cs, val);
-		break;
-	default:
-		retval = -EINVAL;
-		break;
-	}
-out_unlock:
-	mutex_unlock(&cpuset_mutex);
-	return retval;
-}
-
-static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
-			    s64 val)
-{
-	struct cpuset *cs = css_cs(css);
-	cpuset_filetype_t type = cft->private;
-	int retval = -ENODEV;
-
-	mutex_lock(&cpuset_mutex);
-	if (!is_cpuset_online(cs))
-		goto out_unlock;
-
-	switch (type) {
-	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
-		retval = update_relax_domain_level(cs, val);
-		break;
-	default:
-		retval = -EINVAL;
-		break;
-	}
-out_unlock:
-	mutex_unlock(&cpuset_mutex);
-	return retval;
-}
-
-/*
- * Common handling for a write to a "cpus" or "mems" file.
- */
-static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
-				    char *buf, size_t nbytes, loff_t off)
-{
-	struct cpuset *cs = css_cs(of_css(of));
-	struct cpuset *trialcs;
-	int retval = -ENODEV;
-
-	buf = strstrip(buf);
-
-	/*
-	 * CPU or memory hotunplug may leave @cs w/o any execution
-	 * resources, in which case the hotplug code asynchronously updates
-	 * configuration and transfers all tasks to the nearest ancestor
-	 * which can execute.
-	 *
-	 * As writes to "cpus" or "mems" may restore @cs's execution
-	 * resources, wait for the previously scheduled operations before
-	 * proceeding, so that we don't end up keep removing tasks added
-	 * after execution capability is restored.
-	 *
-	 * cpuset_hotplug_work calls back into cgroup core via
-	 * cgroup_transfer_tasks() and waiting for it from a cgroupfs
-	 * operation like this one can lead to a deadlock through kernfs
-	 * active_ref protection.  Let's break the protection.  Losing the
-	 * protection is okay as we check whether @cs is online after
-	 * grabbing cpuset_mutex anyway.  This only happens on the legacy
-	 * hierarchies.
-	 */
-	css_get(&cs->css);
-	kernfs_break_active_protection(of->kn);
-	flush_work(&cpuset_hotplug_work);
-
-	mutex_lock(&cpuset_mutex);
-	if (!is_cpuset_online(cs))
-		goto out_unlock;
-
-	trialcs = alloc_trial_cpuset(cs);
-	if (!trialcs) {
-		retval = -ENOMEM;
-		goto out_unlock;
-	}
-
-	switch (of_cft(of)->private) {
-	case FILE_CPULIST:
-		retval = update_cpumask(cs, trialcs, buf);
-		break;
-	case FILE_MEMLIST:
-		retval = update_nodemask(cs, trialcs, buf);
-		break;
-	default:
-		retval = -EINVAL;
-		break;
-	}
-
-	free_trial_cpuset(trialcs);
-out_unlock:
-	mutex_unlock(&cpuset_mutex);
-	kernfs_unbreak_active_protection(of->kn);
-	css_put(&cs->css);
-	flush_workqueue(cpuset_migrate_mm_wq);
-	return retval ?: nbytes;
-}
-
-/*
- * These ascii lists should be read in a single call, by using a user
- * buffer large enough to hold the entire map.  If read in smaller
- * chunks, there is no guarantee of atomicity.  Since the display format
- * used, list of ranges of sequential numbers, is variable length,
- * and since these maps can change value dynamically, one could read
- * gibberish by doing partial reads while a list was changing.
- */
-static int cpuset_common_seq_show(struct seq_file *sf, void *v)
-{
-	struct cpuset *cs = css_cs(seq_css(sf));
-	cpuset_filetype_t type = seq_cft(sf)->private;
-	int ret = 0;
-
-	spin_lock_irq(&callback_lock);
-
-	switch (type) {
-	case FILE_CPULIST:
-		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
-		break;
-	case FILE_MEMLIST:
-		seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
-		break;
-	case FILE_EFFECTIVE_CPULIST:
-		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
-		break;
-	case FILE_EFFECTIVE_MEMLIST:
-		seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
-		break;
-	default:
-		ret = -EINVAL;
-	}
-
-	spin_unlock_irq(&callback_lock);
-	return ret;
-}
-
-static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
-{
-	struct cpuset *cs = css_cs(css);
-	cpuset_filetype_t type = cft->private;
-	switch (type) {
-	case FILE_CPU_EXCLUSIVE:
-		return is_cpu_exclusive(cs);
-	case FILE_MEM_EXCLUSIVE:
-		return is_mem_exclusive(cs);
-	case FILE_MEM_HARDWALL:
-		return is_mem_hardwall(cs);
-	case FILE_SCHED_LOAD_BALANCE:
-		return is_sched_load_balance(cs);
-	case FILE_MEMORY_MIGRATE:
-		return is_memory_migrate(cs);
-	case FILE_MEMORY_PRESSURE_ENABLED:
-		return cpuset_memory_pressure_enabled;
-	case FILE_MEMORY_PRESSURE:
-		return fmeter_getrate(&cs->fmeter);
-	case FILE_SPREAD_PAGE:
-		return is_spread_page(cs);
-	case FILE_SPREAD_SLAB:
-		return is_spread_slab(cs);
-	default:
-		BUG();
-	}
-
-	/* Unreachable but makes gcc happy */
-	return 0;
-}
-
-static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
-{
-	struct cpuset *cs = css_cs(css);
-	cpuset_filetype_t type = cft->private;
-	switch (type) {
-	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
-		return cs->relax_domain_level;
-	default:
-		BUG();
-	}
-
-	/* Unrechable but makes gcc happy */
-	return 0;
-}
-
-
-/*
- * for the common functions, 'private' gives the type of file
- */
-
-static struct cftype files[] = {
-	{
-		.name = "cpus",
-		.seq_show = cpuset_common_seq_show,
-		.write = cpuset_write_resmask,
-		.max_write_len = (100U + 6 * NR_CPUS),
-		.private = FILE_CPULIST,
-	},
-
-	{
-		.name = "mems",
-		.seq_show = cpuset_common_seq_show,
-		.write = cpuset_write_resmask,
-		.max_write_len = (100U + 6 * MAX_NUMNODES),
-		.private = FILE_MEMLIST,
-	},
-
-	{
-		.name = "effective_cpus",
-		.seq_show = cpuset_common_seq_show,
-		.private = FILE_EFFECTIVE_CPULIST,
-	},
-
-	{
-		.name = "effective_mems",
-		.seq_show = cpuset_common_seq_show,
-		.private = FILE_EFFECTIVE_MEMLIST,
-	},
-
-	{
-		.name = "cpu_exclusive",
-		.read_u64 = cpuset_read_u64,
-		.write_u64 = cpuset_write_u64,
-		.private = FILE_CPU_EXCLUSIVE,
-	},
-
-	{
-		.name = "mem_exclusive",
-		.read_u64 = cpuset_read_u64,
-		.write_u64 = cpuset_write_u64,
-		.private = FILE_MEM_EXCLUSIVE,
-	},
-
-	{
-		.name = "mem_hardwall",
-		.read_u64 = cpuset_read_u64,
-		.write_u64 = cpuset_write_u64,
-		.private = FILE_MEM_HARDWALL,
-	},
-
-	{
-		.name = "sched_load_balance",
-		.read_u64 = cpuset_read_u64,
-		.write_u64 = cpuset_write_u64,
-		.private = FILE_SCHED_LOAD_BALANCE,
-	},
-
-	{
-		.name = "sched_relax_domain_level",
-		.read_s64 = cpuset_read_s64,
-		.write_s64 = cpuset_write_s64,
-		.private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
-	},
-
-	{
-		.name = "memory_migrate",
-		.read_u64 = cpuset_read_u64,
-		.write_u64 = cpuset_write_u64,
-		.private = FILE_MEMORY_MIGRATE,
-	},
-
-	{
-		.name = "memory_pressure",
-		.read_u64 = cpuset_read_u64,
-	},
-
-	{
-		.name = "memory_spread_page",
-		.read_u64 = cpuset_read_u64,
-		.write_u64 = cpuset_write_u64,
-		.private = FILE_SPREAD_PAGE,
-	},
-
-	{
-		.name = "memory_spread_slab",
-		.read_u64 = cpuset_read_u64,
-		.write_u64 = cpuset_write_u64,
-		.private = FILE_SPREAD_SLAB,
-	},
-
-	{
-		.name = "memory_pressure_enabled",
-		.flags = CFTYPE_ONLY_ON_ROOT,
-		.read_u64 = cpuset_read_u64,
-		.write_u64 = cpuset_write_u64,
-		.private = FILE_MEMORY_PRESSURE_ENABLED,
-	},
-
-	{ }	/* terminate */
-};
-
-/*
- *	cpuset_css_alloc - allocate a cpuset css
- *	cgrp:	control group that the new cpuset will be part of
- */
-
-static struct cgroup_subsys_state *
-cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
-{
-	struct cpuset *cs;
-
-	if (!parent_css)
-		return &top_cpuset.css;
-
-	cs = kzalloc(sizeof(*cs), GFP_KERNEL);
-	if (!cs)
-		return ERR_PTR(-ENOMEM);
-	if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
-		goto free_cs;
-	if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
-		goto free_cpus;
-
-	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
-	cpumask_clear(cs->cpus_allowed);
-	nodes_clear(cs->mems_allowed);
-	cpumask_clear(cs->effective_cpus);
-	nodes_clear(cs->effective_mems);
-	fmeter_init(&cs->fmeter);
-	cs->relax_domain_level = -1;
-
-	return &cs->css;
-
-free_cpus:
-	free_cpumask_var(cs->cpus_allowed);
-free_cs:
-	kfree(cs);
-	return ERR_PTR(-ENOMEM);
-}
-
-static int cpuset_css_online(struct cgroup_subsys_state *css)
-{
-	struct cpuset *cs = css_cs(css);
-	struct cpuset *parent = parent_cs(cs);
-	struct cpuset *tmp_cs;
-	struct cgroup_subsys_state *pos_css;
-
-	if (!parent)
-		return 0;
-
-	mutex_lock(&cpuset_mutex);
-
-	set_bit(CS_ONLINE, &cs->flags);
-	if (is_spread_page(parent))
-		set_bit(CS_SPREAD_PAGE, &cs->flags);
-	if (is_spread_slab(parent))
-		set_bit(CS_SPREAD_SLAB, &cs->flags);
-
-	cpuset_inc();
-
-	spin_lock_irq(&callback_lock);
-	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
-		cpumask_copy(cs->effective_cpus, parent->effective_cpus);
-		cs->effective_mems = parent->effective_mems;
-	}
-	spin_unlock_irq(&callback_lock);
-
-	if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
-		goto out_unlock;
-
-	/*
-	 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
-	 * set.  This flag handling is implemented in cgroup core for
-	 * histrical reasons - the flag may be specified during mount.
-	 *
-	 * Currently, if any sibling cpusets have exclusive cpus or mem, we
-	 * refuse to clone the configuration - thereby refusing the task to
-	 * be entered, and as a result refusing the sys_unshare() or
-	 * clone() which initiated it.  If this becomes a problem for some
-	 * users who wish to allow that scenario, then this could be
-	 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
-	 * (and likewise for mems) to the new cgroup.
-	 */
-	rcu_read_lock();
-	cpuset_for_each_child(tmp_cs, pos_css, parent) {
-		if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
-			rcu_read_unlock();
-			goto out_unlock;
-		}
-	}
-	rcu_read_unlock();
-
-	spin_lock_irq(&callback_lock);
-	cs->mems_allowed = parent->mems_allowed;
-	cs->effective_mems = parent->mems_allowed;
-	cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
-	cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
-	spin_unlock_irq(&callback_lock);
-out_unlock:
-	mutex_unlock(&cpuset_mutex);
-	return 0;
-}
-
-/*
- * If the cpuset being removed has its flag 'sched_load_balance'
- * enabled, then simulate turning sched_load_balance off, which
- * will call rebuild_sched_domains_locked().
- */
-
-static void cpuset_css_offline(struct cgroup_subsys_state *css)
-{
-	struct cpuset *cs = css_cs(css);
-
-	mutex_lock(&cpuset_mutex);
-
-	if (is_sched_load_balance(cs))
-		update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
-
-	cpuset_dec();
-	clear_bit(CS_ONLINE, &cs->flags);
-
-	mutex_unlock(&cpuset_mutex);
-}
-
-static void cpuset_css_free(struct cgroup_subsys_state *css)
-{
-	struct cpuset *cs = css_cs(css);
-
-	free_cpumask_var(cs->effective_cpus);
-	free_cpumask_var(cs->cpus_allowed);
-	kfree(cs);
-}
-
-static void cpuset_bind(struct cgroup_subsys_state *root_css)
-{
-	mutex_lock(&cpuset_mutex);
-	spin_lock_irq(&callback_lock);
-
-	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
-		cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
-		top_cpuset.mems_allowed = node_possible_map;
-	} else {
-		cpumask_copy(top_cpuset.cpus_allowed,
-			     top_cpuset.effective_cpus);
-		top_cpuset.mems_allowed = top_cpuset.effective_mems;
-	}
-
-	spin_unlock_irq(&callback_lock);
-	mutex_unlock(&cpuset_mutex);
-}
-
-/*
- * Make sure the new task conform to the current state of its parent,
- * which could have been changed by cpuset just after it inherits the
- * state from the parent and before it sits on the cgroup's task list.
- */
-static void cpuset_fork(struct task_struct *task)
-{
-	if (task_css_is_root(task, cpuset_cgrp_id))
-		return;
-
-	set_cpus_allowed_ptr(task, &current->cpus_allowed);
-	task->mems_allowed = current->mems_allowed;
-}
-
-struct cgroup_subsys cpuset_cgrp_subsys = {
-	.css_alloc	= cpuset_css_alloc,
-	.css_online	= cpuset_css_online,
-	.css_offline	= cpuset_css_offline,
-	.css_free	= cpuset_css_free,
-	.can_attach	= cpuset_can_attach,
-	.cancel_attach	= cpuset_cancel_attach,
-	.attach		= cpuset_attach,
-	.post_attach	= cpuset_post_attach,
-	.bind		= cpuset_bind,
-	.fork		= cpuset_fork,
-	.legacy_cftypes	= files,
-	.early_init	= true,
-};
-
-/**
- * cpuset_init - initialize cpusets at system boot
- *
- * Description: Initialize top_cpuset and the cpuset internal file system,
- **/
-
-int __init cpuset_init(void)
-{
-	int err = 0;
-
-	if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
-		BUG();
-	if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
-		BUG();
-
-	cpumask_setall(top_cpuset.cpus_allowed);
-	nodes_setall(top_cpuset.mems_allowed);
-	cpumask_setall(top_cpuset.effective_cpus);
-	nodes_setall(top_cpuset.effective_mems);
-
-	fmeter_init(&top_cpuset.fmeter);
-	set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
-	top_cpuset.relax_domain_level = -1;
-
-	err = register_filesystem(&cpuset_fs_type);
-	if (err < 0)
-		return err;
-
-	if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
-		BUG();
-
-	return 0;
-}
-
-/*
- * If CPU and/or memory hotplug handlers, below, unplug any CPUs
- * or memory nodes, we need to walk over the cpuset hierarchy,
- * removing that CPU or node from all cpusets.  If this removes the
- * last CPU or node from a cpuset, then move the tasks in the empty
- * cpuset to its next-highest non-empty parent.
- */
-static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
-{
-	struct cpuset *parent;
-
-	/*
-	 * Find its next-highest non-empty parent, (top cpuset
-	 * has online cpus, so can't be empty).
-	 */
-	parent = parent_cs(cs);
-	while (cpumask_empty(parent->cpus_allowed) ||
-			nodes_empty(parent->mems_allowed))
-		parent = parent_cs(parent);
-
-	if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
-		pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
-		pr_cont_cgroup_name(cs->css.cgroup);
-		pr_cont("\n");
-	}
-}
-
-static void
-hotplug_update_tasks_legacy(struct cpuset *cs,
-			    struct cpumask *new_cpus, nodemask_t *new_mems,
-			    bool cpus_updated, bool mems_updated)
-{
-	bool is_empty;
-
-	spin_lock_irq(&callback_lock);
-	cpumask_copy(cs->cpus_allowed, new_cpus);
-	cpumask_copy(cs->effective_cpus, new_cpus);
-	cs->mems_allowed = *new_mems;
-	cs->effective_mems = *new_mems;
-	spin_unlock_irq(&callback_lock);
-
-	/*
-	 * Don't call update_tasks_cpumask() if the cpuset becomes empty,
-	 * as the tasks will be migratecd to an ancestor.
-	 */
-	if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
-		update_tasks_cpumask(cs);
-	if (mems_updated && !nodes_empty(cs->mems_allowed))
-		update_tasks_nodemask(cs);
-
-	is_empty = cpumask_empty(cs->cpus_allowed) ||
-		   nodes_empty(cs->mems_allowed);
-
-	mutex_unlock(&cpuset_mutex);
-
-	/*
-	 * Move tasks to the nearest ancestor with execution resources,
-	 * This is full cgroup operation which will also call back into
-	 * cpuset. Should be done outside any lock.
-	 */
-	if (is_empty)
-		remove_tasks_in_empty_cpuset(cs);
-
-	mutex_lock(&cpuset_mutex);
-}
-
-static void
-hotplug_update_tasks(struct cpuset *cs,
-		     struct cpumask *new_cpus, nodemask_t *new_mems,
-		     bool cpus_updated, bool mems_updated)
-{
-	if (cpumask_empty(new_cpus))
-		cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
-	if (nodes_empty(*new_mems))
-		*new_mems = parent_cs(cs)->effective_mems;
-
-	spin_lock_irq(&callback_lock);
-	cpumask_copy(cs->effective_cpus, new_cpus);
-	cs->effective_mems = *new_mems;
-	spin_unlock_irq(&callback_lock);
-
-	if (cpus_updated)
-		update_tasks_cpumask(cs);
-	if (mems_updated)
-		update_tasks_nodemask(cs);
-}
-
-/**
- * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
- * @cs: cpuset in interest
- *
- * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
- * offline, update @cs accordingly.  If @cs ends up with no CPU or memory,
- * all its tasks are moved to the nearest ancestor with both resources.
- */
-static void cpuset_hotplug_update_tasks(struct cpuset *cs)
-{
-	static cpumask_t new_cpus;
-	static nodemask_t new_mems;
-	bool cpus_updated;
-	bool mems_updated;
-retry:
-	wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
-
-	mutex_lock(&cpuset_mutex);
-
-	/*
-	 * We have raced with task attaching. We wait until attaching
-	 * is finished, so we won't attach a task to an empty cpuset.
-	 */
-	if (cs->attach_in_progress) {
-		mutex_unlock(&cpuset_mutex);
-		goto retry;
-	}
-
-	cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
-	nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
-
-	cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
-	mems_updated = !nodes_equal(new_mems, cs->effective_mems);
-
-	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
-		hotplug_update_tasks(cs, &new_cpus, &new_mems,
-				     cpus_updated, mems_updated);
-	else
-		hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
-					    cpus_updated, mems_updated);
-
-	mutex_unlock(&cpuset_mutex);
-}
-
-/**
- * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
- *
- * This function is called after either CPU or memory configuration has
- * changed and updates cpuset accordingly.  The top_cpuset is always
- * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
- * order to make cpusets transparent (of no affect) on systems that are
- * actively using CPU hotplug but making no active use of cpusets.
- *
- * Non-root cpusets are only affected by offlining.  If any CPUs or memory
- * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
- * all descendants.
- *
- * Note that CPU offlining during suspend is ignored.  We don't modify
- * cpusets across suspend/resume cycles at all.
- */
-static void cpuset_hotplug_workfn(struct work_struct *work)
-{
-	static cpumask_t new_cpus;
-	static nodemask_t new_mems;
-	bool cpus_updated, mems_updated;
-	bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
-
-	mutex_lock(&cpuset_mutex);
-
-	/* fetch the available cpus/mems and find out which changed how */
-	cpumask_copy(&new_cpus, cpu_active_mask);
-	new_mems = node_states[N_MEMORY];
-
-	cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
-	mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
-
-	/* synchronize cpus_allowed to cpu_active_mask */
-	if (cpus_updated) {
-		spin_lock_irq(&callback_lock);
-		if (!on_dfl)
-			cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
-		cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
-		spin_unlock_irq(&callback_lock);
-		/* we don't mess with cpumasks of tasks in top_cpuset */
-	}
-
-	/* synchronize mems_allowed to N_MEMORY */
-	if (mems_updated) {
-		spin_lock_irq(&callback_lock);
-		if (!on_dfl)
-			top_cpuset.mems_allowed = new_mems;
-		top_cpuset.effective_mems = new_mems;
-		spin_unlock_irq(&callback_lock);
-		update_tasks_nodemask(&top_cpuset);
-	}
-
-	mutex_unlock(&cpuset_mutex);
-
-	/* if cpus or mems changed, we need to propagate to descendants */
-	if (cpus_updated || mems_updated) {
-		struct cpuset *cs;
-		struct cgroup_subsys_state *pos_css;
-
-		rcu_read_lock();
-		cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
-			if (cs == &top_cpuset || !css_tryget_online(&cs->css))
-				continue;
-			rcu_read_unlock();
-
-			cpuset_hotplug_update_tasks(cs);
-
-			rcu_read_lock();
-			css_put(&cs->css);
-		}
-		rcu_read_unlock();
-	}
-
-	/* rebuild sched domains if cpus_allowed has changed */
-	if (cpus_updated)
-		rebuild_sched_domains();
-}
-
-void cpuset_update_active_cpus(bool cpu_online)
-{
-	/*
-	 * We're inside cpu hotplug critical region which usually nests
-	 * inside cgroup synchronization.  Bounce actual hotplug processing
-	 * to a work item to avoid reverse locking order.
-	 *
-	 * We still need to do partition_sched_domains() synchronously;
-	 * otherwise, the scheduler will get confused and put tasks to the
-	 * dead CPU.  Fall back to the default single domain.
-	 * cpuset_hotplug_workfn() will rebuild it as necessary.
-	 */
-	partition_sched_domains(1, NULL, NULL);
-	schedule_work(&cpuset_hotplug_work);
-}
-
-/*
- * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
- * Call this routine anytime after node_states[N_MEMORY] changes.
- * See cpuset_update_active_cpus() for CPU hotplug handling.
- */
-static int cpuset_track_online_nodes(struct notifier_block *self,
-				unsigned long action, void *arg)
-{
-	schedule_work(&cpuset_hotplug_work);
-	return NOTIFY_OK;
-}
-
-static struct notifier_block cpuset_track_online_nodes_nb = {
-	.notifier_call = cpuset_track_online_nodes,
-	.priority = 10,		/* ??! */
-};
-
-/**
- * cpuset_init_smp - initialize cpus_allowed
- *
- * Description: Finish top cpuset after cpu, node maps are initialized
- */
-void __init cpuset_init_smp(void)
-{
-	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
-	top_cpuset.mems_allowed = node_states[N_MEMORY];
-	top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
-
-	cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
-	top_cpuset.effective_mems = node_states[N_MEMORY];
-
-	register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
-
-	cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
-	BUG_ON(!cpuset_migrate_mm_wq);
-}
-
-/**
- * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
- * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
- * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
- *
- * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
- * attached to the specified @tsk.  Guaranteed to return some non-empty
- * subset of cpu_online_mask, even if this means going outside the
- * tasks cpuset.
- **/
-
-void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&callback_lock, flags);
-	rcu_read_lock();
-	guarantee_online_cpus(task_cs(tsk), pmask);
-	rcu_read_unlock();
-	spin_unlock_irqrestore(&callback_lock, flags);
-}
-
-void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
-{
-	rcu_read_lock();
-	do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
-	rcu_read_unlock();
-
-	/*
-	 * We own tsk->cpus_allowed, nobody can change it under us.
-	 *
-	 * But we used cs && cs->cpus_allowed lockless and thus can
-	 * race with cgroup_attach_task() or update_cpumask() and get
-	 * the wrong tsk->cpus_allowed. However, both cases imply the
-	 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
-	 * which takes task_rq_lock().
-	 *
-	 * If we are called after it dropped the lock we must see all
-	 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
-	 * set any mask even if it is not right from task_cs() pov,
-	 * the pending set_cpus_allowed_ptr() will fix things.
-	 *
-	 * select_fallback_rq() will fix things ups and set cpu_possible_mask
-	 * if required.
-	 */
-}
-
-void __init cpuset_init_current_mems_allowed(void)
-{
-	nodes_setall(current->mems_allowed);
-}
-
-/**
- * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
- * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
- *
- * Description: Returns the nodemask_t mems_allowed of the cpuset
- * attached to the specified @tsk.  Guaranteed to return some non-empty
- * subset of node_states[N_MEMORY], even if this means going outside the
- * tasks cpuset.
- **/
-
-nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
-{
-	nodemask_t mask;
-	unsigned long flags;
-
-	spin_lock_irqsave(&callback_lock, flags);
-	rcu_read_lock();
-	guarantee_online_mems(task_cs(tsk), &mask);
-	rcu_read_unlock();
-	spin_unlock_irqrestore(&callback_lock, flags);
-
-	return mask;
-}
-
-/**
- * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
- * @nodemask: the nodemask to be checked
- *
- * Are any of the nodes in the nodemask allowed in current->mems_allowed?
- */
-int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
-{
-	return nodes_intersects(*nodemask, current->mems_allowed);
-}
-
-/*
- * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
- * mem_hardwall ancestor to the specified cpuset.  Call holding
- * callback_lock.  If no ancestor is mem_exclusive or mem_hardwall
- * (an unusual configuration), then returns the root cpuset.
- */
-static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
-{
-	while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
-		cs = parent_cs(cs);
-	return cs;
-}
-
-/**
- * cpuset_node_allowed - Can we allocate on a memory node?
- * @node: is this an allowed node?
- * @gfp_mask: memory allocation flags
- *
- * If we're in interrupt, yes, we can always allocate.  If @node is set in
- * current's mems_allowed, yes.  If it's not a __GFP_HARDWALL request and this
- * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
- * yes.  If current has access to memory reserves due to TIF_MEMDIE, yes.
- * Otherwise, no.
- *
- * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
- * and do not allow allocations outside the current tasks cpuset
- * unless the task has been OOM killed as is marked TIF_MEMDIE.
- * GFP_KERNEL allocations are not so marked, so can escape to the
- * nearest enclosing hardwalled ancestor cpuset.
- *
- * Scanning up parent cpusets requires callback_lock.  The
- * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
- * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
- * current tasks mems_allowed came up empty on the first pass over
- * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the
- * cpuset are short of memory, might require taking the callback_lock.
- *
- * The first call here from mm/page_alloc:get_page_from_freelist()
- * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
- * so no allocation on a node outside the cpuset is allowed (unless
- * in interrupt, of course).
- *
- * The second pass through get_page_from_freelist() doesn't even call
- * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
- * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
- * in alloc_flags.  That logic and the checks below have the combined
- * affect that:
- *	in_interrupt - any node ok (current task context irrelevant)
- *	GFP_ATOMIC   - any node ok
- *	TIF_MEMDIE   - any node ok
- *	GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
- *	GFP_USER     - only nodes in current tasks mems allowed ok.
- */
-bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
-{
-	struct cpuset *cs;		/* current cpuset ancestors */
-	int allowed;			/* is allocation in zone z allowed? */
-	unsigned long flags;
-
-	if (in_interrupt())
-		return true;
-	if (node_isset(node, current->mems_allowed))
-		return true;
-	/*
-	 * Allow tasks that have access to memory reserves because they have
-	 * been OOM killed to get memory anywhere.
-	 */
-	if (unlikely(test_thread_flag(TIF_MEMDIE)))
-		return true;
-	if (gfp_mask & __GFP_HARDWALL)	/* If hardwall request, stop here */
-		return false;
-
-	if (current->flags & PF_EXITING) /* Let dying task have memory */
-		return true;
-
-	/* Not hardwall and node outside mems_allowed: scan up cpusets */
-	spin_lock_irqsave(&callback_lock, flags);
-
-	rcu_read_lock();
-	cs = nearest_hardwall_ancestor(task_cs(current));
-	allowed = node_isset(node, cs->mems_allowed);
-	rcu_read_unlock();
-
-	spin_unlock_irqrestore(&callback_lock, flags);
-	return allowed;
-}
-
-/**
- * cpuset_mem_spread_node() - On which node to begin search for a file page
- * cpuset_slab_spread_node() - On which node to begin search for a slab page
- *
- * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
- * tasks in a cpuset with is_spread_page or is_spread_slab set),
- * and if the memory allocation used cpuset_mem_spread_node()
- * to determine on which node to start looking, as it will for
- * certain page cache or slab cache pages such as used for file
- * system buffers and inode caches, then instead of starting on the
- * local node to look for a free page, rather spread the starting
- * node around the tasks mems_allowed nodes.
- *
- * We don't have to worry about the returned node being offline
- * because "it can't happen", and even if it did, it would be ok.
- *
- * The routines calling guarantee_online_mems() are careful to
- * only set nodes in task->mems_allowed that are online.  So it
- * should not be possible for the following code to return an
- * offline node.  But if it did, that would be ok, as this routine
- * is not returning the node where the allocation must be, only
- * the node where the search should start.  The zonelist passed to
- * __alloc_pages() will include all nodes.  If the slab allocator
- * is passed an offline node, it will fall back to the local node.
- * See kmem_cache_alloc_node().
- */
-
-static int cpuset_spread_node(int *rotor)
-{
-	return *rotor = next_node_in(*rotor, current->mems_allowed);
-}
-
-int cpuset_mem_spread_node(void)
-{
-	if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
-		current->cpuset_mem_spread_rotor =
-			node_random(&current->mems_allowed);
-
-	return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
-}
-
-int cpuset_slab_spread_node(void)
-{
-	if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
-		current->cpuset_slab_spread_rotor =
-			node_random(&current->mems_allowed);
-
-	return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
-}
-
-EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
-
-/**
- * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
- * @tsk1: pointer to task_struct of some task.
- * @tsk2: pointer to task_struct of some other task.
- *
- * Description: Return true if @tsk1's mems_allowed intersects the
- * mems_allowed of @tsk2.  Used by the OOM killer to determine if
- * one of the task's memory usage might impact the memory available
- * to the other.
- **/
-
-int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
-				   const struct task_struct *tsk2)
-{
-	return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
-}
-
-/**
- * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
- *
- * Description: Prints current's name, cpuset name, and cached copy of its
- * mems_allowed to the kernel log.
- */
-void cpuset_print_current_mems_allowed(void)
-{
-	struct cgroup *cgrp;
-
-	rcu_read_lock();
-
-	cgrp = task_cs(current)->css.cgroup;
-	pr_info("%s cpuset=", current->comm);
-	pr_cont_cgroup_name(cgrp);
-	pr_cont(" mems_allowed=%*pbl\n",
-		nodemask_pr_args(&current->mems_allowed));
-
-	rcu_read_unlock();
-}
-
-/*
- * Collection of memory_pressure is suppressed unless
- * this flag is enabled by writing "1" to the special
- * cpuset file 'memory_pressure_enabled' in the root cpuset.
- */
-
-int cpuset_memory_pressure_enabled __read_mostly;
-
-/**
- * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
- *
- * Keep a running average of the rate of synchronous (direct)
- * page reclaim efforts initiated by tasks in each cpuset.
- *
- * This represents the rate at which some task in the cpuset
- * ran low on memory on all nodes it was allowed to use, and
- * had to enter the kernels page reclaim code in an effort to
- * create more free memory by tossing clean pages or swapping
- * or writing dirty pages.
- *
- * Display to user space in the per-cpuset read-only file
- * "memory_pressure".  Value displayed is an integer
- * representing the recent rate of entry into the synchronous
- * (direct) page reclaim by any task attached to the cpuset.
- **/
-
-void __cpuset_memory_pressure_bump(void)
-{
-	rcu_read_lock();
-	fmeter_markevent(&task_cs(current)->fmeter);
-	rcu_read_unlock();
-}
-
-#ifdef CONFIG_PROC_PID_CPUSET
-/*
- * proc_cpuset_show()
- *  - Print tasks cpuset path into seq_file.
- *  - Used for /proc/<pid>/cpuset.
- *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
- *    doesn't really matter if tsk->cpuset changes after we read it,
- *    and we take cpuset_mutex, keeping cpuset_attach() from changing it
- *    anyway.
- */
-int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
-		     struct pid *pid, struct task_struct *tsk)
-{
-	char *buf;
-	struct cgroup_subsys_state *css;
-	int retval;
-
-	retval = -ENOMEM;
-	buf = kmalloc(PATH_MAX, GFP_KERNEL);
-	if (!buf)
-		goto out;
-
-	css = task_get_css(tsk, cpuset_cgrp_id);
-	retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
-				current->nsproxy->cgroup_ns);
-	css_put(css);
-	if (retval >= PATH_MAX)
-		retval = -ENAMETOOLONG;
-	if (retval < 0)
-		goto out_free;
-	seq_puts(m, buf);
-	seq_putc(m, '\n');
-	retval = 0;
-out_free:
-	kfree(buf);
-out:
-	return retval;
-}
-#endif /* CONFIG_PROC_PID_CPUSET */
-
-/* Display task mems_allowed in /proc/<pid>/status file. */
-void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
-{
-	seq_printf(m, "Mems_allowed:\t%*pb\n",
-		   nodemask_pr_args(&task->mems_allowed));
-	seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
-		   nodemask_pr_args(&task->mems_allowed));
-}
-- 
cgit v1.2.3


From 0a268dbd7932c78896f5a45c8a492b31729db6c0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 27 Dec 2016 14:49:06 -0500
Subject: cgroup: move cgroup v1 specific code to kernel/cgroup/cgroup-v1.c

cgroup.c is getting too unwieldy.  Let's move out cgroup v1 specific
code along with the debug controller into kernel/cgroup/cgroup-v1.c.

v2: cgroup_mutex and css_set_lock made available in cgroup-internal.h
    regardless of CONFIG_PROVE_RCU.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Acked-by: Zefan Li <lizefan@huawei.com>
---
 kernel/cgroup/Makefile          |    2 +-
 kernel/cgroup/cgroup-internal.h |  103 ++++
 kernel/cgroup/cgroup-v1.c       | 1027 +++++++++++++++++++++++++++++++++++
 kernel/cgroup/cgroup.c          | 1140 ++-------------------------------------
 4 files changed, 1164 insertions(+), 1108 deletions(-)
 create mode 100644 kernel/cgroup/cgroup-internal.h
 create mode 100644 kernel/cgroup/cgroup-v1.c

(limited to 'kernel')

diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index 4d561a50a5ac..719588cb18cd 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -1,4 +1,4 @@
-obj-y := cgroup.o
+obj-y := cgroup.o cgroup-v1.o
 
 obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
 obj-$(CONFIG_CGROUP_PIDS) += pids.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
new file mode 100644
index 000000000000..dca3193bd9d2
--- /dev/null
+++ b/kernel/cgroup/cgroup-internal.h
@@ -0,0 +1,103 @@
+#ifndef __CGROUP_INTERNAL_H
+#define __CGROUP_INTERNAL_H
+
+#include <linux/cgroup.h>
+#include <linux/kernfs.h>
+#include <linux/workqueue.h>
+#include <linux/list.h>
+
+/*
+ * A cgroup can be associated with multiple css_sets as different tasks may
+ * belong to different cgroups on different hierarchies.  In the other
+ * direction, a css_set is naturally associated with multiple cgroups.
+ * This M:N relationship is represented by the following link structure
+ * which exists for each association and allows traversing the associations
+ * from both sides.
+ */
+struct cgrp_cset_link {
+	/* the cgroup and css_set this link associates */
+	struct cgroup		*cgrp;
+	struct css_set		*cset;
+
+	/* list of cgrp_cset_links anchored at cgrp->cset_links */
+	struct list_head	cset_link;
+
+	/* list of cgrp_cset_links anchored at css_set->cgrp_links */
+	struct list_head	cgrp_link;
+};
+
+extern struct mutex cgroup_mutex;
+extern spinlock_t css_set_lock;
+extern struct cgroup_subsys *cgroup_subsys[];
+extern struct list_head cgroup_roots;
+extern struct file_system_type cgroup_fs_type;
+
+/* iterate across the hierarchies */
+#define for_each_root(root)						\
+	list_for_each_entry((root), &cgroup_roots, root_list)
+
+/**
+ * for_each_subsys - iterate all enabled cgroup subsystems
+ * @ss: the iteration cursor
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ */
+#define for_each_subsys(ss, ssid)					\
+	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
+	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
+
+static inline bool cgroup_is_dead(const struct cgroup *cgrp)
+{
+	return !(cgrp->self.flags & CSS_ONLINE);
+}
+
+static inline bool notify_on_release(const struct cgroup *cgrp)
+{
+	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+}
+
+bool cgroup_ssid_enabled(int ssid);
+bool cgroup_on_dfl(const struct cgroup *cgrp);
+
+struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root);
+struct cgroup *task_cgroup_from_root(struct task_struct *task,
+				     struct cgroup_root *root);
+struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline);
+void cgroup_kn_unlock(struct kernfs_node *kn);
+int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
+			  struct cgroup_namespace *ns);
+
+int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
+
+bool cgroup_may_migrate_to(struct cgroup *dst_cgrp);
+void cgroup_migrate_finish(struct list_head *preloaded_csets);
+void cgroup_migrate_add_src(struct css_set *src_cset,
+			    struct cgroup *dst_cgrp,
+			    struct list_head *preloaded_csets);
+int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets);
+int cgroup_migrate(struct task_struct *leader, bool threadgroup,
+		   struct cgroup_root *root);
+
+int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
+		       bool threadgroup);
+ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+			     size_t nbytes, loff_t off, bool threadgroup);
+ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
+			   loff_t off);
+
+void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
+
+/*
+ * cgroup-v1.c
+ */
+extern spinlock_t release_agent_path_lock;
+extern struct cftype cgroup_legacy_base_files[];
+extern const struct file_operations proc_cgroupstats_operations;
+
+bool cgroup_ssid_no_v1(int ssid);
+void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
+int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
+		  const char *new_name_str);
+void cgroup_release_agent(struct work_struct *work);
+void check_for_release(struct cgroup *cgrp);
+
+#endif /* __CGROUP_INTERNAL_H */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
new file mode 100644
index 000000000000..7af745a46f91
--- /dev/null
+++ b/kernel/cgroup/cgroup-v1.c
@@ -0,0 +1,1027 @@
+#include "cgroup-internal.h"
+
+#include <linux/kmod.h>
+#include <linux/sort.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/delayacct.h>
+#include <linux/pid_namespace.h>
+#include <linux/cgroupstats.h>
+
+#include <trace/events/cgroup.h>
+
+/*
+ * pidlists linger the following amount before being destroyed.  The goal
+ * is avoiding frequent destruction in the middle of consecutive read calls
+ * Expiring in the middle is a performance problem not a correctness one.
+ * 1 sec should be enough.
+ */
+#define CGROUP_PIDLIST_DESTROY_DELAY	HZ
+
+/* Controllers blocked by the commandline in v1 */
+static u16 cgroup_no_v1_mask;
+
+/*
+ * pidlist destructions need to be flushed on cgroup destruction.  Use a
+ * separate workqueue as flush domain.
+ */
+static struct workqueue_struct *cgroup_pidlist_destroy_wq;
+
+/*
+ * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
+ * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
+ */
+DEFINE_SPINLOCK(release_agent_path_lock);
+
+bool cgroup_ssid_no_v1(int ssid)
+{
+	return cgroup_no_v1_mask & (1 << ssid);
+}
+
+/**
+ * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
+ * @from: attach to all cgroups of a given task
+ * @tsk: the task to be attached
+ */
+int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
+{
+	struct cgroup_root *root;
+	int retval = 0;
+
+	mutex_lock(&cgroup_mutex);
+	percpu_down_write(&cgroup_threadgroup_rwsem);
+	for_each_root(root) {
+		struct cgroup *from_cgrp;
+
+		if (root == &cgrp_dfl_root)
+			continue;
+
+		spin_lock_irq(&css_set_lock);
+		from_cgrp = task_cgroup_from_root(from, root);
+		spin_unlock_irq(&css_set_lock);
+
+		retval = cgroup_attach_task(from_cgrp, tsk, false);
+		if (retval)
+			break;
+	}
+	percpu_up_write(&cgroup_threadgroup_rwsem);
+	mutex_unlock(&cgroup_mutex);
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
+
+/**
+ * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
+ * @to: cgroup to which the tasks will be moved
+ * @from: cgroup in which the tasks currently reside
+ *
+ * Locking rules between cgroup_post_fork() and the migration path
+ * guarantee that, if a task is forking while being migrated, the new child
+ * is guaranteed to be either visible in the source cgroup after the
+ * parent's migration is complete or put into the target cgroup.  No task
+ * can slip out of migration through forking.
+ */
+int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
+{
+	LIST_HEAD(preloaded_csets);
+	struct cgrp_cset_link *link;
+	struct css_task_iter it;
+	struct task_struct *task;
+	int ret;
+
+	if (cgroup_on_dfl(to))
+		return -EINVAL;
+
+	if (!cgroup_may_migrate_to(to))
+		return -EBUSY;
+
+	mutex_lock(&cgroup_mutex);
+
+	percpu_down_write(&cgroup_threadgroup_rwsem);
+
+	/* all tasks in @from are being moved, all csets are source */
+	spin_lock_irq(&css_set_lock);
+	list_for_each_entry(link, &from->cset_links, cset_link)
+		cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
+	spin_unlock_irq(&css_set_lock);
+
+	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
+	if (ret)
+		goto out_err;
+
+	/*
+	 * Migrate tasks one-by-one until @from is empty.  This fails iff
+	 * ->can_attach() fails.
+	 */
+	do {
+		css_task_iter_start(&from->self, &it);
+		task = css_task_iter_next(&it);
+		if (task)
+			get_task_struct(task);
+		css_task_iter_end(&it);
+
+		if (task) {
+			ret = cgroup_migrate(task, false, to->root);
+			if (!ret)
+				trace_cgroup_transfer_tasks(to, task, false);
+			put_task_struct(task);
+		}
+	} while (task && !ret);
+out_err:
+	cgroup_migrate_finish(&preloaded_csets);
+	percpu_up_write(&cgroup_threadgroup_rwsem);
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
+
+/*
+ * Stuff for reading the 'tasks'/'procs' files.
+ *
+ * Reading this file can return large amounts of data if a cgroup has
+ * *lots* of attached tasks. So it may need several calls to read(),
+ * but we cannot guarantee that the information we produce is correct
+ * unless we produce it entirely atomically.
+ *
+ */
+
+/* which pidlist file are we talking about? */
+enum cgroup_filetype {
+	CGROUP_FILE_PROCS,
+	CGROUP_FILE_TASKS,
+};
+
+/*
+ * A pidlist is a list of pids that virtually represents the contents of one
+ * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
+ * a pair (one each for procs, tasks) for each pid namespace that's relevant
+ * to the cgroup.
+ */
+struct cgroup_pidlist {
+	/*
+	 * used to find which pidlist is wanted. doesn't change as long as
+	 * this particular list stays in the list.
+	*/
+	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
+	/* array of xids */
+	pid_t *list;
+	/* how many elements the above list has */
+	int length;
+	/* each of these stored in a list by its cgroup */
+	struct list_head links;
+	/* pointer to the cgroup we belong to, for list removal purposes */
+	struct cgroup *owner;
+	/* for delayed destruction */
+	struct delayed_work destroy_dwork;
+};
+
+/*
+ * The following two functions "fix" the issue where there are more pids
+ * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
+ * TODO: replace with a kernel-wide solution to this problem
+ */
+#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
+static void *pidlist_allocate(int count)
+{
+	if (PIDLIST_TOO_LARGE(count))
+		return vmalloc(count * sizeof(pid_t));
+	else
+		return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
+}
+
+static void pidlist_free(void *p)
+{
+	kvfree(p);
+}
+
+/*
+ * Used to destroy all pidlists lingering waiting for destroy timer.  None
+ * should be left afterwards.
+ */
+void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
+{
+	struct cgroup_pidlist *l, *tmp_l;
+
+	mutex_lock(&cgrp->pidlist_mutex);
+	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
+		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
+	mutex_unlock(&cgrp->pidlist_mutex);
+
+	flush_workqueue(cgroup_pidlist_destroy_wq);
+	BUG_ON(!list_empty(&cgrp->pidlists));
+}
+
+static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
+						destroy_dwork);
+	struct cgroup_pidlist *tofree = NULL;
+
+	mutex_lock(&l->owner->pidlist_mutex);
+
+	/*
+	 * Destroy iff we didn't get queued again.  The state won't change
+	 * as destroy_dwork can only be queued while locked.
+	 */
+	if (!delayed_work_pending(dwork)) {
+		list_del(&l->links);
+		pidlist_free(l->list);
+		put_pid_ns(l->key.ns);
+		tofree = l;
+	}
+
+	mutex_unlock(&l->owner->pidlist_mutex);
+	kfree(tofree);
+}
+
+/*
+ * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
+ * Returns the number of unique elements.
+ */
+static int pidlist_uniq(pid_t *list, int length)
+{
+	int src, dest = 1;
+
+	/*
+	 * we presume the 0th element is unique, so i starts at 1. trivial
+	 * edge cases first; no work needs to be done for either
+	 */
+	if (length == 0 || length == 1)
+		return length;
+	/* src and dest walk down the list; dest counts unique elements */
+	for (src = 1; src < length; src++) {
+		/* find next unique element */
+		while (list[src] == list[src-1]) {
+			src++;
+			if (src == length)
+				goto after;
+		}
+		/* dest always points to where the next unique element goes */
+		list[dest] = list[src];
+		dest++;
+	}
+after:
+	return dest;
+}
+
+/*
+ * The two pid files - task and cgroup.procs - guaranteed that the result
+ * is sorted, which forced this whole pidlist fiasco.  As pid order is
+ * different per namespace, each namespace needs differently sorted list,
+ * making it impossible to use, for example, single rbtree of member tasks
+ * sorted by task pointer.  As pidlists can be fairly large, allocating one
+ * per open file is dangerous, so cgroup had to implement shared pool of
+ * pidlists keyed by cgroup and namespace.
+ */
+static int cmppid(const void *a, const void *b)
+{
+	return *(pid_t *)a - *(pid_t *)b;
+}
+
+static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
+						  enum cgroup_filetype type)
+{
+	struct cgroup_pidlist *l;
+	/* don't need task_nsproxy() if we're looking at ourself */
+	struct pid_namespace *ns = task_active_pid_ns(current);
+
+	lockdep_assert_held(&cgrp->pidlist_mutex);
+
+	list_for_each_entry(l, &cgrp->pidlists, links)
+		if (l->key.type == type && l->key.ns == ns)
+			return l;
+	return NULL;
+}
+
+/*
+ * find the appropriate pidlist for our purpose (given procs vs tasks)
+ * returns with the lock on that pidlist already held, and takes care
+ * of the use count, or returns NULL with no locks held if we're out of
+ * memory.
+ */
+static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
+						enum cgroup_filetype type)
+{
+	struct cgroup_pidlist *l;
+
+	lockdep_assert_held(&cgrp->pidlist_mutex);
+
+	l = cgroup_pidlist_find(cgrp, type);
+	if (l)
+		return l;
+
+	/* entry not found; create a new one */
+	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
+	if (!l)
+		return l;
+
+	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
+	l->key.type = type;
+	/* don't need task_nsproxy() if we're looking at ourself */
+	l->key.ns = get_pid_ns(task_active_pid_ns(current));
+	l->owner = cgrp;
+	list_add(&l->links, &cgrp->pidlists);
+	return l;
+}
+
+/**
+ * cgroup_task_count - count the number of tasks in a cgroup.
+ * @cgrp: the cgroup in question
+ *
+ * Return the number of tasks in the cgroup.  The returned number can be
+ * higher than the actual number of tasks due to css_set references from
+ * namespace roots and temporary usages.
+ */
+static int cgroup_task_count(const struct cgroup *cgrp)
+{
+	int count = 0;
+	struct cgrp_cset_link *link;
+
+	spin_lock_irq(&css_set_lock);
+	list_for_each_entry(link, &cgrp->cset_links, cset_link)
+		count += atomic_read(&link->cset->refcount);
+	spin_unlock_irq(&css_set_lock);
+	return count;
+}
+
+/*
+ * Load a cgroup's pidarray with either procs' tgids or tasks' pids
+ */
+static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
+			      struct cgroup_pidlist **lp)
+{
+	pid_t *array;
+	int length;
+	int pid, n = 0; /* used for populating the array */
+	struct css_task_iter it;
+	struct task_struct *tsk;
+	struct cgroup_pidlist *l;
+
+	lockdep_assert_held(&cgrp->pidlist_mutex);
+
+	/*
+	 * If cgroup gets more users after we read count, we won't have
+	 * enough space - tough.  This race is indistinguishable to the
+	 * caller from the case that the additional cgroup users didn't
+	 * show up until sometime later on.
+	 */
+	length = cgroup_task_count(cgrp);
+	array = pidlist_allocate(length);
+	if (!array)
+		return -ENOMEM;
+	/* now, populate the array */
+	css_task_iter_start(&cgrp->self, &it);
+	while ((tsk = css_task_iter_next(&it))) {
+		if (unlikely(n == length))
+			break;
+		/* get tgid or pid for procs or tasks file respectively */
+		if (type == CGROUP_FILE_PROCS)
+			pid = task_tgid_vnr(tsk);
+		else
+			pid = task_pid_vnr(tsk);
+		if (pid > 0) /* make sure to only use valid results */
+			array[n++] = pid;
+	}
+	css_task_iter_end(&it);
+	length = n;
+	/* now sort & (if procs) strip out duplicates */
+	sort(array, length, sizeof(pid_t), cmppid, NULL);
+	if (type == CGROUP_FILE_PROCS)
+		length = pidlist_uniq(array, length);
+
+	l = cgroup_pidlist_find_create(cgrp, type);
+	if (!l) {
+		pidlist_free(array);
+		return -ENOMEM;
+	}
+
+	/* store array, freeing old if necessary */
+	pidlist_free(l->list);
+	l->list = array;
+	l->length = length;
+	*lp = l;
+	return 0;
+}
+
+/*
+ * seq_file methods for the tasks/procs files. The seq_file position is the
+ * next pid to display; the seq_file iterator is a pointer to the pid
+ * in the cgroup->l->list array.
+ */
+
+static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
+{
+	/*
+	 * Initially we receive a position value that corresponds to
+	 * one more than the last pid shown (or 0 on the first call or
+	 * after a seek to the start). Use a binary-search to find the
+	 * next pid to display, if any
+	 */
+	struct kernfs_open_file *of = s->private;
+	struct cgroup *cgrp = seq_css(s)->cgroup;
+	struct cgroup_pidlist *l;
+	enum cgroup_filetype type = seq_cft(s)->private;
+	int index = 0, pid = *pos;
+	int *iter, ret;
+
+	mutex_lock(&cgrp->pidlist_mutex);
+
+	/*
+	 * !NULL @of->priv indicates that this isn't the first start()
+	 * after open.  If the matching pidlist is around, we can use that.
+	 * Look for it.  Note that @of->priv can't be used directly.  It
+	 * could already have been destroyed.
+	 */
+	if (of->priv)
+		of->priv = cgroup_pidlist_find(cgrp, type);
+
+	/*
+	 * Either this is the first start() after open or the matching
+	 * pidlist has been destroyed inbetween.  Create a new one.
+	 */
+	if (!of->priv) {
+		ret = pidlist_array_load(cgrp, type,
+					 (struct cgroup_pidlist **)&of->priv);
+		if (ret)
+			return ERR_PTR(ret);
+	}
+	l = of->priv;
+
+	if (pid) {
+		int end = l->length;
+
+		while (index < end) {
+			int mid = (index + end) / 2;
+			if (l->list[mid] == pid) {
+				index = mid;
+				break;
+			} else if (l->list[mid] <= pid)
+				index = mid + 1;
+			else
+				end = mid;
+		}
+	}
+	/* If we're off the end of the array, we're done */
+	if (index >= l->length)
+		return NULL;
+	/* Update the abstract position to be the actual pid that we found */
+	iter = l->list + index;
+	*pos = *iter;
+	return iter;
+}
+
+static void cgroup_pidlist_stop(struct seq_file *s, void *v)
+{
+	struct kernfs_open_file *of = s->private;
+	struct cgroup_pidlist *l = of->priv;
+
+	if (l)
+		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
+				 CGROUP_PIDLIST_DESTROY_DELAY);
+	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
+}
+
+static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct kernfs_open_file *of = s->private;
+	struct cgroup_pidlist *l = of->priv;
+	pid_t *p = v;
+	pid_t *end = l->list + l->length;
+	/*
+	 * Advance to the next pid in the array. If this goes off the
+	 * end, we're done
+	 */
+	p++;
+	if (p >= end) {
+		return NULL;
+	} else {
+		*pos = *p;
+		return p;
+	}
+}
+
+static int cgroup_pidlist_show(struct seq_file *s, void *v)
+{
+	seq_printf(s, "%d\n", *(int *)v);
+
+	return 0;
+}
+
+static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
+				  char *buf, size_t nbytes, loff_t off)
+{
+	return __cgroup_procs_write(of, buf, nbytes, off, false);
+}
+
+static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
+					  char *buf, size_t nbytes, loff_t off)
+{
+	struct cgroup *cgrp;
+
+	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+
+	cgrp = cgroup_kn_lock_live(of->kn, false);
+	if (!cgrp)
+		return -ENODEV;
+	spin_lock(&release_agent_path_lock);
+	strlcpy(cgrp->root->release_agent_path, strstrip(buf),
+		sizeof(cgrp->root->release_agent_path));
+	spin_unlock(&release_agent_path_lock);
+	cgroup_kn_unlock(of->kn);
+	return nbytes;
+}
+
+static int cgroup_release_agent_show(struct seq_file *seq, void *v)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+	spin_lock(&release_agent_path_lock);
+	seq_puts(seq, cgrp->root->release_agent_path);
+	spin_unlock(&release_agent_path_lock);
+	seq_putc(seq, '\n');
+	return 0;
+}
+
+static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
+{
+	seq_puts(seq, "0\n");
+	return 0;
+}
+
+static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
+					 struct cftype *cft)
+{
+	return notify_on_release(css->cgroup);
+}
+
+static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
+					  struct cftype *cft, u64 val)
+{
+	if (val)
+		set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
+	else
+		clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
+	return 0;
+}
+
+static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
+				      struct cftype *cft)
+{
+	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+}
+
+static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
+				       struct cftype *cft, u64 val)
+{
+	if (val)
+		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+	else
+		clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+	return 0;
+}
+
+/* cgroup core interface files for the legacy hierarchies */
+struct cftype cgroup_legacy_base_files[] = {
+	{
+		.name = "cgroup.procs",
+		.seq_start = cgroup_pidlist_start,
+		.seq_next = cgroup_pidlist_next,
+		.seq_stop = cgroup_pidlist_stop,
+		.seq_show = cgroup_pidlist_show,
+		.private = CGROUP_FILE_PROCS,
+		.write = cgroup_procs_write,
+	},
+	{
+		.name = "cgroup.clone_children",
+		.read_u64 = cgroup_clone_children_read,
+		.write_u64 = cgroup_clone_children_write,
+	},
+	{
+		.name = "cgroup.sane_behavior",
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.seq_show = cgroup_sane_behavior_show,
+	},
+	{
+		.name = "tasks",
+		.seq_start = cgroup_pidlist_start,
+		.seq_next = cgroup_pidlist_next,
+		.seq_stop = cgroup_pidlist_stop,
+		.seq_show = cgroup_pidlist_show,
+		.private = CGROUP_FILE_TASKS,
+		.write = cgroup_tasks_write,
+	},
+	{
+		.name = "notify_on_release",
+		.read_u64 = cgroup_read_notify_on_release,
+		.write_u64 = cgroup_write_notify_on_release,
+	},
+	{
+		.name = "release_agent",
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.seq_show = cgroup_release_agent_show,
+		.write = cgroup_release_agent_write,
+		.max_write_len = PATH_MAX - 1,
+	},
+	{ }	/* terminate */
+};
+
+/* Display information about each subsystem and each hierarchy */
+static int proc_cgroupstats_show(struct seq_file *m, void *v)
+{
+	struct cgroup_subsys *ss;
+	int i;
+
+	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
+	/*
+	 * ideally we don't want subsystems moving around while we do this.
+	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
+	 * subsys/hierarchy state.
+	 */
+	mutex_lock(&cgroup_mutex);
+
+	for_each_subsys(ss, i)
+		seq_printf(m, "%s\t%d\t%d\t%d\n",
+			   ss->legacy_name, ss->root->hierarchy_id,
+			   atomic_read(&ss->root->nr_cgrps),
+			   cgroup_ssid_enabled(i));
+
+	mutex_unlock(&cgroup_mutex);
+	return 0;
+}
+
+static int cgroupstats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, proc_cgroupstats_show, NULL);
+}
+
+const struct file_operations proc_cgroupstats_operations = {
+	.open = cgroupstats_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+/**
+ * cgroupstats_build - build and fill cgroupstats
+ * @stats: cgroupstats to fill information into
+ * @dentry: A dentry entry belonging to the cgroup for which stats have
+ * been requested.
+ *
+ * Build and fill cgroupstats so that taskstats can export it to user
+ * space.
+ */
+int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
+{
+	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
+	struct cgroup *cgrp;
+	struct css_task_iter it;
+	struct task_struct *tsk;
+
+	/* it should be kernfs_node belonging to cgroupfs and is a directory */
+	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
+	    kernfs_type(kn) != KERNFS_DIR)
+		return -EINVAL;
+
+	mutex_lock(&cgroup_mutex);
+
+	/*
+	 * We aren't being called from kernfs and there's no guarantee on
+	 * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
+	 * @kn->priv is RCU safe.  Let's do the RCU dancing.
+	 */
+	rcu_read_lock();
+	cgrp = rcu_dereference(kn->priv);
+	if (!cgrp || cgroup_is_dead(cgrp)) {
+		rcu_read_unlock();
+		mutex_unlock(&cgroup_mutex);
+		return -ENOENT;
+	}
+	rcu_read_unlock();
+
+	css_task_iter_start(&cgrp->self, &it);
+	while ((tsk = css_task_iter_next(&it))) {
+		switch (tsk->state) {
+		case TASK_RUNNING:
+			stats->nr_running++;
+			break;
+		case TASK_INTERRUPTIBLE:
+			stats->nr_sleeping++;
+			break;
+		case TASK_UNINTERRUPTIBLE:
+			stats->nr_uninterruptible++;
+			break;
+		case TASK_STOPPED:
+			stats->nr_stopped++;
+			break;
+		default:
+			if (delayacct_is_task_waiting_on_io(tsk))
+				stats->nr_io_wait++;
+			break;
+		}
+	}
+	css_task_iter_end(&it);
+
+	mutex_unlock(&cgroup_mutex);
+	return 0;
+}
+
+void check_for_release(struct cgroup *cgrp)
+{
+	if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
+	    !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
+		schedule_work(&cgrp->release_agent_work);
+}
+
+/*
+ * Notify userspace when a cgroup is released, by running the
+ * configured release agent with the name of the cgroup (path
+ * relative to the root of cgroup file system) as the argument.
+ *
+ * Most likely, this user command will try to rmdir this cgroup.
+ *
+ * This races with the possibility that some other task will be
+ * attached to this cgroup before it is removed, or that some other
+ * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
+ * The presumed 'rmdir' will fail quietly if this cgroup is no longer
+ * unused, and this cgroup will be reprieved from its death sentence,
+ * to continue to serve a useful existence.  Next time it's released,
+ * we will get notified again, if it still has 'notify_on_release' set.
+ *
+ * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
+ * means only wait until the task is successfully execve()'d.  The
+ * separate release agent task is forked by call_usermodehelper(),
+ * then control in this thread returns here, without waiting for the
+ * release agent task.  We don't bother to wait because the caller of
+ * this routine has no use for the exit status of the release agent
+ * task, so no sense holding our caller up for that.
+ */
+void cgroup_release_agent(struct work_struct *work)
+{
+	struct cgroup *cgrp =
+		container_of(work, struct cgroup, release_agent_work);
+	char *pathbuf = NULL, *agentbuf = NULL;
+	char *argv[3], *envp[3];
+	int ret;
+
+	mutex_lock(&cgroup_mutex);
+
+	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
+	agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
+	if (!pathbuf || !agentbuf)
+		goto out;
+
+	spin_lock_irq(&css_set_lock);
+	ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
+	spin_unlock_irq(&css_set_lock);
+	if (ret < 0 || ret >= PATH_MAX)
+		goto out;
+
+	argv[0] = agentbuf;
+	argv[1] = pathbuf;
+	argv[2] = NULL;
+
+	/* minimal command environment */
+	envp[0] = "HOME=/";
+	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+	envp[2] = NULL;
+
+	mutex_unlock(&cgroup_mutex);
+	call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+	goto out_free;
+out:
+	mutex_unlock(&cgroup_mutex);
+out_free:
+	kfree(agentbuf);
+	kfree(pathbuf);
+}
+
+/*
+ * cgroup_rename - Only allow simple rename of directories in place.
+ */
+int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
+		  const char *new_name_str)
+{
+	struct cgroup *cgrp = kn->priv;
+	int ret;
+
+	if (kernfs_type(kn) != KERNFS_DIR)
+		return -ENOTDIR;
+	if (kn->parent != new_parent)
+		return -EIO;
+
+	/*
+	 * This isn't a proper migration and its usefulness is very
+	 * limited.  Disallow on the default hierarchy.
+	 */
+	if (cgroup_on_dfl(cgrp))
+		return -EPERM;
+
+	/*
+	 * We're gonna grab cgroup_mutex which nests outside kernfs
+	 * active_ref.  kernfs_rename() doesn't require active_ref
+	 * protection.  Break them before grabbing cgroup_mutex.
+	 */
+	kernfs_break_active_protection(new_parent);
+	kernfs_break_active_protection(kn);
+
+	mutex_lock(&cgroup_mutex);
+
+	ret = kernfs_rename(kn, new_parent, new_name_str);
+	if (!ret)
+		trace_cgroup_rename(cgrp);
+
+	mutex_unlock(&cgroup_mutex);
+
+	kernfs_unbreak_active_protection(kn);
+	kernfs_unbreak_active_protection(new_parent);
+	return ret;
+}
+
+static int __init cgroup1_wq_init(void)
+{
+	/*
+	 * Used to destroy pidlists and separate to serve as flush domain.
+	 * Cap @max_active to 1 too.
+	 */
+	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
+						    0, 1);
+	BUG_ON(!cgroup_pidlist_destroy_wq);
+	return 0;
+}
+core_initcall(cgroup1_wq_init);
+
+static int __init cgroup_no_v1(char *str)
+{
+	struct cgroup_subsys *ss;
+	char *token;
+	int i;
+
+	while ((token = strsep(&str, ",")) != NULL) {
+		if (!*token)
+			continue;
+
+		if (!strcmp(token, "all")) {
+			cgroup_no_v1_mask = U16_MAX;
+			break;
+		}
+
+		for_each_subsys(ss, i) {
+			if (strcmp(token, ss->name) &&
+			    strcmp(token, ss->legacy_name))
+				continue;
+
+			cgroup_no_v1_mask |= 1 << i;
+		}
+	}
+	return 1;
+}
+__setup("cgroup_no_v1=", cgroup_no_v1);
+
+
+#ifdef CONFIG_CGROUP_DEBUG
+static struct cgroup_subsys_state *
+debug_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
+
+	if (!css)
+		return ERR_PTR(-ENOMEM);
+
+	return css;
+}
+
+static void debug_css_free(struct cgroup_subsys_state *css)
+{
+	kfree(css);
+}
+
+static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
+				struct cftype *cft)
+{
+	return cgroup_task_count(css->cgroup);
+}
+
+static u64 current_css_set_read(struct cgroup_subsys_state *css,
+				struct cftype *cft)
+{
+	return (u64)(unsigned long)current->cgroups;
+}
+
+static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
+					 struct cftype *cft)
+{
+	u64 count;
+
+	rcu_read_lock();
+	count = atomic_read(&task_css_set(current)->refcount);
+	rcu_read_unlock();
+	return count;
+}
+
+static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
+{
+	struct cgrp_cset_link *link;
+	struct css_set *cset;
+	char *name_buf;
+
+	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+	if (!name_buf)
+		return -ENOMEM;
+
+	spin_lock_irq(&css_set_lock);
+	rcu_read_lock();
+	cset = rcu_dereference(current->cgroups);
+	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+		struct cgroup *c = link->cgrp;
+
+		cgroup_name(c, name_buf, NAME_MAX + 1);
+		seq_printf(seq, "Root %d group %s\n",
+			   c->root->hierarchy_id, name_buf);
+	}
+	rcu_read_unlock();
+	spin_unlock_irq(&css_set_lock);
+	kfree(name_buf);
+	return 0;
+}
+
+#define MAX_TASKS_SHOWN_PER_CSS 25
+static int cgroup_css_links_read(struct seq_file *seq, void *v)
+{
+	struct cgroup_subsys_state *css = seq_css(seq);
+	struct cgrp_cset_link *link;
+
+	spin_lock_irq(&css_set_lock);
+	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
+		struct css_set *cset = link->cset;
+		struct task_struct *task;
+		int count = 0;
+
+		seq_printf(seq, "css_set %p\n", cset);
+
+		list_for_each_entry(task, &cset->tasks, cg_list) {
+			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+				goto overflow;
+			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
+		}
+
+		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
+			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+				goto overflow;
+			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
+		}
+		continue;
+	overflow:
+		seq_puts(seq, "  ...\n");
+	}
+	spin_unlock_irq(&css_set_lock);
+	return 0;
+}
+
+static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+	return (!cgroup_is_populated(css->cgroup) &&
+		!css_has_online_children(&css->cgroup->self));
+}
+
+static struct cftype debug_files[] =  {
+	{
+		.name = "taskcount",
+		.read_u64 = debug_taskcount_read,
+	},
+
+	{
+		.name = "current_css_set",
+		.read_u64 = current_css_set_read,
+	},
+
+	{
+		.name = "current_css_set_refcount",
+		.read_u64 = current_css_set_refcount_read,
+	},
+
+	{
+		.name = "current_css_set_cg_links",
+		.seq_show = current_css_set_cg_links_read,
+	},
+
+	{
+		.name = "cgroup_css_links",
+		.seq_show = cgroup_css_links_read,
+	},
+
+	{
+		.name = "releasable",
+		.read_u64 = releasable_read,
+	},
+
+	{ }	/* terminate */
+};
+
+struct cgroup_subsys debug_cgrp_subsys = {
+	.css_alloc = debug_css_alloc,
+	.css_free = debug_css_free,
+	.legacy_cftypes = debug_files,
+};
+#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1a815f275849..d34c170f87ef 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -28,15 +28,14 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#include <linux/cgroup.h>
+#include "cgroup-internal.h"
+
 #include <linux/cred.h>
 #include <linux/ctype.h>
 #include <linux/errno.h>
 #include <linux/init_task.h>
 #include <linux/kernel.h>
-#include <linux/list.h>
 #include <linux/magic.h>
-#include <linux/mm.h>
 #include <linux/mutex.h>
 #include <linux/mount.h>
 #include <linux/pagemap.h>
@@ -47,14 +46,8 @@
 #include <linux/spinlock.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/string.h>
-#include <linux/sort.h>
-#include <linux/kmod.h>
-#include <linux/delayacct.h>
-#include <linux/cgroupstats.h>
 #include <linux/hashtable.h>
-#include <linux/pid_namespace.h>
 #include <linux/idr.h>
-#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
 #include <linux/kthread.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
@@ -67,14 +60,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/cgroup.h>
 
-/*
- * pidlists linger the following amount before being destroyed.  The goal
- * is avoiding frequent destruction in the middle of consecutive read calls
- * Expiring in the middle is a performance problem not a correctness one.
- * 1 sec should be enough.
- */
-#define CGROUP_PIDLIST_DESTROY_DELAY	HZ
-
 #define CGROUP_FILE_NAME_MAX		(MAX_CGROUP_TYPE_NAMELEN +	\
 					 MAX_CFTYPE_NAME + 2)
 
@@ -88,14 +73,12 @@
  * These locks are exported if CONFIG_PROVE_RCU so that accessors in
  * cgroup.h can use them for lockdep annotations.
  */
-#ifdef CONFIG_PROVE_RCU
 DEFINE_MUTEX(cgroup_mutex);
 DEFINE_SPINLOCK(css_set_lock);
+
+#ifdef CONFIG_PROVE_RCU
 EXPORT_SYMBOL_GPL(cgroup_mutex);
 EXPORT_SYMBOL_GPL(css_set_lock);
-#else
-static DEFINE_MUTEX(cgroup_mutex);
-static DEFINE_SPINLOCK(css_set_lock);
 #endif
 
 /*
@@ -110,12 +93,6 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
  */
 static DEFINE_SPINLOCK(cgroup_file_kn_lock);
 
-/*
- * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
- * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
- */
-static DEFINE_SPINLOCK(release_agent_path_lock);
-
 struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
 
 #define cgroup_assert_mutex_or_rcu_locked()				\
@@ -131,15 +108,9 @@ struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
  */
 static struct workqueue_struct *cgroup_destroy_wq;
 
-/*
- * pidlist destructions need to be flushed on cgroup destruction.  Use a
- * separate workqueue as flush domain.
- */
-static struct workqueue_struct *cgroup_pidlist_destroy_wq;
-
 /* generate an array of cgroup subsystem pointers */
 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
-static struct cgroup_subsys *cgroup_subsys[] = {
+struct cgroup_subsys *cgroup_subsys[] = {
 #include <linux/cgroup_subsys.h>
 };
 #undef SUBSYS
@@ -186,9 +157,6 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
  */
 static bool cgrp_dfl_visible;
 
-/* Controllers blocked by the commandline in v1 */
-static u16 cgroup_no_v1_mask;
-
 /* some controllers are not supported in the default hierarchy */
 static u16 cgrp_dfl_inhibit_ss_mask;
 
@@ -196,8 +164,7 @@ static u16 cgrp_dfl_inhibit_ss_mask;
 static unsigned long cgrp_dfl_implicit_ss_mask;
 
 /* The list of hierarchy roots */
-
-static LIST_HEAD(cgroup_roots);
+LIST_HEAD(cgroup_roots);
 static int cgroup_root_count;
 
 /* hierarchy ID allocation and mapping, protected by cgroup_mutex */
@@ -235,10 +202,7 @@ static u16 have_canfork_callback __read_mostly;
 
 static struct file_system_type cgroup2_fs_type;
 static struct cftype cgroup_dfl_base_files[];
-static struct cftype cgroup_legacy_base_files[];
 
-static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
-static void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
 static int cgroup_apply_control(struct cgroup *cgrp);
 static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
 static void css_task_iter_advance(struct css_task_iter *it);
@@ -259,7 +223,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
  * is fine for individual subsystems but unsuitable for cgroup core.  This
  * is slower static_key_enabled() based test indexed by @ssid.
  */
-static bool cgroup_ssid_enabled(int ssid)
+bool cgroup_ssid_enabled(int ssid)
 {
 	if (CGROUP_SUBSYS_COUNT == 0)
 		return false;
@@ -267,11 +231,6 @@ static bool cgroup_ssid_enabled(int ssid)
 	return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
 }
 
-static bool cgroup_ssid_no_v1(int ssid)
-{
-	return cgroup_no_v1_mask & (1 << ssid);
-}
-
 /**
  * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
  * @cgrp: the cgroup of interest
@@ -325,7 +284,7 @@ static bool cgroup_ssid_no_v1(int ssid)
  *
  * - debug: disallowed on the default hierarchy.
  */
-static bool cgroup_on_dfl(const struct cgroup *cgrp)
+bool cgroup_on_dfl(const struct cgroup *cgrp)
 {
 	return cgrp->root == &cgrp_dfl_root;
 }
@@ -481,12 +440,6 @@ out_unlock:
 	return css;
 }
 
-/* convenient tests for these bits */
-static inline bool cgroup_is_dead(const struct cgroup *cgrp)
-{
-	return !(cgrp->self.flags & CSS_ONLINE);
-}
-
 static void cgroup_get(struct cgroup *cgrp)
 {
 	WARN_ON_ONCE(cgroup_is_dead(cgrp));
@@ -518,11 +471,6 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
 }
 EXPORT_SYMBOL_GPL(of_css);
 
-static int notify_on_release(const struct cgroup *cgrp)
-{
-	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
-}
-
 /**
  * for_each_css - iterate all css's of a cgroup
  * @css: the iteration cursor
@@ -552,15 +500,6 @@ static int notify_on_release(const struct cgroup *cgrp)
 			;						\
 		else
 
-/**
- * for_each_subsys - iterate all enabled cgroup subsystems
- * @ss: the iteration cursor
- * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
- */
-#define for_each_subsys(ss, ssid)					\
-	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
-	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
-
 /**
  * do_each_subsys_mask - filter for_each_subsys with a bitmask
  * @ss: the iteration cursor
@@ -585,10 +524,6 @@ static int notify_on_release(const struct cgroup *cgrp)
 	}								\
 } while (false)
 
-/* iterate across the hierarchies */
-#define for_each_root(root)						\
-	list_for_each_entry((root), &cgroup_roots, root_list)
-
 /* iterate over child cgrps, lock should be held throughout iteration */
 #define cgroup_for_each_live_child(child, cgrp)				\
 	list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
@@ -615,29 +550,6 @@ static int notify_on_release(const struct cgroup *cgrp)
 			;						\
 		else
 
-static void cgroup_release_agent(struct work_struct *work);
-static void check_for_release(struct cgroup *cgrp);
-
-/*
- * A cgroup can be associated with multiple css_sets as different tasks may
- * belong to different cgroups on different hierarchies.  In the other
- * direction, a css_set is naturally associated with multiple cgroups.
- * This M:N relationship is represented by the following link structure
- * which exists for each association and allows traversing the associations
- * from both sides.
- */
-struct cgrp_cset_link {
-	/* the cgroup and css_set this link associates */
-	struct cgroup		*cgrp;
-	struct css_set		*cset;
-
-	/* list of cgrp_cset_links anchored at cgrp->cset_links */
-	struct list_head	cset_link;
-
-	/* list of cgrp_cset_links anchored at css_set->cgrp_links */
-	struct list_head	cgrp_link;
-};
-
 /*
  * The default css_set - used by init and its children prior to any
  * hierarchies being mounted. It contains a pointer to the root state
@@ -1138,7 +1050,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	return cset;
 }
 
-static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
+struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
 {
 	struct cgroup *root_cgrp = kf_root->kn->priv;
 
@@ -1283,8 +1195,8 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
  * Return the cgroup for "task" from the given hierarchy. Must be
  * called with cgroup_mutex and css_set_lock held.
  */
-static struct cgroup *task_cgroup_from_root(struct task_struct *task,
-					    struct cgroup_root *root)
+struct cgroup *task_cgroup_from_root(struct task_struct *task,
+				     struct cgroup_root *root)
 {
 	/*
 	 * No need to lock the task - since we hold cgroup_mutex the
@@ -1321,7 +1233,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
  */
 
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
-static const struct file_operations proc_cgroupstats_operations;
 
 static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
 			      char *buf)
@@ -1415,7 +1326,7 @@ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
  * inaccessible any time.  If the caller intends to continue to access the
  * cgroup, it should pin it before invoking this function.
  */
-static void cgroup_kn_unlock(struct kernfs_node *kn)
+void cgroup_kn_unlock(struct kernfs_node *kn)
 {
 	struct cgroup *cgrp;
 
@@ -1447,8 +1358,7 @@ static void cgroup_kn_unlock(struct kernfs_node *kn)
  * locking under kernfs active protection and allows all kernfs operations
  * including self-removal.
  */
-static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn,
-					  bool drain_offline)
+struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
 {
 	struct cgroup *cgrp;
 
@@ -1559,7 +1469,7 @@ err:
 	return ret;
 }
 
-static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
+int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
 {
 	struct cgroup *dcgrp = &dst_root->cgrp;
 	struct cgroup_subsys *ss;
@@ -1656,8 +1566,7 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 	return len;
 }
 
-static int cgroup_show_options(struct seq_file *seq,
-			       struct kernfs_root *kf_root)
+static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
 {
 	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
 	struct cgroup_subsys *ss;
@@ -2311,7 +2220,7 @@ static void cgroup_kill_sb(struct super_block *sb)
 	kernfs_kill_sb(sb);
 }
 
-static struct file_system_type cgroup_fs_type = {
+struct file_system_type cgroup_fs_type = {
 	.name = "cgroup",
 	.mount = cgroup_mount,
 	.kill_sb = cgroup_kill_sb,
@@ -2325,8 +2234,8 @@ static struct file_system_type cgroup2_fs_type = {
 	.fs_flags = FS_USERNS_MOUNT,
 };
 
-static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
-				 struct cgroup_namespace *ns)
+int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
+			  struct cgroup_namespace *ns)
 {
 	struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
 
@@ -2616,7 +2525,7 @@ out_release_tset:
  * zero for migration destination cgroups with tasks so that child cgroups
  * don't compete against tasks.
  */
-static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
+bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
 {
 	return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
 		!dst_cgrp->subtree_control;
@@ -2629,7 +2538,7 @@ static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
  * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
  * those functions for details.
  */
-static void cgroup_migrate_finish(struct list_head *preloaded_csets)
+void cgroup_migrate_finish(struct list_head *preloaded_csets)
 {
 	struct css_set *cset, *tmp_cset;
 
@@ -2662,9 +2571,9 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
  * into play and the preloaded css_sets are guaranteed to cover all
  * migrations.
  */
-static void cgroup_migrate_add_src(struct css_set *src_cset,
-				   struct cgroup *dst_cgrp,
-				   struct list_head *preloaded_csets)
+void cgroup_migrate_add_src(struct css_set *src_cset,
+			    struct cgroup *dst_cgrp,
+			    struct list_head *preloaded_csets)
 {
 	struct cgroup *src_cgrp;
 
@@ -2709,7 +2618,7 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
  * using cgroup_migrate(), cgroup_migrate_finish() must be called on
  * @preloaded_csets.
  */
-static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
+int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
 {
 	LIST_HEAD(csets);
 	struct css_set *src_cset, *tmp_cset;
@@ -2773,8 +2682,8 @@ err:
  * decided for all targets by invoking group_migrate_prepare_dst() before
  * actually starting migrating.
  */
-static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
-			  struct cgroup_root *root)
+int cgroup_migrate(struct task_struct *leader, bool threadgroup,
+		   struct cgroup_root *root)
 {
 	struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
 	struct task_struct *task;
@@ -2806,8 +2715,8 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
  *
  * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
  */
-static int cgroup_attach_task(struct cgroup *dst_cgrp,
-			      struct task_struct *leader, bool threadgroup)
+int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
+		       bool threadgroup)
 {
 	LIST_HEAD(preloaded_csets);
 	struct task_struct *task;
@@ -2888,8 +2797,8 @@ static int cgroup_procs_write_permission(struct task_struct *task,
  * function to attach either it or all tasks in its threadgroup. Will lock
  * cgroup_mutex and threadgroup.
  */
-static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
-				    size_t nbytes, loff_t off, bool threadgroup)
+ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+			     size_t nbytes, loff_t off, bool threadgroup)
 {
 	struct task_struct *tsk;
 	struct cgroup_subsys *ss;
@@ -2950,86 +2859,12 @@ out_unlock_threadgroup:
 	return ret ?: nbytes;
 }
 
-/**
- * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
- * @from: attach to all cgroups of a given task
- * @tsk: the task to be attached
- */
-int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
-{
-	struct cgroup_root *root;
-	int retval = 0;
-
-	mutex_lock(&cgroup_mutex);
-	percpu_down_write(&cgroup_threadgroup_rwsem);
-	for_each_root(root) {
-		struct cgroup *from_cgrp;
-
-		if (root == &cgrp_dfl_root)
-			continue;
-
-		spin_lock_irq(&css_set_lock);
-		from_cgrp = task_cgroup_from_root(from, root);
-		spin_unlock_irq(&css_set_lock);
-
-		retval = cgroup_attach_task(from_cgrp, tsk, false);
-		if (retval)
-			break;
-	}
-	percpu_up_write(&cgroup_threadgroup_rwsem);
-	mutex_unlock(&cgroup_mutex);
-
-	return retval;
-}
-EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
-
-static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
-				  char *buf, size_t nbytes, loff_t off)
-{
-	return __cgroup_procs_write(of, buf, nbytes, off, false);
-}
-
-static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
-				  char *buf, size_t nbytes, loff_t off)
+ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
+			   loff_t off)
 {
 	return __cgroup_procs_write(of, buf, nbytes, off, true);
 }
 
-static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
-					  char *buf, size_t nbytes, loff_t off)
-{
-	struct cgroup *cgrp;
-
-	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
-
-	cgrp = cgroup_kn_lock_live(of->kn, false);
-	if (!cgrp)
-		return -ENODEV;
-	spin_lock(&release_agent_path_lock);
-	strlcpy(cgrp->root->release_agent_path, strstrip(buf),
-		sizeof(cgrp->root->release_agent_path));
-	spin_unlock(&release_agent_path_lock);
-	cgroup_kn_unlock(of->kn);
-	return nbytes;
-}
-
-static int cgroup_release_agent_show(struct seq_file *seq, void *v)
-{
-	struct cgroup *cgrp = seq_css(seq)->cgroup;
-
-	spin_lock(&release_agent_path_lock);
-	seq_puts(seq, cgrp->root->release_agent_path);
-	spin_unlock(&release_agent_path_lock);
-	seq_putc(seq, '\n');
-	return 0;
-}
-
-static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
-{
-	seq_puts(seq, "0\n");
-	return 0;
-}
-
 static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
 {
 	struct cgroup_subsys *ss;
@@ -3131,7 +2966,7 @@ out_finish:
  * controller while the previous css is still around.  This function grabs
  * cgroup_mutex and drains the previous css instances of @cgrp's subtree.
  */
-static void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
+void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
 	__acquires(&cgroup_mutex)
 {
 	struct cgroup *dsct;
@@ -3610,48 +3445,6 @@ static struct kernfs_ops cgroup_kf_ops = {
 	.seq_show		= cgroup_seqfile_show,
 };
 
-/*
- * cgroup_rename - Only allow simple rename of directories in place.
- */
-static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
-			 const char *new_name_str)
-{
-	struct cgroup *cgrp = kn->priv;
-	int ret;
-
-	if (kernfs_type(kn) != KERNFS_DIR)
-		return -ENOTDIR;
-	if (kn->parent != new_parent)
-		return -EIO;
-
-	/*
-	 * This isn't a proper migration and its usefulness is very
-	 * limited.  Disallow on the default hierarchy.
-	 */
-	if (cgroup_on_dfl(cgrp))
-		return -EPERM;
-
-	/*
-	 * We're gonna grab cgroup_mutex which nests outside kernfs
-	 * active_ref.  kernfs_rename() doesn't require active_ref
-	 * protection.  Break them before grabbing cgroup_mutex.
-	 */
-	kernfs_break_active_protection(new_parent);
-	kernfs_break_active_protection(kn);
-
-	mutex_lock(&cgroup_mutex);
-
-	ret = kernfs_rename(kn, new_parent, new_name_str);
-	if (!ret)
-		trace_cgroup_rename(cgrp);
-
-	mutex_unlock(&cgroup_mutex);
-
-	kernfs_unbreak_active_protection(kn);
-	kernfs_unbreak_active_protection(new_parent);
-	return ret;
-}
-
 /* set uid and gid of cgroup dirs and files to that of the creator */
 static int cgroup_kn_set_ugid(struct kernfs_node *kn)
 {
@@ -3947,26 +3740,6 @@ void cgroup_file_notify(struct cgroup_file *cfile)
 	spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
 }
 
-/**
- * cgroup_task_count - count the number of tasks in a cgroup.
- * @cgrp: the cgroup in question
- *
- * Return the number of tasks in the cgroup.  The returned number can be
- * higher than the actual number of tasks due to css_set references from
- * namespace roots and temporary usages.
- */
-static int cgroup_task_count(const struct cgroup *cgrp)
-{
-	int count = 0;
-	struct cgrp_cset_link *link;
-
-	spin_lock_irq(&css_set_lock);
-	list_for_each_entry(link, &cgrp->cset_links, cset_link)
-		count += atomic_read(&link->cset->refcount);
-	spin_unlock_irq(&css_set_lock);
-	return count;
-}
-
 /**
  * css_next_child - find the next child of a given css
  * @pos: the current position (%NULL to initiate traversal)
@@ -4365,70 +4138,6 @@ void css_task_iter_end(struct css_task_iter *it)
 		put_task_struct(it->cur_task);
 }
 
-/**
- * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
- * @to: cgroup to which the tasks will be moved
- * @from: cgroup in which the tasks currently reside
- *
- * Locking rules between cgroup_post_fork() and the migration path
- * guarantee that, if a task is forking while being migrated, the new child
- * is guaranteed to be either visible in the source cgroup after the
- * parent's migration is complete or put into the target cgroup.  No task
- * can slip out of migration through forking.
- */
-int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
-{
-	LIST_HEAD(preloaded_csets);
-	struct cgrp_cset_link *link;
-	struct css_task_iter it;
-	struct task_struct *task;
-	int ret;
-
-	if (cgroup_on_dfl(to))
-		return -EINVAL;
-
-	if (!cgroup_may_migrate_to(to))
-		return -EBUSY;
-
-	mutex_lock(&cgroup_mutex);
-
-	percpu_down_write(&cgroup_threadgroup_rwsem);
-
-	/* all tasks in @from are being moved, all csets are source */
-	spin_lock_irq(&css_set_lock);
-	list_for_each_entry(link, &from->cset_links, cset_link)
-		cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
-	spin_unlock_irq(&css_set_lock);
-
-	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
-	if (ret)
-		goto out_err;
-
-	/*
-	 * Migrate tasks one-by-one until @from is empty.  This fails iff
-	 * ->can_attach() fails.
-	 */
-	do {
-		css_task_iter_start(&from->self, &it);
-		task = css_task_iter_next(&it);
-		if (task)
-			get_task_struct(task);
-		css_task_iter_end(&it);
-
-		if (task) {
-			ret = cgroup_migrate(task, false, to->root);
-			if (!ret)
-				trace_cgroup_transfer_tasks(to, task, false);
-			put_task_struct(task);
-		}
-	} while (task && !ret);
-out_err:
-	cgroup_migrate_finish(&preloaded_csets);
-	percpu_up_write(&cgroup_threadgroup_rwsem);
-	mutex_unlock(&cgroup_mutex);
-	return ret;
-}
-
 static void cgroup_procs_release(struct kernfs_open_file *of)
 {
 	if (of->priv) {
@@ -4483,456 +4192,6 @@ static int cgroup_procs_show(struct seq_file *s, void *v)
 	return 0;
 }
 
-/*
- * Stuff for reading the 'tasks'/'procs' files.
- *
- * Reading this file can return large amounts of data if a cgroup has
- * *lots* of attached tasks. So it may need several calls to read(),
- * but we cannot guarantee that the information we produce is correct
- * unless we produce it entirely atomically.
- *
- */
-
-/* which pidlist file are we talking about? */
-enum cgroup_filetype {
-	CGROUP_FILE_PROCS,
-	CGROUP_FILE_TASKS,
-};
-
-/*
- * A pidlist is a list of pids that virtually represents the contents of one
- * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
- * a pair (one each for procs, tasks) for each pid namespace that's relevant
- * to the cgroup.
- */
-struct cgroup_pidlist {
-	/*
-	 * used to find which pidlist is wanted. doesn't change as long as
-	 * this particular list stays in the list.
-	*/
-	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
-	/* array of xids */
-	pid_t *list;
-	/* how many elements the above list has */
-	int length;
-	/* each of these stored in a list by its cgroup */
-	struct list_head links;
-	/* pointer to the cgroup we belong to, for list removal purposes */
-	struct cgroup *owner;
-	/* for delayed destruction */
-	struct delayed_work destroy_dwork;
-};
-
-/*
- * The following two functions "fix" the issue where there are more pids
- * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
- * TODO: replace with a kernel-wide solution to this problem
- */
-#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
-static void *pidlist_allocate(int count)
-{
-	if (PIDLIST_TOO_LARGE(count))
-		return vmalloc(count * sizeof(pid_t));
-	else
-		return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
-}
-
-static void pidlist_free(void *p)
-{
-	kvfree(p);
-}
-
-/*
- * Used to destroy all pidlists lingering waiting for destroy timer.  None
- * should be left afterwards.
- */
-static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
-{
-	struct cgroup_pidlist *l, *tmp_l;
-
-	mutex_lock(&cgrp->pidlist_mutex);
-	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
-		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
-	mutex_unlock(&cgrp->pidlist_mutex);
-
-	flush_workqueue(cgroup_pidlist_destroy_wq);
-	BUG_ON(!list_empty(&cgrp->pidlists));
-}
-
-static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
-{
-	struct delayed_work *dwork = to_delayed_work(work);
-	struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
-						destroy_dwork);
-	struct cgroup_pidlist *tofree = NULL;
-
-	mutex_lock(&l->owner->pidlist_mutex);
-
-	/*
-	 * Destroy iff we didn't get queued again.  The state won't change
-	 * as destroy_dwork can only be queued while locked.
-	 */
-	if (!delayed_work_pending(dwork)) {
-		list_del(&l->links);
-		pidlist_free(l->list);
-		put_pid_ns(l->key.ns);
-		tofree = l;
-	}
-
-	mutex_unlock(&l->owner->pidlist_mutex);
-	kfree(tofree);
-}
-
-/*
- * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
- * Returns the number of unique elements.
- */
-static int pidlist_uniq(pid_t *list, int length)
-{
-	int src, dest = 1;
-
-	/*
-	 * we presume the 0th element is unique, so i starts at 1. trivial
-	 * edge cases first; no work needs to be done for either
-	 */
-	if (length == 0 || length == 1)
-		return length;
-	/* src and dest walk down the list; dest counts unique elements */
-	for (src = 1; src < length; src++) {
-		/* find next unique element */
-		while (list[src] == list[src-1]) {
-			src++;
-			if (src == length)
-				goto after;
-		}
-		/* dest always points to where the next unique element goes */
-		list[dest] = list[src];
-		dest++;
-	}
-after:
-	return dest;
-}
-
-/*
- * The two pid files - task and cgroup.procs - guaranteed that the result
- * is sorted, which forced this whole pidlist fiasco.  As pid order is
- * different per namespace, each namespace needs differently sorted list,
- * making it impossible to use, for example, single rbtree of member tasks
- * sorted by task pointer.  As pidlists can be fairly large, allocating one
- * per open file is dangerous, so cgroup had to implement shared pool of
- * pidlists keyed by cgroup and namespace.
- */
-static int cmppid(const void *a, const void *b)
-{
-	return *(pid_t *)a - *(pid_t *)b;
-}
-
-static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
-						  enum cgroup_filetype type)
-{
-	struct cgroup_pidlist *l;
-	/* don't need task_nsproxy() if we're looking at ourself */
-	struct pid_namespace *ns = task_active_pid_ns(current);
-
-	lockdep_assert_held(&cgrp->pidlist_mutex);
-
-	list_for_each_entry(l, &cgrp->pidlists, links)
-		if (l->key.type == type && l->key.ns == ns)
-			return l;
-	return NULL;
-}
-
-/*
- * find the appropriate pidlist for our purpose (given procs vs tasks)
- * returns with the lock on that pidlist already held, and takes care
- * of the use count, or returns NULL with no locks held if we're out of
- * memory.
- */
-static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
-						enum cgroup_filetype type)
-{
-	struct cgroup_pidlist *l;
-
-	lockdep_assert_held(&cgrp->pidlist_mutex);
-
-	l = cgroup_pidlist_find(cgrp, type);
-	if (l)
-		return l;
-
-	/* entry not found; create a new one */
-	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
-	if (!l)
-		return l;
-
-	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
-	l->key.type = type;
-	/* don't need task_nsproxy() if we're looking at ourself */
-	l->key.ns = get_pid_ns(task_active_pid_ns(current));
-	l->owner = cgrp;
-	list_add(&l->links, &cgrp->pidlists);
-	return l;
-}
-
-/*
- * Load a cgroup's pidarray with either procs' tgids or tasks' pids
- */
-static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
-			      struct cgroup_pidlist **lp)
-{
-	pid_t *array;
-	int length;
-	int pid, n = 0; /* used for populating the array */
-	struct css_task_iter it;
-	struct task_struct *tsk;
-	struct cgroup_pidlist *l;
-
-	lockdep_assert_held(&cgrp->pidlist_mutex);
-
-	/*
-	 * If cgroup gets more users after we read count, we won't have
-	 * enough space - tough.  This race is indistinguishable to the
-	 * caller from the case that the additional cgroup users didn't
-	 * show up until sometime later on.
-	 */
-	length = cgroup_task_count(cgrp);
-	array = pidlist_allocate(length);
-	if (!array)
-		return -ENOMEM;
-	/* now, populate the array */
-	css_task_iter_start(&cgrp->self, &it);
-	while ((tsk = css_task_iter_next(&it))) {
-		if (unlikely(n == length))
-			break;
-		/* get tgid or pid for procs or tasks file respectively */
-		if (type == CGROUP_FILE_PROCS)
-			pid = task_tgid_vnr(tsk);
-		else
-			pid = task_pid_vnr(tsk);
-		if (pid > 0) /* make sure to only use valid results */
-			array[n++] = pid;
-	}
-	css_task_iter_end(&it);
-	length = n;
-	/* now sort & (if procs) strip out duplicates */
-	sort(array, length, sizeof(pid_t), cmppid, NULL);
-	if (type == CGROUP_FILE_PROCS)
-		length = pidlist_uniq(array, length);
-
-	l = cgroup_pidlist_find_create(cgrp, type);
-	if (!l) {
-		pidlist_free(array);
-		return -ENOMEM;
-	}
-
-	/* store array, freeing old if necessary */
-	pidlist_free(l->list);
-	l->list = array;
-	l->length = length;
-	*lp = l;
-	return 0;
-}
-
-/**
- * cgroupstats_build - build and fill cgroupstats
- * @stats: cgroupstats to fill information into
- * @dentry: A dentry entry belonging to the cgroup for which stats have
- * been requested.
- *
- * Build and fill cgroupstats so that taskstats can export it to user
- * space.
- */
-int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
-{
-	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
-	struct cgroup *cgrp;
-	struct css_task_iter it;
-	struct task_struct *tsk;
-
-	/* it should be kernfs_node belonging to cgroupfs and is a directory */
-	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
-	    kernfs_type(kn) != KERNFS_DIR)
-		return -EINVAL;
-
-	mutex_lock(&cgroup_mutex);
-
-	/*
-	 * We aren't being called from kernfs and there's no guarantee on
-	 * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
-	 * @kn->priv is RCU safe.  Let's do the RCU dancing.
-	 */
-	rcu_read_lock();
-	cgrp = rcu_dereference(kn->priv);
-	if (!cgrp || cgroup_is_dead(cgrp)) {
-		rcu_read_unlock();
-		mutex_unlock(&cgroup_mutex);
-		return -ENOENT;
-	}
-	rcu_read_unlock();
-
-	css_task_iter_start(&cgrp->self, &it);
-	while ((tsk = css_task_iter_next(&it))) {
-		switch (tsk->state) {
-		case TASK_RUNNING:
-			stats->nr_running++;
-			break;
-		case TASK_INTERRUPTIBLE:
-			stats->nr_sleeping++;
-			break;
-		case TASK_UNINTERRUPTIBLE:
-			stats->nr_uninterruptible++;
-			break;
-		case TASK_STOPPED:
-			stats->nr_stopped++;
-			break;
-		default:
-			if (delayacct_is_task_waiting_on_io(tsk))
-				stats->nr_io_wait++;
-			break;
-		}
-	}
-	css_task_iter_end(&it);
-
-	mutex_unlock(&cgroup_mutex);
-	return 0;
-}
-
-
-/*
- * seq_file methods for the tasks/procs files. The seq_file position is the
- * next pid to display; the seq_file iterator is a pointer to the pid
- * in the cgroup->l->list array.
- */
-
-static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
-{
-	/*
-	 * Initially we receive a position value that corresponds to
-	 * one more than the last pid shown (or 0 on the first call or
-	 * after a seek to the start). Use a binary-search to find the
-	 * next pid to display, if any
-	 */
-	struct kernfs_open_file *of = s->private;
-	struct cgroup *cgrp = seq_css(s)->cgroup;
-	struct cgroup_pidlist *l;
-	enum cgroup_filetype type = seq_cft(s)->private;
-	int index = 0, pid = *pos;
-	int *iter, ret;
-
-	mutex_lock(&cgrp->pidlist_mutex);
-
-	/*
-	 * !NULL @of->priv indicates that this isn't the first start()
-	 * after open.  If the matching pidlist is around, we can use that.
-	 * Look for it.  Note that @of->priv can't be used directly.  It
-	 * could already have been destroyed.
-	 */
-	if (of->priv)
-		of->priv = cgroup_pidlist_find(cgrp, type);
-
-	/*
-	 * Either this is the first start() after open or the matching
-	 * pidlist has been destroyed inbetween.  Create a new one.
-	 */
-	if (!of->priv) {
-		ret = pidlist_array_load(cgrp, type,
-					 (struct cgroup_pidlist **)&of->priv);
-		if (ret)
-			return ERR_PTR(ret);
-	}
-	l = of->priv;
-
-	if (pid) {
-		int end = l->length;
-
-		while (index < end) {
-			int mid = (index + end) / 2;
-			if (l->list[mid] == pid) {
-				index = mid;
-				break;
-			} else if (l->list[mid] <= pid)
-				index = mid + 1;
-			else
-				end = mid;
-		}
-	}
-	/* If we're off the end of the array, we're done */
-	if (index >= l->length)
-		return NULL;
-	/* Update the abstract position to be the actual pid that we found */
-	iter = l->list + index;
-	*pos = *iter;
-	return iter;
-}
-
-static void cgroup_pidlist_stop(struct seq_file *s, void *v)
-{
-	struct kernfs_open_file *of = s->private;
-	struct cgroup_pidlist *l = of->priv;
-
-	if (l)
-		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
-				 CGROUP_PIDLIST_DESTROY_DELAY);
-	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
-}
-
-static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
-{
-	struct kernfs_open_file *of = s->private;
-	struct cgroup_pidlist *l = of->priv;
-	pid_t *p = v;
-	pid_t *end = l->list + l->length;
-	/*
-	 * Advance to the next pid in the array. If this goes off the
-	 * end, we're done
-	 */
-	p++;
-	if (p >= end) {
-		return NULL;
-	} else {
-		*pos = *p;
-		return p;
-	}
-}
-
-static int cgroup_pidlist_show(struct seq_file *s, void *v)
-{
-	seq_printf(s, "%d\n", *(int *)v);
-
-	return 0;
-}
-
-static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
-					 struct cftype *cft)
-{
-	return notify_on_release(css->cgroup);
-}
-
-static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
-					  struct cftype *cft, u64 val)
-{
-	if (val)
-		set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
-	else
-		clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
-	return 0;
-}
-
-static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
-				      struct cftype *cft)
-{
-	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
-}
-
-static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
-				       struct cftype *cft, u64 val)
-{
-	if (val)
-		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
-	else
-		clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
-	return 0;
-}
-
 /* cgroup core interface files for the default hierarchy */
 static struct cftype cgroup_dfl_base_files[] = {
 	{
@@ -4962,51 +4221,6 @@ static struct cftype cgroup_dfl_base_files[] = {
 	{ }	/* terminate */
 };
 
-/* cgroup core interface files for the legacy hierarchies */
-static struct cftype cgroup_legacy_base_files[] = {
-	{
-		.name = "cgroup.procs",
-		.seq_start = cgroup_pidlist_start,
-		.seq_next = cgroup_pidlist_next,
-		.seq_stop = cgroup_pidlist_stop,
-		.seq_show = cgroup_pidlist_show,
-		.private = CGROUP_FILE_PROCS,
-		.write = cgroup_procs_write,
-	},
-	{
-		.name = "cgroup.clone_children",
-		.read_u64 = cgroup_clone_children_read,
-		.write_u64 = cgroup_clone_children_write,
-	},
-	{
-		.name = "cgroup.sane_behavior",
-		.flags = CFTYPE_ONLY_ON_ROOT,
-		.seq_show = cgroup_sane_behavior_show,
-	},
-	{
-		.name = "tasks",
-		.seq_start = cgroup_pidlist_start,
-		.seq_next = cgroup_pidlist_next,
-		.seq_stop = cgroup_pidlist_stop,
-		.seq_show = cgroup_pidlist_show,
-		.private = CGROUP_FILE_TASKS,
-		.write = cgroup_tasks_write,
-	},
-	{
-		.name = "notify_on_release",
-		.read_u64 = cgroup_read_notify_on_release,
-		.write_u64 = cgroup_write_notify_on_release,
-	},
-	{
-		.name = "release_agent",
-		.flags = CFTYPE_ONLY_ON_ROOT,
-		.seq_show = cgroup_release_agent_show,
-		.write = cgroup_release_agent_write,
-		.max_write_len = PATH_MAX - 1,
-	},
-	{ }	/* terminate */
-};
-
 /*
  * css destruction is four-stage process.
  *
@@ -5792,15 +5006,6 @@ static int __init cgroup_wq_init(void)
 	 */
 	cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
 	BUG_ON(!cgroup_destroy_wq);
-
-	/*
-	 * Used to destroy pidlists and separate to serve as flush domain.
-	 * Cap @max_active to 1 too.
-	 */
-	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
-						    0, 1);
-	BUG_ON(!cgroup_pidlist_destroy_wq);
-
 	return 0;
 }
 core_initcall(cgroup_wq_init);
@@ -5883,42 +5088,6 @@ out:
 	return retval;
 }
 
-/* Display information about each subsystem and each hierarchy */
-static int proc_cgroupstats_show(struct seq_file *m, void *v)
-{
-	struct cgroup_subsys *ss;
-	int i;
-
-	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
-	/*
-	 * ideally we don't want subsystems moving around while we do this.
-	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
-	 * subsys/hierarchy state.
-	 */
-	mutex_lock(&cgroup_mutex);
-
-	for_each_subsys(ss, i)
-		seq_printf(m, "%s\t%d\t%d\t%d\n",
-			   ss->legacy_name, ss->root->hierarchy_id,
-			   atomic_read(&ss->root->nr_cgrps),
-			   cgroup_ssid_enabled(i));
-
-	mutex_unlock(&cgroup_mutex);
-	return 0;
-}
-
-static int cgroupstats_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_cgroupstats_show, NULL);
-}
-
-static const struct file_operations proc_cgroupstats_operations = {
-	.open = cgroupstats_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
 /**
  * cgroup_fork - initialize cgroup related fields during copy_process()
  * @child: pointer to task_struct of forking parent process.
@@ -6098,76 +5267,6 @@ void cgroup_free(struct task_struct *task)
 	put_css_set(cset);
 }
 
-static void check_for_release(struct cgroup *cgrp)
-{
-	if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
-	    !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
-		schedule_work(&cgrp->release_agent_work);
-}
-
-/*
- * Notify userspace when a cgroup is released, by running the
- * configured release agent with the name of the cgroup (path
- * relative to the root of cgroup file system) as the argument.
- *
- * Most likely, this user command will try to rmdir this cgroup.
- *
- * This races with the possibility that some other task will be
- * attached to this cgroup before it is removed, or that some other
- * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
- * The presumed 'rmdir' will fail quietly if this cgroup is no longer
- * unused, and this cgroup will be reprieved from its death sentence,
- * to continue to serve a useful existence.  Next time it's released,
- * we will get notified again, if it still has 'notify_on_release' set.
- *
- * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
- * means only wait until the task is successfully execve()'d.  The
- * separate release agent task is forked by call_usermodehelper(),
- * then control in this thread returns here, without waiting for the
- * release agent task.  We don't bother to wait because the caller of
- * this routine has no use for the exit status of the release agent
- * task, so no sense holding our caller up for that.
- */
-static void cgroup_release_agent(struct work_struct *work)
-{
-	struct cgroup *cgrp =
-		container_of(work, struct cgroup, release_agent_work);
-	char *pathbuf = NULL, *agentbuf = NULL;
-	char *argv[3], *envp[3];
-	int ret;
-
-	mutex_lock(&cgroup_mutex);
-
-	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
-	agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
-	if (!pathbuf || !agentbuf)
-		goto out;
-
-	spin_lock_irq(&css_set_lock);
-	ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
-	spin_unlock_irq(&css_set_lock);
-	if (ret < 0 || ret >= PATH_MAX)
-		goto out;
-
-	argv[0] = agentbuf;
-	argv[1] = pathbuf;
-	argv[2] = NULL;
-
-	/* minimal command environment */
-	envp[0] = "HOME=/";
-	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
-	envp[2] = NULL;
-
-	mutex_unlock(&cgroup_mutex);
-	call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
-	goto out_free;
-out:
-	mutex_unlock(&cgroup_mutex);
-out_free:
-	kfree(agentbuf);
-	kfree(pathbuf);
-}
-
 static int __init cgroup_disable(char *str)
 {
 	struct cgroup_subsys *ss;
@@ -6189,33 +5288,6 @@ static int __init cgroup_disable(char *str)
 }
 __setup("cgroup_disable=", cgroup_disable);
 
-static int __init cgroup_no_v1(char *str)
-{
-	struct cgroup_subsys *ss;
-	char *token;
-	int i;
-
-	while ((token = strsep(&str, ",")) != NULL) {
-		if (!*token)
-			continue;
-
-		if (!strcmp(token, "all")) {
-			cgroup_no_v1_mask = U16_MAX;
-			break;
-		}
-
-		for_each_subsys(ss, i) {
-			if (strcmp(token, ss->name) &&
-			    strcmp(token, ss->legacy_name))
-				continue;
-
-			cgroup_no_v1_mask |= 1 << i;
-		}
-	}
-	return 1;
-}
-__setup("cgroup_no_v1=", cgroup_no_v1);
-
 /**
  * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
  * @dentry: directory dentry of interest
@@ -6557,149 +5629,3 @@ void cgroup_bpf_update(struct cgroup *cgrp,
 	mutex_unlock(&cgroup_mutex);
 }
 #endif /* CONFIG_CGROUP_BPF */
-
-#ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *
-debug_css_alloc(struct cgroup_subsys_state *parent_css)
-{
-	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
-
-	if (!css)
-		return ERR_PTR(-ENOMEM);
-
-	return css;
-}
-
-static void debug_css_free(struct cgroup_subsys_state *css)
-{
-	kfree(css);
-}
-
-static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
-				struct cftype *cft)
-{
-	return cgroup_task_count(css->cgroup);
-}
-
-static u64 current_css_set_read(struct cgroup_subsys_state *css,
-				struct cftype *cft)
-{
-	return (u64)(unsigned long)current->cgroups;
-}
-
-static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
-					 struct cftype *cft)
-{
-	u64 count;
-
-	rcu_read_lock();
-	count = atomic_read(&task_css_set(current)->refcount);
-	rcu_read_unlock();
-	return count;
-}
-
-static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
-{
-	struct cgrp_cset_link *link;
-	struct css_set *cset;
-	char *name_buf;
-
-	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
-	if (!name_buf)
-		return -ENOMEM;
-
-	spin_lock_irq(&css_set_lock);
-	rcu_read_lock();
-	cset = rcu_dereference(current->cgroups);
-	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
-		struct cgroup *c = link->cgrp;
-
-		cgroup_name(c, name_buf, NAME_MAX + 1);
-		seq_printf(seq, "Root %d group %s\n",
-			   c->root->hierarchy_id, name_buf);
-	}
-	rcu_read_unlock();
-	spin_unlock_irq(&css_set_lock);
-	kfree(name_buf);
-	return 0;
-}
-
-#define MAX_TASKS_SHOWN_PER_CSS 25
-static int cgroup_css_links_read(struct seq_file *seq, void *v)
-{
-	struct cgroup_subsys_state *css = seq_css(seq);
-	struct cgrp_cset_link *link;
-
-	spin_lock_irq(&css_set_lock);
-	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
-		struct css_set *cset = link->cset;
-		struct task_struct *task;
-		int count = 0;
-
-		seq_printf(seq, "css_set %p\n", cset);
-
-		list_for_each_entry(task, &cset->tasks, cg_list) {
-			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
-				goto overflow;
-			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
-		}
-
-		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
-			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
-				goto overflow;
-			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
-		}
-		continue;
-	overflow:
-		seq_puts(seq, "  ...\n");
-	}
-	spin_unlock_irq(&css_set_lock);
-	return 0;
-}
-
-static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
-{
-	return (!cgroup_is_populated(css->cgroup) &&
-		!css_has_online_children(&css->cgroup->self));
-}
-
-static struct cftype debug_files[] =  {
-	{
-		.name = "taskcount",
-		.read_u64 = debug_taskcount_read,
-	},
-
-	{
-		.name = "current_css_set",
-		.read_u64 = current_css_set_read,
-	},
-
-	{
-		.name = "current_css_set_refcount",
-		.read_u64 = current_css_set_refcount_read,
-	},
-
-	{
-		.name = "current_css_set_cg_links",
-		.seq_show = current_css_set_cg_links_read,
-	},
-
-	{
-		.name = "cgroup_css_links",
-		.seq_show = cgroup_css_links_read,
-	},
-
-	{
-		.name = "releasable",
-		.read_u64 = releasable_read,
-	},
-
-	{ }	/* terminate */
-};
-
-struct cgroup_subsys debug_cgrp_subsys = {
-	.css_alloc = debug_css_alloc,
-	.css_free = debug_css_free,
-	.legacy_cftypes = debug_files,
-};
-#endif /* CONFIG_CGROUP_DEBUG */
-- 
cgit v1.2.3


From 633feee310de6b6c3191011140b88fe772f560cf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 27 Dec 2016 14:49:07 -0500
Subject: cgroup: refactor mount path and clearly distinguish v1 and v2 paths

While sharing some mechanisms, the mount paths of v1 and v2 are
substantially different.  Their implementations were mixed in
cgroup_mount().  This patch splits them out so that they're easier to
follow and organize.

This patch causes one functional change - the WARN_ON(new_sb) gets
lost.  This is because the actual mounting gets moved to
cgroup_do_mount() and thus @new_sb is no longer accessible by default
to cgroup1_mount().  While we can add it as an explicit out parameter
to cgroup_do_mount(), this part of code hasn't changed and the warning
hasn't triggered for quite a while.  Dropping it should be fine.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Acked-by: Zefan Li <lizefan@huawei.com>
---
 kernel/cgroup/cgroup.c | 140 ++++++++++++++++++++++++++++---------------------
 1 file changed, 79 insertions(+), 61 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d34c170f87ef..228dd9ae708b 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1989,48 +1989,55 @@ out:
 	return ret;
 }
 
-static struct dentry *cgroup_mount(struct file_system_type *fs_type,
-			 int flags, const char *unused_dev_name,
-			 void *data)
+static struct dentry *cgroup_do_mount(struct file_system_type *fs_type,
+				      int flags, struct cgroup_root *root,
+				      unsigned long magic,
+				      struct cgroup_namespace *ns)
 {
-	bool is_v2 = fs_type == &cgroup2_fs_type;
-	struct super_block *pinned_sb = NULL;
-	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
-	struct cgroup_subsys *ss;
-	struct cgroup_root *root;
-	struct cgroup_sb_opts opts;
 	struct dentry *dentry;
-	int ret;
-	int i;
 	bool new_sb;
 
-	get_cgroup_ns(ns);
-
-	/* Check if the caller has permission to mount. */
-	if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
-		put_cgroup_ns(ns);
-		return ERR_PTR(-EPERM);
-	}
+	dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb);
 
 	/*
-	 * The first time anyone tries to mount a cgroup, enable the list
-	 * linking each css_set to its tasks and fix up all existing tasks.
+	 * In non-init cgroup namespace, instead of root cgroup's dentry,
+	 * we return the dentry corresponding to the cgroupns->root_cgrp.
 	 */
-	if (!use_task_css_set_links)
-		cgroup_enable_task_cg_lists();
+	if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
+		struct dentry *nsdentry;
+		struct cgroup *cgrp;
 
-	if (is_v2) {
-		if (data) {
-			pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
-			put_cgroup_ns(ns);
-			return ERR_PTR(-EINVAL);
-		}
-		cgrp_dfl_visible = true;
-		root = &cgrp_dfl_root;
-		cgroup_get(&root->cgrp);
-		goto out_mount;
+		mutex_lock(&cgroup_mutex);
+		spin_lock_irq(&css_set_lock);
+
+		cgrp = cset_cgroup_from_root(ns->root_cset, root);
+
+		spin_unlock_irq(&css_set_lock);
+		mutex_unlock(&cgroup_mutex);
+
+		nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
+		dput(dentry);
+		dentry = nsdentry;
 	}
 
+	if (IS_ERR(dentry) || !new_sb)
+		cgroup_put(&root->cgrp);
+
+	return dentry;
+}
+
+static struct dentry *cgroup1_mount(struct file_system_type *fs_type,
+				    int flags, void *data,
+				    unsigned long magic,
+				    struct cgroup_namespace *ns)
+{
+	struct super_block *pinned_sb = NULL;
+	struct cgroup_sb_opts opts;
+	struct cgroup_root *root;
+	struct cgroup_subsys *ss;
+	struct dentry *dentry;
+	int i, ret;
+
 	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
 
 	/* First find the desired set of subsystems */
@@ -2152,47 +2159,58 @@ out_free:
 	kfree(opts.release_agent);
 	kfree(opts.name);
 
-	if (ret) {
-		put_cgroup_ns(ns);
+	if (ret)
 		return ERR_PTR(ret);
-	}
-out_mount:
-	dentry = kernfs_mount(fs_type, flags, root->kf_root,
-			      is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
-			      &new_sb);
+
+	dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
+				 CGROUP_SUPER_MAGIC, ns);
 
 	/*
-	 * In non-init cgroup namespace, instead of root cgroup's
-	 * dentry, we return the dentry corresponding to the
-	 * cgroupns->root_cgrp.
+	 * If @pinned_sb, we're reusing an existing root and holding an
+	 * extra ref on its sb.  Mount is complete.  Put the extra ref.
 	 */
-	if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
-		struct dentry *nsdentry;
-		struct cgroup *cgrp;
+	if (pinned_sb)
+		deactivate_super(pinned_sb);
 
-		mutex_lock(&cgroup_mutex);
-		spin_lock_irq(&css_set_lock);
+	return dentry;
+}
 
-		cgrp = cset_cgroup_from_root(ns->root_cset, root);
+static struct dentry *cgroup_mount(struct file_system_type *fs_type,
+			 int flags, const char *unused_dev_name,
+			 void *data)
+{
+	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+	struct dentry *dentry;
 
-		spin_unlock_irq(&css_set_lock);
-		mutex_unlock(&cgroup_mutex);
+	get_cgroup_ns(ns);
 
-		nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
-		dput(dentry);
-		dentry = nsdentry;
+	/* Check if the caller has permission to mount. */
+	if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
+		put_cgroup_ns(ns);
+		return ERR_PTR(-EPERM);
 	}
 
-	if (IS_ERR(dentry) || !new_sb)
-		cgroup_put(&root->cgrp);
-
 	/*
-	 * If @pinned_sb, we're reusing an existing root and holding an
-	 * extra ref on its sb.  Mount is complete.  Put the extra ref.
+	 * The first time anyone tries to mount a cgroup, enable the list
+	 * linking each css_set to its tasks and fix up all existing tasks.
 	 */
-	if (pinned_sb) {
-		WARN_ON(new_sb);
-		deactivate_super(pinned_sb);
+	if (!use_task_css_set_links)
+		cgroup_enable_task_cg_lists();
+
+	if (fs_type == &cgroup2_fs_type) {
+		if (data) {
+			pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
+			put_cgroup_ns(ns);
+			return ERR_PTR(-EINVAL);
+		}
+		cgrp_dfl_visible = true;
+		cgroup_get(&cgrp_dfl_root.cgrp);
+
+		dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
+					 CGROUP2_SUPER_MAGIC, ns);
+	} else {
+		dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
+				       CGROUP_SUPER_MAGIC, ns);
 	}
 
 	put_cgroup_ns(ns);
-- 
cgit v1.2.3


From fa069904dd38c2d8e121a3c7e37f8daaddb6dafa Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 27 Dec 2016 14:49:07 -0500
Subject: cgroup: separate out cgroup1_kf_syscall_ops

Currently, cgroup_kf_syscall_ops is shared by v1 and v2 and the
specific methods test the version and take different actions.  Split
out v1 functions and put them in cgroup1_kf_syscall_ops and remove the
now unnecessary explicit branches in specific methods.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Acked-by: Zefan Li <lizefan@huawei.com>
---
 kernel/cgroup/cgroup-internal.h |  4 ++--
 kernel/cgroup/cgroup-v1.c       | 11 ++---------
 kernel/cgroup/cgroup.c          | 40 ++++++++++++++++++++++++++--------------
 3 files changed, 30 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index dca3193bd9d2..5790e5ff9a0f 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -95,8 +95,8 @@ extern const struct file_operations proc_cgroupstats_operations;
 
 bool cgroup_ssid_no_v1(int ssid);
 void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
-int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
-		  const char *new_name_str);
+int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
+		   const char *new_name_str);
 void cgroup_release_agent(struct work_struct *work);
 void check_for_release(struct cgroup *cgrp);
 
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 7af745a46f91..0b2c24f0b310 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -800,8 +800,8 @@ out_free:
 /*
  * cgroup_rename - Only allow simple rename of directories in place.
  */
-int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
-		  const char *new_name_str)
+int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
+		   const char *new_name_str)
 {
 	struct cgroup *cgrp = kn->priv;
 	int ret;
@@ -811,13 +811,6 @@ int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
 	if (kn->parent != new_parent)
 		return -EIO;
 
-	/*
-	 * This isn't a proper migration and its usefulness is very
-	 * limited.  Disallow on the default hierarchy.
-	 */
-	if (cgroup_on_dfl(cgrp))
-		return -EPERM;
-
 	/*
 	 * We're gonna grab cgroup_mutex which nests outside kernfs
 	 * active_ref.  kernfs_rename() doesn't require active_ref
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 228dd9ae708b..de6a2ac41d0b 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1232,6 +1232,7 @@ struct cgroup *task_cgroup_from_root(struct task_struct *task,
  * update of a tasks cgroup pointer by cgroup_attach_task()
  */
 
+static struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
 
 static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
@@ -1566,16 +1567,15 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 	return len;
 }
 
-static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
+static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
 {
 	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
 	struct cgroup_subsys *ss;
 	int ssid;
 
-	if (root != &cgrp_dfl_root)
-		for_each_subsys(ss, ssid)
-			if (root->subsys_mask & (1 << ssid))
-				seq_show_option(seq, ss->legacy_name, NULL);
+	for_each_subsys(ss, ssid)
+		if (root->subsys_mask & (1 << ssid))
+			seq_show_option(seq, ss->legacy_name, NULL);
 	if (root->flags & CGRP_ROOT_NOPREFIX)
 		seq_puts(seq, ",noprefix");
 	if (root->flags & CGRP_ROOT_XATTR)
@@ -1736,18 +1736,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 	return 0;
 }
 
-static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
+static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
 {
 	int ret = 0;
 	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
 	struct cgroup_sb_opts opts;
 	u16 added_mask, removed_mask;
 
-	if (root == &cgrp_dfl_root) {
-		pr_err("remount is not allowed\n");
-		return -EINVAL;
-	}
-
 	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
 
 	/* See what subsystems are wanted */
@@ -1798,6 +1793,12 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 	return ret;
 }
 
+static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
+{
+	pr_err("remount is not allowed\n");
+	return -EINVAL;
+}
+
 /*
  * To reduce the fork() overhead for systems that are not actually using
  * their cgroups capability, we don't maintain the lists running through
@@ -1900,6 +1901,7 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
 {
 	LIST_HEAD(tmp_links);
 	struct cgroup *root_cgrp = &root->cgrp;
+	struct kernfs_syscall_ops *kf_sops;
 	struct css_set *cset;
 	int i, ret;
 
@@ -1931,7 +1933,10 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
 	if (ret)
 		goto cancel_ref;
 
-	root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
+	kf_sops = root == &cgrp_dfl_root ?
+		&cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
+
+	root->kf_root = kernfs_create_root(kf_sops,
 					   KERNFS_ROOT_CREATE_DEACTIVATED,
 					   root_cgrp);
 	if (IS_ERR(root->kf_root)) {
@@ -4813,12 +4818,19 @@ static int cgroup_rmdir(struct kernfs_node *kn)
 	return ret;
 }
 
+static struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
+	.remount_fs		= cgroup1_remount,
+	.show_options		= cgroup1_show_options,
+	.rename			= cgroup1_rename,
+	.mkdir			= cgroup_mkdir,
+	.rmdir			= cgroup_rmdir,
+	.show_path		= cgroup_show_path,
+};
+
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
 	.remount_fs		= cgroup_remount,
-	.show_options		= cgroup_show_options,
 	.mkdir			= cgroup_mkdir,
 	.rmdir			= cgroup_rmdir,
-	.rename			= cgroup_rename,
 	.show_path		= cgroup_show_path,
 };
 
-- 
cgit v1.2.3


From 1592c9b223749d59b933ebbfe37f1a8833d7a6cf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 27 Dec 2016 14:49:08 -0500
Subject: cgroup: move v1 mount functions to kernel/cgroup/cgroup-v1.c

Now that the v1 mount code is split into separate functions, move them
to kernel/cgroup/cgroup-v1.c along with the mount option handling
code.  As this puts all v1-only kernfs_syscall_ops in cgroup-v1.c,
move cgroup1_kf_syscall_ops to cgroup-v1.c too.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Acked-by: Zefan Li <lizefan@huawei.com>
---
 kernel/cgroup/cgroup-internal.h |  28 ++-
 kernel/cgroup/cgroup-v1.c       | 381 ++++++++++++++++++++++++++++++++++++-
 kernel/cgroup/cgroup.c          | 410 +---------------------------------------
 3 files changed, 413 insertions(+), 406 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 5790e5ff9a0f..710edeeb1f9f 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -26,6 +26,16 @@ struct cgrp_cset_link {
 	struct list_head	cgrp_link;
 };
 
+struct cgroup_sb_opts {
+	u16 subsys_mask;
+	unsigned int flags;
+	char *release_agent;
+	bool cpuset_clone_children;
+	char *name;
+	/* User explicitly requested empty subsystem */
+	bool none;
+};
+
 extern struct mutex cgroup_mutex;
 extern spinlock_t css_set_lock;
 extern struct cgroup_subsys *cgroup_subsys[];
@@ -66,7 +76,13 @@ void cgroup_kn_unlock(struct kernfs_node *kn);
 int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
 			  struct cgroup_namespace *ns);
 
+void cgroup_free_root(struct cgroup_root *root);
+void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts);
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
 int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
+struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
+			       struct cgroup_root *root, unsigned long magic,
+			       struct cgroup_namespace *ns);
 
 bool cgroup_may_migrate_to(struct cgroup *dst_cgrp);
 void cgroup_migrate_finish(struct list_head *preloaded_csets);
@@ -86,18 +102,24 @@ ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes
 
 void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
 
+int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode);
+int cgroup_rmdir(struct kernfs_node *kn);
+int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
+		     struct kernfs_root *kf_root);
+
 /*
  * cgroup-v1.c
  */
-extern spinlock_t release_agent_path_lock;
 extern struct cftype cgroup_legacy_base_files[];
 extern const struct file_operations proc_cgroupstats_operations;
+extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
 
 bool cgroup_ssid_no_v1(int ssid);
 void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
-int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
-		   const char *new_name_str);
 void cgroup_release_agent(struct work_struct *work);
 void check_for_release(struct cgroup *cgrp);
+struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
+			     void *data, unsigned long magic,
+			     struct cgroup_namespace *ns);
 
 #endif /* __CGROUP_INTERNAL_H */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 0b2c24f0b310..ae240c0d33cb 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -1,7 +1,9 @@
 #include "cgroup-internal.h"
 
+#include <linux/ctype.h>
 #include <linux/kmod.h>
 #include <linux/sort.h>
+#include <linux/delay.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
@@ -32,7 +34,7 @@ static struct workqueue_struct *cgroup_pidlist_destroy_wq;
  * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
  * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
  */
-DEFINE_SPINLOCK(release_agent_path_lock);
+static DEFINE_SPINLOCK(release_agent_path_lock);
 
 bool cgroup_ssid_no_v1(int ssid)
 {
@@ -800,8 +802,8 @@ out_free:
 /*
  * cgroup_rename - Only allow simple rename of directories in place.
  */
-int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
-		   const char *new_name_str)
+static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
+			  const char *new_name_str)
 {
 	struct cgroup *cgrp = kn->priv;
 	int ret;
@@ -832,6 +834,379 @@ int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
 	return ret;
 }
 
+static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
+{
+	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
+	struct cgroup_subsys *ss;
+	int ssid;
+
+	for_each_subsys(ss, ssid)
+		if (root->subsys_mask & (1 << ssid))
+			seq_show_option(seq, ss->legacy_name, NULL);
+	if (root->flags & CGRP_ROOT_NOPREFIX)
+		seq_puts(seq, ",noprefix");
+	if (root->flags & CGRP_ROOT_XATTR)
+		seq_puts(seq, ",xattr");
+
+	spin_lock(&release_agent_path_lock);
+	if (strlen(root->release_agent_path))
+		seq_show_option(seq, "release_agent",
+				root->release_agent_path);
+	spin_unlock(&release_agent_path_lock);
+
+	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
+		seq_puts(seq, ",clone_children");
+	if (strlen(root->name))
+		seq_show_option(seq, "name", root->name);
+	return 0;
+}
+
+static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
+{
+	char *token, *o = data;
+	bool all_ss = false, one_ss = false;
+	u16 mask = U16_MAX;
+	struct cgroup_subsys *ss;
+	int nr_opts = 0;
+	int i;
+
+#ifdef CONFIG_CPUSETS
+	mask = ~((u16)1 << cpuset_cgrp_id);
+#endif
+
+	memset(opts, 0, sizeof(*opts));
+
+	while ((token = strsep(&o, ",")) != NULL) {
+		nr_opts++;
+
+		if (!*token)
+			return -EINVAL;
+		if (!strcmp(token, "none")) {
+			/* Explicitly have no subsystems */
+			opts->none = true;
+			continue;
+		}
+		if (!strcmp(token, "all")) {
+			/* Mutually exclusive option 'all' + subsystem name */
+			if (one_ss)
+				return -EINVAL;
+			all_ss = true;
+			continue;
+		}
+		if (!strcmp(token, "noprefix")) {
+			opts->flags |= CGRP_ROOT_NOPREFIX;
+			continue;
+		}
+		if (!strcmp(token, "clone_children")) {
+			opts->cpuset_clone_children = true;
+			continue;
+		}
+		if (!strcmp(token, "xattr")) {
+			opts->flags |= CGRP_ROOT_XATTR;
+			continue;
+		}
+		if (!strncmp(token, "release_agent=", 14)) {
+			/* Specifying two release agents is forbidden */
+			if (opts->release_agent)
+				return -EINVAL;
+			opts->release_agent =
+				kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
+			if (!opts->release_agent)
+				return -ENOMEM;
+			continue;
+		}
+		if (!strncmp(token, "name=", 5)) {
+			const char *name = token + 5;
+			/* Can't specify an empty name */
+			if (!strlen(name))
+				return -EINVAL;
+			/* Must match [\w.-]+ */
+			for (i = 0; i < strlen(name); i++) {
+				char c = name[i];
+				if (isalnum(c))
+					continue;
+				if ((c == '.') || (c == '-') || (c == '_'))
+					continue;
+				return -EINVAL;
+			}
+			/* Specifying two names is forbidden */
+			if (opts->name)
+				return -EINVAL;
+			opts->name = kstrndup(name,
+					      MAX_CGROUP_ROOT_NAMELEN - 1,
+					      GFP_KERNEL);
+			if (!opts->name)
+				return -ENOMEM;
+
+			continue;
+		}
+
+		for_each_subsys(ss, i) {
+			if (strcmp(token, ss->legacy_name))
+				continue;
+			if (!cgroup_ssid_enabled(i))
+				continue;
+			if (cgroup_ssid_no_v1(i))
+				continue;
+
+			/* Mutually exclusive option 'all' + subsystem name */
+			if (all_ss)
+				return -EINVAL;
+			opts->subsys_mask |= (1 << i);
+			one_ss = true;
+
+			break;
+		}
+		if (i == CGROUP_SUBSYS_COUNT)
+			return -ENOENT;
+	}
+
+	/*
+	 * If the 'all' option was specified select all the subsystems,
+	 * otherwise if 'none', 'name=' and a subsystem name options were
+	 * not specified, let's default to 'all'
+	 */
+	if (all_ss || (!one_ss && !opts->none && !opts->name))
+		for_each_subsys(ss, i)
+			if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i))
+				opts->subsys_mask |= (1 << i);
+
+	/*
+	 * We either have to specify by name or by subsystems. (So all
+	 * empty hierarchies must have a name).
+	 */
+	if (!opts->subsys_mask && !opts->name)
+		return -EINVAL;
+
+	/*
+	 * Option noprefix was introduced just for backward compatibility
+	 * with the old cpuset, so we allow noprefix only if mounting just
+	 * the cpuset subsystem.
+	 */
+	if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
+		return -EINVAL;
+
+	/* Can't specify "none" and some subsystems */
+	if (opts->subsys_mask && opts->none)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
+{
+	int ret = 0;
+	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
+	struct cgroup_sb_opts opts;
+	u16 added_mask, removed_mask;
+
+	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
+
+	/* See what subsystems are wanted */
+	ret = parse_cgroupfs_options(data, &opts);
+	if (ret)
+		goto out_unlock;
+
+	if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
+		pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
+			task_tgid_nr(current), current->comm);
+
+	added_mask = opts.subsys_mask & ~root->subsys_mask;
+	removed_mask = root->subsys_mask & ~opts.subsys_mask;
+
+	/* Don't allow flags or name to change at remount */
+	if ((opts.flags ^ root->flags) ||
+	    (opts.name && strcmp(opts.name, root->name))) {
+		pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
+		       opts.flags, opts.name ?: "", root->flags, root->name);
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* remounting is not allowed for populated hierarchies */
+	if (!list_empty(&root->cgrp.self.children)) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
+	ret = rebind_subsystems(root, added_mask);
+	if (ret)
+		goto out_unlock;
+
+	WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
+
+	if (opts.release_agent) {
+		spin_lock(&release_agent_path_lock);
+		strcpy(root->release_agent_path, opts.release_agent);
+		spin_unlock(&release_agent_path_lock);
+	}
+
+	trace_cgroup_remount(root);
+
+ out_unlock:
+	kfree(opts.release_agent);
+	kfree(opts.name);
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
+
+struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
+	.rename			= cgroup1_rename,
+	.show_options		= cgroup1_show_options,
+	.remount_fs		= cgroup1_remount,
+	.mkdir			= cgroup_mkdir,
+	.rmdir			= cgroup_rmdir,
+	.show_path		= cgroup_show_path,
+};
+
+struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
+			     void *data, unsigned long magic,
+			     struct cgroup_namespace *ns)
+{
+	struct super_block *pinned_sb = NULL;
+	struct cgroup_sb_opts opts;
+	struct cgroup_root *root;
+	struct cgroup_subsys *ss;
+	struct dentry *dentry;
+	int i, ret;
+
+	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
+
+	/* First find the desired set of subsystems */
+	ret = parse_cgroupfs_options(data, &opts);
+	if (ret)
+		goto out_unlock;
+
+	/*
+	 * Destruction of cgroup root is asynchronous, so subsystems may
+	 * still be dying after the previous unmount.  Let's drain the
+	 * dying subsystems.  We just need to ensure that the ones
+	 * unmounted previously finish dying and don't care about new ones
+	 * starting.  Testing ref liveliness is good enough.
+	 */
+	for_each_subsys(ss, i) {
+		if (!(opts.subsys_mask & (1 << i)) ||
+		    ss->root == &cgrp_dfl_root)
+			continue;
+
+		if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
+			mutex_unlock(&cgroup_mutex);
+			msleep(10);
+			ret = restart_syscall();
+			goto out_free;
+		}
+		cgroup_put(&ss->root->cgrp);
+	}
+
+	for_each_root(root) {
+		bool name_match = false;
+
+		if (root == &cgrp_dfl_root)
+			continue;
+
+		/*
+		 * If we asked for a name then it must match.  Also, if
+		 * name matches but sybsys_mask doesn't, we should fail.
+		 * Remember whether name matched.
+		 */
+		if (opts.name) {
+			if (strcmp(opts.name, root->name))
+				continue;
+			name_match = true;
+		}
+
+		/*
+		 * If we asked for subsystems (or explicitly for no
+		 * subsystems) then they must match.
+		 */
+		if ((opts.subsys_mask || opts.none) &&
+		    (opts.subsys_mask != root->subsys_mask)) {
+			if (!name_match)
+				continue;
+			ret = -EBUSY;
+			goto out_unlock;
+		}
+
+		if (root->flags ^ opts.flags)
+			pr_warn("new mount options do not match the existing superblock, will be ignored\n");
+
+		/*
+		 * We want to reuse @root whose lifetime is governed by its
+		 * ->cgrp.  Let's check whether @root is alive and keep it
+		 * that way.  As cgroup_kill_sb() can happen anytime, we
+		 * want to block it by pinning the sb so that @root doesn't
+		 * get killed before mount is complete.
+		 *
+		 * With the sb pinned, tryget_live can reliably indicate
+		 * whether @root can be reused.  If it's being killed,
+		 * drain it.  We can use wait_queue for the wait but this
+		 * path is super cold.  Let's just sleep a bit and retry.
+		 */
+		pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
+		if (IS_ERR(pinned_sb) ||
+		    !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
+			mutex_unlock(&cgroup_mutex);
+			if (!IS_ERR_OR_NULL(pinned_sb))
+				deactivate_super(pinned_sb);
+			msleep(10);
+			ret = restart_syscall();
+			goto out_free;
+		}
+
+		ret = 0;
+		goto out_unlock;
+	}
+
+	/*
+	 * No such thing, create a new one.  name= matching without subsys
+	 * specification is allowed for already existing hierarchies but we
+	 * can't create new one without subsys specification.
+	 */
+	if (!opts.subsys_mask && !opts.none) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* Hierarchies may only be created in the initial cgroup namespace. */
+	if (ns != &init_cgroup_ns) {
+		ret = -EPERM;
+		goto out_unlock;
+	}
+
+	root = kzalloc(sizeof(*root), GFP_KERNEL);
+	if (!root) {
+		ret = -ENOMEM;
+		goto out_unlock;
+	}
+
+	init_cgroup_root(root, &opts);
+
+	ret = cgroup_setup_root(root, opts.subsys_mask);
+	if (ret)
+		cgroup_free_root(root);
+
+out_unlock:
+	mutex_unlock(&cgroup_mutex);
+out_free:
+	kfree(opts.release_agent);
+	kfree(opts.name);
+
+	if (ret)
+		return ERR_PTR(ret);
+
+	dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
+				 CGROUP_SUPER_MAGIC, ns);
+
+	/*
+	 * If @pinned_sb, we're reusing an existing root and holding an
+	 * extra ref on its sb.  Mount is complete.  Put the extra ref.
+	 */
+	if (pinned_sb)
+		deactivate_super(pinned_sb);
+
+	return dentry;
+}
+
 static int __init cgroup1_wq_init(void)
 {
 	/*
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index de6a2ac41d0b..4be306510aff 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -31,7 +31,6 @@
 #include "cgroup-internal.h"
 
 #include <linux/cred.h>
-#include <linux/ctype.h>
 #include <linux/errno.h>
 #include <linux/init_task.h>
 #include <linux/kernel.h>
@@ -49,7 +48,6 @@
 #include <linux/hashtable.h>
 #include <linux/idr.h>
 #include <linux/kthread.h>
-#include <linux/delay.h>
 #include <linux/atomic.h>
 #include <linux/cpuset.h>
 #include <linux/proc_ns.h>
@@ -1078,7 +1076,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
 	idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
 }
 
-static void cgroup_free_root(struct cgroup_root *root)
+void cgroup_free_root(struct cgroup_root *root)
 {
 	if (root) {
 		idr_destroy(&root->cgroup_idr);
@@ -1232,7 +1230,6 @@ struct cgroup *task_cgroup_from_root(struct task_struct *task,
  * update of a tasks cgroup pointer by cgroup_attach_task()
  */
 
-static struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
 
 static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
@@ -1540,8 +1537,8 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
 	return 0;
 }
 
-static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
-			    struct kernfs_root *kf_root)
+int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
+		     struct kernfs_root *kf_root)
 {
 	int len = 0;
 	char *buf = NULL;
@@ -1567,232 +1564,6 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 	return len;
 }
 
-static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
-{
-	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
-	struct cgroup_subsys *ss;
-	int ssid;
-
-	for_each_subsys(ss, ssid)
-		if (root->subsys_mask & (1 << ssid))
-			seq_show_option(seq, ss->legacy_name, NULL);
-	if (root->flags & CGRP_ROOT_NOPREFIX)
-		seq_puts(seq, ",noprefix");
-	if (root->flags & CGRP_ROOT_XATTR)
-		seq_puts(seq, ",xattr");
-
-	spin_lock(&release_agent_path_lock);
-	if (strlen(root->release_agent_path))
-		seq_show_option(seq, "release_agent",
-				root->release_agent_path);
-	spin_unlock(&release_agent_path_lock);
-
-	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
-		seq_puts(seq, ",clone_children");
-	if (strlen(root->name))
-		seq_show_option(seq, "name", root->name);
-	return 0;
-}
-
-struct cgroup_sb_opts {
-	u16 subsys_mask;
-	unsigned int flags;
-	char *release_agent;
-	bool cpuset_clone_children;
-	char *name;
-	/* User explicitly requested empty subsystem */
-	bool none;
-};
-
-static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
-{
-	char *token, *o = data;
-	bool all_ss = false, one_ss = false;
-	u16 mask = U16_MAX;
-	struct cgroup_subsys *ss;
-	int nr_opts = 0;
-	int i;
-
-#ifdef CONFIG_CPUSETS
-	mask = ~((u16)1 << cpuset_cgrp_id);
-#endif
-
-	memset(opts, 0, sizeof(*opts));
-
-	while ((token = strsep(&o, ",")) != NULL) {
-		nr_opts++;
-
-		if (!*token)
-			return -EINVAL;
-		if (!strcmp(token, "none")) {
-			/* Explicitly have no subsystems */
-			opts->none = true;
-			continue;
-		}
-		if (!strcmp(token, "all")) {
-			/* Mutually exclusive option 'all' + subsystem name */
-			if (one_ss)
-				return -EINVAL;
-			all_ss = true;
-			continue;
-		}
-		if (!strcmp(token, "noprefix")) {
-			opts->flags |= CGRP_ROOT_NOPREFIX;
-			continue;
-		}
-		if (!strcmp(token, "clone_children")) {
-			opts->cpuset_clone_children = true;
-			continue;
-		}
-		if (!strcmp(token, "xattr")) {
-			opts->flags |= CGRP_ROOT_XATTR;
-			continue;
-		}
-		if (!strncmp(token, "release_agent=", 14)) {
-			/* Specifying two release agents is forbidden */
-			if (opts->release_agent)
-				return -EINVAL;
-			opts->release_agent =
-				kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
-			if (!opts->release_agent)
-				return -ENOMEM;
-			continue;
-		}
-		if (!strncmp(token, "name=", 5)) {
-			const char *name = token + 5;
-			/* Can't specify an empty name */
-			if (!strlen(name))
-				return -EINVAL;
-			/* Must match [\w.-]+ */
-			for (i = 0; i < strlen(name); i++) {
-				char c = name[i];
-				if (isalnum(c))
-					continue;
-				if ((c == '.') || (c == '-') || (c == '_'))
-					continue;
-				return -EINVAL;
-			}
-			/* Specifying two names is forbidden */
-			if (opts->name)
-				return -EINVAL;
-			opts->name = kstrndup(name,
-					      MAX_CGROUP_ROOT_NAMELEN - 1,
-					      GFP_KERNEL);
-			if (!opts->name)
-				return -ENOMEM;
-
-			continue;
-		}
-
-		for_each_subsys(ss, i) {
-			if (strcmp(token, ss->legacy_name))
-				continue;
-			if (!cgroup_ssid_enabled(i))
-				continue;
-			if (cgroup_ssid_no_v1(i))
-				continue;
-
-			/* Mutually exclusive option 'all' + subsystem name */
-			if (all_ss)
-				return -EINVAL;
-			opts->subsys_mask |= (1 << i);
-			one_ss = true;
-
-			break;
-		}
-		if (i == CGROUP_SUBSYS_COUNT)
-			return -ENOENT;
-	}
-
-	/*
-	 * If the 'all' option was specified select all the subsystems,
-	 * otherwise if 'none', 'name=' and a subsystem name options were
-	 * not specified, let's default to 'all'
-	 */
-	if (all_ss || (!one_ss && !opts->none && !opts->name))
-		for_each_subsys(ss, i)
-			if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i))
-				opts->subsys_mask |= (1 << i);
-
-	/*
-	 * We either have to specify by name or by subsystems. (So all
-	 * empty hierarchies must have a name).
-	 */
-	if (!opts->subsys_mask && !opts->name)
-		return -EINVAL;
-
-	/*
-	 * Option noprefix was introduced just for backward compatibility
-	 * with the old cpuset, so we allow noprefix only if mounting just
-	 * the cpuset subsystem.
-	 */
-	if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
-		return -EINVAL;
-
-	/* Can't specify "none" and some subsystems */
-	if (opts->subsys_mask && opts->none)
-		return -EINVAL;
-
-	return 0;
-}
-
-static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
-{
-	int ret = 0;
-	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
-	struct cgroup_sb_opts opts;
-	u16 added_mask, removed_mask;
-
-	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
-
-	/* See what subsystems are wanted */
-	ret = parse_cgroupfs_options(data, &opts);
-	if (ret)
-		goto out_unlock;
-
-	if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
-		pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
-			task_tgid_nr(current), current->comm);
-
-	added_mask = opts.subsys_mask & ~root->subsys_mask;
-	removed_mask = root->subsys_mask & ~opts.subsys_mask;
-
-	/* Don't allow flags or name to change at remount */
-	if ((opts.flags ^ root->flags) ||
-	    (opts.name && strcmp(opts.name, root->name))) {
-		pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
-		       opts.flags, opts.name ?: "", root->flags, root->name);
-		ret = -EINVAL;
-		goto out_unlock;
-	}
-
-	/* remounting is not allowed for populated hierarchies */
-	if (!list_empty(&root->cgrp.self.children)) {
-		ret = -EBUSY;
-		goto out_unlock;
-	}
-
-	ret = rebind_subsystems(root, added_mask);
-	if (ret)
-		goto out_unlock;
-
-	WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
-
-	if (opts.release_agent) {
-		spin_lock(&release_agent_path_lock);
-		strcpy(root->release_agent_path, opts.release_agent);
-		spin_unlock(&release_agent_path_lock);
-	}
-
-	trace_cgroup_remount(root);
-
- out_unlock:
-	kfree(opts.release_agent);
-	kfree(opts.name);
-	mutex_unlock(&cgroup_mutex);
-	return ret;
-}
-
 static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 {
 	pr_err("remount is not allowed\n");
@@ -1877,8 +1648,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
 }
 
-static void init_cgroup_root(struct cgroup_root *root,
-			     struct cgroup_sb_opts *opts)
+void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
 {
 	struct cgroup *cgrp = &root->cgrp;
 
@@ -1897,7 +1667,7 @@ static void init_cgroup_root(struct cgroup_root *root,
 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
 }
 
-static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
 {
 	LIST_HEAD(tmp_links);
 	struct cgroup *root_cgrp = &root->cgrp;
@@ -1994,10 +1764,9 @@ out:
 	return ret;
 }
 
-static struct dentry *cgroup_do_mount(struct file_system_type *fs_type,
-				      int flags, struct cgroup_root *root,
-				      unsigned long magic,
-				      struct cgroup_namespace *ns)
+struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
+			       struct cgroup_root *root, unsigned long magic,
+			       struct cgroup_namespace *ns)
 {
 	struct dentry *dentry;
 	bool new_sb;
@@ -2031,155 +1800,6 @@ static struct dentry *cgroup_do_mount(struct file_system_type *fs_type,
 	return dentry;
 }
 
-static struct dentry *cgroup1_mount(struct file_system_type *fs_type,
-				    int flags, void *data,
-				    unsigned long magic,
-				    struct cgroup_namespace *ns)
-{
-	struct super_block *pinned_sb = NULL;
-	struct cgroup_sb_opts opts;
-	struct cgroup_root *root;
-	struct cgroup_subsys *ss;
-	struct dentry *dentry;
-	int i, ret;
-
-	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
-
-	/* First find the desired set of subsystems */
-	ret = parse_cgroupfs_options(data, &opts);
-	if (ret)
-		goto out_unlock;
-
-	/*
-	 * Destruction of cgroup root is asynchronous, so subsystems may
-	 * still be dying after the previous unmount.  Let's drain the
-	 * dying subsystems.  We just need to ensure that the ones
-	 * unmounted previously finish dying and don't care about new ones
-	 * starting.  Testing ref liveliness is good enough.
-	 */
-	for_each_subsys(ss, i) {
-		if (!(opts.subsys_mask & (1 << i)) ||
-		    ss->root == &cgrp_dfl_root)
-			continue;
-
-		if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
-			mutex_unlock(&cgroup_mutex);
-			msleep(10);
-			ret = restart_syscall();
-			goto out_free;
-		}
-		cgroup_put(&ss->root->cgrp);
-	}
-
-	for_each_root(root) {
-		bool name_match = false;
-
-		if (root == &cgrp_dfl_root)
-			continue;
-
-		/*
-		 * If we asked for a name then it must match.  Also, if
-		 * name matches but sybsys_mask doesn't, we should fail.
-		 * Remember whether name matched.
-		 */
-		if (opts.name) {
-			if (strcmp(opts.name, root->name))
-				continue;
-			name_match = true;
-		}
-
-		/*
-		 * If we asked for subsystems (or explicitly for no
-		 * subsystems) then they must match.
-		 */
-		if ((opts.subsys_mask || opts.none) &&
-		    (opts.subsys_mask != root->subsys_mask)) {
-			if (!name_match)
-				continue;
-			ret = -EBUSY;
-			goto out_unlock;
-		}
-
-		if (root->flags ^ opts.flags)
-			pr_warn("new mount options do not match the existing superblock, will be ignored\n");
-
-		/*
-		 * We want to reuse @root whose lifetime is governed by its
-		 * ->cgrp.  Let's check whether @root is alive and keep it
-		 * that way.  As cgroup_kill_sb() can happen anytime, we
-		 * want to block it by pinning the sb so that @root doesn't
-		 * get killed before mount is complete.
-		 *
-		 * With the sb pinned, tryget_live can reliably indicate
-		 * whether @root can be reused.  If it's being killed,
-		 * drain it.  We can use wait_queue for the wait but this
-		 * path is super cold.  Let's just sleep a bit and retry.
-		 */
-		pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
-		if (IS_ERR(pinned_sb) ||
-		    !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
-			mutex_unlock(&cgroup_mutex);
-			if (!IS_ERR_OR_NULL(pinned_sb))
-				deactivate_super(pinned_sb);
-			msleep(10);
-			ret = restart_syscall();
-			goto out_free;
-		}
-
-		ret = 0;
-		goto out_unlock;
-	}
-
-	/*
-	 * No such thing, create a new one.  name= matching without subsys
-	 * specification is allowed for already existing hierarchies but we
-	 * can't create new one without subsys specification.
-	 */
-	if (!opts.subsys_mask && !opts.none) {
-		ret = -EINVAL;
-		goto out_unlock;
-	}
-
-	/* Hierarchies may only be created in the initial cgroup namespace. */
-	if (ns != &init_cgroup_ns) {
-		ret = -EPERM;
-		goto out_unlock;
-	}
-
-	root = kzalloc(sizeof(*root), GFP_KERNEL);
-	if (!root) {
-		ret = -ENOMEM;
-		goto out_unlock;
-	}
-
-	init_cgroup_root(root, &opts);
-
-	ret = cgroup_setup_root(root, opts.subsys_mask);
-	if (ret)
-		cgroup_free_root(root);
-
-out_unlock:
-	mutex_unlock(&cgroup_mutex);
-out_free:
-	kfree(opts.release_agent);
-	kfree(opts.name);
-
-	if (ret)
-		return ERR_PTR(ret);
-
-	dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
-				 CGROUP_SUPER_MAGIC, ns);
-
-	/*
-	 * If @pinned_sb, we're reusing an existing root and holding an
-	 * extra ref on its sb.  Mount is complete.  Put the extra ref.
-	 */
-	if (pinned_sb)
-		deactivate_super(pinned_sb);
-
-	return dentry;
-}
-
 static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 			 int flags, const char *unused_dev_name,
 			 void *data)
@@ -4587,8 +4207,7 @@ out_destroy:
 	return ERR_PTR(ret);
 }
 
-static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
-			umode_t mode)
+int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
 {
 	struct cgroup *parent, *cgrp;
 	struct kernfs_node *kn;
@@ -4800,7 +4419,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	return 0;
 };
 
-static int cgroup_rmdir(struct kernfs_node *kn)
+int cgroup_rmdir(struct kernfs_node *kn)
 {
 	struct cgroup *cgrp;
 	int ret = 0;
@@ -4818,15 +4437,6 @@ static int cgroup_rmdir(struct kernfs_node *kn)
 	return ret;
 }
 
-static struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
-	.remount_fs		= cgroup1_remount,
-	.show_options		= cgroup1_show_options,
-	.rename			= cgroup1_rename,
-	.mkdir			= cgroup_mkdir,
-	.rmdir			= cgroup_rmdir,
-	.show_path		= cgroup_show_path,
-};
-
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
 	.remount_fs		= cgroup_remount,
 	.mkdir			= cgroup_mkdir,
-- 
cgit v1.2.3


From d62beb7f3dc6b45f9b9d381897e05fe8ba286d8a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 27 Dec 2016 14:49:08 -0500
Subject: cgroup: rename functions for consistency

Now that v1 functions are separated out, rename some functions for
consistency.

 cgroup_dfl_base_files		-> cgroup_base_files
 cgroup_legacy_base_files	-> cgroup1_base_files
 cgroup_ssid_no_v1()		-> cgroup1_ssid_disabled()
 cgroup_pidlist_destroy_all	-> cgroup1_pidlist_destroy_all()
 cgroup_release_agent()		-> cgroup1_release_agent()
 check_for_release()		-> cgroup1_check_for_release()

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Acked-by: Zefan Li <lizefan@huawei.com>
---
 kernel/cgroup/cgroup-internal.h | 10 +++++-----
 kernel/cgroup/cgroup-v1.c       | 14 +++++++-------
 kernel/cgroup/cgroup.c          | 22 +++++++++++-----------
 3 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 710edeeb1f9f..a890c92cb688 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -110,14 +110,14 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 /*
  * cgroup-v1.c
  */
-extern struct cftype cgroup_legacy_base_files[];
+extern struct cftype cgroup1_base_files[];
 extern const struct file_operations proc_cgroupstats_operations;
 extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
 
-bool cgroup_ssid_no_v1(int ssid);
-void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
-void cgroup_release_agent(struct work_struct *work);
-void check_for_release(struct cgroup *cgrp);
+bool cgroup1_ssid_disabled(int ssid);
+void cgroup1_pidlist_destroy_all(struct cgroup *cgrp);
+void cgroup1_release_agent(struct work_struct *work);
+void cgroup1_check_for_release(struct cgroup *cgrp);
 struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
 			     void *data, unsigned long magic,
 			     struct cgroup_namespace *ns);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index ae240c0d33cb..37be09e09740 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -36,7 +36,7 @@ static struct workqueue_struct *cgroup_pidlist_destroy_wq;
  */
 static DEFINE_SPINLOCK(release_agent_path_lock);
 
-bool cgroup_ssid_no_v1(int ssid)
+bool cgroup1_ssid_disabled(int ssid)
 {
 	return cgroup_no_v1_mask & (1 << ssid);
 }
@@ -201,7 +201,7 @@ static void pidlist_free(void *p)
  * Used to destroy all pidlists lingering waiting for destroy timer.  None
  * should be left afterwards.
  */
-void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
+void cgroup1_pidlist_destroy_all(struct cgroup *cgrp)
 {
 	struct cgroup_pidlist *l, *tmp_l;
 
@@ -585,7 +585,7 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
 }
 
 /* cgroup core interface files for the legacy hierarchies */
-struct cftype cgroup_legacy_base_files[] = {
+struct cftype cgroup1_base_files[] = {
 	{
 		.name = "cgroup.procs",
 		.seq_start = cgroup_pidlist_start,
@@ -729,7 +729,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
 	return 0;
 }
 
-void check_for_release(struct cgroup *cgrp)
+void cgroup1_check_for_release(struct cgroup *cgrp)
 {
 	if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
 	    !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
@@ -759,7 +759,7 @@ void check_for_release(struct cgroup *cgrp)
  * this routine has no use for the exit status of the release agent
  * task, so no sense holding our caller up for that.
  */
-void cgroup_release_agent(struct work_struct *work)
+void cgroup1_release_agent(struct work_struct *work)
 {
 	struct cgroup *cgrp =
 		container_of(work, struct cgroup, release_agent_work);
@@ -946,7 +946,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 				continue;
 			if (!cgroup_ssid_enabled(i))
 				continue;
-			if (cgroup_ssid_no_v1(i))
+			if (cgroup1_ssid_disabled(i))
 				continue;
 
 			/* Mutually exclusive option 'all' + subsystem name */
@@ -968,7 +968,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 	 */
 	if (all_ss || (!one_ss && !opts->none && !opts->name))
 		for_each_subsys(ss, i)
-			if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i))
+			if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
 				opts->subsys_mask |= (1 << i);
 
 	/*
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 4be306510aff..a05a2dacf5dc 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -199,7 +199,7 @@ struct cgroup_namespace init_cgroup_ns = {
 static u16 have_canfork_callback __read_mostly;
 
 static struct file_system_type cgroup2_fs_type;
-static struct cftype cgroup_dfl_base_files[];
+static struct cftype cgroup_base_files[];
 
 static int cgroup_apply_control(struct cgroup *cgrp);
 static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
@@ -609,7 +609,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
 		if (!trigger)
 			break;
 
-		check_for_release(cgrp);
+		cgroup1_check_for_release(cgrp);
 		cgroup_file_notify(&cgrp->events_file);
 
 		cgrp = cgroup_parent(cgrp);
@@ -1440,9 +1440,9 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
 
 	if (!css->ss) {
 		if (cgroup_on_dfl(cgrp))
-			cfts = cgroup_dfl_base_files;
+			cfts = cgroup_base_files;
 		else
-			cfts = cgroup_legacy_base_files;
+			cfts = cgroup1_base_files;
 
 		return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
 	}
@@ -1645,7 +1645,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 		INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
 
 	init_waitqueue_head(&cgrp->offline_waitq);
-	INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
+	INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
 }
 
 void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
@@ -3836,7 +3836,7 @@ static int cgroup_procs_show(struct seq_file *s, void *v)
 }
 
 /* cgroup core interface files for the default hierarchy */
-static struct cftype cgroup_dfl_base_files[] = {
+static struct cftype cgroup_base_files[] = {
 	{
 		.name = "cgroup.procs",
 		.file_offset = offsetof(struct cgroup, procs_file),
@@ -3909,7 +3909,7 @@ static void css_free_work_fn(struct work_struct *work)
 	} else {
 		/* cgroup free path */
 		atomic_dec(&cgrp->root->nr_cgrps);
-		cgroup_pidlist_destroy_all(cgrp);
+		cgroup1_pidlist_destroy_all(cgrp);
 		cancel_work_sync(&cgrp->release_agent_work);
 
 		if (cgroup_parent(cgrp)) {
@@ -4411,7 +4411,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	 */
 	kernfs_remove(cgrp->kn);
 
-	check_for_release(cgroup_parent(cgrp));
+	cgroup1_check_for_release(cgroup_parent(cgrp));
 
 	/* put the base reference */
 	percpu_ref_kill(&cgrp->self.refcnt);
@@ -4548,8 +4548,8 @@ int __init cgroup_init(void)
 
 	BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
 	BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
-	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
-	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
+	BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
+	BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
 
 	/*
 	 * The latency of the synchronize_sched() is too high for cgroups,
@@ -4599,7 +4599,7 @@ int __init cgroup_init(void)
 			continue;
 		}
 
-		if (cgroup_ssid_no_v1(ssid))
+		if (cgroup1_ssid_disabled(ssid))
 			printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
 			       ss->name);
 
-- 
cgit v1.2.3


From dcfe149b9f45aaf89bb95e8b314210da626417d9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 27 Dec 2016 14:49:09 -0500
Subject: cgroup: move namespace code to kernel/cgroup/namespace.c

get/put_css_set() get exposed in cgroup-internal.h in the process.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Acked-by: Zefan Li <lizefan@huawei.com>
---
 kernel/cgroup/Makefile          |   2 +-
 kernel/cgroup/cgroup-internal.h |  32 ++++++++
 kernel/cgroup/cgroup.c          | 175 +---------------------------------------
 kernel/cgroup/namespace.c       | 155 +++++++++++++++++++++++++++++++++++
 4 files changed, 189 insertions(+), 175 deletions(-)
 create mode 100644 kernel/cgroup/namespace.c

(limited to 'kernel')

diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index 719588cb18cd..6d42a3211164 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -1,4 +1,4 @@
-obj-y := cgroup.o cgroup-v1.o
+obj-y := cgroup.o namespace.o cgroup-v1.o
 
 obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
 obj-$(CONFIG_CGROUP_PIDS) += pids.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index a890c92cb688..589b0e7013ec 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -65,6 +65,33 @@ static inline bool notify_on_release(const struct cgroup *cgrp)
 	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 }
 
+void put_css_set_locked(struct css_set *cset);
+
+static inline void put_css_set(struct css_set *cset)
+{
+	unsigned long flags;
+
+	/*
+	 * Ensure that the refcount doesn't hit zero while any readers
+	 * can see it. Similar to atomic_dec_and_lock(), but for an
+	 * rwlock
+	 */
+	if (atomic_add_unless(&cset->refcount, -1, 1))
+		return;
+
+	spin_lock_irqsave(&css_set_lock, flags);
+	put_css_set_locked(cset);
+	spin_unlock_irqrestore(&css_set_lock, flags);
+}
+
+/*
+ * refcounted get/put for css_set objects
+ */
+static inline void get_css_set(struct css_set *cset)
+{
+	atomic_inc(&cset->refcount);
+}
+
 bool cgroup_ssid_enabled(int ssid);
 bool cgroup_on_dfl(const struct cgroup *cgrp);
 
@@ -107,6 +134,11 @@ int cgroup_rmdir(struct kernfs_node *kn);
 int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 		     struct kernfs_root *kf_root);
 
+/*
+ * namespace.c
+ */
+extern const struct proc_ns_operations cgroupns_operations;
+
 /*
  * cgroup-v1.c
  */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index a05a2dacf5dc..b6b9068ef468 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -718,7 +718,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 	return key;
 }
 
-static void put_css_set_locked(struct css_set *cset)
+void put_css_set_locked(struct css_set *cset)
 {
 	struct cgrp_cset_link *link, *tmp_link;
 	struct cgroup_subsys *ss;
@@ -748,31 +748,6 @@ static void put_css_set_locked(struct css_set *cset)
 	kfree_rcu(cset, rcu_head);
 }
 
-static void put_css_set(struct css_set *cset)
-{
-	unsigned long flags;
-
-	/*
-	 * Ensure that the refcount doesn't hit zero while any readers
-	 * can see it. Similar to atomic_dec_and_lock(), but for an
-	 * rwlock
-	 */
-	if (atomic_add_unless(&cset->refcount, -1, 1))
-		return;
-
-	spin_lock_irqsave(&css_set_lock, flags);
-	put_css_set_locked(cset);
-	spin_unlock_irqrestore(&css_set_lock, flags);
-}
-
-/*
- * refcounted get/put for css_set objects
- */
-static inline void get_css_set(struct css_set *cset)
-{
-	atomic_inc(&cset->refcount);
-}
-
 /**
  * compare_css_sets - helper function for find_existing_css_set().
  * @cset: candidate css_set being tested
@@ -5109,154 +5084,6 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
 
 #endif	/* CONFIG_SOCK_CGROUP_DATA */
 
-/* cgroup namespaces */
-
-static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
-{
-	return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
-}
-
-static void dec_cgroup_namespaces(struct ucounts *ucounts)
-{
-	dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
-}
-
-static struct cgroup_namespace *alloc_cgroup_ns(void)
-{
-	struct cgroup_namespace *new_ns;
-	int ret;
-
-	new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
-	if (!new_ns)
-		return ERR_PTR(-ENOMEM);
-	ret = ns_alloc_inum(&new_ns->ns);
-	if (ret) {
-		kfree(new_ns);
-		return ERR_PTR(ret);
-	}
-	atomic_set(&new_ns->count, 1);
-	new_ns->ns.ops = &cgroupns_operations;
-	return new_ns;
-}
-
-void free_cgroup_ns(struct cgroup_namespace *ns)
-{
-	put_css_set(ns->root_cset);
-	dec_cgroup_namespaces(ns->ucounts);
-	put_user_ns(ns->user_ns);
-	ns_free_inum(&ns->ns);
-	kfree(ns);
-}
-EXPORT_SYMBOL(free_cgroup_ns);
-
-struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
-					struct user_namespace *user_ns,
-					struct cgroup_namespace *old_ns)
-{
-	struct cgroup_namespace *new_ns;
-	struct ucounts *ucounts;
-	struct css_set *cset;
-
-	BUG_ON(!old_ns);
-
-	if (!(flags & CLONE_NEWCGROUP)) {
-		get_cgroup_ns(old_ns);
-		return old_ns;
-	}
-
-	/* Allow only sysadmin to create cgroup namespace. */
-	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
-		return ERR_PTR(-EPERM);
-
-	ucounts = inc_cgroup_namespaces(user_ns);
-	if (!ucounts)
-		return ERR_PTR(-ENOSPC);
-
-	/* It is not safe to take cgroup_mutex here */
-	spin_lock_irq(&css_set_lock);
-	cset = task_css_set(current);
-	get_css_set(cset);
-	spin_unlock_irq(&css_set_lock);
-
-	new_ns = alloc_cgroup_ns();
-	if (IS_ERR(new_ns)) {
-		put_css_set(cset);
-		dec_cgroup_namespaces(ucounts);
-		return new_ns;
-	}
-
-	new_ns->user_ns = get_user_ns(user_ns);
-	new_ns->ucounts = ucounts;
-	new_ns->root_cset = cset;
-
-	return new_ns;
-}
-
-static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
-{
-	return container_of(ns, struct cgroup_namespace, ns);
-}
-
-static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
-{
-	struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
-
-	if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
-	    !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
-		return -EPERM;
-
-	/* Don't need to do anything if we are attaching to our own cgroupns. */
-	if (cgroup_ns == nsproxy->cgroup_ns)
-		return 0;
-
-	get_cgroup_ns(cgroup_ns);
-	put_cgroup_ns(nsproxy->cgroup_ns);
-	nsproxy->cgroup_ns = cgroup_ns;
-
-	return 0;
-}
-
-static struct ns_common *cgroupns_get(struct task_struct *task)
-{
-	struct cgroup_namespace *ns = NULL;
-	struct nsproxy *nsproxy;
-
-	task_lock(task);
-	nsproxy = task->nsproxy;
-	if (nsproxy) {
-		ns = nsproxy->cgroup_ns;
-		get_cgroup_ns(ns);
-	}
-	task_unlock(task);
-
-	return ns ? &ns->ns : NULL;
-}
-
-static void cgroupns_put(struct ns_common *ns)
-{
-	put_cgroup_ns(to_cg_ns(ns));
-}
-
-static struct user_namespace *cgroupns_owner(struct ns_common *ns)
-{
-	return to_cg_ns(ns)->user_ns;
-}
-
-const struct proc_ns_operations cgroupns_operations = {
-	.name		= "cgroup",
-	.type		= CLONE_NEWCGROUP,
-	.get		= cgroupns_get,
-	.put		= cgroupns_put,
-	.install	= cgroupns_install,
-	.owner		= cgroupns_owner,
-};
-
-static __init int cgroup_namespaces_init(void)
-{
-	return 0;
-}
-subsys_initcall(cgroup_namespaces_init);
-
 #ifdef CONFIG_CGROUP_BPF
 void cgroup_bpf_update(struct cgroup *cgrp,
 		       struct bpf_prog *prog,
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
new file mode 100644
index 000000000000..cff7ea62c38f
--- /dev/null
+++ b/kernel/cgroup/namespace.c
@@ -0,0 +1,155 @@
+#include "cgroup-internal.h"
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/nsproxy.h>
+#include <linux/proc_ns.h>
+
+
+/* cgroup namespaces */
+
+static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
+{
+	return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
+}
+
+static void dec_cgroup_namespaces(struct ucounts *ucounts)
+{
+	dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
+}
+
+static struct cgroup_namespace *alloc_cgroup_ns(void)
+{
+	struct cgroup_namespace *new_ns;
+	int ret;
+
+	new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
+	if (!new_ns)
+		return ERR_PTR(-ENOMEM);
+	ret = ns_alloc_inum(&new_ns->ns);
+	if (ret) {
+		kfree(new_ns);
+		return ERR_PTR(ret);
+	}
+	atomic_set(&new_ns->count, 1);
+	new_ns->ns.ops = &cgroupns_operations;
+	return new_ns;
+}
+
+void free_cgroup_ns(struct cgroup_namespace *ns)
+{
+	put_css_set(ns->root_cset);
+	dec_cgroup_namespaces(ns->ucounts);
+	put_user_ns(ns->user_ns);
+	ns_free_inum(&ns->ns);
+	kfree(ns);
+}
+EXPORT_SYMBOL(free_cgroup_ns);
+
+struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
+					struct user_namespace *user_ns,
+					struct cgroup_namespace *old_ns)
+{
+	struct cgroup_namespace *new_ns;
+	struct ucounts *ucounts;
+	struct css_set *cset;
+
+	BUG_ON(!old_ns);
+
+	if (!(flags & CLONE_NEWCGROUP)) {
+		get_cgroup_ns(old_ns);
+		return old_ns;
+	}
+
+	/* Allow only sysadmin to create cgroup namespace. */
+	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	ucounts = inc_cgroup_namespaces(user_ns);
+	if (!ucounts)
+		return ERR_PTR(-ENOSPC);
+
+	/* It is not safe to take cgroup_mutex here */
+	spin_lock_irq(&css_set_lock);
+	cset = task_css_set(current);
+	get_css_set(cset);
+	spin_unlock_irq(&css_set_lock);
+
+	new_ns = alloc_cgroup_ns();
+	if (IS_ERR(new_ns)) {
+		put_css_set(cset);
+		dec_cgroup_namespaces(ucounts);
+		return new_ns;
+	}
+
+	new_ns->user_ns = get_user_ns(user_ns);
+	new_ns->ucounts = ucounts;
+	new_ns->root_cset = cset;
+
+	return new_ns;
+}
+
+static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
+{
+	return container_of(ns, struct cgroup_namespace, ns);
+}
+
+static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+{
+	struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
+
+	if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
+	    !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* Don't need to do anything if we are attaching to our own cgroupns. */
+	if (cgroup_ns == nsproxy->cgroup_ns)
+		return 0;
+
+	get_cgroup_ns(cgroup_ns);
+	put_cgroup_ns(nsproxy->cgroup_ns);
+	nsproxy->cgroup_ns = cgroup_ns;
+
+	return 0;
+}
+
+static struct ns_common *cgroupns_get(struct task_struct *task)
+{
+	struct cgroup_namespace *ns = NULL;
+	struct nsproxy *nsproxy;
+
+	task_lock(task);
+	nsproxy = task->nsproxy;
+	if (nsproxy) {
+		ns = nsproxy->cgroup_ns;
+		get_cgroup_ns(ns);
+	}
+	task_unlock(task);
+
+	return ns ? &ns->ns : NULL;
+}
+
+static void cgroupns_put(struct ns_common *ns)
+{
+	put_cgroup_ns(to_cg_ns(ns));
+}
+
+static struct user_namespace *cgroupns_owner(struct ns_common *ns)
+{
+	return to_cg_ns(ns)->user_ns;
+}
+
+const struct proc_ns_operations cgroupns_operations = {
+	.name		= "cgroup",
+	.type		= CLONE_NEWCGROUP,
+	.get		= cgroupns_get,
+	.put		= cgroupns_put,
+	.install	= cgroupns_install,
+	.owner		= cgroupns_owner,
+};
+
+static __init int cgroup_namespaces_init(void)
+{
+	return 0;
+}
+subsys_initcall(cgroup_namespaces_init);
-- 
cgit v1.2.3


From e0aed7c74f0bf6c643103f3a10ff988b7ee6545a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 27 Dec 2016 14:49:09 -0500
Subject: cgroup: fix RCU related sparse warnings

kn->priv which is a void * is used as a RCU pointer by cgroup.  When
dereferencing it, it was passing kn->priv to rcu_derefreence() without
casting it into a RCU pointer triggering address space mismatch
warning from sparse.  Fix them.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Acked-by: Acked-by: Zefan Li <lizefan@huawei.com>
---
 kernel/cgroup/cgroup-v1.c | 2 +-
 kernel/cgroup/cgroup.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 37be09e09740..465b10111eaf 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -694,7 +694,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
 	 * @kn->priv is RCU safe.  Let's do the RCU dancing.
 	 */
 	rcu_read_lock();
-	cgrp = rcu_dereference(kn->priv);
+	cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
 	if (!cgrp || cgroup_is_dead(cgrp)) {
 		rcu_read_unlock();
 		mutex_unlock(&cgroup_mutex);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index b6b9068ef468..d9d82e96d67f 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4932,7 +4932,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
 	 * have been or be removed at any point.  @kn->priv is RCU
 	 * protected for this access.  See css_release_work_fn() for details.
 	 */
-	cgrp = rcu_dereference(kn->priv);
+	cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
 	if (cgrp)
 		css = cgroup_css(cgrp, ss);
 
-- 
cgit v1.2.3


From 40d9f82750044f846005d2ac4eec65e39c1c0f7c Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Wed, 7 Dec 2016 14:33:23 -0800
Subject: timekeeping: Remove unused timekeeping_{get,set}_tai_offset()

The last caller to timekeeping_set_tai_offset() was in commit
0b5154fb9040 (timekeeping: Simplify tai updating from
do_adjtimex, 2013-03-22) and the last caller to
timekeeping_get_tai_offset() was in commit 76f4108892d9 (hrtimer:
Cleanup hrtimer accessors to the timekepeing state, 2014-07-16).
Remove these unused functions now that we handle TAI offsets
differently.

Cc: John Stultz <john.stultz@linaro.org>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/timekeeping.c | 39 +--------------------------------------
 kernel/time/timekeeping.h |  2 --
 2 files changed, 1 insertion(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index db087d7e106d..95b258dd75db 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1275,27 +1275,8 @@ error: /* even if we error out, we forwarded the time, so call update */
 }
 EXPORT_SYMBOL(timekeeping_inject_offset);
 
-
-/**
- * timekeeping_get_tai_offset - Returns current TAI offset from UTC
- *
- */
-s32 timekeeping_get_tai_offset(void)
-{
-	struct timekeeper *tk = &tk_core.timekeeper;
-	unsigned int seq;
-	s32 ret;
-
-	do {
-		seq = read_seqcount_begin(&tk_core.seq);
-		ret = tk->tai_offset;
-	} while (read_seqcount_retry(&tk_core.seq, seq));
-
-	return ret;
-}
-
 /**
- * __timekeeping_set_tai_offset - Lock free worker function
+ * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic
  *
  */
 static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
@@ -1304,24 +1285,6 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
 	tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0));
 }
 
-/**
- * timekeeping_set_tai_offset - Sets the current TAI offset from UTC
- *
- */
-void timekeeping_set_tai_offset(s32 tai_offset)
-{
-	struct timekeeper *tk = &tk_core.timekeeper;
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&timekeeper_lock, flags);
-	write_seqcount_begin(&tk_core.seq);
-	__timekeeping_set_tai_offset(tk, tai_offset);
-	timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
-	write_seqcount_end(&tk_core.seq);
-	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
-	clock_was_set();
-}
-
 /**
  * change_clocksource - Swaps clocksources if a new one is available
  *
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 704f595ce83f..d0914676d4c5 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -11,8 +11,6 @@ extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq,
 extern int timekeeping_valid_for_hres(void);
 extern u64 timekeeping_max_deferment(void);
 extern int timekeeping_inject_offset(struct timespec *ts);
-extern s32 timekeeping_get_tai_offset(void);
-extern void timekeeping_set_tai_offset(s32 tai_offset);
 extern int timekeeping_suspend(void);
 extern void timekeeping_resume(void);
 
-- 
cgit v1.2.3


From dbcfe5f76dd5266b8f308b5a8f9ef52f74b2d6e7 Mon Sep 17 00:00:00 2001
From: Gianluca Borello <g.borello@gmail.com>
Date: Mon, 9 Jan 2017 10:19:46 -0800
Subject: bpf: split check_mem_access logic for map values

Move the logic to check memory accesses to a PTR_TO_MAP_VALUE_ADJ from
check_mem_access() to a separate helper check_map_access_adj(). This
enables to use those checks in other parts of the verifier as well,
where boundaries on PTR_TO_MAP_VALUE_ADJ might need to be checked, for
example when checking helper function arguments. The same thing is
already happening for other types such as PTR_TO_PACKET and its
check_packet_access() helper.

The code has been copied verbatim, with the only difference of removing
the "off += reg->max_value" statement and moving the sum into the call
statement to check_map_access(), as that was only needed due to the
earlier common check_map_access() call.

Signed-off-by: Gianluca Borello <g.borello@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 88 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 49 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 83ed2f8f6f22..8333fbcfbfe7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -635,6 +635,51 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
 	return 0;
 }
 
+/* check read/write into an adjusted map element */
+static int check_map_access_adj(struct bpf_verifier_env *env, u32 regno,
+				int off, int size)
+{
+	struct bpf_verifier_state *state = &env->cur_state;
+	struct bpf_reg_state *reg = &state->regs[regno];
+	int err;
+
+	/* We adjusted the register to this map value, so we
+	 * need to change off and size to min_value and max_value
+	 * respectively to make sure our theoretical access will be
+	 * safe.
+	 */
+	if (log_level)
+		print_verifier_state(state);
+	env->varlen_map_value_access = true;
+	/* The minimum value is only important with signed
+	 * comparisons where we can't assume the floor of a
+	 * value is 0.  If we are using signed variables for our
+	 * index'es we need to make sure that whatever we use
+	 * will have a set floor within our range.
+	 */
+	if (reg->min_value < 0) {
+		verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+			regno);
+		return -EACCES;
+	}
+	err = check_map_access(env, regno, reg->min_value + off, size);
+	if (err) {
+		verbose("R%d min value is outside of the array range\n",
+			regno);
+		return err;
+	}
+
+	/* If we haven't set a max value then we need to bail
+	 * since we can't be sure we won't do bad things.
+	 */
+	if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
+		verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
+			regno);
+		return -EACCES;
+	}
+	return check_map_access(env, regno, reg->max_value + off, size);
+}
+
 #define MAX_PACKET_OFF 0xffff
 
 static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
@@ -775,45 +820,10 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
 			return -EACCES;
 		}
 
-		/* If we adjusted the register to this map value at all then we
-		 * need to change off and size to min_value and max_value
-		 * respectively to make sure our theoretical access will be
-		 * safe.
-		 */
-		if (reg->type == PTR_TO_MAP_VALUE_ADJ) {
-			if (log_level)
-				print_verifier_state(state);
-			env->varlen_map_value_access = true;
-			/* The minimum value is only important with signed
-			 * comparisons where we can't assume the floor of a
-			 * value is 0.  If we are using signed variables for our
-			 * index'es we need to make sure that whatever we use
-			 * will have a set floor within our range.
-			 */
-			if (reg->min_value < 0) {
-				verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
-					regno);
-				return -EACCES;
-			}
-			err = check_map_access(env, regno, reg->min_value + off,
-					       size);
-			if (err) {
-				verbose("R%d min value is outside of the array range\n",
-					regno);
-				return err;
-			}
-
-			/* If we haven't set a max value then we need to bail
-			 * since we can't be sure we won't do bad things.
-			 */
-			if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
-				verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
-					regno);
-				return -EACCES;
-			}
-			off += reg->max_value;
-		}
-		err = check_map_access(env, regno, off, size);
+		if (reg->type == PTR_TO_MAP_VALUE_ADJ)
+			err = check_map_access_adj(env, regno, off, size);
+		else
+			err = check_map_access(env, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown_value(state->regs, value_regno);
 
-- 
cgit v1.2.3


From 5722569bb9c3bd922c4f10b5b2912fe88c255312 Mon Sep 17 00:00:00 2001
From: Gianluca Borello <g.borello@gmail.com>
Date: Mon, 9 Jan 2017 10:19:47 -0800
Subject: bpf: allow helpers access to map element values

Enable helpers to directly access a map element value by passing a
register type PTR_TO_MAP_VALUE (or PTR_TO_MAP_VALUE_ADJ) to helper
arguments ARG_PTR_TO_STACK or ARG_PTR_TO_RAW_STACK.

This enables several use cases. For example, a typical tracing program
might want to capture pathnames passed to sys_open() with:

struct trace_data {
	char pathname[PATHLEN];
};

SEC("kprobe/sys_open")
void bpf_sys_open(struct pt_regs *ctx)
{
	struct trace_data data;
	bpf_probe_read(data.pathname, sizeof(data.pathname), ctx->di);

	/* consume data.pathname, for example via
	 * bpf_trace_printk() or bpf_perf_event_output()
	 */
}

Such a program could easily hit the stack limit in case PATHLEN needs to
be large or more local variables need to exist, both of which are quite
common scenarios. Allowing direct helper access to map element values,
one could do:

struct bpf_map_def SEC("maps") scratch_map = {
	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
	.key_size = sizeof(u32),
	.value_size = sizeof(struct trace_data),
	.max_entries = 1,
};

SEC("kprobe/sys_open")
int bpf_sys_open(struct pt_regs *ctx)
{
	int id = 0;
	struct trace_data *p = bpf_map_lookup_elem(&scratch_map, &id);
	if (!p)
		return;
	bpf_probe_read(p->pathname, sizeof(p->pathname), ctx->di);

	/* consume p->pathname, for example via
	 * bpf_trace_printk() or bpf_perf_event_output()
	 */
}

And wouldn't risk exhausting the stack.

Code changes are loosely modeled after commit 6841de8b0d03 ("bpf: allow
helpers access the packet directly"). Unlike with PTR_TO_PACKET, these
changes just work with ARG_PTR_TO_STACK and ARG_PTR_TO_RAW_STACK (not
ARG_PTR_TO_MAP_KEY, ARG_PTR_TO_MAP_VALUE, ...): adding those would be
trivial, but since there is not currently a use case for that, it's
reasonable to limit the set of changes.

Also, add new tests to make sure accesses to map element values from
helpers never go out of boundary, even when adjusted.

Signed-off-by: Gianluca Borello <g.borello@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c                       |   9 +-
 tools/testing/selftests/bpf/test_verifier.c | 491 ++++++++++++++++++++++++++++
 2 files changed, 498 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8333fbcfbfe7..b7014606745b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -627,7 +627,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
 {
 	struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
 
-	if (off < 0 || off + size > map->value_size) {
+	if (off < 0 || size <= 0 || off + size > map->value_size) {
 		verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
 			map->value_size, off, size);
 		return -EACCES;
@@ -1025,7 +1025,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		 */
 		if (type == CONST_IMM && reg->imm == 0)
 			/* final test in check_stack_boundary() */;
-		else if (type != PTR_TO_PACKET && type != expected_type)
+		else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE &&
+			 type != PTR_TO_MAP_VALUE_ADJ && type != expected_type)
 			goto err_type;
 		meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK;
 	} else {
@@ -1088,6 +1089,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		}
 		if (regs[regno - 1].type == PTR_TO_PACKET)
 			err = check_packet_access(env, regno - 1, 0, reg->imm);
+		else if (regs[regno - 1].type == PTR_TO_MAP_VALUE)
+			err = check_map_access(env, regno - 1, 0, reg->imm);
+		else if (regs[regno - 1].type == PTR_TO_MAP_VALUE_ADJ)
+			err = check_map_access_adj(env, regno - 1, 0, reg->imm);
 		else
 			err = check_stack_boundary(env, regno - 1, reg->imm,
 						   zero_size_allowed, meta);
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 853d7e43434a..b7732e557bf9 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -2905,6 +2905,497 @@ static struct bpf_test tests[] = {
 		.result = REJECT,
 		.errstr = "invalid bpf_context access",
 	},
+	{
+		"helper access to map: full range",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val)),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 3 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to map: partial range",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_2, 8),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 3 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to map: empty range",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 3 },
+		.errstr = "invalid access to map value, value_size=48 off=0 size=0",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to map: out-of-bound range",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val) + 8),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 3 },
+		.errstr = "invalid access to map value, value_size=48 off=0 size=56",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to map: negative range",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_2, -8),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 3 },
+		.errstr = "invalid access to map value, value_size=48 off=0 size=-8",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to adjusted map (via const imm): full range",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1,
+				offsetof(struct test_val, foo)),
+			BPF_MOV64_IMM(BPF_REG_2,
+				sizeof(struct test_val) -
+				offsetof(struct test_val, foo)),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 3 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to adjusted map (via const imm): partial range",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1,
+				offsetof(struct test_val, foo)),
+			BPF_MOV64_IMM(BPF_REG_2, 8),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 3 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to adjusted map (via const imm): empty range",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1,
+				offsetof(struct test_val, foo)),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 3 },
+		.errstr = "R1 min value is outside of the array range",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to adjusted map (via const imm): out-of-bound range",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1,
+				offsetof(struct test_val, foo)),
+			BPF_MOV64_IMM(BPF_REG_2,
+				sizeof(struct test_val) -
+				offsetof(struct test_val, foo) + 8),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 3 },
+		.errstr = "invalid access to map value, value_size=48 off=4 size=52",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to adjusted map (via const imm): negative range (> adjustment)",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1,
+				offsetof(struct test_val, foo)),
+			BPF_MOV64_IMM(BPF_REG_2, -8),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 3 },
+		.errstr = "invalid access to map value, value_size=48 off=4 size=-8",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to adjusted map (via const imm): negative range (< adjustment)",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1,
+				offsetof(struct test_val, foo)),
+			BPF_MOV64_IMM(BPF_REG_2, -1),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 3 },
+		.errstr = "R1 min value is outside of the array range",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to adjusted map (via const reg): full range",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_3,
+				offsetof(struct test_val, foo)),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+			BPF_MOV64_IMM(BPF_REG_2,
+				sizeof(struct test_val) -
+				offsetof(struct test_val, foo)),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 3 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to adjusted map (via const reg): partial range",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_3,
+				offsetof(struct test_val, foo)),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+			BPF_MOV64_IMM(BPF_REG_2, 8),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 3 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to adjusted map (via const reg): empty range",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 3 },
+		.errstr = "R1 min value is outside of the array range",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to adjusted map (via const reg): out-of-bound range",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_3,
+				offsetof(struct test_val, foo)),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+			BPF_MOV64_IMM(BPF_REG_2,
+				sizeof(struct test_val) -
+				offsetof(struct test_val, foo) + 8),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 3 },
+		.errstr = "invalid access to map value, value_size=48 off=4 size=52",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to adjusted map (via const reg): negative range (> adjustment)",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_3,
+				offsetof(struct test_val, foo)),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+			BPF_MOV64_IMM(BPF_REG_2, -8),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 3 },
+		.errstr = "invalid access to map value, value_size=48 off=4 size=-8",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to adjusted map (via const reg): negative range (< adjustment)",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_3,
+				offsetof(struct test_val, foo)),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+			BPF_MOV64_IMM(BPF_REG_2, -1),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 3 },
+		.errstr = "R1 min value is outside of the array range",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to adjusted map (via variable): full range",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
+			BPF_JMP_IMM(BPF_JGT, BPF_REG_3,
+				offsetof(struct test_val, foo), 4),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+			BPF_MOV64_IMM(BPF_REG_2,
+				sizeof(struct test_val) -
+				offsetof(struct test_val, foo)),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 3 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to adjusted map (via variable): partial range",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
+			BPF_JMP_IMM(BPF_JGT, BPF_REG_3,
+				offsetof(struct test_val, foo), 4),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+			BPF_MOV64_IMM(BPF_REG_2, 8),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 3 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to adjusted map (via variable): empty range",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
+			BPF_JMP_IMM(BPF_JGT, BPF_REG_3,
+				offsetof(struct test_val, foo), 4),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 3 },
+		.errstr = "R1 min value is outside of the array range",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to adjusted map (via variable): no max check",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 3 },
+		.errstr = "R1 min value is negative, either use unsigned index or do a if (index >=0) check",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to adjusted map (via variable): wrong max check",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
+			BPF_JMP_IMM(BPF_JGT, BPF_REG_3,
+				offsetof(struct test_val, foo), 4),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+			BPF_MOV64_IMM(BPF_REG_2,
+				sizeof(struct test_val) -
+				offsetof(struct test_val, foo) + 1),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 3 },
+		.errstr = "invalid access to map value, value_size=48 off=4 size=45",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
-- 
cgit v1.2.3


From f0318d01b694485af9678a4e120328ae3555be6d Mon Sep 17 00:00:00 2001
From: Gianluca Borello <g.borello@gmail.com>
Date: Mon, 9 Jan 2017 10:19:48 -0800
Subject: bpf: allow adjusted map element values to spill

commit 484611357c19 ("bpf: allow access into map value arrays")
introduces the ability to do pointer math inside a map element value via
the PTR_TO_MAP_VALUE_ADJ register type.

The current support doesn't handle the case where a PTR_TO_MAP_VALUE_ADJ
is spilled into the stack, limiting several use cases, especially when
generating bpf code from a compiler.

Handle this case by explicitly enabling the register type
PTR_TO_MAP_VALUE_ADJ to be spilled. Also, make sure that min_value and
max_value are reset just for BPF_LDX operations that don't result in a
restore of a spilled register from stack.

Signed-off-by: Gianluca Borello <g.borello@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c                       | 21 +++++++++----
 tools/testing/selftests/bpf/test_verifier.c | 46 +++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b7014606745b..59ed07b9b4ea 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -481,6 +481,13 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
 	regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
 }
 
+static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs,
+					     u32 regno)
+{
+	mark_reg_unknown_value(regs, regno);
+	reset_reg_range_values(regs, regno);
+}
+
 enum reg_arg_type {
 	SRC_OP,		/* register is used as source operand */
 	DST_OP,		/* register is used as destination operand */
@@ -532,6 +539,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	switch (type) {
 	case PTR_TO_MAP_VALUE:
 	case PTR_TO_MAP_VALUE_OR_NULL:
+	case PTR_TO_MAP_VALUE_ADJ:
 	case PTR_TO_STACK:
 	case PTR_TO_CTX:
 	case PTR_TO_PACKET:
@@ -616,7 +624,8 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
 		}
 		if (value_regno >= 0)
 			/* have read misc data from the stack */
-			mark_reg_unknown_value(state->regs, value_regno);
+			mark_reg_unknown_value_and_range(state->regs,
+							 value_regno);
 		return 0;
 	}
 }
@@ -825,7 +834,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
 		else
 			err = check_map_access(env, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
-			mark_reg_unknown_value(state->regs, value_regno);
+			mark_reg_unknown_value_and_range(state->regs,
+							 value_regno);
 
 	} else if (reg->type == PTR_TO_CTX) {
 		enum bpf_reg_type reg_type = UNKNOWN_VALUE;
@@ -837,7 +847,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
 		}
 		err = check_ctx_access(env, off, size, t, &reg_type);
 		if (!err && t == BPF_READ && value_regno >= 0) {
-			mark_reg_unknown_value(state->regs, value_regno);
+			mark_reg_unknown_value_and_range(state->regs,
+							 value_regno);
 			/* note that reg.[id|off|range] == 0 */
 			state->regs[value_regno].type = reg_type;
 		}
@@ -870,7 +881,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
 		}
 		err = check_packet_access(env, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
-			mark_reg_unknown_value(state->regs, value_regno);
+			mark_reg_unknown_value_and_range(state->regs,
+							 value_regno);
 	} else {
 		verbose("R%d invalid mem access '%s'\n",
 			regno, reg_type_str[reg->type]);
@@ -2744,7 +2756,6 @@ static int do_check(struct bpf_verifier_env *env)
 			if (err)
 				return err;
 
-			reset_reg_range_values(regs, insn->dst_reg);
 			if (BPF_SIZE(insn->code) != BPF_W &&
 			    BPF_SIZE(insn->code) != BPF_DW) {
 				insn_idx++;
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index b7732e557bf9..e7b075819c08 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -3396,6 +3396,52 @@ static struct bpf_test tests[] = {
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
 	},
+	{
+		"map element value is preserved across register spilling",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+			BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 42),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -184),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1, 0),
+			BPF_ST_MEM(BPF_DW, BPF_REG_3, 0, 42),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 3 },
+		.errstr_unpriv = "R0 leaks addr",
+		.result = ACCEPT,
+		.result_unpriv = REJECT,
+	},
+	{
+		"map element value (adjusted) is preserved across register spilling",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0,
+				offsetof(struct test_val, foo)),
+			BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 42),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -184),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1, 0),
+			BPF_ST_MEM(BPF_DW, BPF_REG_3, 0, 42),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 3 },
+		.errstr_unpriv = "R0 pointer arithmetic prohibited",
+		.result = ACCEPT,
+		.result_unpriv = REJECT,
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
-- 
cgit v1.2.3


From 06c1c049721a995dee2829ad13b24aaf5d7c5cce Mon Sep 17 00:00:00 2001
From: Gianluca Borello <g.borello@gmail.com>
Date: Mon, 9 Jan 2017 10:19:49 -0800
Subject: bpf: allow helpers access to variable memory

Currently, helpers that read and write from/to the stack can do so using
a pair of arguments of type ARG_PTR_TO_STACK and ARG_CONST_STACK_SIZE.
ARG_CONST_STACK_SIZE accepts a constant register of type CONST_IMM, so
that the verifier can safely check the memory access. However, requiring
the argument to be a constant can be limiting in some circumstances.

Since the current logic keeps track of the minimum and maximum value of
a register throughout the simulated execution, ARG_CONST_STACK_SIZE can
be changed to also accept an UNKNOWN_VALUE register in case its
boundaries have been set and the range doesn't cause invalid memory
accesses.

One common situation when this is useful:

int len;
char buf[BUFSIZE]; /* BUFSIZE is 128 */

if (some_condition)
	len = 42;
else
	len = 84;

some_helper(..., buf, len & (BUFSIZE - 1));

The compiler can often decide to assign the constant values 42 or 48
into a variable on the stack, instead of keeping it in a register. When
the variable is then read back from stack into the register in order to
be passed to the helper, the verifier will not be able to recognize the
register as constant (the verifier is not currently tracking all
constant writes into memory), and the program won't be valid.

However, by allowing the helper to accept an UNKNOWN_VALUE register,
this program will work because the bitwise AND operation will set the
range of possible values for the UNKNOWN_VALUE register to [0, BUFSIZE),
so the verifier can guarantee the helper call will be safe (assuming the
argument is of type ARG_CONST_STACK_SIZE_OR_ZERO, otherwise one more
check against 0 would be needed). Custom ranges can be set not only with
ALU operations, but also by explicitly comparing the UNKNOWN_VALUE
register with constants.

Another very common example happens when intercepting system call
arguments and accessing user-provided data of variable size using
bpf_probe_read(). One can load at runtime the user-provided length in an
UNKNOWN_VALUE register, and then read that exact amount of data up to a
compile-time determined limit in order to fit into the proper local
storage allocated on the stack, without having to guess a suboptimal
access size at compile time.

Also, in case the helpers accepting the UNKNOWN_VALUE register operate
in raw mode, disable the raw mode so that the program is required to
initialize all memory, since there is no guarantee the helper will fill
it completely, leaving possibilities for data leak (just relevant when
the memory used by the helper is the stack, not when using a pointer to
map element value or packet). In other words, ARG_PTR_TO_RAW_STACK will
be treated as ARG_PTR_TO_STACK.

Signed-off-by: Gianluca Borello <g.borello@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c                       |  74 ++++-
 tools/testing/selftests/bpf/test_verifier.c | 410 ++++++++++++++++++++++++++++
 2 files changed, 474 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 59ed07b9b4ea..3d4f7bf32aaf 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -980,6 +980,25 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 	return 0;
 }
 
+static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
+				   int access_size, bool zero_size_allowed,
+				   struct bpf_call_arg_meta *meta)
+{
+	struct bpf_reg_state *regs = env->cur_state.regs;
+
+	switch (regs[regno].type) {
+	case PTR_TO_PACKET:
+		return check_packet_access(env, regno, 0, access_size);
+	case PTR_TO_MAP_VALUE:
+		return check_map_access(env, regno, 0, access_size);
+	case PTR_TO_MAP_VALUE_ADJ:
+		return check_map_access_adj(env, regno, 0, access_size);
+	default: /* const_imm|ptr_to_stack or invalid ptr */
+		return check_stack_boundary(env, regno, access_size,
+					    zero_size_allowed, meta);
+	}
+}
+
 static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			  enum bpf_arg_type arg_type,
 			  struct bpf_call_arg_meta *meta)
@@ -1018,7 +1037,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 	} else if (arg_type == ARG_CONST_STACK_SIZE ||
 		   arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
 		expected_type = CONST_IMM;
-		if (type != expected_type)
+		/* One exception. Allow UNKNOWN_VALUE registers when the
+		 * boundaries are known and don't cause unsafe memory accesses
+		 */
+		if (type != UNKNOWN_VALUE && type != expected_type)
 			goto err_type;
 	} else if (arg_type == ARG_CONST_MAP_PTR) {
 		expected_type = CONST_PTR_TO_MAP;
@@ -1099,15 +1121,47 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			verbose("ARG_CONST_STACK_SIZE cannot be first argument\n");
 			return -EACCES;
 		}
-		if (regs[regno - 1].type == PTR_TO_PACKET)
-			err = check_packet_access(env, regno - 1, 0, reg->imm);
-		else if (regs[regno - 1].type == PTR_TO_MAP_VALUE)
-			err = check_map_access(env, regno - 1, 0, reg->imm);
-		else if (regs[regno - 1].type == PTR_TO_MAP_VALUE_ADJ)
-			err = check_map_access_adj(env, regno - 1, 0, reg->imm);
-		else
-			err = check_stack_boundary(env, regno - 1, reg->imm,
-						   zero_size_allowed, meta);
+
+		/* If the register is UNKNOWN_VALUE, the access check happens
+		 * using its boundaries. Otherwise, just use its imm
+		 */
+		if (type == UNKNOWN_VALUE) {
+			/* For unprivileged variable accesses, disable raw
+			 * mode so that the program is required to
+			 * initialize all the memory that the helper could
+			 * just partially fill up.
+			 */
+			meta = NULL;
+
+			if (reg->min_value < 0) {
+				verbose("R%d min value is negative, either use unsigned or 'var &= const'\n",
+					regno);
+				return -EACCES;
+			}
+
+			if (reg->min_value == 0) {
+				err = check_helper_mem_access(env, regno - 1, 0,
+							      zero_size_allowed,
+							      meta);
+				if (err)
+					return err;
+			}
+
+			if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
+				verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+					regno);
+				return -EACCES;
+			}
+			err = check_helper_mem_access(env, regno - 1,
+						      reg->max_value,
+						      zero_size_allowed, meta);
+			if (err)
+				return err;
+		} else {
+			/* register is CONST_IMM */
+			err = check_helper_mem_access(env, regno - 1, reg->imm,
+						      zero_size_allowed, meta);
+		}
 	}
 
 	return err;
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index e7b075819c08..9bb45346dc72 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -3442,6 +3442,416 @@ static struct bpf_test tests[] = {
 		.result = ACCEPT,
 		.result_unpriv = REJECT,
 	},
+	{
+		"helper access to variable memory: stack, bitwise AND + JMP, correct bounds",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -64),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -56),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -48),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -40),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -32),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -24),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
+			BPF_MOV64_IMM(BPF_REG_2, 16),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 64),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: stack, bitwise AND, zero included",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+			BPF_MOV64_IMM(BPF_REG_2, 16),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 64),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid stack type R1 off=-64 access_size=0",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: stack, bitwise AND + JMP, wrong max",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+			BPF_MOV64_IMM(BPF_REG_2, 16),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 65),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid stack type R1 off=-64 access_size=65",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: stack, JMP, correct bounds",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -64),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -56),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -48),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -40),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -32),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -24),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
+			BPF_MOV64_IMM(BPF_REG_2, 16),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128),
+			BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 64, 4),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: stack, JMP (signed), correct bounds",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -64),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -56),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -48),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -40),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -32),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -24),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
+			BPF_MOV64_IMM(BPF_REG_2, 16),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128),
+			BPF_JMP_IMM(BPF_JSGT, BPF_REG_2, 64, 4),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_JMP_REG(BPF_JSGE, BPF_REG_4, BPF_REG_2, 2),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: stack, JMP, bounds + offset",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+			BPF_MOV64_IMM(BPF_REG_2, 16),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128),
+			BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 64, 5),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 3),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid stack type R1 off=-64 access_size=65",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: stack, JMP, wrong max",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+			BPF_MOV64_IMM(BPF_REG_2, 16),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128),
+			BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 65, 4),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid stack type R1 off=-64 access_size=65",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: stack, JMP, no max check",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+			BPF_MOV64_IMM(BPF_REG_2, 16),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R2 unbounded memory access",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: stack, JMP, no min check",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+			BPF_MOV64_IMM(BPF_REG_2, 16),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128),
+			BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 64, 3),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid stack type R1 off=-64 access_size=0",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: stack, JMP (signed), no min check",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+			BPF_MOV64_IMM(BPF_REG_2, 16),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128),
+			BPF_JMP_IMM(BPF_JSGT, BPF_REG_2, 64, 3),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R2 min value is negative",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: map, JMP, correct bounds",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val)),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128),
+			BPF_JMP_IMM(BPF_JSGT, BPF_REG_2,
+				sizeof(struct test_val), 4),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 3 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: map, JMP, wrong max",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val)),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128),
+			BPF_JMP_IMM(BPF_JSGT, BPF_REG_2,
+				sizeof(struct test_val) + 1, 4),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 3 },
+		.errstr = "invalid access to map value, value_size=48 off=0 size=49",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: map adjusted, JMP, correct bounds",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 20),
+			BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val)),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128),
+			BPF_JMP_IMM(BPF_JSGT, BPF_REG_2,
+				sizeof(struct test_val) - 20, 4),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 3 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: map adjusted, JMP, wrong max",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 20),
+			BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val)),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128),
+			BPF_JMP_IMM(BPF_JSGT, BPF_REG_2,
+				sizeof(struct test_val) - 19, 4),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 3 },
+		.errstr = "R1 min value is outside of the array range",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: size > 0 not allowed on NULL",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_1, 0),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 64),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_MOV64_IMM(BPF_REG_5, 0),
+			BPF_EMIT_CALL(BPF_FUNC_csum_diff),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R1 type=imm expected=fp",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
+		"helper access to variable memory: size = 0 not allowed on != NULL",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, 0),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 8),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_MOV64_IMM(BPF_REG_5, 0),
+			BPF_EMIT_CALL(BPF_FUNC_csum_diff),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid stack type R1 off=-8 access_size=0",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
+		"helper access to variable memory: 8 bytes leak",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -64),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -56),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -48),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -40),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -24),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 63),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid indirect read from stack off -64+32 size 64",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: 8 bytes no leak (init memory)",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -64),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -56),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -48),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -40),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -32),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -24),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 32),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 32),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
-- 
cgit v1.2.3


From 39f19ebbf57b403695f7b5f9cf322fe1ddb5d7fb Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 9 Jan 2017 10:19:50 -0800
Subject: bpf: rename ARG_PTR_TO_STACK

since ARG_PTR_TO_STACK is no longer just pointer to stack
rename it to ARG_PTR_TO_MEM and adjust comment.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      | 12 ++++++------
 kernel/bpf/helpers.c     |  4 ++--
 kernel/bpf/verifier.c    | 28 ++++++++++++++--------------
 kernel/trace/bpf_trace.c | 20 ++++++++++----------
 net/core/filter.c        | 40 ++++++++++++++++++++--------------------
 5 files changed, 52 insertions(+), 52 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f74ae68086dc..94ea8d2383e6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -69,14 +69,14 @@ enum bpf_arg_type {
 	/* the following constraints used to prototype bpf_memcmp() and other
 	 * functions that access data on eBPF program stack
 	 */
-	ARG_PTR_TO_STACK,	/* any pointer to eBPF program stack */
-	ARG_PTR_TO_RAW_STACK,	/* any pointer to eBPF program stack, area does not
-				 * need to be initialized, helper function must fill
-				 * all bytes or clear them in error case.
+	ARG_PTR_TO_MEM,		/* pointer to valid memory (stack, packet, map value) */
+	ARG_PTR_TO_UNINIT_MEM,	/* pointer to memory does not need to be initialized,
+				 * helper function must fill all bytes or clear
+				 * them in error case.
 				 */
 
-	ARG_CONST_STACK_SIZE,	/* number of bytes accessed from stack */
-	ARG_CONST_STACK_SIZE_OR_ZERO, /* number of bytes accessed from stack or 0 */
+	ARG_CONST_SIZE,		/* number of bytes accessed from memory */
+	ARG_CONST_SIZE_OR_ZERO,	/* number of bytes accessed from memory or 0 */
 
 	ARG_PTR_TO_CTX,		/* pointer to context */
 	ARG_ANYTHING,		/* any (initialized) argument is ok */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 045cbe673356..3d24e238221e 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -176,6 +176,6 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
 	.func		= bpf_get_current_comm,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_RAW_STACK,
-	.arg2_type	= ARG_CONST_STACK_SIZE,
+	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg2_type	= ARG_CONST_SIZE,
 };
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3d4f7bf32aaf..2efdc9128e3c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1034,8 +1034,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		expected_type = PTR_TO_STACK;
 		if (type != PTR_TO_PACKET && type != expected_type)
 			goto err_type;
-	} else if (arg_type == ARG_CONST_STACK_SIZE ||
-		   arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
+	} else if (arg_type == ARG_CONST_SIZE ||
+		   arg_type == ARG_CONST_SIZE_OR_ZERO) {
 		expected_type = CONST_IMM;
 		/* One exception. Allow UNKNOWN_VALUE registers when the
 		 * boundaries are known and don't cause unsafe memory accesses
@@ -1050,8 +1050,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		expected_type = PTR_TO_CTX;
 		if (type != expected_type)
 			goto err_type;
-	} else if (arg_type == ARG_PTR_TO_STACK ||
-		   arg_type == ARG_PTR_TO_RAW_STACK) {
+	} else if (arg_type == ARG_PTR_TO_MEM ||
+		   arg_type == ARG_PTR_TO_UNINIT_MEM) {
 		expected_type = PTR_TO_STACK;
 		/* One exception here. In case function allows for NULL to be
 		 * passed in as argument, it's a CONST_IMM type. Final test
@@ -1062,7 +1062,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE &&
 			 type != PTR_TO_MAP_VALUE_ADJ && type != expected_type)
 			goto err_type;
-		meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK;
+		meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
 	} else {
 		verbose("unsupported arg_type %d\n", arg_type);
 		return -EFAULT;
@@ -1108,9 +1108,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			err = check_stack_boundary(env, regno,
 						   meta->map_ptr->value_size,
 						   false, NULL);
-	} else if (arg_type == ARG_CONST_STACK_SIZE ||
-		   arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
-		bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO);
+	} else if (arg_type == ARG_CONST_SIZE ||
+		   arg_type == ARG_CONST_SIZE_OR_ZERO) {
+		bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
 
 		/* bpf_xxx(..., buf, len) call will access 'len' bytes
 		 * from stack pointer 'buf'. Check it
@@ -1118,7 +1118,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		 */
 		if (regno == 0) {
 			/* kernel subsystem misconfigured verifier */
-			verbose("ARG_CONST_STACK_SIZE cannot be first argument\n");
+			verbose("ARG_CONST_SIZE cannot be first argument\n");
 			return -EACCES;
 		}
 
@@ -1235,15 +1235,15 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
 {
 	int count = 0;
 
-	if (fn->arg1_type == ARG_PTR_TO_RAW_STACK)
+	if (fn->arg1_type == ARG_PTR_TO_UNINIT_MEM)
 		count++;
-	if (fn->arg2_type == ARG_PTR_TO_RAW_STACK)
+	if (fn->arg2_type == ARG_PTR_TO_UNINIT_MEM)
 		count++;
-	if (fn->arg3_type == ARG_PTR_TO_RAW_STACK)
+	if (fn->arg3_type == ARG_PTR_TO_UNINIT_MEM)
 		count++;
-	if (fn->arg4_type == ARG_PTR_TO_RAW_STACK)
+	if (fn->arg4_type == ARG_PTR_TO_UNINIT_MEM)
 		count++;
-	if (fn->arg5_type == ARG_PTR_TO_RAW_STACK)
+	if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM)
 		count++;
 
 	return count > 1 ? -EINVAL : 0;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index fa77311dadb2..f883c43c96f3 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -76,8 +76,8 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
 	.func		= bpf_probe_read,
 	.gpl_only	= true,
 	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_RAW_STACK,
-	.arg2_type	= ARG_CONST_STACK_SIZE,
+	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg2_type	= ARG_CONST_SIZE,
 	.arg3_type	= ARG_ANYTHING,
 };
 
@@ -109,8 +109,8 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = {
 	.gpl_only	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_ANYTHING,
-	.arg2_type	= ARG_PTR_TO_STACK,
-	.arg3_type	= ARG_CONST_STACK_SIZE,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
 };
 
 static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
@@ -213,8 +213,8 @@ static const struct bpf_func_proto bpf_trace_printk_proto = {
 	.func		= bpf_trace_printk,
 	.gpl_only	= true,
 	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_STACK,
-	.arg2_type	= ARG_CONST_STACK_SIZE,
+	.arg1_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_CONST_SIZE,
 };
 
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
@@ -329,8 +329,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_STACK,
-	.arg5_type	= ARG_CONST_STACK_SIZE,
+	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg5_type	= ARG_CONST_SIZE,
 };
 
 static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
@@ -492,8 +492,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_STACK,
-	.arg5_type	= ARG_CONST_STACK_SIZE,
+	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg5_type	= ARG_CONST_SIZE,
 };
 
 BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map,
diff --git a/net/core/filter.c b/net/core/filter.c
index 1969b3f118c1..f4d16a905754 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1416,8 +1416,8 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_ANYTHING,
-	.arg3_type	= ARG_PTR_TO_STACK,
-	.arg4_type	= ARG_CONST_STACK_SIZE,
+	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_CONST_SIZE,
 	.arg5_type	= ARG_ANYTHING,
 };
 
@@ -1447,8 +1447,8 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_ANYTHING,
-	.arg3_type	= ARG_PTR_TO_RAW_STACK,
-	.arg4_type	= ARG_CONST_STACK_SIZE,
+	.arg3_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg4_type	= ARG_CONST_SIZE,
 };
 
 BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
@@ -1601,10 +1601,10 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {
 	.gpl_only	= false,
 	.pkt_access	= true,
 	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_STACK,
-	.arg2_type	= ARG_CONST_STACK_SIZE_OR_ZERO,
-	.arg3_type	= ARG_PTR_TO_STACK,
-	.arg4_type	= ARG_CONST_STACK_SIZE_OR_ZERO,
+	.arg1_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_CONST_SIZE_OR_ZERO,
 	.arg5_type	= ARG_ANYTHING,
 };
 
@@ -2306,8 +2306,8 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_STACK,
-	.arg5_type	= ARG_CONST_STACK_SIZE,
+	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg5_type	= ARG_CONST_SIZE,
 };
 
 static unsigned short bpf_tunnel_key_af(u64 flags)
@@ -2377,8 +2377,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_RAW_STACK,
-	.arg3_type	= ARG_CONST_STACK_SIZE,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
 	.arg4_type	= ARG_ANYTHING,
 };
 
@@ -2412,8 +2412,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_RAW_STACK,
-	.arg3_type	= ARG_CONST_STACK_SIZE,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
 };
 
 static struct metadata_dst __percpu *md_dst;
@@ -2483,8 +2483,8 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_STACK,
-	.arg3_type	= ARG_CONST_STACK_SIZE,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
 	.arg4_type	= ARG_ANYTHING,
 };
 
@@ -2509,8 +2509,8 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_STACK,
-	.arg3_type	= ARG_CONST_STACK_SIZE,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
 };
 
 static const struct bpf_func_proto *
@@ -2593,8 +2593,8 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_STACK,
-	.arg5_type	= ARG_CONST_STACK_SIZE,
+	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg5_type	= ARG_CONST_SIZE,
 };
 
 static const struct bpf_func_proto *
-- 
cgit v1.2.3


From 39d3e7584a686541a3295ff1624d341e669e1afc Mon Sep 17 00:00:00 2001
From: Parav Pandit <pandit.parav@gmail.com>
Date: Tue, 10 Jan 2017 00:02:13 +0000
Subject: rdmacg: Added rdma cgroup controller

Added rdma cgroup controller that does accounting, limit enforcement
on rdma/IB resources.

Added rdma cgroup header file which defines its APIs to perform
charging/uncharging functionality. It also defined APIs for RDMA/IB
stack for device registration. Devices which are registered will
participate in controller functions of accounting and limit
enforcements. It define rdmacg_device structure to bind IB stack
and RDMA cgroup controller.

RDMA resources are tracked using resource pool. Resource pool is per
device, per cgroup entity which allows setting up accounting limits
on per device basis.

Currently resources are defined by the RDMA cgroup.

Resource pool is created/destroyed dynamically whenever
charging/uncharging occurs respectively and whenever user
configuration is done. Its a tradeoff of memory vs little more code
space that creates resource pool object whenever necessary, instead of
creating them during cgroup creation and device registration time.

Signed-off-by: Parav Pandit <pandit.parav@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup_rdma.h   |  53 ++++
 include/linux/cgroup_subsys.h |   4 +
 init/Kconfig                  |  10 +
 kernel/cgroup/Makefile        |   1 +
 kernel/cgroup/rdma.c          | 617 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 685 insertions(+)
 create mode 100644 include/linux/cgroup_rdma.h
 create mode 100644 kernel/cgroup/rdma.c

(limited to 'kernel')

diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h
new file mode 100644
index 000000000000..e94290b29e99
--- /dev/null
+++ b/include/linux/cgroup_rdma.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#ifndef _CGROUP_RDMA_H
+#define _CGROUP_RDMA_H
+
+#include <linux/cgroup.h>
+
+enum rdmacg_resource_type {
+	RDMACG_RESOURCE_HCA_HANDLE,
+	RDMACG_RESOURCE_HCA_OBJECT,
+	RDMACG_RESOURCE_MAX,
+};
+
+#ifdef CONFIG_CGROUP_RDMA
+
+struct rdma_cgroup {
+	struct cgroup_subsys_state	css;
+
+	/*
+	 * head to keep track of all resource pools
+	 * that belongs to this cgroup.
+	 */
+	struct list_head		rpools;
+};
+
+struct rdmacg_device {
+	struct list_head	dev_node;
+	struct list_head	rpools;
+	char			*name;
+};
+
+/*
+ * APIs for RDMA/IB stack to publish when a device wants to
+ * participate in resource accounting
+ */
+int rdmacg_register_device(struct rdmacg_device *device);
+void rdmacg_unregister_device(struct rdmacg_device *device);
+
+/* APIs for RDMA/IB stack to charge/uncharge pool specific resources */
+int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
+		      struct rdmacg_device *device,
+		      enum rdmacg_resource_type index);
+void rdmacg_uncharge(struct rdma_cgroup *cg,
+		     struct rdmacg_device *device,
+		     enum rdmacg_resource_type index);
+#endif	/* CONFIG_CGROUP_RDMA */
+#endif	/* _CGROUP_RDMA_H */
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 0df0336acee9..d0e597c44585 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -56,6 +56,10 @@ SUBSYS(hugetlb)
 SUBSYS(pids)
 #endif
 
+#if IS_ENABLED(CONFIG_CGROUP_RDMA)
+SUBSYS(rdma)
+#endif
+
 /*
  * The following subsystems are not supported on the default hierarchy.
  */
diff --git a/init/Kconfig b/init/Kconfig
index 223b734abccd..ef80d46a32b6 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1090,6 +1090,16 @@ config CGROUP_PIDS
 	  since the PIDs limit only affects a process's ability to fork, not to
 	  attach to a cgroup.
 
+config CGROUP_RDMA
+	bool "RDMA controller"
+	help
+	  Provides enforcement of RDMA resources defined by IB stack.
+	  It is fairly easy for consumers to exhaust RDMA resources, which
+	  can result into resource unavailability to other consumers.
+	  RDMA controller is designed to stop this from happening.
+	  Attaching processes with active RDMA resources to the cgroup
+	  hierarchy is allowed even if can cross the hierarchy's limit.
+
 config CGROUP_FREEZER
 	bool "Freezer controller"
 	help
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index 6d42a3211164..387348a40c64 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -2,4 +2,5 @@ obj-y := cgroup.o namespace.o cgroup-v1.o
 
 obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
 obj-$(CONFIG_CGROUP_PIDS) += pids.o
+obj-$(CONFIG_CGROUP_RDMA) += rdma.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
new file mode 100644
index 000000000000..021bee7a9692
--- /dev/null
+++ b/kernel/cgroup/rdma.c
@@ -0,0 +1,617 @@
+/*
+ * RDMA resource limiting controller for cgroups.
+ *
+ * Used to allow a cgroup hierarchy to stop processes from consuming
+ * additional RDMA resources after a certain limit is reached.
+ *
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/cgroup.h>
+#include <linux/parser.h>
+#include <linux/cgroup_rdma.h>
+
+#define RDMACG_MAX_STR "max"
+
+/*
+ * Protects list of resource pools maintained on per cgroup basis
+ * and rdma device list.
+ */
+static DEFINE_MUTEX(rdmacg_mutex);
+static LIST_HEAD(rdmacg_devices);
+
+enum rdmacg_file_type {
+	RDMACG_RESOURCE_TYPE_MAX,
+	RDMACG_RESOURCE_TYPE_STAT,
+};
+
+/*
+ * resource table definition as to be seen by the user.
+ * Need to add entries to it when more resources are
+ * added/defined at IB verb/core layer.
+ */
+static char const *rdmacg_resource_names[] = {
+	[RDMACG_RESOURCE_HCA_HANDLE]	= "hca_handle",
+	[RDMACG_RESOURCE_HCA_OBJECT]	= "hca_object",
+};
+
+/* resource tracker for each resource of rdma cgroup */
+struct rdmacg_resource {
+	int max;
+	int usage;
+};
+
+/*
+ * resource pool object which represents per cgroup, per device
+ * resources. There are multiple instances of this object per cgroup,
+ * therefore it cannot be embedded within rdma_cgroup structure. It
+ * is maintained as list.
+ */
+struct rdmacg_resource_pool {
+	struct rdmacg_device	*device;
+	struct rdmacg_resource	resources[RDMACG_RESOURCE_MAX];
+
+	struct list_head	cg_node;
+	struct list_head	dev_node;
+
+	/* count active user tasks of this pool */
+	u64			usage_sum;
+	/* total number counts which are set to max */
+	int			num_max_cnt;
+};
+
+static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
+{
+	return container_of(css, struct rdma_cgroup, css);
+}
+
+static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
+{
+	return css_rdmacg(cg->css.parent);
+}
+
+static inline struct rdma_cgroup *get_current_rdmacg(void)
+{
+	return css_rdmacg(task_get_css(current, rdma_cgrp_id));
+}
+
+static void set_resource_limit(struct rdmacg_resource_pool *rpool,
+			       int index, int new_max)
+{
+	if (new_max == S32_MAX) {
+		if (rpool->resources[index].max != S32_MAX)
+			rpool->num_max_cnt++;
+	} else {
+		if (rpool->resources[index].max == S32_MAX)
+			rpool->num_max_cnt--;
+	}
+	rpool->resources[index].max = new_max;
+}
+
+static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
+{
+	int i;
+
+	for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
+		set_resource_limit(rpool, i, S32_MAX);
+}
+
+static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
+{
+	lockdep_assert_held(&rdmacg_mutex);
+
+	list_del(&rpool->cg_node);
+	list_del(&rpool->dev_node);
+	kfree(rpool);
+}
+
+static struct rdmacg_resource_pool *
+find_cg_rpool_locked(struct rdma_cgroup *cg,
+		     struct rdmacg_device *device)
+
+{
+	struct rdmacg_resource_pool *pool;
+
+	lockdep_assert_held(&rdmacg_mutex);
+
+	list_for_each_entry(pool, &cg->rpools, cg_node)
+		if (pool->device == device)
+			return pool;
+
+	return NULL;
+}
+
+static struct rdmacg_resource_pool *
+get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
+{
+	struct rdmacg_resource_pool *rpool;
+
+	rpool = find_cg_rpool_locked(cg, device);
+	if (rpool)
+		return rpool;
+
+	rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
+	if (!rpool)
+		return ERR_PTR(-ENOMEM);
+
+	rpool->device = device;
+	set_all_resource_max_limit(rpool);
+
+	INIT_LIST_HEAD(&rpool->cg_node);
+	INIT_LIST_HEAD(&rpool->dev_node);
+	list_add_tail(&rpool->cg_node, &cg->rpools);
+	list_add_tail(&rpool->dev_node, &device->rpools);
+	return rpool;
+}
+
+/**
+ * uncharge_cg_locked - uncharge resource for rdma cgroup
+ * @cg: pointer to cg to uncharge and all parents in hierarchy
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to uncharge in cg (resource pool)
+ *
+ * It also frees the resource pool which was created as part of
+ * charging operation when there are no resources attached to
+ * resource pool.
+ */
+static void
+uncharge_cg_locked(struct rdma_cgroup *cg,
+		   struct rdmacg_device *device,
+		   enum rdmacg_resource_type index)
+{
+	struct rdmacg_resource_pool *rpool;
+
+	rpool = find_cg_rpool_locked(cg, device);
+
+	/*
+	 * rpool cannot be null at this stage. Let kernel operate in case
+	 * if there a bug in IB stack or rdma controller, instead of crashing
+	 * the system.
+	 */
+	if (unlikely(!rpool)) {
+		pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
+		return;
+	}
+
+	rpool->resources[index].usage--;
+
+	/*
+	 * A negative count (or overflow) is invalid,
+	 * it indicates a bug in the rdma controller.
+	 */
+	WARN_ON_ONCE(rpool->resources[index].usage < 0);
+	rpool->usage_sum--;
+	if (rpool->usage_sum == 0 &&
+	    rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
+		/*
+		 * No user of the rpool and all entries are set to max, so
+		 * safe to delete this rpool.
+		 */
+		free_cg_rpool_locked(rpool);
+	}
+}
+
+/**
+ * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
+ * @device: pointer to rdmacg device
+ * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
+ *           stop uncharging
+ * @index: index of the resource to uncharge in cg in given resource pool
+ */
+static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
+				     struct rdmacg_device *device,
+				     struct rdma_cgroup *stop_cg,
+				     enum rdmacg_resource_type index)
+{
+	struct rdma_cgroup *p;
+
+	mutex_lock(&rdmacg_mutex);
+
+	for (p = cg; p != stop_cg; p = parent_rdmacg(p))
+		uncharge_cg_locked(p, device, index);
+
+	mutex_unlock(&rdmacg_mutex);
+
+	css_put(&cg->css);
+}
+
+/**
+ * rdmacg_uncharge - hierarchically uncharge rdma resource count
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to uncharge in cgroup in given resource pool
+ */
+void rdmacg_uncharge(struct rdma_cgroup *cg,
+		     struct rdmacg_device *device,
+		     enum rdmacg_resource_type index)
+{
+	if (index >= RDMACG_RESOURCE_MAX)
+		return;
+
+	rdmacg_uncharge_hierarchy(cg, device, NULL, index);
+}
+EXPORT_SYMBOL(rdmacg_uncharge);
+
+/**
+ * rdmacg_try_charge - hierarchically try to charge the rdma resource
+ * @rdmacg: pointer to rdma cgroup which will own this resource
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to charge in cgroup (resource pool)
+ *
+ * This function follows charging resource in hierarchical way.
+ * It will fail if the charge would cause the new value to exceed the
+ * hierarchical limit.
+ * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
+ * Returns pointer to rdmacg for this resource when charging is successful.
+ *
+ * Charger needs to account resources on two criteria.
+ * (a) per cgroup & (b) per device resource usage.
+ * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
+ * the configured limits. Per device provides granular configuration
+ * in multi device usage. It allocates resource pool in the hierarchy
+ * for each parent it come across for first resource. Later on resource
+ * pool will be available. Therefore it will be much faster thereon
+ * to charge/uncharge.
+ */
+int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
+		      struct rdmacg_device *device,
+		      enum rdmacg_resource_type index)
+{
+	struct rdma_cgroup *cg, *p;
+	struct rdmacg_resource_pool *rpool;
+	s64 new;
+	int ret = 0;
+
+	if (index >= RDMACG_RESOURCE_MAX)
+		return -EINVAL;
+
+	/*
+	 * hold on to css, as cgroup can be removed but resource
+	 * accounting happens on css.
+	 */
+	cg = get_current_rdmacg();
+
+	mutex_lock(&rdmacg_mutex);
+	for (p = cg; p; p = parent_rdmacg(p)) {
+		rpool = get_cg_rpool_locked(p, device);
+		if (IS_ERR(rpool)) {
+			ret = PTR_ERR(rpool);
+			goto err;
+		} else {
+			new = rpool->resources[index].usage + 1;
+			if (new > rpool->resources[index].max) {
+				ret = -EAGAIN;
+				goto err;
+			} else {
+				rpool->resources[index].usage = new;
+				rpool->usage_sum++;
+			}
+		}
+	}
+	mutex_unlock(&rdmacg_mutex);
+
+	*rdmacg = cg;
+	return 0;
+
+err:
+	mutex_unlock(&rdmacg_mutex);
+	rdmacg_uncharge_hierarchy(cg, device, p, index);
+	return ret;
+}
+EXPORT_SYMBOL(rdmacg_try_charge);
+
+/**
+ * rdmacg_register_device - register rdmacg device to rdma controller.
+ * @device: pointer to rdmacg device whose resources need to be accounted.
+ *
+ * If IB stack wish a device to participate in rdma cgroup resource
+ * tracking, it must invoke this API to register with rdma cgroup before
+ * any user space application can start using the RDMA resources.
+ * Returns 0 on success or EINVAL when table length given is beyond
+ * supported size.
+ */
+int rdmacg_register_device(struct rdmacg_device *device)
+{
+	INIT_LIST_HEAD(&device->dev_node);
+	INIT_LIST_HEAD(&device->rpools);
+
+	mutex_lock(&rdmacg_mutex);
+	list_add_tail(&device->dev_node, &rdmacg_devices);
+	mutex_unlock(&rdmacg_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(rdmacg_register_device);
+
+/**
+ * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
+ * @device: pointer to rdmacg device which was previously registered with rdma
+ *          controller using rdmacg_register_device().
+ *
+ * IB stack must invoke this after all the resources of the IB device
+ * are destroyed and after ensuring that no more resources will be created
+ * when this API is invoked.
+ */
+void rdmacg_unregister_device(struct rdmacg_device *device)
+{
+	struct rdmacg_resource_pool *rpool, *tmp;
+
+	/*
+	 * Synchronize with any active resource settings,
+	 * usage query happening via configfs.
+	 */
+	mutex_lock(&rdmacg_mutex);
+	list_del_init(&device->dev_node);
+
+	/*
+	 * Now that this device is off the cgroup list, its safe to free
+	 * all the rpool resources.
+	 */
+	list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
+		free_cg_rpool_locked(rpool);
+
+	mutex_unlock(&rdmacg_mutex);
+}
+EXPORT_SYMBOL(rdmacg_unregister_device);
+
+static int parse_resource(char *c, int *intval)
+{
+	substring_t argstr;
+	const char **table = &rdmacg_resource_names[0];
+	char *name, *value = c;
+	size_t len;
+	int ret, i = 0;
+
+	name = strsep(&value, "=");
+	if (!name || !value)
+		return -EINVAL;
+
+	len = strlen(value);
+
+	for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
+		if (strcmp(table[i], name))
+			continue;
+
+		argstr.from = value;
+		argstr.to = value + len;
+
+		ret = match_int(&argstr, intval);
+		if (ret >= 0) {
+			if (*intval < 0)
+				break;
+			return i;
+		}
+		if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
+			*intval = S32_MAX;
+			return i;
+		}
+		break;
+	}
+	return -EINVAL;
+}
+
+static int rdmacg_parse_limits(char *options,
+			       int *new_limits, unsigned long *enables)
+{
+	char *c;
+	int err = -EINVAL;
+
+	/* parse resource options */
+	while ((c = strsep(&options, " ")) != NULL) {
+		int index, intval;
+
+		index = parse_resource(c, &intval);
+		if (index < 0)
+			goto err;
+
+		new_limits[index] = intval;
+		*enables |= BIT(index);
+	}
+	return 0;
+
+err:
+	return err;
+}
+
+static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
+{
+	struct rdmacg_device *device;
+
+	lockdep_assert_held(&rdmacg_mutex);
+
+	list_for_each_entry(device, &rdmacg_devices, dev_node)
+		if (!strcmp(name, device->name))
+			return device;
+
+	return NULL;
+}
+
+static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
+				       char *buf, size_t nbytes, loff_t off)
+{
+	struct rdma_cgroup *cg = css_rdmacg(of_css(of));
+	const char *dev_name;
+	struct rdmacg_resource_pool *rpool;
+	struct rdmacg_device *device;
+	char *options = strstrip(buf);
+	int *new_limits;
+	unsigned long enables = 0;
+	int i = 0, ret = 0;
+
+	/* extract the device name first */
+	dev_name = strsep(&options, " ");
+	if (!dev_name) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
+	if (!new_limits) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	ret = rdmacg_parse_limits(options, new_limits, &enables);
+	if (ret)
+		goto parse_err;
+
+	/* acquire lock to synchronize with hot plug devices */
+	mutex_lock(&rdmacg_mutex);
+
+	device = rdmacg_get_device_locked(dev_name);
+	if (!device) {
+		ret = -ENODEV;
+		goto dev_err;
+	}
+
+	rpool = get_cg_rpool_locked(cg, device);
+	if (IS_ERR(rpool)) {
+		ret = PTR_ERR(rpool);
+		goto dev_err;
+	}
+
+	/* now set the new limits of the rpool */
+	for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
+		set_resource_limit(rpool, i, new_limits[i]);
+
+	if (rpool->usage_sum == 0 &&
+	    rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
+		/*
+		 * No user of the rpool and all entries are set to max, so
+		 * safe to delete this rpool.
+		 */
+		free_cg_rpool_locked(rpool);
+	}
+
+dev_err:
+	mutex_unlock(&rdmacg_mutex);
+
+parse_err:
+	kfree(new_limits);
+
+err:
+	return ret ?: nbytes;
+}
+
+static void print_rpool_values(struct seq_file *sf,
+			       struct rdmacg_resource_pool *rpool)
+{
+	enum rdmacg_file_type sf_type;
+	int i;
+	u32 value;
+
+	sf_type = seq_cft(sf)->private;
+
+	for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
+		seq_puts(sf, rdmacg_resource_names[i]);
+		seq_putc(sf, '=');
+		if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
+			if (rpool)
+				value = rpool->resources[i].max;
+			else
+				value = S32_MAX;
+		} else {
+			if (rpool)
+				value = rpool->resources[i].usage;
+		}
+
+		if (value == S32_MAX)
+			seq_puts(sf, RDMACG_MAX_STR);
+		else
+			seq_printf(sf, "%d", value);
+		seq_putc(sf, ' ');
+	}
+}
+
+static int rdmacg_resource_read(struct seq_file *sf, void *v)
+{
+	struct rdmacg_device *device;
+	struct rdmacg_resource_pool *rpool;
+	struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
+
+	mutex_lock(&rdmacg_mutex);
+
+	list_for_each_entry(device, &rdmacg_devices, dev_node) {
+		seq_printf(sf, "%s ", device->name);
+
+		rpool = find_cg_rpool_locked(cg, device);
+		print_rpool_values(sf, rpool);
+
+		seq_putc(sf, '\n');
+	}
+
+	mutex_unlock(&rdmacg_mutex);
+	return 0;
+}
+
+static struct cftype rdmacg_files[] = {
+	{
+		.name = "max",
+		.write = rdmacg_resource_set_max,
+		.seq_show = rdmacg_resource_read,
+		.private = RDMACG_RESOURCE_TYPE_MAX,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "current",
+		.seq_show = rdmacg_resource_read,
+		.private = RDMACG_RESOURCE_TYPE_STAT,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{ }	/* terminate */
+};
+
+static struct cgroup_subsys_state *
+rdmacg_css_alloc(struct cgroup_subsys_state *parent)
+{
+	struct rdma_cgroup *cg;
+
+	cg = kzalloc(sizeof(*cg), GFP_KERNEL);
+	if (!cg)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&cg->rpools);
+	return &cg->css;
+}
+
+static void rdmacg_css_free(struct cgroup_subsys_state *css)
+{
+	struct rdma_cgroup *cg = css_rdmacg(css);
+
+	kfree(cg);
+}
+
+/**
+ * rdmacg_css_offline - cgroup css_offline callback
+ * @css: css of interest
+ *
+ * This function is called when @css is about to go away and responsible
+ * for shooting down all rdmacg associated with @css. As part of that it
+ * marks all the resource pool entries to max value, so that when resources are
+ * uncharged, associated resource pool can be freed as well.
+ */
+static void rdmacg_css_offline(struct cgroup_subsys_state *css)
+{
+	struct rdma_cgroup *cg = css_rdmacg(css);
+	struct rdmacg_resource_pool *rpool;
+
+	mutex_lock(&rdmacg_mutex);
+
+	list_for_each_entry(rpool, &cg->rpools, cg_node)
+		set_all_resource_max_limit(rpool);
+
+	mutex_unlock(&rdmacg_mutex);
+}
+
+struct cgroup_subsys rdma_cgrp_subsys = {
+	.css_alloc	= rdmacg_css_alloc,
+	.css_free	= rdmacg_css_free,
+	.css_offline	= rdmacg_css_offline,
+	.legacy_cftypes	= rdmacg_files,
+	.dfl_cftypes	= rdmacg_files,
+};
-- 
cgit v1.2.3


From 7896dfb0a6180beb51bf34b32c9427984cb052c4 Mon Sep 17 00:00:00 2001
From: Parav Pandit <pandit.parav@gmail.com>
Date: Tue, 10 Jan 2017 17:51:48 +0000
Subject: rdmacg: Fixed uninitialized current resource usage

Fixed warning reported by kbuild test robot.
When reading current resource usage value, when no resources are
allocated, its possible that it can report a uninitialized value
for current resource usage.
This fix avoids it by initializing it to zero as no resource is
allocated.

Signed-off-by: Parav Pandit <pandit.parav@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/rdma.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index 021bee7a9692..defad3c5e7dc 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -518,6 +518,8 @@ static void print_rpool_values(struct seq_file *sf,
 		} else {
 			if (rpool)
 				value = rpool->resources[i].usage;
+			else
+				value = 0;
 		}
 
 		if (value == S32_MAX)
-- 
cgit v1.2.3


From a5ef01aaac245d37edf113d65b0c146e96d841d1 Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Tue, 10 Jan 2017 15:02:07 +0100
Subject: bpf: Remove unused but set variable in
 __bpf_lru_list_shrink_inactive()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove the unused but set variable 'first_node' in
__bpf_lru_list_shrink_inactive() to fix the following GCC warning when
building with 'W=1':

  kernel/bpf/bpf_lru_list.c:216:41: warning: variable ‘first_node’ set but not used [-Wunused-but-set-variable]

Cc: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/bpf_lru_list.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index 89b7ef41c86b..d78501ee0609 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -213,11 +213,10 @@ __bpf_lru_list_shrink_inactive(struct bpf_lru *lru,
 			       enum bpf_lru_list_type tgt_free_type)
 {
 	struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE];
-	struct bpf_lru_node *node, *tmp_node, *first_node;
+	struct bpf_lru_node *node, *tmp_node;
 	unsigned int nshrinked = 0;
 	unsigned int i = 0;
 
-	first_node = list_first_entry(inactive, struct bpf_lru_node, list);
 	list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) {
 		if (bpf_lru_node_is_ref(node)) {
 			__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
-- 
cgit v1.2.3


From 3bf003335ba356aac5a43e28640159d4ae8a2a60 Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Tue, 10 Jan 2017 15:02:16 +0100
Subject: bpf: Make unnecessarily global functions static
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make the functions __local_list_pop_free(), __local_list_pop_pending(),
bpf_common_lru_populate() and bpf_percpu_lru_populate() static as they
are not used outide of bpf_lru_list.c

This fixes the following GCC warnings when building with 'W=1':

  kernel/bpf/bpf_lru_list.c:363:22: warning: no previous prototype for ‘__local_list_pop_free’ [-Wmissing-prototypes]
  kernel/bpf/bpf_lru_list.c:376:22: warning: no previous prototype for ‘__local_list_pop_pending’ [-Wmissing-prototypes]
  kernel/bpf/bpf_lru_list.c:560:6: warning: no previous prototype for ‘bpf_common_lru_populate’ [-Wmissing-prototypes]
  kernel/bpf/bpf_lru_list.c:577:6: warning: no previous prototype for ‘bpf_percpu_lru_populate’ [-Wmissing-prototypes]

Cc: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/bpf_lru_list.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index d78501ee0609..f62d1d56f41d 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -360,7 +360,8 @@ static void __local_list_add_pending(struct bpf_lru *lru,
 	list_add(&node->list, local_pending_list(loc_l));
 }
 
-struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l)
+static struct bpf_lru_node *
+__local_list_pop_free(struct bpf_lru_locallist *loc_l)
 {
 	struct bpf_lru_node *node;
 
@@ -373,8 +374,8 @@ struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l)
 	return node;
 }
 
-struct bpf_lru_node *__local_list_pop_pending(struct bpf_lru *lru,
-					      struct bpf_lru_locallist *loc_l)
+static struct bpf_lru_node *
+__local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l)
 {
 	struct bpf_lru_node *node;
 	bool force = false;
@@ -557,8 +558,9 @@ void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node)
 		bpf_common_lru_push_free(lru, node);
 }
 
-void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
-			     u32 elem_size, u32 nr_elems)
+static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf,
+				    u32 node_offset, u32 elem_size,
+				    u32 nr_elems)
 {
 	struct bpf_lru_list *l = &lru->common_lru.lru_list;
 	u32 i;
@@ -574,8 +576,9 @@ void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
 	}
 }
 
-void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
-			     u32 elem_size, u32 nr_elems)
+static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf,
+				    u32 node_offset, u32 elem_size,
+				    u32 nr_elems)
 {
 	u32 i, pcpu_entries;
 	int cpu;
-- 
cgit v1.2.3


From b6e92aa81038ce57d298a87b4c44285a1d456c3e Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@redhat.com>
Date: Tue, 10 Jan 2017 13:35:43 -0800
Subject: kexec: Switch to __pa_symbol

__pa_symbol is the correct api to get the physical address of kernel
symbols. Switch to it to allow for better debug checking.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 kernel/kexec_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 5617cc412444..a01974e1bf6b 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1399,7 +1399,7 @@ void __weak arch_crash_save_vmcoreinfo(void)
 
 phys_addr_t __weak paddr_vmcoreinfo_note(void)
 {
-	return __pa((unsigned long)(char *)&vmcoreinfo_note);
+	return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
 }
 
 static int __init crash_save_vmcoreinfo_init(void)
-- 
cgit v1.2.3


From 607904c357c61adf20b8fd18af765e501d61a385 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Mon, 9 Jan 2017 10:26:52 -0500
Subject: locking/spinlocks: Remove the unused spin_lock_bh_nested() API

The spin_lock_bh_nested() API is defined but is not used anywhere
in the kernel. So all spin_lock_bh_nested() and related APIs are
now removed.

Signed-off-by: Waiman Long <longman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1483975612-16447-1-git-send-email-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/spinlock.h         | 8 --------
 include/linux/spinlock_api_smp.h | 2 --
 include/linux/spinlock_api_up.h  | 1 -
 kernel/locking/spinlock.c        | 8 --------
 4 files changed, 19 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 47dd0cebd204..59248dcc6ef3 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -180,8 +180,6 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define raw_spin_lock_nested(lock, subclass) \
 	_raw_spin_lock_nested(lock, subclass)
-# define raw_spin_lock_bh_nested(lock, subclass) \
-	_raw_spin_lock_bh_nested(lock, subclass)
 
 # define raw_spin_lock_nest_lock(lock, nest_lock)			\
 	 do {								\
@@ -197,7 +195,6 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
 # define raw_spin_lock_nested(lock, subclass)		\
 	_raw_spin_lock(((void)(subclass), (lock)))
 # define raw_spin_lock_nest_lock(lock, nest_lock)	_raw_spin_lock(lock)
-# define raw_spin_lock_bh_nested(lock, subclass)	_raw_spin_lock_bh(lock)
 #endif
 
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
@@ -317,11 +314,6 @@ do {								\
 	raw_spin_lock_nested(spinlock_check(lock), subclass);	\
 } while (0)
 
-#define spin_lock_bh_nested(lock, subclass)			\
-do {								\
-	raw_spin_lock_bh_nested(spinlock_check(lock), subclass);\
-} while (0)
-
 #define spin_lock_nest_lock(lock, nest_lock)				\
 do {									\
 	raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock);	\
diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
index 5344268e6e62..42dfab89e740 100644
--- a/include/linux/spinlock_api_smp.h
+++ b/include/linux/spinlock_api_smp.h
@@ -22,8 +22,6 @@ int in_lock_functions(unsigned long addr);
 void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)		__acquires(lock);
 void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
 								__acquires(lock);
-void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass)
-								__acquires(lock);
 void __lockfunc
 _raw_spin_lock_nest_lock(raw_spinlock_t *lock, struct lockdep_map *map)
 								__acquires(lock);
diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h
index d3afef9d8dbe..d0d188861ad6 100644
--- a/include/linux/spinlock_api_up.h
+++ b/include/linux/spinlock_api_up.h
@@ -57,7 +57,6 @@
 
 #define _raw_spin_lock(lock)			__LOCK(lock)
 #define _raw_spin_lock_nested(lock, subclass)	__LOCK(lock)
-#define _raw_spin_lock_bh_nested(lock, subclass) __LOCK(lock)
 #define _raw_read_lock(lock)			__LOCK(lock)
 #define _raw_write_lock(lock)			__LOCK(lock)
 #define _raw_spin_lock_bh(lock)			__LOCK_BH(lock)
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index db3ccb1dd614..4b082b5cac9e 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -363,14 +363,6 @@ void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
 }
 EXPORT_SYMBOL(_raw_spin_lock_nested);
 
-void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass)
-{
-	__local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
-	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
-	LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
-}
-EXPORT_SYMBOL(_raw_spin_lock_bh_nested);
-
 unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
 						   int subclass)
 {
-- 
cgit v1.2.3


From 75437bb304b20a2b350b9a8e9f9238d5e24e12ba Mon Sep 17 00:00:00 2001
From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Date: Tue, 10 Jan 2017 02:56:46 -0500
Subject: locking/pvqspinlock: Don't wait if vCPU is preempted

If prev node is not in running state or its vCPU is preempted, we can give
up our vCPU slices in pv_wait_node() ASAP.

Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: longman@redhat.com
Link: http://lkml.kernel.org/r/1484035006-6787-1-git-send-email-xinhui.pan@linux.vnet.ibm.com
[ Fixed typos in the changelog, removed ugly linebreak from the code. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock_paravirt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index e3b5520005db..e6b2f7ad3e51 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -263,7 +263,7 @@ pv_wait_early(struct pv_node *prev, int loop)
 	if ((loop & PV_PREV_CHECK_MASK) != 0)
 		return false;
 
-	return READ_ONCE(prev->state) != vcpu_running;
+	return READ_ONCE(prev->state) != vcpu_running || vcpu_is_preempted(prev->cpu);
 }
 
 /*
-- 
cgit v1.2.3


From 6b8cc1d11ef75c5b9c530b3d0d148f3c2dd25f93 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 12 Jan 2017 11:51:32 +0100
Subject: bpf: pass original insn directly to convert_ctx_access

Currently, when calling convert_ctx_access() callback for the various
program types, we pass in insn->dst_reg, insn->src_reg, insn->off from
the original instruction. This information is needed to rewrite the
instruction that is based on the user ctx structure into a kernel
representation for the ctx. As we'd like to allow access size beyond
just BPF_W, we'd need also insn->code for that in order to decode the
original access size. Given that, lets just pass insn directly to the
convert_ctx_access() callback and work on that to not clutter the
callback with even more arguments we need to pass when everything is
already contained in insn. So lets go through that once, no functional
change.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |   7 ++-
 kernel/bpf/verifier.c    |   3 +-
 kernel/trace/bpf_trace.c |  15 ++---
 net/core/filter.c        | 139 +++++++++++++++++++++++++----------------------
 4 files changed, 87 insertions(+), 77 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 94ea8d2383e6..f8c3560b01db 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -161,9 +161,10 @@ struct bpf_verifier_ops {
 				enum bpf_reg_type *reg_type);
 	int (*gen_prologue)(struct bpf_insn *insn, bool direct_write,
 			    const struct bpf_prog *prog);
-	u32 (*convert_ctx_access)(enum bpf_access_type type, int dst_reg,
-				  int src_reg, int ctx_off,
-				  struct bpf_insn *insn, struct bpf_prog *prog);
+	u32 (*convert_ctx_access)(enum bpf_access_type type,
+				  const struct bpf_insn *src,
+				  struct bpf_insn *dst,
+				  struct bpf_prog *prog);
 };
 
 struct bpf_prog_type_list {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2efdc9128e3c..df7e47244e75 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3177,8 +3177,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX)
 			continue;
 
-		cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg,
-					      insn->off, insn_buf, env->prog);
+		cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog);
 		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
 			verbose("bpf verifier is misconfigured\n");
 			return -EINVAL;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f883c43c96f3..1860e7f1e5a8 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -572,28 +572,29 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type
 	return true;
 }
 
-static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg,
-				      int src_reg, int ctx_off,
+static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
+				      const struct bpf_insn *si,
 				      struct bpf_insn *insn_buf,
 				      struct bpf_prog *prog)
 {
 	struct bpf_insn *insn = insn_buf;
 
-	switch (ctx_off) {
+	switch (si->off) {
 	case offsetof(struct bpf_perf_event_data, sample_period):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64));
 
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
-						       data), dst_reg, src_reg,
+						       data), si->dst_reg, si->src_reg,
 				      offsetof(struct bpf_perf_event_data_kern, data));
-		*insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg,
+		*insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
 				      offsetof(struct perf_sample_data, period));
 		break;
 	default:
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
-						       regs), dst_reg, src_reg,
+						       regs), si->dst_reg, si->src_reg,
 				      offsetof(struct bpf_perf_event_data_kern, regs));
-		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), dst_reg, dst_reg, ctx_off);
+		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), si->dst_reg, si->dst_reg,
+				      si->off);
 		break;
 	}
 
diff --git a/net/core/filter.c b/net/core/filter.c
index f4d16a905754..8cfbdefbfb1c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2972,32 +2972,33 @@ void bpf_warn_invalid_xdp_action(u32 act)
 }
 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
-static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
-					int src_reg, int ctx_off,
+static u32 sk_filter_convert_ctx_access(enum bpf_access_type type,
+					const struct bpf_insn *si,
 					struct bpf_insn *insn_buf,
 					struct bpf_prog *prog)
 {
 	struct bpf_insn *insn = insn_buf;
+	int off;
 
-	switch (ctx_off) {
+	switch (si->off) {
 	case offsetof(struct __sk_buff, len):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
 
-		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, len));
 		break;
 
 	case offsetof(struct __sk_buff, protocol):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
 
-		*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, protocol));
 		break;
 
 	case offsetof(struct __sk_buff, vlan_proto):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2);
 
-		*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, vlan_proto));
 		break;
 
@@ -3005,17 +3006,17 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4);
 
 		if (type == BPF_WRITE)
-			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
 					      offsetof(struct sk_buff, priority));
 		else
-			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 					      offsetof(struct sk_buff, priority));
 		break;
 
 	case offsetof(struct __sk_buff, ingress_ifindex):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4);
 
-		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, skb_iif));
 		break;
 
@@ -3023,17 +3024,17 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
 
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
-				      dst_reg, src_reg,
+				      si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, dev));
-		*insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1);
-		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg,
+		*insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
 				      offsetof(struct net_device, ifindex));
 		break;
 
 	case offsetof(struct __sk_buff, hash):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
 
-		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, hash));
 		break;
 
@@ -3041,63 +3042,74 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
 
 		if (type == BPF_WRITE)
-			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
 					      offsetof(struct sk_buff, mark));
 		else
-			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 					      offsetof(struct sk_buff, mark));
 		break;
 
 	case offsetof(struct __sk_buff, pkt_type):
-		return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn);
+		return convert_skb_access(SKF_AD_PKTTYPE, si->dst_reg,
+					  si->src_reg, insn);
 
 	case offsetof(struct __sk_buff, queue_mapping):
-		return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn);
+		return convert_skb_access(SKF_AD_QUEUE, si->dst_reg,
+					  si->src_reg, insn);
 
 	case offsetof(struct __sk_buff, vlan_present):
 		return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
-					  dst_reg, src_reg, insn);
+					  si->dst_reg, si->src_reg, insn);
 
 	case offsetof(struct __sk_buff, vlan_tci):
 		return convert_skb_access(SKF_AD_VLAN_TAG,
-					  dst_reg, src_reg, insn);
+					  si->dst_reg, si->src_reg, insn);
 
 	case offsetof(struct __sk_buff, cb[0]) ...
 	     offsetof(struct __sk_buff, cb[4]):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
 
 		prog->cb_access = 1;
-		ctx_off -= offsetof(struct __sk_buff, cb[0]);
-		ctx_off += offsetof(struct sk_buff, cb);
-		ctx_off += offsetof(struct qdisc_skb_cb, data);
+		off  = si->off;
+		off -= offsetof(struct __sk_buff, cb[0]);
+		off += offsetof(struct sk_buff, cb);
+		off += offsetof(struct qdisc_skb_cb, data);
 		if (type == BPF_WRITE)
-			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ctx_off);
+			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg,
+					      si->src_reg, off);
 		else
-			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off);
+			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg,
+					      si->src_reg, off);
 		break;
 
 	case offsetof(struct __sk_buff, tc_classid):
-		ctx_off -= offsetof(struct __sk_buff, tc_classid);
-		ctx_off += offsetof(struct sk_buff, cb);
-		ctx_off += offsetof(struct qdisc_skb_cb, tc_classid);
+		BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, tc_classid) != 2);
+
+		off  = si->off;
+		off -= offsetof(struct __sk_buff, tc_classid);
+		off += offsetof(struct sk_buff, cb);
+		off += offsetof(struct qdisc_skb_cb, tc_classid);
 		if (type == BPF_WRITE)
-			*insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
+			*insn++ = BPF_STX_MEM(BPF_H, si->dst_reg,
+					      si->src_reg, off);
 		else
-			*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
+			*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg,
+					      si->src_reg, off);
 		break;
 
 	case offsetof(struct __sk_buff, data):
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
-				      dst_reg, src_reg,
+				      si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, data));
 		break;
 
 	case offsetof(struct __sk_buff, data_end):
-		ctx_off -= offsetof(struct __sk_buff, data_end);
-		ctx_off += offsetof(struct sk_buff, cb);
-		ctx_off += offsetof(struct bpf_skb_data_end, data_end);
-		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg, src_reg,
-				      ctx_off);
+		off  = si->off;
+		off -= offsetof(struct __sk_buff, data_end);
+		off += offsetof(struct sk_buff, cb);
+		off += offsetof(struct bpf_skb_data_end, data_end);
+		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
+				      si->src_reg, off);
 		break;
 
 	case offsetof(struct __sk_buff, tc_index):
@@ -3105,110 +3117,107 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2);
 
 		if (type == BPF_WRITE)
-			*insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg,
+			*insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg,
 					      offsetof(struct sk_buff, tc_index));
 		else
-			*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+			*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
 					      offsetof(struct sk_buff, tc_index));
-		break;
 #else
 		if (type == BPF_WRITE)
-			*insn++ = BPF_MOV64_REG(dst_reg, dst_reg);
+			*insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg);
 		else
-			*insn++ = BPF_MOV64_IMM(dst_reg, 0);
-		break;
+			*insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
 #endif
+		break;
 	}
 
 	return insn - insn_buf;
 }
 
 static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
-					  int dst_reg, int src_reg,
-					  int ctx_off,
+					  const struct bpf_insn *si,
 					  struct bpf_insn *insn_buf,
 					  struct bpf_prog *prog)
 {
 	struct bpf_insn *insn = insn_buf;
 
-	switch (ctx_off) {
+	switch (si->off) {
 	case offsetof(struct bpf_sock, bound_dev_if):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4);
 
 		if (type == BPF_WRITE)
-			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
 					offsetof(struct sock, sk_bound_dev_if));
 		else
-			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 				      offsetof(struct sock, sk_bound_dev_if));
 		break;
 
 	case offsetof(struct bpf_sock, family):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2);
 
-		*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
 				      offsetof(struct sock, sk_family));
 		break;
 
 	case offsetof(struct bpf_sock, type):
-		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 				      offsetof(struct sock, __sk_flags_offset));
-		*insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_TYPE_MASK);
-		*insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_TYPE_SHIFT);
+		*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK);
+		*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT);
 		break;
 
 	case offsetof(struct bpf_sock, protocol):
-		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
 				      offsetof(struct sock, __sk_flags_offset));
-		*insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_PROTO_MASK);
-		*insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_PROTO_SHIFT);
+		*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
+		*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT);
 		break;
 	}
 
 	return insn - insn_buf;
 }
 
-static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg,
-					 int src_reg, int ctx_off,
+static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type,
+					 const struct bpf_insn *si,
 					 struct bpf_insn *insn_buf,
 					 struct bpf_prog *prog)
 {
 	struct bpf_insn *insn = insn_buf;
 
-	switch (ctx_off) {
+	switch (si->off) {
 	case offsetof(struct __sk_buff, ifindex):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
 
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
-				      dst_reg, src_reg,
+				      si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, dev));
-		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg,
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
 				      offsetof(struct net_device, ifindex));
 		break;
 	default:
-		return sk_filter_convert_ctx_access(type, dst_reg, src_reg,
-						    ctx_off, insn_buf, prog);
+		return sk_filter_convert_ctx_access(type, si, insn_buf, prog);
 	}
 
 	return insn - insn_buf;
 }
 
-static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
-				  int src_reg, int ctx_off,
+static u32 xdp_convert_ctx_access(enum bpf_access_type type,
+				  const struct bpf_insn *si,
 				  struct bpf_insn *insn_buf,
 				  struct bpf_prog *prog)
 {
 	struct bpf_insn *insn = insn_buf;
 
-	switch (ctx_off) {
+	switch (si->off) {
 	case offsetof(struct xdp_md, data):
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),
-				      dst_reg, src_reg,
+				      si->dst_reg, si->src_reg,
 				      offsetof(struct xdp_buff, data));
 		break;
 	case offsetof(struct xdp_md, data_end):
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
-				      dst_reg, src_reg,
+				      si->dst_reg, si->src_reg,
 				      offsetof(struct xdp_buff, data_end));
 		break;
 	}
-- 
cgit v1.2.3


From 62c7989b24dbd348c2507ee6458ebf5637d6ddb5 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 12 Jan 2017 11:51:33 +0100
Subject: bpf: allow b/h/w/dw access for bpf's cb in ctx

When structs are used to store temporary state in cb[] buffer that is
used with programs and among tail calls, then the generated code will
not always access the buffer in bpf_w chunks. We can ease programming
of it and let this act more natural by allowing for aligned b/h/w/dw
sized access for cb[] ctx member. Various test cases are attached as
well for the selftest suite. Potentially, this can also be reused for
other program types to pass data around.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c                       |   8 +-
 net/core/filter.c                           |  41 ++-
 tools/testing/selftests/bpf/test_verifier.c | 442 +++++++++++++++++++++++++++-
 3 files changed, 478 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index df7e47244e75..d60e12c67266 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3165,10 +3165,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 	insn = env->prog->insnsi + delta;
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
-		if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
+		if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
+		    insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
+		    insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
 		    insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
 			type = BPF_READ;
-		else if (insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
+		else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
+			 insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
+			 insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
 			 insn->code == (BPF_STX | BPF_MEM | BPF_DW))
 			type = BPF_WRITE;
 		else
diff --git a/net/core/filter.c b/net/core/filter.c
index 8cfbdefbfb1c..90383860e224 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2776,11 +2776,33 @@ static bool __is_valid_access(int off, int size)
 {
 	if (off < 0 || off >= sizeof(struct __sk_buff))
 		return false;
+
 	/* The verifier guarantees that size > 0. */
 	if (off % size != 0)
 		return false;
-	if (size != sizeof(__u32))
-		return false;
+
+	switch (off) {
+	case offsetof(struct __sk_buff, cb[0]) ...
+	     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
+		if (size == sizeof(__u16) &&
+		    off > offsetof(struct __sk_buff, cb[4]) + sizeof(__u16))
+			return false;
+		if (size == sizeof(__u32) &&
+		    off > offsetof(struct __sk_buff, cb[4]))
+			return false;
+		if (size == sizeof(__u64) &&
+		    off > offsetof(struct __sk_buff, cb[2]))
+			return false;
+		if (size != sizeof(__u8)  &&
+		    size != sizeof(__u16) &&
+		    size != sizeof(__u32) &&
+		    size != sizeof(__u64))
+			return false;
+		break;
+	default:
+		if (size != sizeof(__u32))
+			return false;
+	}
 
 	return true;
 }
@@ -2799,7 +2821,7 @@ static bool sk_filter_is_valid_access(int off, int size,
 	if (type == BPF_WRITE) {
 		switch (off) {
 		case offsetof(struct __sk_buff, cb[0]) ...
-		     offsetof(struct __sk_buff, cb[4]):
+		     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
 			break;
 		default:
 			return false;
@@ -2823,7 +2845,7 @@ static bool lwt_is_valid_access(int off, int size,
 		case offsetof(struct __sk_buff, mark):
 		case offsetof(struct __sk_buff, priority):
 		case offsetof(struct __sk_buff, cb[0]) ...
-		     offsetof(struct __sk_buff, cb[4]):
+		     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
 			break;
 		default:
 			return false;
@@ -2915,7 +2937,7 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 		case offsetof(struct __sk_buff, tc_index):
 		case offsetof(struct __sk_buff, priority):
 		case offsetof(struct __sk_buff, cb[0]) ...
-		     offsetof(struct __sk_buff, cb[4]):
+		     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
 		case offsetof(struct __sk_buff, tc_classid):
 			break;
 		default:
@@ -3066,8 +3088,11 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type,
 					  si->dst_reg, si->src_reg, insn);
 
 	case offsetof(struct __sk_buff, cb[0]) ...
-	     offsetof(struct __sk_buff, cb[4]):
+	     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
 		BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
+		BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
+			      offsetof(struct qdisc_skb_cb, data)) %
+			     sizeof(__u64));
 
 		prog->cb_access = 1;
 		off  = si->off;
@@ -3075,10 +3100,10 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type,
 		off += offsetof(struct sk_buff, cb);
 		off += offsetof(struct qdisc_skb_cb, data);
 		if (type == BPF_WRITE)
-			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg,
+			*insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg,
 					      si->src_reg, off);
 		else
-			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg,
+			*insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
 					      si->src_reg, off);
 		break;
 
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 9bb45346dc72..1aa73241c999 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -859,15 +859,451 @@ static struct bpf_test tests[] = {
 		.result = REJECT,
 	},
 	{
-		"check non-u32 access to cb",
+		"check cb access: byte",
 		.insns = {
-			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_1,
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0])),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0]) + 1),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0]) + 2),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0]) + 3),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[1])),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[1]) + 1),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[1]) + 2),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[1]) + 3),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[2])),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[2]) + 1),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[2]) + 2),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[2]) + 3),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[3])),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[3]) + 1),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[3]) + 2),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[3]) + 3),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[4])),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[4]) + 1),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[4]) + 2),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[4]) + 3),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[0])),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[0]) + 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[0]) + 2),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[0]) + 3),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[1])),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[1]) + 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[1]) + 2),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[1]) + 3),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[2])),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[2]) + 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[2]) + 2),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[2]) + 3),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[3])),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[3]) + 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[3]) + 2),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[3]) + 3),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[4])),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[4]) + 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[4]) + 2),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[4]) + 3),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+	},
+	{
+		"check cb access: byte, oob 1",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[4]) + 4),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: byte, oob 2",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0]) - 1),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: byte, oob 3",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[4]) + 4),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: byte, oob 4",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[0]) - 1),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: byte, wrong type",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0])),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"check cb access: half",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0])),
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0]) + 2),
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[1])),
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[1]) + 2),
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[2])),
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[2]) + 2),
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[3])),
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[3]) + 2),
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[4])),
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[4]) + 2),
+			BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[0])),
+			BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[0]) + 2),
+			BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[1])),
+			BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[1]) + 2),
+			BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[2])),
+			BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[2]) + 2),
+			BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[3])),
+			BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[3]) + 2),
+			BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[4])),
+			BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[4]) + 2),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+	},
+	{
+		"check cb access: half, unaligned",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0]) + 1),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "misaligned access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: half, oob 1",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[4]) + 4),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: half, oob 2",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0]) - 2),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: half, oob 3",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[4]) + 4),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: half, oob 4",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[0]) - 2),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: half, wrong type",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0])),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"check cb access: word",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0])),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[1])),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[2])),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[3])),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[4])),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[0])),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[1])),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[2])),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[3])),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[4])),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+	},
+	{
+		"check cb access: word, unaligned 1",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0]) + 2),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "misaligned access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: word, unaligned 2",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[4]) + 1),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "misaligned access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: word, unaligned 3",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[4]) + 2),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "misaligned access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: word, unaligned 4",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[4]) + 3),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "misaligned access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: double",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0])),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[2])),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[0])),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[2])),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+	},
+	{
+		"check cb access: double, unaligned 1",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[1])),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "misaligned access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: double, unaligned 2",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[3])),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "misaligned access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: double, oob 1",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[4])),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: double, oob 2",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[4]) + 8),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: double, oob 3",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[0]) - 8),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: double, oob 4",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[4])),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: double, oob 5",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[4]) + 8),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: double, oob 6",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[0]) - 8),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check cb access: double, wrong type",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
 				    offsetof(struct __sk_buff, cb[0])),
 			BPF_EXIT_INSN(),
 		},
 		.errstr = "invalid bpf_context access",
-		.errstr_unpriv = "R1 leaks addr",
 		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
 	},
 	{
 		"check out of range skb->cb access",
-- 
cgit v1.2.3


From 3a2f5a59a695a73e0cde9a61e0feae5fa730e936 Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Tue, 10 Jan 2017 12:28:32 -0500
Subject: security,selinux,smack: kill security_task_wait hook

As reported by yangshukui, a permission denial from security_task_wait()
can lead to a soft lockup in zap_pid_ns_processes() since it only expects
sys_wait4() to return 0 or -ECHILD. Further, security_task_wait() can
in general lead to zombies; in the absence of some way to automatically
reparent a child process upon a denial, the hook is not useful.  Remove
the security hook and its implementations in SELinux and Smack.  Smack
already removed its check from its hook.

Reported-by: yangshukui <yangshukui@huawei.com>
Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h  |  7 -------
 include/linux/security.h   |  6 ------
 kernel/exit.c              | 19 ++-----------------
 security/security.c        |  6 ------
 security/selinux/hooks.c   |  7 -------
 security/smack/smack_lsm.c | 20 --------------------
 6 files changed, 2 insertions(+), 63 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 0dde95900196..6fe7a5cb0be1 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -666,11 +666,6 @@
  *	@sig contains the signal value.
  *	@secid contains the sid of the process where the signal originated
  *	Return 0 if permission is granted.
- * @task_wait:
- *	Check permission before allowing a process to reap a child process @p
- *	and collect its status information.
- *	@p contains the task_struct for process.
- *	Return 0 if permission is granted.
  * @task_prctl:
  *	Check permission before performing a process control operation on the
  *	current process.
@@ -1507,7 +1502,6 @@ union security_list_options {
 	int (*task_movememory)(struct task_struct *p);
 	int (*task_kill)(struct task_struct *p, struct siginfo *info,
 				int sig, u32 secid);
-	int (*task_wait)(struct task_struct *p);
 	int (*task_prctl)(int option, unsigned long arg2, unsigned long arg3,
 				unsigned long arg4, unsigned long arg5);
 	void (*task_to_inode)(struct task_struct *p, struct inode *inode);
@@ -1767,7 +1761,6 @@ struct security_hook_heads {
 	struct list_head task_getscheduler;
 	struct list_head task_movememory;
 	struct list_head task_kill;
-	struct list_head task_wait;
 	struct list_head task_prctl;
 	struct list_head task_to_inode;
 	struct list_head ipc_permission;
diff --git a/include/linux/security.h b/include/linux/security.h
index f4ebac117fa6..d3868f2ebada 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -332,7 +332,6 @@ int security_task_getscheduler(struct task_struct *p);
 int security_task_movememory(struct task_struct *p);
 int security_task_kill(struct task_struct *p, struct siginfo *info,
 			int sig, u32 secid);
-int security_task_wait(struct task_struct *p);
 int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 			unsigned long arg4, unsigned long arg5);
 void security_task_to_inode(struct task_struct *p, struct inode *inode);
@@ -980,11 +979,6 @@ static inline int security_task_kill(struct task_struct *p,
 	return 0;
 }
 
-static inline int security_task_wait(struct task_struct *p)
-{
-	return 0;
-}
-
 static inline int security_task_prctl(int option, unsigned long arg2,
 				      unsigned long arg3,
 				      unsigned long arg4,
diff --git a/kernel/exit.c b/kernel/exit.c
index 8f14b866f9f6..60f245190571 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -14,7 +14,6 @@
 #include <linux/tty.h>
 #include <linux/iocontext.h>
 #include <linux/key.h>
-#include <linux/security.h>
 #include <linux/cpu.h>
 #include <linux/acct.h>
 #include <linux/tsacct_kern.h>
@@ -1360,7 +1359,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
  * Returns nonzero for a final return, when we have unlocked tasklist_lock.
  * Returns zero if the search for a child should continue;
  * then ->notask_error is 0 if @p is an eligible child,
- * or another error from security_task_wait(), or still -ECHILD.
+ * or still -ECHILD.
  */
 static int wait_consider_task(struct wait_opts *wo, int ptrace,
 				struct task_struct *p)
@@ -1380,20 +1379,6 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 	if (!ret)
 		return ret;
 
-	ret = security_task_wait(p);
-	if (unlikely(ret < 0)) {
-		/*
-		 * If we have not yet seen any eligible child,
-		 * then let this error code replace -ECHILD.
-		 * A permission error will give the user a clue
-		 * to look for security policy problems, rather
-		 * than for mysterious wait bugs.
-		 */
-		if (wo->notask_error)
-			wo->notask_error = ret;
-		return 0;
-	}
-
 	if (unlikely(exit_state == EXIT_TRACE)) {
 		/*
 		 * ptrace == 0 means we are the natural parent. In this case
@@ -1486,7 +1471,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
  * Returns nonzero for a final return, when we have unlocked tasklist_lock.
  * Returns zero if the search for a child should continue; then
  * ->notask_error is 0 if there were any eligible children,
- * or another error from security_task_wait(), or still -ECHILD.
+ * or still -ECHILD.
  */
 static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
 {
diff --git a/security/security.c b/security/security.c
index 32052f5c76b2..8c9fee59e60a 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1025,11 +1025,6 @@ int security_task_kill(struct task_struct *p, struct siginfo *info,
 	return call_int_hook(task_kill, 0, p, info, sig, secid);
 }
 
-int security_task_wait(struct task_struct *p)
-{
-	return call_int_hook(task_wait, 0, p);
-}
-
 int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 			 unsigned long arg4, unsigned long arg5)
 {
@@ -1769,7 +1764,6 @@ struct security_hook_heads security_hook_heads = {
 	.task_movememory =
 		LIST_HEAD_INIT(security_hook_heads.task_movememory),
 	.task_kill =	LIST_HEAD_INIT(security_hook_heads.task_kill),
-	.task_wait =	LIST_HEAD_INIT(security_hook_heads.task_wait),
 	.task_prctl =	LIST_HEAD_INIT(security_hook_heads.task_prctl),
 	.task_to_inode =
 		LIST_HEAD_INIT(security_hook_heads.task_to_inode),
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 55ad878f1146..a5398fea0966 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3963,12 +3963,6 @@ static int selinux_task_kill(struct task_struct *p, struct siginfo *info,
 	return avc_has_perm(secid, task_sid(p), SECCLASS_PROCESS, perm, NULL);
 }
 
-static int selinux_task_wait(struct task_struct *p)
-{
-	return avc_has_perm(task_sid(p), current_sid(), SECCLASS_PROCESS,
-			    PROCESS__SIGCHLD, NULL);
-}
-
 static void selinux_task_to_inode(struct task_struct *p,
 				  struct inode *inode)
 {
@@ -6211,7 +6205,6 @@ static struct security_hook_list selinux_hooks[] = {
 	LSM_HOOK_INIT(task_getscheduler, selinux_task_getscheduler),
 	LSM_HOOK_INIT(task_movememory, selinux_task_movememory),
 	LSM_HOOK_INIT(task_kill, selinux_task_kill),
-	LSM_HOOK_INIT(task_wait, selinux_task_wait),
 	LSM_HOOK_INIT(task_to_inode, selinux_task_to_inode),
 
 	LSM_HOOK_INIT(ipc_permission, selinux_ipc_permission),
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 8da4a6b9ca4d..2166373ea5a4 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -2271,25 +2271,6 @@ static int smack_task_kill(struct task_struct *p, struct siginfo *info,
 	return rc;
 }
 
-/**
- * smack_task_wait - Smack access check for waiting
- * @p: task to wait for
- *
- * Returns 0
- */
-static int smack_task_wait(struct task_struct *p)
-{
-	/*
-	 * Allow the operation to succeed.
-	 * Zombies are bad.
-	 * In userless environments (e.g. phones) programs
-	 * get marked with SMACK64EXEC and even if the parent
-	 * and child shouldn't be talking the parent still
-	 * may expect to know when the child exits.
-	 */
-	return 0;
-}
-
 /**
  * smack_task_to_inode - copy task smack into the inode blob
  * @p: task to copy from
@@ -4658,7 +4639,6 @@ static struct security_hook_list smack_hooks[] = {
 	LSM_HOOK_INIT(task_getscheduler, smack_task_getscheduler),
 	LSM_HOOK_INIT(task_movememory, smack_task_movememory),
 	LSM_HOOK_INIT(task_kill, smack_task_kill),
-	LSM_HOOK_INIT(task_wait, smack_task_wait),
 	LSM_HOOK_INIT(task_to_inode, smack_task_to_inode),
 
 	LSM_HOOK_INIT(ipc_permission, smack_ipc_permission),
-- 
cgit v1.2.3


From 5b485629ba0d5d027880769ff467c587b24b4bde Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Sun, 8 Jan 2017 23:58:09 +0900
Subject: kprobes, extable: Identify kprobes trampolines as kernel text area

Improve __kernel_text_address()/kernel_text_address() to return
true if the given address is on a kprobe's instruction slot
trampoline.

This can help stacktraces to determine the address is on a
text area or not.

To implement this atomically in is_kprobe_*_slot(), also change
the insn_cache page list to an RCU list.

This changes timings a bit (it delays page freeing to the RCU garbage
collection phase), but none of that is in the hot path.

Note: this change can add small overhead to stack unwinders because
it adds 2 additional checks to __kernel_text_address(). However, the
impact should be very small, because kprobe_insn_pages list has 1 entry
per 256 probes(on x86, on arm/arm64 it will be 1024 probes),
and kprobe_optinsn_pages has 1 entry per 32 probes(on x86).
In most use cases, the number of kprobe events may be less
than 20, which means that is_kprobe_*_slot() will check just one entry.

Tested-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/148388747896.6869.6354262871751682264.stgit@devbox
[ Improved the changelog and coding style. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kprobes.h | 30 +++++++++++++++++++-
 kernel/extable.c        |  9 +++++-
 kernel/kprobes.c        | 73 ++++++++++++++++++++++++++++++++++++-------------
 3 files changed, 91 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 8f6849084248..16ddfb8b304a 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -278,9 +278,13 @@ struct kprobe_insn_cache {
 	int nr_garbage;
 };
 
+#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
 extern kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c);
 extern void __free_insn_slot(struct kprobe_insn_cache *c,
 			     kprobe_opcode_t *slot, int dirty);
+/* sleep-less address checking routine  */
+extern bool __is_insn_slot_addr(struct kprobe_insn_cache *c,
+				unsigned long addr);
 
 #define DEFINE_INSN_CACHE_OPS(__name)					\
 extern struct kprobe_insn_cache kprobe_##__name##_slots;		\
@@ -294,6 +298,18 @@ static inline void free_##__name##_slot(kprobe_opcode_t *slot, int dirty)\
 {									\
 	__free_insn_slot(&kprobe_##__name##_slots, slot, dirty);	\
 }									\
+									\
+static inline bool is_kprobe_##__name##_slot(unsigned long addr)	\
+{									\
+	return __is_insn_slot_addr(&kprobe_##__name##_slots, addr);	\
+}
+#else /* __ARCH_WANT_KPROBES_INSN_SLOT */
+#define DEFINE_INSN_CACHE_OPS(__name)					\
+static inline bool is_kprobe_##__name##_slot(unsigned long addr)	\
+{									\
+	return 0;							\
+}
+#endif
 
 DEFINE_INSN_CACHE_OPS(insn);
 
@@ -330,7 +346,6 @@ extern int proc_kprobes_optimization_handler(struct ctl_table *table,
 					     int write, void __user *buffer,
 					     size_t *length, loff_t *ppos);
 #endif
-
 #endif /* CONFIG_OPTPROBES */
 #ifdef CONFIG_KPROBES_ON_FTRACE
 extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
@@ -481,6 +496,19 @@ static inline int enable_jprobe(struct jprobe *jp)
 	return enable_kprobe(&jp->kp);
 }
 
+#ifndef CONFIG_KPROBES
+static inline bool is_kprobe_insn_slot(unsigned long addr)
+{
+	return false;
+}
+#endif
+#ifndef CONFIG_OPTPROBES
+static inline bool is_kprobe_optinsn_slot(unsigned long addr)
+{
+	return false;
+}
+#endif
+
 #ifdef CONFIG_KPROBES
 /*
  * Blacklist ganerating macro. Specify functions which is not probed
diff --git a/kernel/extable.c b/kernel/extable.c
index e3beec4a2339..e1359474baa5 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -20,6 +20,7 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/init.h>
+#include <linux/kprobes.h>
 
 #include <asm/sections.h>
 #include <linux/uaccess.h>
@@ -104,6 +105,8 @@ int __kernel_text_address(unsigned long addr)
 		return 1;
 	if (is_ftrace_trampoline(addr))
 		return 1;
+	if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
+		return 1;
 	/*
 	 * There might be init symbols in saved stacktraces.
 	 * Give those symbols a chance to be printed in
@@ -123,7 +126,11 @@ int kernel_text_address(unsigned long addr)
 		return 1;
 	if (is_module_text_address(addr))
 		return 1;
-	return is_ftrace_trampoline(addr);
+	if (is_ftrace_trampoline(addr))
+		return 1;
+	if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
+		return 1;
+	return 0;
 }
 
 /*
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 43460104f119..ebb4dadca66b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -149,9 +149,11 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
 	struct kprobe_insn_page *kip;
 	kprobe_opcode_t *slot = NULL;
 
+	/* Since the slot array is not protected by rcu, we need a mutex */
 	mutex_lock(&c->mutex);
  retry:
-	list_for_each_entry(kip, &c->pages, list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(kip, &c->pages, list) {
 		if (kip->nused < slots_per_page(c)) {
 			int i;
 			for (i = 0; i < slots_per_page(c); i++) {
@@ -159,6 +161,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
 					kip->slot_used[i] = SLOT_USED;
 					kip->nused++;
 					slot = kip->insns + (i * c->insn_size);
+					rcu_read_unlock();
 					goto out;
 				}
 			}
@@ -167,6 +170,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
 			WARN_ON(1);
 		}
 	}
+	rcu_read_unlock();
 
 	/* If there are any garbage slots, collect it and try again. */
 	if (c->nr_garbage && collect_garbage_slots(c) == 0)
@@ -193,7 +197,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
 	kip->nused = 1;
 	kip->ngarbage = 0;
 	kip->cache = c;
-	list_add(&kip->list, &c->pages);
+	list_add_rcu(&kip->list, &c->pages);
 	slot = kip->insns;
 out:
 	mutex_unlock(&c->mutex);
@@ -213,7 +217,8 @@ static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
 		 * next time somebody inserts a probe.
 		 */
 		if (!list_is_singular(&kip->list)) {
-			list_del(&kip->list);
+			list_del_rcu(&kip->list);
+			synchronize_rcu();
 			kip->cache->free(kip->insns);
 			kfree(kip);
 		}
@@ -235,8 +240,7 @@ static int collect_garbage_slots(struct kprobe_insn_cache *c)
 			continue;
 		kip->ngarbage = 0;	/* we will collect all garbages */
 		for (i = 0; i < slots_per_page(c); i++) {
-			if (kip->slot_used[i] == SLOT_DIRTY &&
-			    collect_one_slot(kip, i))
+			if (kip->slot_used[i] == SLOT_DIRTY && collect_one_slot(kip, i))
 				break;
 		}
 	}
@@ -248,29 +252,60 @@ void __free_insn_slot(struct kprobe_insn_cache *c,
 		      kprobe_opcode_t *slot, int dirty)
 {
 	struct kprobe_insn_page *kip;
+	long idx;
 
 	mutex_lock(&c->mutex);
-	list_for_each_entry(kip, &c->pages, list) {
-		long idx = ((long)slot - (long)kip->insns) /
-				(c->insn_size * sizeof(kprobe_opcode_t));
-		if (idx >= 0 && idx < slots_per_page(c)) {
-			WARN_ON(kip->slot_used[idx] != SLOT_USED);
-			if (dirty) {
-				kip->slot_used[idx] = SLOT_DIRTY;
-				kip->ngarbage++;
-				if (++c->nr_garbage > slots_per_page(c))
-					collect_garbage_slots(c);
-			} else
-				collect_one_slot(kip, idx);
+	rcu_read_lock();
+	list_for_each_entry_rcu(kip, &c->pages, list) {
+		idx = ((long)slot - (long)kip->insns) /
+			(c->insn_size * sizeof(kprobe_opcode_t));
+		if (idx >= 0 && idx < slots_per_page(c))
 			goto out;
-		}
 	}
-	/* Could not free this slot. */
+	/* Could not find this slot. */
 	WARN_ON(1);
+	kip = NULL;
 out:
+	rcu_read_unlock();
+	/* Mark and sweep: this may sleep */
+	if (kip) {
+		/* Check double free */
+		WARN_ON(kip->slot_used[idx] != SLOT_USED);
+		if (dirty) {
+			kip->slot_used[idx] = SLOT_DIRTY;
+			kip->ngarbage++;
+			if (++c->nr_garbage > slots_per_page(c))
+				collect_garbage_slots(c);
+		} else {
+			collect_one_slot(kip, idx);
+		}
+	}
 	mutex_unlock(&c->mutex);
 }
 
+/*
+ * Check given address is on the page of kprobe instruction slots.
+ * This will be used for checking whether the address on a stack
+ * is on a text area or not.
+ */
+bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr)
+{
+	struct kprobe_insn_page *kip;
+	bool ret = false;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(kip, &c->pages, list) {
+		if (addr >= (unsigned long)kip->insns &&
+		    addr < (unsigned long)kip->insns + PAGE_SIZE) {
+			ret = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
 #ifdef CONFIG_OPTPROBES
 /* For optimized_kprobe buffer */
 struct kprobe_insn_cache kprobe_optinsn_slots = {
-- 
cgit v1.2.3


From c31cc6a5187e8b09ccee34f81728a90f80e872e7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 5 Jan 2017 18:11:43 +0100
Subject: sched/cputime: Allow accounting system time using cpustat index

In order to prepare for CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y to delay
cputime accounting to the tick, let's provide APIs to account system
time to precise contexts: hardirq, softirq, pure system, ...

Inspired-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1483636310-6557-4-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kernel_stat.h |  2 ++
 kernel/sched/cputime.c      | 10 +++++-----
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 00f776816aa3..14b63bb714d4 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -80,6 +80,8 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu)
 
 extern void account_user_time(struct task_struct *, cputime_t);
 extern void account_system_time(struct task_struct *, int, cputime_t);
+extern void account_system_index_time(struct task_struct *, cputime_t,
+				      enum cpu_usage_stat);
 extern void account_steal_time(cputime_t);
 extern void account_idle_time(cputime_t);
 
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 7700a9cba335..aad42835938c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -176,8 +176,8 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime)
  * @cputime: the cpu time spent in kernel space since the last update
  * @index: pointer to cpustat field that has to be updated
  */
-static inline
-void __account_system_time(struct task_struct *p, cputime_t cputime, int index)
+void account_system_index_time(struct task_struct *p,
+			       cputime_t cputime, enum cpu_usage_stat index)
 {
 	/* Add system time to process. */
 	p->stime += cputime;
@@ -213,7 +213,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 	else
 		index = CPUTIME_SYSTEM;
 
-	__account_system_time(p, cputime, index);
+	account_system_index_time(p, cputime, index);
 }
 
 /*
@@ -400,7 +400,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 		 * So, we have to handle it separately here.
 		 * Also, p->stime needs to be updated for ksoftirqd.
 		 */
-		__account_system_time(p, cputime, CPUTIME_SOFTIRQ);
+		account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
 	} else if (user_tick) {
 		account_user_time(p, cputime);
 	} else if (p == rq->idle) {
@@ -408,7 +408,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 	} else if (p->flags & PF_VCPU) { /* System time or guest time */
 		account_guest_time(p, cputime);
 	} else {
-		__account_system_time(p, cputime, CPUTIME_SYSTEM);
+		account_system_index_time(p, cputime, CPUTIME_SYSTEM);
 	}
 }
 
-- 
cgit v1.2.3


From 1213699ab426608ff1925ab263dd6925102bb92a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 5 Jan 2017 18:11:44 +0100
Subject: sched/cputime: Export account_guest_time()

In order to prepare for CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y to delay
cputime accounting to the tick, let's allow archs to account cputime
directly to gtime.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1483636310-6557-5-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kernel_stat.h | 1 +
 kernel/sched/cputime.c      | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 14b63bb714d4..cfd6c0c6d4e8 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -79,6 +79,7 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu)
 }
 
 extern void account_user_time(struct task_struct *, cputime_t);
+extern void account_guest_time(struct task_struct *, cputime_t);
 extern void account_system_time(struct task_struct *, int, cputime_t);
 extern void account_system_index_time(struct task_struct *, cputime_t,
 				      enum cpu_usage_stat);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index aad42835938c..5813ee4a5168 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -151,7 +151,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
  * @p: the process that the cpu time gets accounted to
  * @cputime: the cpu time spent in virtual machine since the last update
  */
-static void account_guest_time(struct task_struct *p, cputime_t cputime)
+void account_guest_time(struct task_struct *p, cputime_t cputime)
 {
 	u64 *cpustat = kcpustat_this_cpu->cpustat;
 
-- 
cgit v1.2.3


From c8d7dabf8f91fadd265e6eb87afb201d14ea299b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 5 Jan 2017 18:11:50 +0100
Subject: sched/cputime: Rename vtime_account_user() to vtime_flush()

CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y used to accumulate user time and
account it on ticks and context switches only through the
vtime_account_user() function.

Now this model has been generalized on the 3 archs for all kind of
cputime (system, irq, ...) and all the cputime flushing happens under
vtime_account_user().

So let's rename this function to better reflect its new role.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1483636310-6557-11-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/ia64/kernel/time.c     | 2 +-
 arch/powerpc/kernel/time.c  | 6 ++----
 arch/s390/kernel/vtime.c    | 2 +-
 include/linux/kernel_stat.h | 2 +-
 include/linux/vtime.h       | 7 +++++--
 kernel/sched/cputime.c      | 4 +---
 6 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 37f1b315d9b6..d040f12ea9f9 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -61,7 +61,7 @@ static struct clocksource *itc_clocksource;
 
 extern cputime_t cycle_to_cputime(u64 cyc);
 
-void vtime_account_user(struct task_struct *tsk)
+void vtime_flush(struct task_struct *tsk)
 {
 	struct thread_info *ti = task_thread_info(tsk);
 	cputime_t delta;
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 4255e6930ac1..02e97305d22b 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -382,15 +382,13 @@ void vtime_account_idle(struct task_struct *tsk)
 }
 
 /*
- * Transfer the user time accumulated in the paca
- * by the exception entry and exit code to the generic
- * process user time records.
+ * Account the whole cputime accumulated in the paca
  * Must be called with interrupts disabled.
  * Assumes that vtime_account_system/idle() has been called
  * recently (i.e. since the last entry from usermode) so that
  * get_paca()->user_time_scaled is up to date.
  */
-void vtime_account_user(struct task_struct *tsk)
+void vtime_flush(struct task_struct *tsk)
 {
 	struct cpu_accounting_data *acct = get_accounting(tsk);
 
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index 1a53e0bdc90a..0a9e5d67547d 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -214,7 +214,7 @@ void vtime_task_switch(struct task_struct *prev)
  * accounting system time in order to correctly compute
  * the stolen time accounting.
  */
-void vtime_account_user(struct task_struct *tsk)
+void vtime_flush(struct task_struct *tsk)
 {
 	if (do_account_vtime(tsk))
 		virt_timer_expire();
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index cfd6c0c6d4e8..c3e38ded2d73 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -89,7 +89,7 @@ extern void account_idle_time(cputime_t);
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 static inline void account_process_tick(struct task_struct *tsk, int user)
 {
-	vtime_account_user(tsk);
+	vtime_flush(tsk);
 }
 #else
 extern void account_process_tick(struct task_struct *, int user);
diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index aa9bfea8804a..0681fe25abeb 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -58,27 +58,28 @@ static inline void vtime_task_switch(struct task_struct *prev)
 
 extern void vtime_account_system(struct task_struct *tsk);
 extern void vtime_account_idle(struct task_struct *tsk);
-extern void vtime_account_user(struct task_struct *tsk);
 
 #else /* !CONFIG_VIRT_CPU_ACCOUNTING */
 
 static inline void vtime_task_switch(struct task_struct *prev) { }
 static inline void vtime_account_system(struct task_struct *tsk) { }
-static inline void vtime_account_user(struct task_struct *tsk) { }
 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 extern void arch_vtime_task_switch(struct task_struct *tsk);
+extern void vtime_account_user(struct task_struct *tsk);
 extern void vtime_user_enter(struct task_struct *tsk);
 
 static inline void vtime_user_exit(struct task_struct *tsk)
 {
 	vtime_account_user(tsk);
 }
+
 extern void vtime_guest_enter(struct task_struct *tsk);
 extern void vtime_guest_exit(struct task_struct *tsk);
 extern void vtime_init_idle(struct task_struct *tsk, int cpu);
 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN  */
+static inline void vtime_account_user(struct task_struct *tsk) { }
 static inline void vtime_user_enter(struct task_struct *tsk) { }
 static inline void vtime_user_exit(struct task_struct *tsk) { }
 static inline void vtime_guest_enter(struct task_struct *tsk) { }
@@ -93,9 +94,11 @@ static inline void vtime_account_irq_exit(struct task_struct *tsk)
 	/* On hard|softirq exit we always account to hard|softirq cputime */
 	vtime_account_system(tsk);
 }
+extern void vtime_flush(struct task_struct *tsk);
 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 static inline void vtime_account_irq_enter(struct task_struct *tsk) { }
 static inline void vtime_account_irq_exit(struct task_struct *tsk) { }
+static inline void vtime_flush(struct task_struct *tsk) { }
 #endif
 
 
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 5813ee4a5168..f7c14cc71d06 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -437,9 +437,7 @@ void vtime_common_task_switch(struct task_struct *prev)
 	else
 		vtime_account_system(prev);
 
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	vtime_account_user(prev);
-#endif
+	vtime_flush(prev);
 	arch_vtime_task_switch(prev);
 }
 #endif
-- 
cgit v1.2.3


From 0039962a1473f07fd5c8355bd8264be1eb87eb3e Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 3 Jan 2017 13:43:11 -0800
Subject: kernel/exit: Compute 'current' directly

This patch effectively replaces the tsk pointer dereference (which is
obviously == current), to directly use get_current() macro. In this
case, do_exit() always passes current to exit_mm(), hence we can
simply get rid of the argument. This is also a performance win on some
archs such as x86-64 and ppc64 -- arm64 is no longer an issue.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dave@stgolabs.net
Cc: mark.rutland@arm.com
Link: http://lkml.kernel.org/r/1483479794-14013-2-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/exit.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 8f14b866f9f6..2385d434a46e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -468,12 +468,12 @@ assign_new_owner:
  * Turn us into a lazy TLB process if we
  * aren't already..
  */
-static void exit_mm(struct task_struct *tsk)
+static void exit_mm(void)
 {
-	struct mm_struct *mm = tsk->mm;
+	struct mm_struct *mm = current->mm;
 	struct core_state *core_state;
 
-	mm_release(tsk, mm);
+	mm_release(current, mm);
 	if (!mm)
 		return;
 	sync_mm_rss(mm);
@@ -491,7 +491,7 @@ static void exit_mm(struct task_struct *tsk)
 
 		up_read(&mm->mmap_sem);
 
-		self.task = tsk;
+		self.task = current;
 		self.next = xchg(&core_state->dumper.next, &self);
 		/*
 		 * Implies mb(), the result of xchg() must be visible
@@ -501,22 +501,22 @@ static void exit_mm(struct task_struct *tsk)
 			complete(&core_state->startup);
 
 		for (;;) {
-			set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+			set_task_state(current, TASK_UNINTERRUPTIBLE);
 			if (!self.task) /* see coredump_finish() */
 				break;
 			freezable_schedule();
 		}
-		__set_task_state(tsk, TASK_RUNNING);
+		__set_task_state(current, TASK_RUNNING);
 		down_read(&mm->mmap_sem);
 	}
 	atomic_inc(&mm->mm_count);
-	BUG_ON(mm != tsk->active_mm);
+	BUG_ON(mm != current->active_mm);
 	/* more a memory barrier than a real lock */
-	task_lock(tsk);
-	tsk->mm = NULL;
+	task_lock(current);
+	current->mm = NULL;
 	up_read(&mm->mmap_sem);
 	enter_lazy_tlb(mm, current);
-	task_unlock(tsk);
+	task_unlock(current);
 	mm_update_next_owner(mm);
 	mmput(mm);
 	if (test_thread_flag(TIF_MEMDIE))
@@ -823,7 +823,7 @@ void __noreturn do_exit(long code)
 	tsk->exit_code = code;
 	taskstats_exit(tsk, group_dead);
 
-	exit_mm(tsk);
+	exit_mm();
 
 	if (group_dead)
 		acct_process();
-- 
cgit v1.2.3


From d269a8b8c57523a2e328c1ff44fe791e13df3d37 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 3 Jan 2017 13:43:13 -0800
Subject: kernel/locking: Compute 'current' directly

This patch effectively replaces the tsk pointer dereference
(which is obviously == current), to directly use get_current()
macro. This is to make the removal of setting foreign task
states smoother and painfully obvious. Performance win on some
archs such as x86-64 and ppc64. On a microbenchmark that calls
set_task_state() vs set_current_state() and an inode rwsem
pounding benchmark doing unlink:

== 1. x86-64 ==

Avg runtime set_task_state():    601 msecs
Avg runtime set_current_state(): 552 msecs

                                            vanilla                 dirty
Hmean    unlink1-processes-2      36089.26 (  0.00%)    38977.33 (  8.00%)
Hmean    unlink1-processes-5      28555.01 (  0.00%)    29832.55 (  4.28%)
Hmean    unlink1-processes-8      37323.75 (  0.00%)    44974.57 ( 20.50%)
Hmean    unlink1-processes-12     43571.88 (  0.00%)    44283.01 (  1.63%)
Hmean    unlink1-processes-21     34431.52 (  0.00%)    38284.45 ( 11.19%)
Hmean    unlink1-processes-30     34813.26 (  0.00%)    37975.17 (  9.08%)
Hmean    unlink1-processes-48     37048.90 (  0.00%)    39862.78 (  7.59%)
Hmean    unlink1-processes-79     35630.01 (  0.00%)    36855.30 (  3.44%)
Hmean    unlink1-processes-110    36115.85 (  0.00%)    39843.91 ( 10.32%)
Hmean    unlink1-processes-141    32546.96 (  0.00%)    35418.52 (  8.82%)
Hmean    unlink1-processes-172    34674.79 (  0.00%)    36899.21 (  6.42%)
Hmean    unlink1-processes-203    37303.11 (  0.00%)    36393.04 ( -2.44%)
Hmean    unlink1-processes-224    35712.13 (  0.00%)    36685.96 (  2.73%)

== 2. ppc64le ==

Avg runtime set_task_state():  938 msecs
Avg runtime set_current_state: 940 msecs

                                            vanilla                 dirty
Hmean    unlink1-processes-2      19269.19 (  0.00%)    30704.50 ( 59.35%)
Hmean    unlink1-processes-5      20106.15 (  0.00%)    21804.15 (  8.45%)
Hmean    unlink1-processes-8      17496.97 (  0.00%)    17243.28 ( -1.45%)
Hmean    unlink1-processes-12     14224.15 (  0.00%)    17240.21 ( 21.20%)
Hmean    unlink1-processes-21     14155.66 (  0.00%)    15681.23 ( 10.78%)
Hmean    unlink1-processes-30     14450.70 (  0.00%)    15995.83 ( 10.69%)
Hmean    unlink1-processes-48     16945.57 (  0.00%)    16370.42 ( -3.39%)
Hmean    unlink1-processes-79     15788.39 (  0.00%)    14639.27 ( -7.28%)
Hmean    unlink1-processes-110    14268.48 (  0.00%)    14377.40 (  0.76%)
Hmean    unlink1-processes-141    14023.65 (  0.00%)    16271.69 ( 16.03%)
Hmean    unlink1-processes-172    13417.62 (  0.00%)    16067.55 ( 19.75%)
Hmean    unlink1-processes-203    15293.08 (  0.00%)    15440.40 (  0.96%)
Hmean    unlink1-processes-234    13719.32 (  0.00%)    16190.74 ( 18.01%)
Hmean    unlink1-processes-265    16400.97 (  0.00%)    16115.22 ( -1.74%)
Hmean    unlink1-processes-296    14388.60 (  0.00%)    16216.13 ( 12.70%)
Hmean    unlink1-processes-320    15771.85 (  0.00%)    15905.96 (  0.85%)

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dave@stgolabs.net
Cc: mark.rutland@arm.com
Link: http://lkml.kernel.org/r/1483479794-14013-4-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/mutex.c          | 19 +++++++++----------
 kernel/locking/rwsem-spinlock.c | 18 +++++++-----------
 kernel/locking/rwsem-xadd.c     |  7 +++----
 kernel/locking/semaphore.c      |  7 +++----
 4 files changed, 22 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 9b349619f431..4c7d04362c95 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -622,7 +622,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		    struct lockdep_map *nest_lock, unsigned long ip,
 		    struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
 {
-	struct task_struct *task = current;
 	struct mutex_waiter waiter;
 	unsigned long flags;
 	bool first = false;
@@ -656,18 +655,18 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		goto skip_wait;
 
 	debug_mutex_lock_common(lock, &waiter);
-	debug_mutex_add_waiter(lock, &waiter, task);
+	debug_mutex_add_waiter(lock, &waiter, current);
 
 	/* add waiting tasks to the end of the waitqueue (FIFO): */
 	list_add_tail(&waiter.list, &lock->wait_list);
-	waiter.task = task;
+	waiter.task = current;
 
 	if (__mutex_waiter_is_first(lock, &waiter))
 		__mutex_set_flag(lock, MUTEX_FLAG_WAITERS);
 
 	lock_contended(&lock->dep_map, ip);
 
-	set_task_state(task, state);
+	set_task_state(current, state);
 	for (;;) {
 		/*
 		 * Once we hold wait_lock, we're serialized against
@@ -683,7 +682,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 * wait_lock. This ensures the lock cancellation is ordered
 		 * against mutex_unlock() and wake-ups do not go missing.
 		 */
-		if (unlikely(signal_pending_state(state, task))) {
+		if (unlikely(signal_pending_state(state, current))) {
 			ret = -EINTR;
 			goto err;
 		}
@@ -702,7 +701,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 			__mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
 		}
 
-		set_task_state(task, state);
+		set_task_state(current, state);
 		/*
 		 * Here we order against unlock; we must either see it change
 		 * state back to RUNNING and fall through the next schedule(),
@@ -716,9 +715,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	}
 	spin_lock_mutex(&lock->wait_lock, flags);
 acquired:
-	__set_task_state(task, TASK_RUNNING);
+	__set_task_state(current, TASK_RUNNING);
 
-	mutex_remove_waiter(lock, &waiter, task);
+	mutex_remove_waiter(lock, &waiter, current);
 	if (likely(list_empty(&lock->wait_list)))
 		__mutex_clear_flag(lock, MUTEX_FLAGS);
 
@@ -736,8 +735,8 @@ skip_wait:
 	return 0;
 
 err:
-	__set_task_state(task, TASK_RUNNING);
-	mutex_remove_waiter(lock, &waiter, task);
+	__set_task_state(current, TASK_RUNNING);
+	mutex_remove_waiter(lock, &waiter, current);
 	spin_unlock_mutex(&lock->wait_lock, flags);
 	debug_mutex_free_waiter(&waiter);
 	mutex_release(&lock->dep_map, 1, ip);
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 1591f6b3539f..60d15d3bb2a8 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -128,7 +128,6 @@ __rwsem_wake_one_writer(struct rw_semaphore *sem)
 void __sched __down_read(struct rw_semaphore *sem)
 {
 	struct rwsem_waiter waiter;
-	struct task_struct *tsk;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&sem->wait_lock, flags);
@@ -140,13 +139,12 @@ void __sched __down_read(struct rw_semaphore *sem)
 		goto out;
 	}
 
-	tsk = current;
-	set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+	set_task_state(current, TASK_UNINTERRUPTIBLE);
 
 	/* set up my own style of waitqueue */
-	waiter.task = tsk;
+	waiter.task = current;
 	waiter.type = RWSEM_WAITING_FOR_READ;
-	get_task_struct(tsk);
+	get_task_struct(current);
 
 	list_add_tail(&waiter.list, &sem->wait_list);
 
@@ -158,10 +156,10 @@ void __sched __down_read(struct rw_semaphore *sem)
 		if (!waiter.task)
 			break;
 		schedule();
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+		set_task_state(current, TASK_UNINTERRUPTIBLE);
 	}
 
-	__set_task_state(tsk, TASK_RUNNING);
+	__set_task_state(current, TASK_RUNNING);
  out:
 	;
 }
@@ -194,15 +192,13 @@ int __down_read_trylock(struct rw_semaphore *sem)
 int __sched __down_write_common(struct rw_semaphore *sem, int state)
 {
 	struct rwsem_waiter waiter;
-	struct task_struct *tsk;
 	unsigned long flags;
 	int ret = 0;
 
 	raw_spin_lock_irqsave(&sem->wait_lock, flags);
 
 	/* set up my own style of waitqueue */
-	tsk = current;
-	waiter.task = tsk;
+	waiter.task = current;
 	waiter.type = RWSEM_WAITING_FOR_WRITE;
 	list_add_tail(&waiter.list, &sem->wait_list);
 
@@ -220,7 +216,7 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state)
 			ret = -EINTR;
 			goto out;
 		}
-		set_task_state(tsk, state);
+		set_task_state(current, state);
 		raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
 		schedule();
 		raw_spin_lock_irqsave(&sem->wait_lock, flags);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 631506004f9e..d3b819c7f07f 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -224,10 +224,9 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 {
 	long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
 	struct rwsem_waiter waiter;
-	struct task_struct *tsk = current;
 	DEFINE_WAKE_Q(wake_q);
 
-	waiter.task = tsk;
+	waiter.task = current;
 	waiter.type = RWSEM_WAITING_FOR_READ;
 
 	raw_spin_lock_irq(&sem->wait_lock);
@@ -254,13 +253,13 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 
 	/* wait to be given the lock */
 	while (true) {
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+		set_task_state(current, TASK_UNINTERRUPTIBLE);
 		if (!waiter.task)
 			break;
 		schedule();
 	}
 
-	__set_task_state(tsk, TASK_RUNNING);
+	__set_task_state(current, TASK_RUNNING);
 	return sem;
 }
 EXPORT_SYMBOL(rwsem_down_read_failed);
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index b8120abe594b..29aac9f9e5e4 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -204,19 +204,18 @@ struct semaphore_waiter {
 static inline int __sched __down_common(struct semaphore *sem, long state,
 								long timeout)
 {
-	struct task_struct *task = current;
 	struct semaphore_waiter waiter;
 
 	list_add_tail(&waiter.list, &sem->wait_list);
-	waiter.task = task;
+	waiter.task = current;
 	waiter.up = false;
 
 	for (;;) {
-		if (signal_pending_state(state, task))
+		if (signal_pending_state(state, current))
 			goto interrupted;
 		if (unlikely(timeout <= 0))
 			goto timed_out;
-		__set_task_state(task, state);
+		__set_task_state(current, state);
 		raw_spin_unlock_irq(&sem->lock);
 		timeout = schedule_timeout(timeout);
 		raw_spin_lock_irq(&sem->lock);
-- 
cgit v1.2.3


From 642fa448ae6b3a4e5e8737054a094173405b7643 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 3 Jan 2017 13:43:14 -0800
Subject: sched/core: Remove set_task_state()

This is a nasty interface and setting the state of a foreign task must
not be done. As of the following commit:

  be628be0956 ("bcache: Make gc wakeup sane, remove set_task_state()")

... everyone in the kernel calls set_task_state() with current, allowing
the helper to be removed.

However, as the comment indicates, it is still around for those archs
where computing current is more expensive than using a pointer, at least
in theory. An important arch that is affected is arm64, however this has
been addressed now [1] and performance is up to par making no difference
with either calls.

Of all the callers, if any, it's the locking bits that would care most
about this -- ie: we end up passing a tsk pointer to a lot of the lock
slowpath, and setting ->state on that. The following numbers are based
on two tests: a custom ad-hoc microbenchmark that just measures
latencies (for ~65 million calls) between get_task_state() vs
get_current_state().

Secondly for a higher overview, an unlink microbenchmark was used,
which pounds on a single file with open, close,unlink combos with
increasing thread counts (up to 4x ncpus). While the workload is quite
unrealistic, it does contend a lot on the inode mutex or now rwsem.

[1] https://lkml.kernel.org/r/1483468021-8237-1-git-send-email-mark.rutland@arm.com

== 1. x86-64 ==

Avg runtime set_task_state():    601 msecs
Avg runtime set_current_state(): 552 msecs

                                            vanilla                 dirty
Hmean    unlink1-processes-2      36089.26 (  0.00%)    38977.33 (  8.00%)
Hmean    unlink1-processes-5      28555.01 (  0.00%)    29832.55 (  4.28%)
Hmean    unlink1-processes-8      37323.75 (  0.00%)    44974.57 ( 20.50%)
Hmean    unlink1-processes-12     43571.88 (  0.00%)    44283.01 (  1.63%)
Hmean    unlink1-processes-21     34431.52 (  0.00%)    38284.45 ( 11.19%)
Hmean    unlink1-processes-30     34813.26 (  0.00%)    37975.17 (  9.08%)
Hmean    unlink1-processes-48     37048.90 (  0.00%)    39862.78 (  7.59%)
Hmean    unlink1-processes-79     35630.01 (  0.00%)    36855.30 (  3.44%)
Hmean    unlink1-processes-110    36115.85 (  0.00%)    39843.91 ( 10.32%)
Hmean    unlink1-processes-141    32546.96 (  0.00%)    35418.52 (  8.82%)
Hmean    unlink1-processes-172    34674.79 (  0.00%)    36899.21 (  6.42%)
Hmean    unlink1-processes-203    37303.11 (  0.00%)    36393.04 ( -2.44%)
Hmean    unlink1-processes-224    35712.13 (  0.00%)    36685.96 (  2.73%)

== 2. ppc64le ==

Avg runtime set_task_state():  938 msecs
Avg runtime set_current_state: 940 msecs

                                            vanilla                 dirty
Hmean    unlink1-processes-2      19269.19 (  0.00%)    30704.50 ( 59.35%)
Hmean    unlink1-processes-5      20106.15 (  0.00%)    21804.15 (  8.45%)
Hmean    unlink1-processes-8      17496.97 (  0.00%)    17243.28 ( -1.45%)
Hmean    unlink1-processes-12     14224.15 (  0.00%)    17240.21 ( 21.20%)
Hmean    unlink1-processes-21     14155.66 (  0.00%)    15681.23 ( 10.78%)
Hmean    unlink1-processes-30     14450.70 (  0.00%)    15995.83 ( 10.69%)
Hmean    unlink1-processes-48     16945.57 (  0.00%)    16370.42 ( -3.39%)
Hmean    unlink1-processes-79     15788.39 (  0.00%)    14639.27 ( -7.28%)
Hmean    unlink1-processes-110    14268.48 (  0.00%)    14377.40 (  0.76%)
Hmean    unlink1-processes-141    14023.65 (  0.00%)    16271.69 ( 16.03%)
Hmean    unlink1-processes-172    13417.62 (  0.00%)    16067.55 ( 19.75%)
Hmean    unlink1-processes-203    15293.08 (  0.00%)    15440.40 (  0.96%)
Hmean    unlink1-processes-234    13719.32 (  0.00%)    16190.74 ( 18.01%)
Hmean    unlink1-processes-265    16400.97 (  0.00%)    16115.22 ( -1.74%)
Hmean    unlink1-processes-296    14388.60 (  0.00%)    16216.13 ( 12.70%)
Hmean    unlink1-processes-320    15771.85 (  0.00%)    15905.96 (  0.85%)

x86-64 (known to be fast for get_current()/this_cpu_read_stable() caching)
and ppc64 (with paca) show similar improvements in the unlink microbenches.
The small delta for ppc64 (2ms), does not represent the gains on the unlink
runs. In the case of x86, there was a decent amount of variation in the
latency runs, but always within a 20 to 50ms increase), ppc was more constant.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dave@stgolabs.net
Cc: mark.rutland@arm.com
Link: http://lkml.kernel.org/r/1483479794-14013-5-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/um/drivers/random.c                           |  2 +-
 drivers/md/dm-bufio.c                              |  2 +-
 drivers/md/dm-crypt.c                              |  4 ++--
 drivers/md/persistent-data/dm-block-manager.c      |  4 ++--
 .../staging/lustre/lnet/libcfs/linux/linux-debug.c |  2 +-
 drivers/tty/tty_ldsem.c                            | 10 ++++----
 include/linux/sched.h                              | 27 +---------------------
 kernel/exit.c                                      |  4 ++--
 kernel/locking/mutex.c                             |  8 +++----
 kernel/locking/rwsem-spinlock.c                    |  8 +++----
 kernel/locking/rwsem-xadd.c                        |  4 ++--
 kernel/locking/semaphore.c                         |  2 +-
 12 files changed, 26 insertions(+), 51 deletions(-)

(limited to 'kernel')

diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c
index 05523f14d7b2..57f03050c850 100644
--- a/arch/um/drivers/random.c
+++ b/arch/um/drivers/random.c
@@ -76,7 +76,7 @@ static ssize_t rng_dev_read (struct file *filp, char __user *buf, size_t size,
 			add_sigio_fd(random_fd);
 
 			add_wait_queue(&host_read_wait, &wait);
-			set_task_state(current, TASK_INTERRUPTIBLE);
+			set_current_state(TASK_INTERRUPTIBLE);
 
 			schedule();
 			remove_wait_queue(&host_read_wait, &wait);
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 84d2f0e4c754..d36d427a9efb 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -794,7 +794,7 @@ static void __wait_for_free_buffer(struct dm_bufio_client *c)
 	DECLARE_WAITQUEUE(wait, current);
 
 	add_wait_queue(&c->free_buffer_wait, &wait);
-	set_task_state(current, TASK_UNINTERRUPTIBLE);
+	set_current_state(TASK_UNINTERRUPTIBLE);
 	dm_bufio_unlock(c);
 
 	io_schedule();
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 7c6c57216bf2..96692d13a6e4 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1210,14 +1210,14 @@ continue_locked:
 		spin_unlock_irq(&cc->write_thread_wait.lock);
 
 		if (unlikely(kthread_should_stop())) {
-			set_task_state(current, TASK_RUNNING);
+			set_current_state(TASK_RUNNING);
 			remove_wait_queue(&cc->write_thread_wait, &wait);
 			break;
 		}
 
 		schedule();
 
-		set_task_state(current, TASK_RUNNING);
+		set_current_state(TASK_RUNNING);
 		spin_lock_irq(&cc->write_thread_wait.lock);
 		__remove_wait_queue(&cc->write_thread_wait, &wait);
 		goto continue_locked;
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index a6dde7cab458..758d90cc2733 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -120,7 +120,7 @@ static int __check_holder(struct block_lock *lock)
 static void __wait(struct waiter *w)
 {
 	for (;;) {
-		set_task_state(current, TASK_UNINTERRUPTIBLE);
+		set_current_state(TASK_UNINTERRUPTIBLE);
 
 		if (!w->task)
 			break;
@@ -128,7 +128,7 @@ static void __wait(struct waiter *w)
 		schedule();
 	}
 
-	set_task_state(current, TASK_RUNNING);
+	set_current_state(TASK_RUNNING);
 }
 
 static void __wake_waiter(struct waiter *w)
diff --git a/drivers/staging/lustre/lnet/libcfs/linux/linux-debug.c b/drivers/staging/lustre/lnet/libcfs/linux/linux-debug.c
index 39a72e3f0c18..7035356e56b3 100644
--- a/drivers/staging/lustre/lnet/libcfs/linux/linux-debug.c
+++ b/drivers/staging/lustre/lnet/libcfs/linux/linux-debug.c
@@ -107,7 +107,7 @@ void __noreturn lbug_with_loc(struct libcfs_debug_msg_data *msgdata)
 		libcfs_debug_dumplog();
 	if (libcfs_panic_on_lbug)
 		panic("LBUG");
-	set_task_state(current, TASK_UNINTERRUPTIBLE);
+	set_current_state(TASK_UNINTERRUPTIBLE);
 	while (1)
 		schedule();
 }
diff --git a/drivers/tty/tty_ldsem.c b/drivers/tty/tty_ldsem.c
index 3e6722954f29..9229de43e19d 100644
--- a/drivers/tty/tty_ldsem.c
+++ b/drivers/tty/tty_ldsem.c
@@ -231,7 +231,7 @@ down_read_failed(struct ld_semaphore *sem, long count, long timeout)
 
 	/* wait to be given the lock */
 	for (;;) {
-		set_task_state(current, TASK_UNINTERRUPTIBLE);
+		set_current_state(TASK_UNINTERRUPTIBLE);
 
 		if (!waiter.task)
 			break;
@@ -240,7 +240,7 @@ down_read_failed(struct ld_semaphore *sem, long count, long timeout)
 		timeout = schedule_timeout(timeout);
 	}
 
-	__set_task_state(current, TASK_RUNNING);
+	__set_current_state(TASK_RUNNING);
 
 	if (!timeout) {
 		/* lock timed out but check if this task was just
@@ -289,14 +289,14 @@ down_write_failed(struct ld_semaphore *sem, long count, long timeout)
 
 	waiter.task = current;
 
-	set_task_state(current, TASK_UNINTERRUPTIBLE);
+	set_current_state(TASK_UNINTERRUPTIBLE);
 	for (;;) {
 		if (!timeout)
 			break;
 		raw_spin_unlock_irq(&sem->wait_lock);
 		timeout = schedule_timeout(timeout);
 		raw_spin_lock_irq(&sem->wait_lock);
-		set_task_state(current, TASK_UNINTERRUPTIBLE);
+		set_current_state(TASK_UNINTERRUPTIBLE);
 		locked = writer_trylock(sem);
 		if (locked)
 			break;
@@ -307,7 +307,7 @@ down_write_failed(struct ld_semaphore *sem, long count, long timeout)
 	list_del(&waiter.list);
 	raw_spin_unlock_irq(&sem->wait_lock);
 
-	__set_task_state(current, TASK_RUNNING);
+	__set_current_state(TASK_RUNNING);
 
 	/* lock wait may have timed out */
 	if (!locked)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ad3ec9ec61f7..f4f9d32f12b3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -227,7 +227,7 @@ extern void proc_sched_set_task(struct task_struct *p);
 extern char ___assert_task_state[1 - 2*!!(
 		sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
 
-/* Convenience macros for the sake of set_task_state */
+/* Convenience macros for the sake of set_current_state */
 #define TASK_KILLABLE		(TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
 #define TASK_STOPPED		(TASK_WAKEKILL | __TASK_STOPPED)
 #define TASK_TRACED		(TASK_WAKEKILL | __TASK_TRACED)
@@ -254,17 +254,6 @@ extern char ___assert_task_state[1 - 2*!!(
 
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 
-#define __set_task_state(tsk, state_value)			\
-	do {							\
-		(tsk)->task_state_change = _THIS_IP_;		\
-		(tsk)->state = (state_value);			\
-	} while (0)
-#define set_task_state(tsk, state_value)			\
-	do {							\
-		(tsk)->task_state_change = _THIS_IP_;		\
-		smp_store_mb((tsk)->state, (state_value));	\
-	} while (0)
-
 #define __set_current_state(state_value)			\
 	do {							\
 		current->task_state_change = _THIS_IP_;		\
@@ -277,20 +266,6 @@ extern char ___assert_task_state[1 - 2*!!(
 	} while (0)
 
 #else
-
-/*
- * @tsk had better be current, or you get to keep the pieces.
- *
- * The only reason is that computing current can be more expensive than
- * using a pointer that's already available.
- *
- * Therefore, see set_current_state().
- */
-#define __set_task_state(tsk, state_value)		\
-	do { (tsk)->state = (state_value); } while (0)
-#define set_task_state(tsk, state_value)		\
-	smp_store_mb((tsk)->state, (state_value))
-
 /*
  * set_current_state() includes a barrier so that the write of current->state
  * is correctly serialised wrt the caller's subsequent test of whether to
diff --git a/kernel/exit.c b/kernel/exit.c
index 2385d434a46e..27c68653e2fc 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -501,12 +501,12 @@ static void exit_mm(void)
 			complete(&core_state->startup);
 
 		for (;;) {
-			set_task_state(current, TASK_UNINTERRUPTIBLE);
+			set_current_state(TASK_UNINTERRUPTIBLE);
 			if (!self.task) /* see coredump_finish() */
 				break;
 			freezable_schedule();
 		}
-		__set_task_state(current, TASK_RUNNING);
+		__set_current_state(TASK_RUNNING);
 		down_read(&mm->mmap_sem);
 	}
 	atomic_inc(&mm->mm_count);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 4c7d04362c95..97d142486f93 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -666,7 +666,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 
 	lock_contended(&lock->dep_map, ip);
 
-	set_task_state(current, state);
+	set_current_state(state);
 	for (;;) {
 		/*
 		 * Once we hold wait_lock, we're serialized against
@@ -701,7 +701,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 			__mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
 		}
 
-		set_task_state(current, state);
+		set_current_state(state);
 		/*
 		 * Here we order against unlock; we must either see it change
 		 * state back to RUNNING and fall through the next schedule(),
@@ -715,7 +715,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	}
 	spin_lock_mutex(&lock->wait_lock, flags);
 acquired:
-	__set_task_state(current, TASK_RUNNING);
+	__set_current_state(TASK_RUNNING);
 
 	mutex_remove_waiter(lock, &waiter, current);
 	if (likely(list_empty(&lock->wait_list)))
@@ -735,7 +735,7 @@ skip_wait:
 	return 0;
 
 err:
-	__set_task_state(current, TASK_RUNNING);
+	__set_current_state(TASK_RUNNING);
 	mutex_remove_waiter(lock, &waiter, current);
 	spin_unlock_mutex(&lock->wait_lock, flags);
 	debug_mutex_free_waiter(&waiter);
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 60d15d3bb2a8..5eacab880f67 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -139,7 +139,7 @@ void __sched __down_read(struct rw_semaphore *sem)
 		goto out;
 	}
 
-	set_task_state(current, TASK_UNINTERRUPTIBLE);
+	set_current_state(TASK_UNINTERRUPTIBLE);
 
 	/* set up my own style of waitqueue */
 	waiter.task = current;
@@ -156,10 +156,10 @@ void __sched __down_read(struct rw_semaphore *sem)
 		if (!waiter.task)
 			break;
 		schedule();
-		set_task_state(current, TASK_UNINTERRUPTIBLE);
+		set_current_state(TASK_UNINTERRUPTIBLE);
 	}
 
-	__set_task_state(current, TASK_RUNNING);
+	__set_current_state(TASK_RUNNING);
  out:
 	;
 }
@@ -216,7 +216,7 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state)
 			ret = -EINTR;
 			goto out;
 		}
-		set_task_state(current, state);
+		set_current_state(state);
 		raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
 		schedule();
 		raw_spin_lock_irqsave(&sem->wait_lock, flags);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index d3b819c7f07f..a3a381aee24b 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -253,13 +253,13 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 
 	/* wait to be given the lock */
 	while (true) {
-		set_task_state(current, TASK_UNINTERRUPTIBLE);
+		set_current_state(TASK_UNINTERRUPTIBLE);
 		if (!waiter.task)
 			break;
 		schedule();
 	}
 
-	__set_task_state(current, TASK_RUNNING);
+	__set_current_state(TASK_RUNNING);
 	return sem;
 }
 EXPORT_SYMBOL(rwsem_down_read_failed);
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 29aac9f9e5e4..9512e37637dc 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -215,7 +215,7 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
 			goto interrupted;
 		if (unlikely(timeout <= 0))
 			goto timed_out;
-		__set_task_state(current, state);
+		__set_current_state(state);
 		raw_spin_unlock_irq(&sem->lock);
 		timeout = schedule_timeout(timeout);
 		raw_spin_lock_irq(&sem->lock);
-- 
cgit v1.2.3


From 8f95c90ceb541a38ac16fec48c05142ef1450c25 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 11 Jan 2017 07:22:25 -0800
Subject: sched/wait, RCU: Introduce rcuwait machinery

rcuwait provides support for (single) RCU-safe task wait/wake functionality,
with the caveat that it must not be called after exit_notify(), such that
we avoid racing with rcu delayed_put_task_struct callbacks, task_struct
being rcu unaware in this context -- for which we similarly have
task_rcu_dereference() magic, but with different return semantics, which
can conflict with the wakeup side.

The interfaces are quite straightforward:

  rcuwait_wait_event()
  rcuwait_wake_up()

More details are in the comments, but it's perhaps worth mentioning at least,
that users must provide proper serialization when waiting on a condition, and
avoid corrupting a concurrent waiter. Also care must be taken between the task
and the condition for when calling the wakeup -- we cannot miss wakeups. When
porting users, this is for example, a given when using waitqueues in that
everything is done under the q->lock. As such, it can remove sources of non
preemptable unbounded work for realtime.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dave@stgolabs.net
Link: http://lkml.kernel.org/r/1484148146-14210-2-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/rcuwait.h | 63 +++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/exit.c           | 30 +++++++++++++++++++++++
 2 files changed, 93 insertions(+)
 create mode 100644 include/linux/rcuwait.h

(limited to 'kernel')

diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h
new file mode 100644
index 000000000000..0e93d56c7ab2
--- /dev/null
+++ b/include/linux/rcuwait.h
@@ -0,0 +1,63 @@
+#ifndef _LINUX_RCUWAIT_H_
+#define _LINUX_RCUWAIT_H_
+
+#include <linux/rcupdate.h>
+
+/*
+ * rcuwait provides a way of blocking and waking up a single
+ * task in an rcu-safe manner; where it is forbidden to use
+ * after exit_notify(). task_struct is not properly rcu protected,
+ * unless dealing with rcu-aware lists, ie: find_task_by_*().
+ *
+ * Alternatively we have task_rcu_dereference(), but the return
+ * semantics have different implications which would break the
+ * wakeup side. The only time @task is non-nil is when a user is
+ * blocked (or checking if it needs to) on a condition, and reset
+ * as soon as we know that the condition has succeeded and are
+ * awoken.
+ */
+struct rcuwait {
+	struct task_struct *task;
+};
+
+#define __RCUWAIT_INITIALIZER(name)		\
+	{ .task = NULL, }
+
+static inline void rcuwait_init(struct rcuwait *w)
+{
+	w->task = NULL;
+}
+
+extern void rcuwait_wake_up(struct rcuwait *w);
+
+/*
+ * The caller is responsible for locking around rcuwait_wait_event(),
+ * such that writes to @task are properly serialized.
+ */
+#define rcuwait_wait_event(w, condition)				\
+({									\
+	/*								\
+	 * Complain if we are called after do_exit()/exit_notify(),     \
+	 * as we cannot rely on the rcu critical region for the		\
+	 * wakeup side.							\
+	 */                                                             \
+	WARN_ON(current->exit_state);                                   \
+									\
+	rcu_assign_pointer((w)->task, current);				\
+	for (;;) {							\
+		/*							\
+		 * Implicit barrier (A) pairs with (B) in		\
+		 * rcuwait_trywake().					\
+		 */							\
+		set_current_state(TASK_UNINTERRUPTIBLE);		\
+		if (condition)						\
+			break;						\
+									\
+		schedule();						\
+	}								\
+									\
+	WRITE_ONCE((w)->task, NULL);					\
+	__set_current_state(TASK_RUNNING);				\
+})
+
+#endif /* _LINUX_RCUWAIT_H_ */
diff --git a/kernel/exit.c b/kernel/exit.c
index 27c68653e2fc..a9441da69e29 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -55,6 +55,7 @@
 #include <linux/shm.h>
 #include <linux/kcov.h>
 #include <linux/random.h>
+#include <linux/rcuwait.h>
 
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
@@ -282,6 +283,35 @@ retry:
 	return task;
 }
 
+void rcuwait_wake_up(struct rcuwait *w)
+{
+	struct task_struct *task;
+
+	rcu_read_lock();
+
+	/*
+	 * Order condition vs @task, such that everything prior to the load
+	 * of @task is visible. This is the condition as to why the user called
+	 * rcuwait_trywake() in the first place. Pairs with set_current_state()
+	 * barrier (A) in rcuwait_wait_event().
+	 *
+	 *    WAIT                WAKE
+	 *    [S] tsk = current	  [S] cond = true
+	 *        MB (A)	      MB (B)
+	 *    [L] cond		  [L] tsk
+	 */
+	smp_rmb(); /* (B) */
+
+	/*
+	 * Avoid using task_rcu_dereference() magic as long as we are careful,
+	 * see comment in rcuwait_wait_event() regarding ->exit_state.
+	 */
+	task = rcu_dereference(w->task);
+	if (task)
+		wake_up_process(task);
+	rcu_read_unlock();
+}
+
 struct task_struct *try_get_task_struct(struct task_struct **ptask)
 {
 	struct task_struct *task;
-- 
cgit v1.2.3


From 52b94129f274937e4c25dd17b76697664a3c43c9 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 11 Jan 2017 07:22:26 -0800
Subject: locking/percpu-rwsem: Replace waitqueue with rcuwait

The use of any kind of wait queue is an overkill for pcpu-rwsems.
While one option would be to use the less heavy simple (swait)
flavor, this is still too much for what pcpu-rwsems needs. For one,
we do not care about any sort of queuing in that the only (rare) time
writers (and readers, for that matter) are queued is when trying to
acquire the regular contended rw_sem. There cannot be any further
queuing as writers are serialized by the rw_sem in the first place.

Given that percpu_down_write() must not be called after exit_notify(),
we can replace the bulky waitqueue with rcuwait such that a writer
can wait for its turn to take the lock. As such, we can avoid the
queue handling and locking overhead.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dave@stgolabs.net
Link: http://lkml.kernel.org/r/1484148146-14210-3-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/percpu-rwsem.h  | 8 ++++----
 kernel/locking/percpu-rwsem.c | 7 +++----
 2 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 5b2e6159b744..93664f022ecf 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -4,15 +4,15 @@
 #include <linux/atomic.h>
 #include <linux/rwsem.h>
 #include <linux/percpu.h>
-#include <linux/wait.h>
+#include <linux/rcuwait.h>
 #include <linux/rcu_sync.h>
 #include <linux/lockdep.h>
 
 struct percpu_rw_semaphore {
 	struct rcu_sync		rss;
 	unsigned int __percpu	*read_count;
-	struct rw_semaphore	rw_sem;
-	wait_queue_head_t	writer;
+	struct rw_semaphore	rw_sem; /* slowpath */
+	struct rcuwait          writer; /* blocked writer */
 	int			readers_block;
 };
 
@@ -22,7 +22,7 @@ static struct percpu_rw_semaphore name = {				\
 	.rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC),	\
 	.read_count = &__percpu_rwsem_rc_##name,			\
 	.rw_sem = __RWSEM_INITIALIZER(name.rw_sem),			\
-	.writer = __WAIT_QUEUE_HEAD_INITIALIZER(name.writer),		\
+	.writer = __RCUWAIT_INITIALIZER(name.writer),			\
 }
 
 extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index ce182599cf2e..883cf1b92d90 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -1,7 +1,6 @@
 #include <linux/atomic.h>
 #include <linux/rwsem.h>
 #include <linux/percpu.h>
-#include <linux/wait.h>
 #include <linux/lockdep.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/rcupdate.h>
@@ -18,7 +17,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
 	/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
 	rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
 	__init_rwsem(&sem->rw_sem, name, rwsem_key);
-	init_waitqueue_head(&sem->writer);
+	rcuwait_init(&sem->writer);
 	sem->readers_block = 0;
 	return 0;
 }
@@ -103,7 +102,7 @@ void __percpu_up_read(struct percpu_rw_semaphore *sem)
 	__this_cpu_dec(*sem->read_count);
 
 	/* Prod writer to recheck readers_active */
-	wake_up(&sem->writer);
+	rcuwait_wake_up(&sem->writer);
 }
 EXPORT_SYMBOL_GPL(__percpu_up_read);
 
@@ -160,7 +159,7 @@ void percpu_down_write(struct percpu_rw_semaphore *sem)
 	 */
 
 	/* Wait for all now active readers to complete. */
-	wait_event(sem->writer, readers_active_check(sem));
+	rcuwait_wait_event(&sem->writer, readers_active_check(sem));
 }
 EXPORT_SYMBOL_GPL(percpu_down_write);
 
-- 
cgit v1.2.3


From e274795ea7b7caa0fd74ef651594382a69e2a951 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 11 Jan 2017 14:17:48 +0100
Subject: locking/mutex: Fix mutex handoff

While reviewing the ww_mutex patches, I noticed that it was still
possible to (incorrectly) succeed for (incorrect) code like:

	mutex_lock(&a);
	mutex_lock(&a);

This was possible if the second mutex_lock() would block (as expected)
but then receive a spurious wakeup. At that point it would find itself
at the front of the queue, request a handoff and instantly claim
ownership and continue, since owner would point to itself.

Avoid this scenario and simplify the code by introducing a third low
bit to signal handoff pickup. So once we request handoff, unlock
clears the handoff bit and sets the pickup bit along with the new
owner.

This also removes the need for the .handoff argument to
__mutex_trylock(), since that becomes superfluous with PICKUP.

In order to guarantee enough low bits, ensure task_struct alignment is
at least L1_CACHE_BYTES (which seems a good ideal regardless).

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 9d659ae14b54 ("locking/mutex: Add lock handoff to avoid starvation")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mutex.h  |   2 +-
 kernel/fork.c          |   6 ++-
 kernel/locking/mutex.c | 108 ++++++++++++++++++++++++-------------------------
 3 files changed, 57 insertions(+), 59 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index b97870f2debd..3e1fccb47f11 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -65,7 +65,7 @@ struct mutex {
 
 static inline struct task_struct *__mutex_owner(struct mutex *lock)
 {
-	return (struct task_struct *)(atomic_long_read(&lock->owner) & ~0x03);
+	return (struct task_struct *)(atomic_long_read(&lock->owner) & ~0x07);
 }
 
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 11c5c8ab827c..a90510d0bbf8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -432,11 +432,13 @@ void __init fork_init(void)
 	int i;
 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
-#define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
+#define ARCH_MIN_TASKALIGN	0
 #endif
+	int align = min_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
+
 	/* create a slab on which task_structs can be allocated */
 	task_struct_cachep = kmem_cache_create("task_struct",
-			arch_task_struct_size, ARCH_MIN_TASKALIGN,
+			arch_task_struct_size, align,
 			SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL);
 #endif
 
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 97d142486f93..24284497c425 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -50,16 +50,17 @@ EXPORT_SYMBOL(__mutex_init);
 /*
  * @owner: contains: 'struct task_struct *' to the current lock owner,
  * NULL means not owned. Since task_struct pointers are aligned at
- * ARCH_MIN_TASKALIGN (which is at least sizeof(void *)), we have low
- * bits to store extra state.
+ * at least L1_CACHE_BYTES, we have low bits to store extra state.
  *
  * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup.
  * Bit1 indicates unlock needs to hand the lock to the top-waiter
+ * Bit2 indicates handoff has been done and we're waiting for pickup.
  */
 #define MUTEX_FLAG_WAITERS	0x01
 #define MUTEX_FLAG_HANDOFF	0x02
+#define MUTEX_FLAG_PICKUP	0x04
 
-#define MUTEX_FLAGS		0x03
+#define MUTEX_FLAGS		0x07
 
 static inline struct task_struct *__owner_task(unsigned long owner)
 {
@@ -72,38 +73,29 @@ static inline unsigned long __owner_flags(unsigned long owner)
 }
 
 /*
- * Actual trylock that will work on any unlocked state.
- *
- * When setting the owner field, we must preserve the low flag bits.
- *
- * Be careful with @handoff, only set that in a wait-loop (where you set
- * HANDOFF) to avoid recursive lock attempts.
+ * Trylock variant that retuns the owning task on failure.
  */
-static inline bool __mutex_trylock(struct mutex *lock, const bool handoff)
+static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock)
 {
 	unsigned long owner, curr = (unsigned long)current;
 
 	owner = atomic_long_read(&lock->owner);
 	for (;;) { /* must loop, can race against a flag */
 		unsigned long old, flags = __owner_flags(owner);
+		unsigned long task = owner & ~MUTEX_FLAGS;
+
+		if (task) {
+			if (likely(task != curr))
+				break;
+
+			if (likely(!(flags & MUTEX_FLAG_PICKUP)))
+				break;
 
-		if (__owner_task(owner)) {
-			if (handoff && unlikely(__owner_task(owner) == current)) {
-				/*
-				 * Provide ACQUIRE semantics for the lock-handoff.
-				 *
-				 * We cannot easily use load-acquire here, since
-				 * the actual load is a failed cmpxchg, which
-				 * doesn't imply any barriers.
-				 *
-				 * Also, this is a fairly unlikely scenario, and
-				 * this contains the cost.
-				 */
-				smp_mb(); /* ACQUIRE */
-				return true;
-			}
-
-			return false;
+			flags &= ~MUTEX_FLAG_PICKUP;
+		} else {
+#ifdef CONFIG_DEBUG_MUTEXES
+			DEBUG_LOCKS_WARN_ON(flags & MUTEX_FLAG_PICKUP);
+#endif
 		}
 
 		/*
@@ -111,15 +103,24 @@ static inline bool __mutex_trylock(struct mutex *lock, const bool handoff)
 		 * past the point where we acquire it. This would be possible
 		 * if we (accidentally) set the bit on an unlocked mutex.
 		 */
-		if (handoff)
-			flags &= ~MUTEX_FLAG_HANDOFF;
+		flags &= ~MUTEX_FLAG_HANDOFF;
 
 		old = atomic_long_cmpxchg_acquire(&lock->owner, owner, curr | flags);
 		if (old == owner)
-			return true;
+			return NULL;
 
 		owner = old;
 	}
+
+	return __owner_task(owner);
+}
+
+/*
+ * Actual trylock that will work on any unlocked state.
+ */
+static inline bool __mutex_trylock(struct mutex *lock)
+{
+	return !__mutex_trylock_or_owner(lock);
 }
 
 #ifndef CONFIG_DEBUG_LOCK_ALLOC
@@ -171,9 +172,9 @@ static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_wait
 
 /*
  * Give up ownership to a specific task, when @task = NULL, this is equivalent
- * to a regular unlock. Clears HANDOFF, preserves WAITERS. Provides RELEASE
- * semantics like a regular unlock, the __mutex_trylock() provides matching
- * ACQUIRE semantics for the handoff.
+ * to a regular unlock. Sets PICKUP on a handoff, clears HANDOF, preserves
+ * WAITERS. Provides RELEASE semantics like a regular unlock, the
+ * __mutex_trylock() provides a matching ACQUIRE semantics for the handoff.
  */
 static void __mutex_handoff(struct mutex *lock, struct task_struct *task)
 {
@@ -184,10 +185,13 @@ static void __mutex_handoff(struct mutex *lock, struct task_struct *task)
 
 #ifdef CONFIG_DEBUG_MUTEXES
 		DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current);
+		DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP);
 #endif
 
 		new = (owner & MUTEX_FLAG_WAITERS);
 		new |= (unsigned long)task;
+		if (task)
+			new |= MUTEX_FLAG_PICKUP;
 
 		old = atomic_long_cmpxchg_release(&lock->owner, owner, new);
 		if (old == owner)
@@ -435,8 +439,6 @@ static bool mutex_optimistic_spin(struct mutex *lock,
 				  struct ww_acquire_ctx *ww_ctx,
 				  const bool use_ww_ctx, const bool waiter)
 {
-	struct task_struct *task = current;
-
 	if (!waiter) {
 		/*
 		 * The purpose of the mutex_can_spin_on_owner() function is
@@ -476,24 +478,17 @@ static bool mutex_optimistic_spin(struct mutex *lock,
 				goto fail_unlock;
 		}
 
+		/* Try to acquire the mutex... */
+		owner = __mutex_trylock_or_owner(lock);
+		if (!owner)
+			break;
+
 		/*
-		 * If there's an owner, wait for it to either
+		 * There's an owner, wait for it to either
 		 * release the lock or go to sleep.
 		 */
-		owner = __mutex_owner(lock);
-		if (owner) {
-			if (waiter && owner == task) {
-				smp_mb(); /* ACQUIRE */
-				break;
-			}
-
-			if (!mutex_spin_on_owner(lock, owner))
-				goto fail_unlock;
-		}
-
-		/* Try to acquire the mutex if it is unlocked. */
-		if (__mutex_trylock(lock, waiter))
-			break;
+		if (!mutex_spin_on_owner(lock, owner))
+			goto fail_unlock;
 
 		/*
 		 * The cpu_relax() call is a compiler barrier which forces
@@ -637,7 +632,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	preempt_disable();
 	mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
 
-	if (__mutex_trylock(lock, false) ||
+	if (__mutex_trylock(lock) ||
 	    mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, false)) {
 		/* got the lock, yay! */
 		lock_acquired(&lock->dep_map, ip);
@@ -651,7 +646,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	/*
 	 * After waiting to acquire the wait_lock, try again.
 	 */
-	if (__mutex_trylock(lock, false))
+	if (__mutex_trylock(lock))
 		goto skip_wait;
 
 	debug_mutex_lock_common(lock, &waiter);
@@ -674,7 +669,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 * before testing the error conditions to make sure we pick up
 		 * the handoff.
 		 */
-		if (__mutex_trylock(lock, first))
+		if (__mutex_trylock(lock))
 			goto acquired;
 
 		/*
@@ -707,8 +702,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 * state back to RUNNING and fall through the next schedule(),
 		 * or we must see its unlock and acquire.
 		 */
-		if ((first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, true)) ||
-		     __mutex_trylock(lock, first))
+		if (__mutex_trylock(lock) ||
+		    (first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, true)))
 			break;
 
 		spin_lock_mutex(&lock->wait_lock, flags);
@@ -865,6 +860,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
 
 #ifdef CONFIG_DEBUG_MUTEXES
 		DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current);
+		DEBUG_LOCKS_WARN_ON(owner & MUTEX_FLAG_PICKUP);
 #endif
 
 		if (owner & MUTEX_FLAG_HANDOFF)
@@ -1003,7 +999,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
  */
 int __sched mutex_trylock(struct mutex *lock)
 {
-	bool locked = __mutex_trylock(lock, false);
+	bool locked = __mutex_trylock(lock);
 
 	if (locked)
 		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
-- 
cgit v1.2.3


From 3822da3ed0676e01f83fe0518c333c8e9ba249bf Mon Sep 17 00:00:00 2001
From: Nicolai Hähnle <Nicolai.Haehnle@amd.com>
Date: Wed, 21 Dec 2016 19:46:31 +0100
Subject: locking/ww_mutex: Extract stamp comparison to
 __ww_mutex_stamp_after()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The function will be re-used in subsequent patches.

Signed-off-by: Nicolai Hähnle <Nicolai.Haehnle@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <dev@mblankhorst.nl>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dri-devel@lists.freedesktop.org
Link: http://lkml.kernel.org/r/1482346000-9927-4-git-send-email-nhaehnle@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/mutex.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 24284497c425..9ad03b9a5f7f 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -281,6 +281,13 @@ static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
 	ww_ctx->acquired++;
 }
 
+static inline bool __sched
+__ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b)
+{
+	return a->stamp - b->stamp <= LONG_MAX &&
+	       (a->stamp != b->stamp || a > b);
+}
+
 /*
  * After acquiring lock with fastpath or when we lost out in contested
  * slowpath, set ctx and wake up any waiters so they can recheck.
@@ -597,8 +604,7 @@ __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
 	if (!hold_ctx)
 		return 0;
 
-	if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
-	    (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
+	if (__ww_ctx_stamp_after(ctx, hold_ctx)) {
 #ifdef CONFIG_DEBUG_MUTEXES
 		DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
 		ctx->contending_lock = ww;
-- 
cgit v1.2.3


From ea9e0fb8fe1bdfca81bd76052a5cce70bb053430 Mon Sep 17 00:00:00 2001
From: Nicolai Hähnle <Nicolai.Haehnle@amd.com>
Date: Wed, 21 Dec 2016 19:46:32 +0100
Subject: locking/ww_mutex: Set use_ww_ctx even when locking without a context
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We will add a new field to struct mutex_waiter.  This field must be
initialized for all waiters if any waiter uses the ww_use_ctx path.

So there is a trade-off: Keep ww_mutex locking without a context on
the faster non-use_ww_ctx path, at the cost of adding the
initialization to all mutex locks (including non-ww_mutexes), or avoid
the additional cost for non-ww_mutex locks, at the cost of adding
additional checks to the use_ww_ctx path.

We take the latter choice.  It may be worth eliminating the users of
ww_mutex_lock(lock, NULL), but there are a lot of them.

Signed-off-by: Nicolai Hähnle <Nicolai.Haehnle@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <dev@mblankhorst.nl>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dri-devel@lists.freedesktop.org
Link: http://lkml.kernel.org/r/1482346000-9927-5-git-send-email-nhaehnle@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/ww_mutex.h | 11 ++---------
 kernel/locking/mutex.c   | 29 +++++++++++++++++------------
 2 files changed, 19 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h
index 7b0066814fa0..5f2e8379baff 100644
--- a/include/linux/ww_mutex.h
+++ b/include/linux/ww_mutex.h
@@ -222,11 +222,7 @@ extern int __must_check __ww_mutex_lock_interruptible(struct ww_mutex *lock,
  */
 static inline int ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
-	if (ctx)
-		return __ww_mutex_lock(lock, ctx);
-
-	mutex_lock(&lock->base);
-	return 0;
+	return __ww_mutex_lock(lock, ctx);
 }
 
 /**
@@ -262,10 +258,7 @@ static inline int ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ct
 static inline int __must_check ww_mutex_lock_interruptible(struct ww_mutex *lock,
 							   struct ww_acquire_ctx *ctx)
 {
-	if (ctx)
-		return __ww_mutex_lock_interruptible(lock, ctx);
-	else
-		return mutex_lock_interruptible(&lock->base);
+	return __ww_mutex_lock_interruptible(lock, ctx);
 }
 
 /**
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 9ad03b9a5f7f..44a64c0a851a 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -469,7 +469,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
 	for (;;) {
 		struct task_struct *owner;
 
-		if (use_ww_ctx && ww_ctx->acquired > 0) {
+		if (use_ww_ctx && ww_ctx && ww_ctx->acquired > 0) {
 			struct ww_mutex *ww;
 
 			ww = container_of(lock, struct ww_mutex, base);
@@ -629,8 +629,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	struct ww_mutex *ww;
 	int ret;
 
-	if (use_ww_ctx) {
-		ww = container_of(lock, struct ww_mutex, base);
+	ww = container_of(lock, struct ww_mutex, base);
+
+	if (use_ww_ctx && ww_ctx) {
 		if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
 			return -EALREADY;
 	}
@@ -642,7 +643,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	    mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, false)) {
 		/* got the lock, yay! */
 		lock_acquired(&lock->dep_map, ip);
-		if (use_ww_ctx)
+		if (use_ww_ctx && ww_ctx)
 			ww_mutex_set_context_fastpath(ww, ww_ctx);
 		preempt_enable();
 		return 0;
@@ -688,7 +689,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 			goto err;
 		}
 
-		if (use_ww_ctx && ww_ctx->acquired > 0) {
+		if (use_ww_ctx && ww_ctx && ww_ctx->acquired > 0) {
 			ret = __ww_mutex_lock_check_stamp(lock, ww_ctx);
 			if (ret)
 				goto err;
@@ -728,7 +729,7 @@ skip_wait:
 	/* got the lock - cleanup and rejoice! */
 	lock_acquired(&lock->dep_map, ip);
 
-	if (use_ww_ctx)
+	if (use_ww_ctx && ww_ctx)
 		ww_mutex_set_context_slowpath(ww, ww_ctx);
 
 	spin_unlock_mutex(&lock->wait_lock, flags);
@@ -816,8 +817,9 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 
 	might_sleep();
 	ret =  __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE,
-				   0, &ctx->dep_map, _RET_IP_, ctx, 1);
-	if (!ret && ctx->acquired > 1)
+				   0, ctx ? &ctx->dep_map : NULL, _RET_IP_,
+				   ctx, 1);
+	if (!ret && ctx && ctx->acquired > 1)
 		return ww_mutex_deadlock_injection(lock, ctx);
 
 	return ret;
@@ -831,9 +833,10 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 
 	might_sleep();
 	ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE,
-				  0, &ctx->dep_map, _RET_IP_, ctx, 1);
+				  0, ctx ? &ctx->dep_map : NULL, _RET_IP_,
+				  ctx, 1);
 
-	if (!ret && ctx->acquired > 1)
+	if (!ret && ctx && ctx->acquired > 1)
 		return ww_mutex_deadlock_injection(lock, ctx);
 
 	return ret;
@@ -1021,7 +1024,8 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 	might_sleep();
 
 	if (__mutex_trylock_fast(&lock->base)) {
-		ww_mutex_set_context_fastpath(lock, ctx);
+		if (ctx)
+			ww_mutex_set_context_fastpath(lock, ctx);
 		return 0;
 	}
 
@@ -1035,7 +1039,8 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 	might_sleep();
 
 	if (__mutex_trylock_fast(&lock->base)) {
-		ww_mutex_set_context_fastpath(lock, ctx);
+		if (ctx)
+			ww_mutex_set_context_fastpath(lock, ctx);
 		return 0;
 	}
 
-- 
cgit v1.2.3


From c5470b22d1833241cae996d23a8a346ff8ec4d58 Mon Sep 17 00:00:00 2001
From: Nicolai Hähnle <Nicolai.Haehnle@amd.com>
Date: Wed, 21 Dec 2016 19:46:33 +0100
Subject: locking/ww_mutex: Remove the __ww_mutex_lock*() inline wrappers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Keep the documentation in the header file since there is no good place
for it in mutex.c: there are two rather different implementations with
different EXPORT_SYMBOLs for each function.

Signed-off-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <Nicolai.Haehnle@amd.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <dev@mblankhorst.nl>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dri-devel@lists.freedesktop.org
Link: http://lkml.kernel.org/r/1482346000-9927-6-git-send-email-nhaehnle@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/ww_mutex.h | 18 ++++--------------
 kernel/locking/mutex.c   | 16 ++++++++--------
 2 files changed, 12 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h
index 5f2e8379baff..c07528522bbc 100644
--- a/include/linux/ww_mutex.h
+++ b/include/linux/ww_mutex.h
@@ -186,11 +186,6 @@ static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx)
 #endif
 }
 
-extern int __must_check __ww_mutex_lock(struct ww_mutex *lock,
-					struct ww_acquire_ctx *ctx);
-extern int __must_check __ww_mutex_lock_interruptible(struct ww_mutex *lock,
-						      struct ww_acquire_ctx *ctx);
-
 /**
  * ww_mutex_lock - acquire the w/w mutex
  * @lock: the mutex to be acquired
@@ -220,10 +215,8 @@ extern int __must_check __ww_mutex_lock_interruptible(struct ww_mutex *lock,
  *
  * A mutex acquired with this function must be released with ww_mutex_unlock.
  */
-static inline int ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
-{
-	return __ww_mutex_lock(lock, ctx);
-}
+extern int __must_check ww_mutex_lock(struct ww_mutex *lock,
+				      struct ww_acquire_ctx *ctx);
 
 /**
  * ww_mutex_lock_interruptible - acquire the w/w mutex, interruptible
@@ -255,11 +248,8 @@ static inline int ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ct
  *
  * A mutex acquired with this function must be released with ww_mutex_unlock.
  */
-static inline int __must_check ww_mutex_lock_interruptible(struct ww_mutex *lock,
-							   struct ww_acquire_ctx *ctx)
-{
-	return __ww_mutex_lock_interruptible(lock, ctx);
-}
+extern int __must_check ww_mutex_lock_interruptible(struct ww_mutex *lock,
+						    struct ww_acquire_ctx *ctx);
 
 /**
  * ww_mutex_lock_slow - slowpath acquiring of the w/w mutex
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 44a64c0a851a..c696614a6b8b 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -811,7 +811,7 @@ ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 }
 
 int __sched
-__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
 	int ret;
 
@@ -824,10 +824,10 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(__ww_mutex_lock);
+EXPORT_SYMBOL_GPL(ww_mutex_lock);
 
 int __sched
-__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
 	int ret;
 
@@ -841,7 +841,7 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
+EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible);
 
 #endif
 
@@ -1019,7 +1019,7 @@ EXPORT_SYMBOL(mutex_trylock);
 
 #ifndef CONFIG_DEBUG_LOCK_ALLOC
 int __sched
-__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
 	might_sleep();
 
@@ -1031,10 +1031,10 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 
 	return __ww_mutex_lock_slowpath(lock, ctx);
 }
-EXPORT_SYMBOL(__ww_mutex_lock);
+EXPORT_SYMBOL(ww_mutex_lock);
 
 int __sched
-__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
 	might_sleep();
 
@@ -1046,7 +1046,7 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 
 	return __ww_mutex_lock_interruptible_slowpath(lock, ctx);
 }
-EXPORT_SYMBOL(__ww_mutex_lock_interruptible);
+EXPORT_SYMBOL(ww_mutex_lock_interruptible);
 
 #endif
 
-- 
cgit v1.2.3


From 6baa5c60a97d7f1a37a4b5ab5fb3a450e56e8b06 Mon Sep 17 00:00:00 2001
From: Nicolai Hähnle <Nicolai.Haehnle@amd.com>
Date: Wed, 21 Dec 2016 19:46:34 +0100
Subject: locking/ww_mutex: Add waiters in stamp order
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add regular waiters in stamp order. Keep adding waiters that have no
context in FIFO order and take care not to starve them.

While adding our task as a waiter, back off if we detect that there is
a waiter with a lower stamp in front of us.

Make sure to call lock_contended even when we back off early.

For w/w mutexes, being first in the wait list is only stable when
taking the lock without a context. Therefore, the purpose of the first
flag is split into two: 'first' remains to indicate whether we want to
spin optimistically, while 'handoff' indicates that we should be
prepared to accept a handoff.

For w/w locking with a context, we always accept handoffs after the
first schedule(), to handle the following sequence of events:

 1. Task #0 unlocks and hands off to Task #2 which is first in line

 2. Task #1 adds itself in front of Task #2

 3. Task #2 wakes up and must accept the handoff even though it is no
    longer first in line

Signed-off-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <Nicolai.Haehnle@amd.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <dev@mblankhorst.nl>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dri-devel@lists.freedesktop.org
Link: http://lkml.kernel.org/r/1482346000-9927-7-git-send-email-nhaehnle@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mutex.h  |  3 ++
 kernel/locking/mutex.c | 76 +++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 72 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 3e1fccb47f11..e17942ffb3fc 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -20,6 +20,8 @@
 #include <linux/osq_lock.h>
 #include <linux/debug_locks.h>
 
+struct ww_acquire_ctx;
+
 /*
  * Simple, straightforward mutexes with strict semantics:
  *
@@ -75,6 +77,7 @@ static inline struct task_struct *__mutex_owner(struct mutex *lock)
 struct mutex_waiter {
 	struct list_head	list;
 	struct task_struct	*task;
+	struct ww_acquire_ctx	*ww_ctx;
 #ifdef CONFIG_DEBUG_MUTEXES
 	void			*magic;
 #endif
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index c696614a6b8b..d0f7628b5a3d 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -615,6 +615,52 @@ __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
 	return 0;
 }
 
+static inline int __sched
+__ww_mutex_add_waiter(struct mutex_waiter *waiter,
+		      struct mutex *lock,
+		      struct ww_acquire_ctx *ww_ctx)
+{
+	struct mutex_waiter *cur;
+	struct list_head *pos;
+
+	if (!ww_ctx) {
+		list_add_tail(&waiter->list, &lock->wait_list);
+		return 0;
+	}
+
+	/*
+	 * Add the waiter before the first waiter with a higher stamp.
+	 * Waiters without a context are skipped to avoid starving
+	 * them.
+	 */
+	pos = &lock->wait_list;
+	list_for_each_entry_reverse(cur, &lock->wait_list, list) {
+		if (!cur->ww_ctx)
+			continue;
+
+		if (__ww_ctx_stamp_after(ww_ctx, cur->ww_ctx)) {
+			/* Back off immediately if necessary. */
+			if (ww_ctx->acquired > 0) {
+#ifdef CONFIG_DEBUG_MUTEXES
+				struct ww_mutex *ww;
+
+				ww = container_of(lock, struct ww_mutex, base);
+				DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock);
+				ww_ctx->contending_lock = ww;
+#endif
+				return -EDEADLK;
+			}
+
+			break;
+		}
+
+		pos = &cur->list;
+	}
+
+	list_add_tail(&waiter->list, pos);
+	return 0;
+}
+
 /*
  * Lock a mutex (possibly interruptible), slowpath:
  */
@@ -659,15 +705,25 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	debug_mutex_lock_common(lock, &waiter);
 	debug_mutex_add_waiter(lock, &waiter, current);
 
-	/* add waiting tasks to the end of the waitqueue (FIFO): */
-	list_add_tail(&waiter.list, &lock->wait_list);
+	lock_contended(&lock->dep_map, ip);
+
+	if (!use_ww_ctx) {
+		/* add waiting tasks to the end of the waitqueue (FIFO): */
+		list_add_tail(&waiter.list, &lock->wait_list);
+	} else {
+		/* Add in stamp order, waking up waiters that must back off. */
+		ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx);
+		if (ret)
+			goto err_early_backoff;
+
+		waiter.ww_ctx = ww_ctx;
+	}
+
 	waiter.task = current;
 
 	if (__mutex_waiter_is_first(lock, &waiter))
 		__mutex_set_flag(lock, MUTEX_FLAG_WAITERS);
 
-	lock_contended(&lock->dep_map, ip);
-
 	set_current_state(state);
 	for (;;) {
 		/*
@@ -698,9 +754,14 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		spin_unlock_mutex(&lock->wait_lock, flags);
 		schedule_preempt_disabled();
 
-		if (!first && __mutex_waiter_is_first(lock, &waiter)) {
-			first = true;
-			__mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
+		/*
+		 * ww_mutex needs to always recheck its position since its waiter
+		 * list is not FIFO ordered.
+		 */
+		if ((use_ww_ctx && ww_ctx) || !first) {
+			first = __mutex_waiter_is_first(lock, &waiter);
+			if (first)
+				__mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
 		}
 
 		set_current_state(state);
@@ -739,6 +800,7 @@ skip_wait:
 err:
 	__set_current_state(TASK_RUNNING);
 	mutex_remove_waiter(lock, &waiter, current);
+err_early_backoff:
 	spin_unlock_mutex(&lock->wait_lock, flags);
 	debug_mutex_free_waiter(&waiter);
 	mutex_release(&lock->dep_map, 1, ip);
-- 
cgit v1.2.3


From 200b1874401555e9b36010aa318e75cf57005f36 Mon Sep 17 00:00:00 2001
From: Nicolai Hähnle <Nicolai.Haehnle@amd.com>
Date: Wed, 21 Dec 2016 19:46:35 +0100
Subject: locking/ww_mutex: Notify waiters that have to back off while adding
 tasks to wait list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

While adding our task as a waiter, detect if another task should back off
because of us.

With this patch, we establish the invariant that the wait list contains
at most one (sleeping) waiter with ww_ctx->acquired > 0, and this waiter
will be the first waiter with a context.

Since only waiters with ww_ctx->acquired > 0 have to back off, this allows
us to be much more economical with wakeups.

Signed-off-by: Nicolai Hähnle <Nicolai.Haehnle@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <dev@mblankhorst.nl>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dri-devel@lists.freedesktop.org
Link: http://lkml.kernel.org/r/1482346000-9927-8-git-send-email-nhaehnle@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/mutex.c | 40 ++++++++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index d0f7628b5a3d..66967219477d 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -596,23 +596,34 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock)
 EXPORT_SYMBOL(ww_mutex_unlock);
 
 static inline int __sched
-__ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
+__ww_mutex_lock_check_stamp(struct mutex *lock, struct mutex_waiter *waiter,
+			    struct ww_acquire_ctx *ctx)
 {
 	struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
 	struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
+	struct mutex_waiter *cur;
 
-	if (!hold_ctx)
-		return 0;
+	if (hold_ctx && __ww_ctx_stamp_after(ctx, hold_ctx))
+		goto deadlock;
 
-	if (__ww_ctx_stamp_after(ctx, hold_ctx)) {
-#ifdef CONFIG_DEBUG_MUTEXES
-		DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
-		ctx->contending_lock = ww;
-#endif
-		return -EDEADLK;
+	/*
+	 * If there is a waiter in front of us that has a context, then its
+	 * stamp is earlier than ours and we must back off.
+	 */
+	cur = waiter;
+	list_for_each_entry_continue_reverse(cur, &lock->wait_list, list) {
+		if (cur->ww_ctx)
+			goto deadlock;
 	}
 
 	return 0;
+
+deadlock:
+#ifdef CONFIG_DEBUG_MUTEXES
+	DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
+	ctx->contending_lock = ww;
+#endif
+	return -EDEADLK;
 }
 
 static inline int __sched
@@ -655,6 +666,15 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter,
 		}
 
 		pos = &cur->list;
+
+		/*
+		 * Wake up the waiter so that it gets a chance to back
+		 * off.
+		 */
+		if (cur->ww_ctx->acquired > 0) {
+			debug_mutex_wake_waiter(lock, cur);
+			wake_up_process(cur->task);
+		}
 	}
 
 	list_add_tail(&waiter->list, pos);
@@ -746,7 +766,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		}
 
 		if (use_ww_ctx && ww_ctx && ww_ctx->acquired > 0) {
-			ret = __ww_mutex_lock_check_stamp(lock, ww_ctx);
+			ret = __ww_mutex_lock_check_stamp(lock, &waiter, ww_ctx);
 			if (ret)
 				goto err;
 		}
-- 
cgit v1.2.3


From 659cf9f5824a12e6b50785e4e9cf1adf8a3adbd0 Mon Sep 17 00:00:00 2001
From: Nicolai Hähnle <Nicolai.Haehnle@amd.com>
Date: Wed, 21 Dec 2016 19:46:36 +0100
Subject: locking/ww_mutex: Optimize ww-mutexes by waking at most one waiter
 for backoff when acquiring the lock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The wait list is sorted by stamp order, and the only waiting task that may
have to back off is the first waiter with a context.

The regular slow path does not have to wake any other tasks at all, since
all other waiters that would have to back off were either woken up when
the waiter was added to the list, or detected the condition before they
added themselves.

Median timings taken of a contention-heavy GPU workload:

Without this series:

  real    0m59.900s
  user    0m7.516s
  sys     2m16.076s

With changes up to and including this patch:

  real    0m52.946s
  user    0m7.272s
  sys     1m55.964s

Signed-off-by: Nicolai Hähnle <Nicolai.Haehnle@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <dev@mblankhorst.nl>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dri-devel@lists.freedesktop.org
Link: http://lkml.kernel.org/r/1482346000-9927-9-git-send-email-nhaehnle@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/mutex.c | 59 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 40 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 66967219477d..51e2ba557e2a 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -288,6 +288,36 @@ __ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b)
 	       (a->stamp != b->stamp || a > b);
 }
 
+/*
+ * Wake up any waiters that may have to back off when the lock is held by the
+ * given context.
+ *
+ * Due to the invariants on the wait list, this can only affect the first
+ * waiter with a context.
+ *
+ * The current task must not be on the wait list.
+ */
+static void __sched
+__ww_mutex_wakeup_for_backoff(struct mutex *lock, struct ww_acquire_ctx *ww_ctx)
+{
+	struct mutex_waiter *cur;
+
+	lockdep_assert_held(&lock->wait_lock);
+
+	list_for_each_entry(cur, &lock->wait_list, list) {
+		if (!cur->ww_ctx)
+			continue;
+
+		if (cur->ww_ctx->acquired > 0 &&
+		    __ww_ctx_stamp_after(cur->ww_ctx, ww_ctx)) {
+			debug_mutex_wake_waiter(lock, cur);
+			wake_up_process(cur->task);
+		}
+
+		break;
+	}
+}
+
 /*
  * After acquiring lock with fastpath or when we lost out in contested
  * slowpath, set ctx and wake up any waiters so they can recheck.
@@ -297,7 +327,6 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock,
 			       struct ww_acquire_ctx *ctx)
 {
 	unsigned long flags;
-	struct mutex_waiter *cur;
 
 	ww_mutex_lock_acquired(lock, ctx);
 
@@ -323,16 +352,15 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock,
 	 * so they can see the new lock->ctx.
 	 */
 	spin_lock_mutex(&lock->base.wait_lock, flags);
-	list_for_each_entry(cur, &lock->base.wait_list, list) {
-		debug_mutex_wake_waiter(&lock->base, cur);
-		wake_up_process(cur->task);
-	}
+	__ww_mutex_wakeup_for_backoff(&lock->base, ctx);
 	spin_unlock_mutex(&lock->base.wait_lock, flags);
 }
 
 /*
- * After acquiring lock in the slowpath set ctx and wake up any
- * waiters so they can recheck.
+ * After acquiring lock in the slowpath set ctx.
+ *
+ * Unlike for the fast path, the caller ensures that waiters are woken up where
+ * necessary.
  *
  * Callers must hold the mutex wait_lock.
  */
@@ -340,19 +368,8 @@ static __always_inline void
 ww_mutex_set_context_slowpath(struct ww_mutex *lock,
 			      struct ww_acquire_ctx *ctx)
 {
-	struct mutex_waiter *cur;
-
 	ww_mutex_lock_acquired(lock, ctx);
 	lock->ctx = ctx;
-
-	/*
-	 * Give any possible sleeping processes the chance to wake up,
-	 * so they can recheck if they have to back off.
-	 */
-	list_for_each_entry(cur, &lock->base.wait_list, list) {
-		debug_mutex_wake_waiter(&lock->base, cur);
-		wake_up_process(cur->task);
-	}
 }
 
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
@@ -719,8 +736,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	/*
 	 * After waiting to acquire the wait_lock, try again.
 	 */
-	if (__mutex_trylock(lock))
+	if (__mutex_trylock(lock)) {
+		if (use_ww_ctx && ww_ctx)
+			__ww_mutex_wakeup_for_backoff(lock, ww_ctx);
+
 		goto skip_wait;
+	}
 
 	debug_mutex_lock_common(lock, &waiter);
 	debug_mutex_add_waiter(lock, &waiter, current);
-- 
cgit v1.2.3


From 427b18207a87f6306bd53a74e03dbe17392b0045 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 23 Dec 2016 10:36:00 +0100
Subject: locking/mutex: Improve inlining

Instead of inlining __mutex_lock_common() 5 times, once for each
{state,ww} variant. Reduce this to two, ww and !ww.

Then add __always_inline to mutex_optimistic_spin(), so that that will
get inlined all 4 remaining times, for all {waiter,ww} variants.

   text    data     bss     dec     hex filename

   6301       0       0    6301    189d defconfig-build/kernel/locking/mutex.o
   4053       0       0    4053     fd5 defconfig-build/kernel/locking/mutex.o
   4257       0       0    4257    10a1 defconfig-build/kernel/locking/mutex.o

This reduces total text size and better separates the ww and !ww mutex
code generation.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/mutex.c | 85 ++++++++++++++++++++++++++------------------------
 1 file changed, 44 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 51e2ba557e2a..43ff6110014c 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -241,8 +241,8 @@ void __sched mutex_lock(struct mutex *lock)
 EXPORT_SYMBOL(mutex_lock);
 #endif
 
-static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
-						   struct ww_acquire_ctx *ww_ctx)
+static __always_inline void
+ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx)
 {
 #ifdef CONFIG_DEBUG_MUTEXES
 	/*
@@ -323,8 +323,7 @@ __ww_mutex_wakeup_for_backoff(struct mutex *lock, struct ww_acquire_ctx *ww_ctx)
  * slowpath, set ctx and wake up any waiters so they can recheck.
  */
 static __always_inline void
-ww_mutex_set_context_fastpath(struct ww_mutex *lock,
-			       struct ww_acquire_ctx *ctx)
+ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
 	unsigned long flags;
 
@@ -365,8 +364,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock,
  * Callers must hold the mutex wait_lock.
  */
 static __always_inline void
-ww_mutex_set_context_slowpath(struct ww_mutex *lock,
-			      struct ww_acquire_ctx *ctx)
+ww_mutex_set_context_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
 	ww_mutex_lock_acquired(lock, ctx);
 	lock->ctx = ctx;
@@ -459,9 +457,9 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
  * with the spinner at the head of the OSQ, if present, until the owner is
  * changed to itself.
  */
-static bool mutex_optimistic_spin(struct mutex *lock,
-				  struct ww_acquire_ctx *ww_ctx,
-				  const bool use_ww_ctx, const bool waiter)
+static __always_inline bool
+mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
+		      const bool use_ww_ctx, const bool waiter)
 {
 	if (!waiter) {
 		/*
@@ -551,9 +549,9 @@ fail:
 	return false;
 }
 #else
-static bool mutex_optimistic_spin(struct mutex *lock,
-				  struct ww_acquire_ctx *ww_ctx,
-				  const bool use_ww_ctx, const bool waiter)
+static __always_inline bool
+mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
+		      const bool use_ww_ctx, const bool waiter)
 {
 	return false;
 }
@@ -712,8 +710,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	struct ww_mutex *ww;
 	int ret;
 
-	ww = container_of(lock, struct ww_mutex, base);
+	might_sleep();
 
+	ww = container_of(lock, struct ww_mutex, base);
 	if (use_ww_ctx && ww_ctx) {
 		if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
 			return -EALREADY;
@@ -849,13 +848,26 @@ err_early_backoff:
 	return ret;
 }
 
+static int __sched
+__mutex_lock(struct mutex *lock, long state, unsigned int subclass,
+	     struct lockdep_map *nest_lock, unsigned long ip)
+{
+	return __mutex_lock_common(lock, state, subclass, nest_lock, ip, NULL, false);
+}
+
+static int __sched
+__ww_mutex_lock(struct mutex *lock, long state, unsigned int subclass,
+		struct lockdep_map *nest_lock, unsigned long ip,
+		struct ww_acquire_ctx *ww_ctx)
+{
+	return __mutex_lock_common(lock, state, subclass, nest_lock, ip, ww_ctx, true);
+}
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void __sched
 mutex_lock_nested(struct mutex *lock, unsigned int subclass)
 {
-	might_sleep();
-	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
-			    subclass, NULL, _RET_IP_, NULL, 0);
+	__mutex_lock(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
 }
 
 EXPORT_SYMBOL_GPL(mutex_lock_nested);
@@ -863,27 +875,21 @@ EXPORT_SYMBOL_GPL(mutex_lock_nested);
 void __sched
 _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
 {
-	might_sleep();
-	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
-			    0, nest, _RET_IP_, NULL, 0);
+	__mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_);
 }
 EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
 
 int __sched
 mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
 {
-	might_sleep();
-	return __mutex_lock_common(lock, TASK_KILLABLE,
-				   subclass, NULL, _RET_IP_, NULL, 0);
+	return __mutex_lock(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);
 }
 EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
 
 int __sched
 mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
 {
-	might_sleep();
-	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
-				   subclass, NULL, _RET_IP_, NULL, 0);
+	return __mutex_lock(lock, TASK_INTERRUPTIBLE, subclass, NULL, _RET_IP_);
 }
 EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
 
@@ -919,9 +925,9 @@ ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 	int ret;
 
 	might_sleep();
-	ret =  __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE,
-				   0, ctx ? &ctx->dep_map : NULL, _RET_IP_,
-				   ctx, 1);
+	ret =  __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE,
+			       0, ctx ? &ctx->dep_map : NULL, _RET_IP_,
+			       ctx);
 	if (!ret && ctx && ctx->acquired > 1)
 		return ww_mutex_deadlock_injection(lock, ctx);
 
@@ -935,9 +941,9 @@ ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 	int ret;
 
 	might_sleep();
-	ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE,
-				  0, ctx ? &ctx->dep_map : NULL, _RET_IP_,
-				  ctx, 1);
+	ret = __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE,
+			      0, ctx ? &ctx->dep_map : NULL, _RET_IP_,
+			      ctx);
 
 	if (!ret && ctx && ctx->acquired > 1)
 		return ww_mutex_deadlock_injection(lock, ctx);
@@ -1060,37 +1066,34 @@ EXPORT_SYMBOL(mutex_lock_killable);
 static noinline void __sched
 __mutex_lock_slowpath(struct mutex *lock)
 {
-	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0,
-			    NULL, _RET_IP_, NULL, 0);
+	__mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
 }
 
 static noinline int __sched
 __mutex_lock_killable_slowpath(struct mutex *lock)
 {
-	return __mutex_lock_common(lock, TASK_KILLABLE, 0,
-				   NULL, _RET_IP_, NULL, 0);
+	return __mutex_lock(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
 }
 
 static noinline int __sched
 __mutex_lock_interruptible_slowpath(struct mutex *lock)
 {
-	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0,
-				   NULL, _RET_IP_, NULL, 0);
+	return __mutex_lock(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
 }
 
 static noinline int __sched
 __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
-	return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0,
-				   NULL, _RET_IP_, ctx, 1);
+	return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0, NULL,
+			       _RET_IP_, ctx);
 }
 
 static noinline int __sched
 __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
 					    struct ww_acquire_ctx *ctx)
 {
-	return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0,
-				   NULL, _RET_IP_, ctx, 1);
+	return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0, NULL,
+			       _RET_IP_, ctx);
 }
 
 #endif
-- 
cgit v1.2.3


From 25f13b4040b68dfc5a2a22e7234894e718e3f4c5 Mon Sep 17 00:00:00 2001
From: Nicolai Hähnle <Nicolai.Haehnle@amd.com>
Date: Wed, 21 Dec 2016 19:46:37 +0100
Subject: locking/ww_mutex: Re-check ww->ctx in the inner optimistic spin loop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the following scenario, thread #1 should back off its attempt to lock
ww1 and unlock ww2 (assuming the acquire context stamps are ordered
accordingly).

    Thread #0               Thread #1
    ---------               ---------
                            successfully lock ww2
    set ww1->base.owner
                            attempt to lock ww1
                            confirm ww1->ctx == NULL
                            enter mutex_spin_on_owner
    set ww1->ctx

What was likely to happen previously is:

    attempt to lock ww2
    refuse to spin because
      ww2->ctx != NULL
    schedule()
                            detect thread #0 is off CPU
                            stop optimistic spin
                            return -EDEADLK
                            unlock ww2
                            wakeup thread #0
    lock ww2

Now, we are more likely to see:

                            detect ww1->ctx != NULL
                            stop optimistic spin
                            return -EDEADLK
                            unlock ww2
    successfully lock ww2

... because thread #1 will stop its optimistic spin as soon as possible.

The whole scenario is quite unlikely, since it requires thread #1 to get
between thread #0 setting the owner and setting the ctx. But since we're
idling here anyway, the additional check is basically free.

Found by inspection.

Signed-off-by: Nicolai Hähnle <Nicolai.Haehnle@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <dev@mblankhorst.nl>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dri-devel@lists.freedesktop.org
Link: http://lkml.kernel.org/r/1482346000-9927-10-git-send-email-nhaehnle@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/mutex.c | 49 +++++++++++++++++++++++++++++--------------------
 1 file changed, 29 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 43ff6110014c..41b0406069e8 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -372,11 +372,14 @@ ww_mutex_set_context_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 /*
- * Look out! "owner" is an entirely speculative pointer
- * access and not reliable.
+ * Look out! "owner" is an entirely speculative pointer access and not
+ * reliable.
+ *
+ * "noinline" so that this function shows up on perf profiles.
  */
 static noinline
-bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner,
+			 struct ww_acquire_ctx *ww_ctx)
 {
 	bool ret = true;
 
@@ -399,6 +402,28 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
 			break;
 		}
 
+		if (ww_ctx && ww_ctx->acquired > 0) {
+			struct ww_mutex *ww;
+
+			ww = container_of(lock, struct ww_mutex, base);
+
+			/*
+			 * If ww->ctx is set the contents are undefined, only
+			 * by acquiring wait_lock there is a guarantee that
+			 * they are not invalid when reading.
+			 *
+			 * As such, when deadlock detection needs to be
+			 * performed the optimistic spinning cannot be done.
+			 *
+			 * Check this in every inner iteration because we may
+			 * be racing against another thread's ww_mutex_lock.
+			 */
+			if (READ_ONCE(ww->ctx)) {
+				ret = false;
+				break;
+			}
+		}
+
 		cpu_relax();
 	}
 	rcu_read_unlock();
@@ -484,22 +509,6 @@ mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
 	for (;;) {
 		struct task_struct *owner;
 
-		if (use_ww_ctx && ww_ctx && ww_ctx->acquired > 0) {
-			struct ww_mutex *ww;
-
-			ww = container_of(lock, struct ww_mutex, base);
-			/*
-			 * If ww->ctx is set the contents are undefined, only
-			 * by acquiring wait_lock there is a guarantee that
-			 * they are not invalid when reading.
-			 *
-			 * As such, when deadlock detection needs to be
-			 * performed the optimistic spinning cannot be done.
-			 */
-			if (READ_ONCE(ww->ctx))
-				goto fail_unlock;
-		}
-
 		/* Try to acquire the mutex... */
 		owner = __mutex_trylock_or_owner(lock);
 		if (!owner)
@@ -509,7 +518,7 @@ mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
 		 * There's an owner, wait for it to either
 		 * release the lock or go to sleep.
 		 */
-		if (!mutex_spin_on_owner(lock, owner))
+		if (!mutex_spin_on_owner(lock, owner, ww_ctx))
 			goto fail_unlock;
 
 		/*
-- 
cgit v1.2.3


From c516df978df1471793aaa4809b390ecd40fa93c2 Mon Sep 17 00:00:00 2001
From: Nicolai Hähnle <Nicolai.Haehnle@amd.com>
Date: Wed, 21 Dec 2016 19:46:38 +0100
Subject: locking/ww_mutex: Optimize ww-mutexes by yielding to other waiters
 from optimistic spin
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Lock stealing is less beneficial for w/w mutexes since we may just end up
backing off if we stole from a thread with an earlier acquire stamp that
already holds another w/w mutex that we also need. So don't spin
optimistically unless we are sure that there is no other waiter that might
cause us to back off.

Median timings taken of a contention-heavy GPU workload:

Before:

  real    0m52.946s
  user    0m7.272s
  sys     1m55.964s

After:

  real    0m53.086s
  user    0m7.360s
  sys     1m46.204s

This particular workload still spends 20%-25% of CPU in mutex_spin_on_owner
according to perf, but my attempts to further reduce this spinning based on
various heuristics all lead to an increase in measured wall time despite
the decrease in sys time.

Signed-off-by: Nicolai Hähnle <Nicolai.Haehnle@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <dev@mblankhorst.nl>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dri-devel@lists.freedesktop.org
Link: http://lkml.kernel.org/r/1482346000-9927-11-git-send-email-nhaehnle@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/mutex.c | 78 +++++++++++++++++++++++++++++++++-----------------
 1 file changed, 52 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 41b0406069e8..cd8598aa0426 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -371,6 +371,49 @@ ww_mutex_set_context_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 }
 
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+
+static inline
+bool ww_mutex_spin_on_owner(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
+			    struct mutex_waiter *waiter)
+{
+	struct ww_mutex *ww;
+
+	ww = container_of(lock, struct ww_mutex, base);
+
+	/*
+	 * If ww->ctx is set the contents are undefined, only
+	 * by acquiring wait_lock there is a guarantee that
+	 * they are not invalid when reading.
+	 *
+	 * As such, when deadlock detection needs to be
+	 * performed the optimistic spinning cannot be done.
+	 *
+	 * Check this in every inner iteration because we may
+	 * be racing against another thread's ww_mutex_lock.
+	 */
+	if (ww_ctx->acquired > 0 && READ_ONCE(ww->ctx))
+		return false;
+
+	/*
+	 * If we aren't on the wait list yet, cancel the spin
+	 * if there are waiters. We want  to avoid stealing the
+	 * lock from a waiter with an earlier stamp, since the
+	 * other thread may already own a lock that we also
+	 * need.
+	 */
+	if (!waiter && (atomic_long_read(&lock->owner) & MUTEX_FLAG_WAITERS))
+		return false;
+
+	/*
+	 * Similarly, stop spinning if we are no longer the
+	 * first waiter.
+	 */
+	if (waiter && !__mutex_waiter_is_first(lock, waiter))
+		return false;
+
+	return true;
+}
+
 /*
  * Look out! "owner" is an entirely speculative pointer access and not
  * reliable.
@@ -379,7 +422,7 @@ ww_mutex_set_context_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
  */
 static noinline
 bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner,
-			 struct ww_acquire_ctx *ww_ctx)
+			 struct ww_acquire_ctx *ww_ctx, struct mutex_waiter *waiter)
 {
 	bool ret = true;
 
@@ -402,26 +445,9 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner,
 			break;
 		}
 
-		if (ww_ctx && ww_ctx->acquired > 0) {
-			struct ww_mutex *ww;
-
-			ww = container_of(lock, struct ww_mutex, base);
-
-			/*
-			 * If ww->ctx is set the contents are undefined, only
-			 * by acquiring wait_lock there is a guarantee that
-			 * they are not invalid when reading.
-			 *
-			 * As such, when deadlock detection needs to be
-			 * performed the optimistic spinning cannot be done.
-			 *
-			 * Check this in every inner iteration because we may
-			 * be racing against another thread's ww_mutex_lock.
-			 */
-			if (READ_ONCE(ww->ctx)) {
-				ret = false;
-				break;
-			}
+		if (ww_ctx && !ww_mutex_spin_on_owner(lock, ww_ctx, waiter)) {
+			ret = false;
+			break;
 		}
 
 		cpu_relax();
@@ -484,7 +510,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
  */
 static __always_inline bool
 mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
-		      const bool use_ww_ctx, const bool waiter)
+		      const bool use_ww_ctx, struct mutex_waiter *waiter)
 {
 	if (!waiter) {
 		/*
@@ -518,7 +544,7 @@ mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
 		 * There's an owner, wait for it to either
 		 * release the lock or go to sleep.
 		 */
-		if (!mutex_spin_on_owner(lock, owner, ww_ctx))
+		if (!mutex_spin_on_owner(lock, owner, ww_ctx, waiter))
 			goto fail_unlock;
 
 		/*
@@ -560,7 +586,7 @@ fail:
 #else
 static __always_inline bool
 mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
-		      const bool use_ww_ctx, const bool waiter)
+		      const bool use_ww_ctx, struct mutex_waiter *waiter)
 {
 	return false;
 }
@@ -731,7 +757,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
 
 	if (__mutex_trylock(lock) ||
-	    mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, false)) {
+	    mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, NULL)) {
 		/* got the lock, yay! */
 		lock_acquired(&lock->dep_map, ip);
 		if (use_ww_ctx && ww_ctx)
@@ -820,7 +846,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 * or we must see its unlock and acquire.
 		 */
 		if (__mutex_trylock(lock) ||
-		    (first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, true)))
+		    (first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, &waiter)))
 			break;
 
 		spin_lock_mutex(&lock->wait_lock, flags);
-- 
cgit v1.2.3


From 977625a693283dbb35b2a3f674bc0237f7347348 Mon Sep 17 00:00:00 2001
From: Nicolai Hähnle <Nicolai.Haehnle@amd.com>
Date: Wed, 21 Dec 2016 19:46:39 +0100
Subject: locking/mutex: Initialize mutex_waiter::ww_ctx with poison when
 debugging
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Help catch cases where mutex_lock is used directly on w/w mutexes, which
otherwise result in the w/w tasks reading uninitialized data.

Signed-off-by: Nicolai Hähnle <Nicolai.Haehnle@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <dev@mblankhorst.nl>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dri-devel@lists.freedesktop.org
Link: http://lkml.kernel.org/r/1482346000-9927-12-git-send-email-nhaehnle@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/poison.h | 1 +
 kernel/locking/mutex.c | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/poison.h b/include/linux/poison.h
index 51334edec506..a39540326417 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -80,6 +80,7 @@
 /********** kernel/mutexes **********/
 #define MUTEX_DEBUG_INIT	0x11
 #define MUTEX_DEBUG_FREE	0x22
+#define MUTEX_POISON_WW_CTX	((void *) 0x500 + POISON_POINTER_DELTA)
 
 /********** lib/flex_array.c **********/
 #define FLEX_ARRAY_FREE	0x6c	/* for use-after-free poisoning */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index cd8598aa0426..935116723a3d 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -785,6 +785,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	if (!use_ww_ctx) {
 		/* add waiting tasks to the end of the waitqueue (FIFO): */
 		list_add_tail(&waiter.list, &lock->wait_list);
+
+#ifdef CONFIG_DEBUG_MUTEXES
+		waiter.ww_ctx = MUTEX_POISON_WW_CTX;
+#endif
 	} else {
 		/* Add in stamp order, waking up waiters that must back off. */
 		ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx);
-- 
cgit v1.2.3


From d8ac897137a230ec351269f6378017f2decca512 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Wed, 21 Sep 2016 14:38:10 +0100
Subject: sched/core: Add wrappers for lockdep_(un)pin_lock()

In preparation for adding diagnostic checks to catch missing calls to
update_rq_clock(), provide wrappers for (re)pinning and unpinning
rq->lock.

Because the pending diagnostic checks allow state to be maintained in
rq_flags across pin contexts, swap the 'struct pin_cookie' arguments
for 'struct rq_flags *'.

Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luca Abeni <luca.abeni@unitn.it>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Yuyang Du <yuyang.du@intel.com>
Link: http://lkml.kernel.org/r/20160921133813.31976-5-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c      | 80 ++++++++++++++++++++++++------------------------
 kernel/sched/deadline.c  | 10 +++---
 kernel/sched/fair.c      |  6 ++--
 kernel/sched/idle_task.c |  2 +-
 kernel/sched/rt.c        |  6 ++--
 kernel/sched/sched.h     | 31 ++++++++++++++-----
 kernel/sched/stop_task.c |  2 +-
 7 files changed, 76 insertions(+), 61 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c56fb57f2991..41df935180ea 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -185,7 +185,7 @@ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
 		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
-			rf->cookie = lockdep_pin_lock(&rq->lock);
+			rq_pin_lock(rq, rf);
 			return rq;
 		}
 		raw_spin_unlock(&rq->lock);
@@ -225,7 +225,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
 		 * pair with the WMB to ensure we must then also see migrating.
 		 */
 		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
-			rf->cookie = lockdep_pin_lock(&rq->lock);
+			rq_pin_lock(rq, rf);
 			return rq;
 		}
 		raw_spin_unlock(&rq->lock);
@@ -1195,9 +1195,9 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 		 * OK, since we're going to drop the lock immediately
 		 * afterwards anyway.
 		 */
-		lockdep_unpin_lock(&rq->lock, rf.cookie);
+		rq_unpin_lock(rq, &rf);
 		rq = move_queued_task(rq, p, dest_cpu);
-		lockdep_repin_lock(&rq->lock, rf.cookie);
+		rq_repin_lock(rq, &rf);
 	}
 out:
 	task_rq_unlock(rq, p, &rf);
@@ -1690,7 +1690,7 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
  * Mark the task runnable and perform wakeup-preemption.
  */
 static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
-			   struct pin_cookie cookie)
+			   struct rq_flags *rf)
 {
 	check_preempt_curr(rq, p, wake_flags);
 	p->state = TASK_RUNNING;
@@ -1702,9 +1702,9 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
 		 * Our task @p is fully woken up and running; so its safe to
 		 * drop the rq->lock, hereafter rq is only used for statistics.
 		 */
-		lockdep_unpin_lock(&rq->lock, cookie);
+		rq_unpin_lock(rq, rf);
 		p->sched_class->task_woken(rq, p);
-		lockdep_repin_lock(&rq->lock, cookie);
+		rq_repin_lock(rq, rf);
 	}
 
 	if (rq->idle_stamp) {
@@ -1723,7 +1723,7 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
 
 static void
 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
-		 struct pin_cookie cookie)
+		 struct rq_flags *rf)
 {
 	int en_flags = ENQUEUE_WAKEUP;
 
@@ -1738,7 +1738,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
 #endif
 
 	ttwu_activate(rq, p, en_flags);
-	ttwu_do_wakeup(rq, p, wake_flags, cookie);
+	ttwu_do_wakeup(rq, p, wake_flags, rf);
 }
 
 /*
@@ -1757,7 +1757,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
 	if (task_on_rq_queued(p)) {
 		/* check_preempt_curr() may use rq clock */
 		update_rq_clock(rq);
-		ttwu_do_wakeup(rq, p, wake_flags, rf.cookie);
+		ttwu_do_wakeup(rq, p, wake_flags, &rf);
 		ret = 1;
 	}
 	__task_rq_unlock(rq, &rf);
@@ -1770,15 +1770,15 @@ void sched_ttwu_pending(void)
 {
 	struct rq *rq = this_rq();
 	struct llist_node *llist = llist_del_all(&rq->wake_list);
-	struct pin_cookie cookie;
 	struct task_struct *p;
 	unsigned long flags;
+	struct rq_flags rf;
 
 	if (!llist)
 		return;
 
 	raw_spin_lock_irqsave(&rq->lock, flags);
-	cookie = lockdep_pin_lock(&rq->lock);
+	rq_pin_lock(rq, &rf);
 
 	while (llist) {
 		int wake_flags = 0;
@@ -1789,10 +1789,10 @@ void sched_ttwu_pending(void)
 		if (p->sched_remote_wakeup)
 			wake_flags = WF_MIGRATED;
 
-		ttwu_do_activate(rq, p, wake_flags, cookie);
+		ttwu_do_activate(rq, p, wake_flags, &rf);
 	}
 
-	lockdep_unpin_lock(&rq->lock, cookie);
+	rq_unpin_lock(rq, &rf);
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 
@@ -1881,7 +1881,7 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
 static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
 {
 	struct rq *rq = cpu_rq(cpu);
-	struct pin_cookie cookie;
+	struct rq_flags rf;
 
 #if defined(CONFIG_SMP)
 	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
@@ -1892,9 +1892,9 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
 #endif
 
 	raw_spin_lock(&rq->lock);
-	cookie = lockdep_pin_lock(&rq->lock);
-	ttwu_do_activate(rq, p, wake_flags, cookie);
-	lockdep_unpin_lock(&rq->lock, cookie);
+	rq_pin_lock(rq, &rf);
+	ttwu_do_activate(rq, p, wake_flags, &rf);
+	rq_unpin_lock(rq, &rf);
 	raw_spin_unlock(&rq->lock);
 }
 
@@ -2111,7 +2111,7 @@ out:
  * ensure that this_rq() is locked, @p is bound to this_rq() and not
  * the current task.
  */
-static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie)
+static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
 {
 	struct rq *rq = task_rq(p);
 
@@ -2128,11 +2128,11 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie
 		 * disabled avoiding further scheduler activity on it and we've
 		 * not yet picked a replacement task.
 		 */
-		lockdep_unpin_lock(&rq->lock, cookie);
+		rq_unpin_lock(rq, rf);
 		raw_spin_unlock(&rq->lock);
 		raw_spin_lock(&p->pi_lock);
 		raw_spin_lock(&rq->lock);
-		lockdep_repin_lock(&rq->lock, cookie);
+		rq_repin_lock(rq, rf);
 	}
 
 	if (!(p->state & TASK_NORMAL))
@@ -2143,7 +2143,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie
 	if (!task_on_rq_queued(p))
 		ttwu_activate(rq, p, ENQUEUE_WAKEUP);
 
-	ttwu_do_wakeup(rq, p, 0, cookie);
+	ttwu_do_wakeup(rq, p, 0, rf);
 	ttwu_stat(p, smp_processor_id(), 0);
 out:
 	raw_spin_unlock(&p->pi_lock);
@@ -2590,9 +2590,9 @@ void wake_up_new_task(struct task_struct *p)
 		 * Nothing relies on rq->lock after this, so its fine to
 		 * drop it.
 		 */
-		lockdep_unpin_lock(&rq->lock, rf.cookie);
+		rq_unpin_lock(rq, &rf);
 		p->sched_class->task_woken(rq, p);
-		lockdep_repin_lock(&rq->lock, rf.cookie);
+		rq_repin_lock(rq, &rf);
 	}
 #endif
 	task_rq_unlock(rq, p, &rf);
@@ -2861,7 +2861,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
  */
 static __always_inline struct rq *
 context_switch(struct rq *rq, struct task_struct *prev,
-	       struct task_struct *next, struct pin_cookie cookie)
+	       struct task_struct *next, struct rq_flags *rf)
 {
 	struct mm_struct *mm, *oldmm;
 
@@ -2893,7 +2893,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	 * of the scheduler it's an obvious special-case), so we
 	 * do an early lockdep release here:
 	 */
-	lockdep_unpin_lock(&rq->lock, cookie);
+	rq_unpin_lock(rq, rf);
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 
 	/* Here we just switch the register state and the stack. */
@@ -3257,7 +3257,7 @@ static inline void schedule_debug(struct task_struct *prev)
  * Pick up the highest-prio task:
  */
 static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
+pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 {
 	const struct sched_class *class = &fair_sched_class;
 	struct task_struct *p;
@@ -3268,20 +3268,20 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie
 	 */
 	if (likely(prev->sched_class == class &&
 		   rq->nr_running == rq->cfs.h_nr_running)) {
-		p = fair_sched_class.pick_next_task(rq, prev, cookie);
+		p = fair_sched_class.pick_next_task(rq, prev, rf);
 		if (unlikely(p == RETRY_TASK))
 			goto again;
 
 		/* assumes fair_sched_class->next == idle_sched_class */
 		if (unlikely(!p))
-			p = idle_sched_class.pick_next_task(rq, prev, cookie);
+			p = idle_sched_class.pick_next_task(rq, prev, rf);
 
 		return p;
 	}
 
 again:
 	for_each_class(class) {
-		p = class->pick_next_task(rq, prev, cookie);
+		p = class->pick_next_task(rq, prev, rf);
 		if (p) {
 			if (unlikely(p == RETRY_TASK))
 				goto again;
@@ -3335,7 +3335,7 @@ static void __sched notrace __schedule(bool preempt)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
-	struct pin_cookie cookie;
+	struct rq_flags rf;
 	struct rq *rq;
 	int cpu;
 
@@ -3358,7 +3358,7 @@ static void __sched notrace __schedule(bool preempt)
 	 */
 	smp_mb__before_spinlock();
 	raw_spin_lock(&rq->lock);
-	cookie = lockdep_pin_lock(&rq->lock);
+	rq_pin_lock(rq, &rf);
 
 	rq->clock_skip_update <<= 1; /* promote REQ to ACT */
 
@@ -3380,7 +3380,7 @@ static void __sched notrace __schedule(bool preempt)
 
 				to_wakeup = wq_worker_sleeping(prev);
 				if (to_wakeup)
-					try_to_wake_up_local(to_wakeup, cookie);
+					try_to_wake_up_local(to_wakeup, &rf);
 			}
 		}
 		switch_count = &prev->nvcsw;
@@ -3389,7 +3389,7 @@ static void __sched notrace __schedule(bool preempt)
 	if (task_on_rq_queued(prev))
 		update_rq_clock(rq);
 
-	next = pick_next_task(rq, prev, cookie);
+	next = pick_next_task(rq, prev, &rf);
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();
 	rq->clock_skip_update = 0;
@@ -3400,9 +3400,9 @@ static void __sched notrace __schedule(bool preempt)
 		++*switch_count;
 
 		trace_sched_switch(preempt, prev, next);
-		rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */
+		rq = context_switch(rq, prev, next, &rf); /* unlocks the rq */
 	} else {
-		lockdep_unpin_lock(&rq->lock, cookie);
+		rq_unpin_lock(rq, &rf);
 		raw_spin_unlock_irq(&rq->lock);
 	}
 
@@ -5521,7 +5521,7 @@ static void migrate_tasks(struct rq *dead_rq)
 {
 	struct rq *rq = dead_rq;
 	struct task_struct *next, *stop = rq->stop;
-	struct pin_cookie cookie;
+	struct rq_flags rf;
 	int dest_cpu;
 
 	/*
@@ -5553,8 +5553,8 @@ static void migrate_tasks(struct rq *dead_rq)
 		/*
 		 * pick_next_task assumes pinned rq->lock.
 		 */
-		cookie = lockdep_pin_lock(&rq->lock);
-		next = pick_next_task(rq, &fake_task, cookie);
+		rq_pin_lock(rq, &rf);
+		next = pick_next_task(rq, &fake_task, &rf);
 		BUG_ON(!next);
 		next->sched_class->put_prev_task(rq, next);
 
@@ -5567,7 +5567,7 @@ static void migrate_tasks(struct rq *dead_rq)
 		 * because !cpu_active at this point, which means load-balance
 		 * will not interfere. Also, stop-machine.
 		 */
-		lockdep_unpin_lock(&rq->lock, cookie);
+		rq_unpin_lock(rq, &rf);
 		raw_spin_unlock(&rq->lock);
 		raw_spin_lock(&next->pi_lock);
 		raw_spin_lock(&rq->lock);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 70ef2b1901e4..491ff663e1b6 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -663,9 +663,9 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 		 * Nothing relies on rq->lock after this, so its safe to drop
 		 * rq->lock.
 		 */
-		lockdep_unpin_lock(&rq->lock, rf.cookie);
+		rq_unpin_lock(rq, &rf);
 		push_dl_task(rq);
-		lockdep_repin_lock(&rq->lock, rf.cookie);
+		rq_repin_lock(rq, &rf);
 	}
 #endif
 
@@ -1118,7 +1118,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
 }
 
 struct task_struct *
-pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
+pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 {
 	struct sched_dl_entity *dl_se;
 	struct task_struct *p;
@@ -1133,9 +1133,9 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie coo
 		 * disabled avoiding further scheduler activity on it and we're
 		 * being very careful to re-start the picking loop.
 		 */
-		lockdep_unpin_lock(&rq->lock, cookie);
+		rq_unpin_lock(rq, rf);
 		pull_dl_task(rq);
-		lockdep_repin_lock(&rq->lock, cookie);
+		rq_repin_lock(rq, rf);
 		/*
 		 * pull_dl_task() can drop (and re-acquire) rq->lock; this
 		 * means a stop task can slip in, in which case we need to
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6559d197e08a..490441255c56 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6213,7 +6213,7 @@ preempt:
 }
 
 static struct task_struct *
-pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
+pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 {
 	struct cfs_rq *cfs_rq = &rq->cfs;
 	struct sched_entity *se;
@@ -6326,9 +6326,9 @@ idle:
 	 * further scheduler activity on it and we're being very careful to
 	 * re-start the picking loop.
 	 */
-	lockdep_unpin_lock(&rq->lock, cookie);
+	rq_unpin_lock(rq, rf);
 	new_tasks = idle_balance(rq);
-	lockdep_repin_lock(&rq->lock, cookie);
+	rq_repin_lock(rq, rf);
 	/*
 	 * Because idle_balance() releases (and re-acquires) rq->lock, it is
 	 * possible for any higher priority task to appear. In that case we
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 5405d3feb112..0c00172db63e 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -24,7 +24,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 }
 
 static struct task_struct *
-pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
+pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 {
 	put_prev_task(rq, prev);
 	update_idle_core(rq);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 2516b8df6dbb..88254be118b0 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1523,7 +1523,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
 }
 
 static struct task_struct *
-pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
+pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 {
 	struct task_struct *p;
 	struct rt_rq *rt_rq = &rq->rt;
@@ -1535,9 +1535,9 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct pin_cookie coo
 		 * disabled avoiding further scheduler activity on it and we're
 		 * being very careful to re-start the picking loop.
 		 */
-		lockdep_unpin_lock(&rq->lock, cookie);
+		rq_unpin_lock(rq, rf);
 		pull_rt_task(rq);
-		lockdep_repin_lock(&rq->lock, cookie);
+		rq_repin_lock(rq, rf);
 		/*
 		 * pull_rt_task() can drop (and re-acquire) rq->lock; this
 		 * means a dl or stop task can slip in, in which case we need
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7b34c7826ca5..98e7eee07237 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -792,6 +792,26 @@ static inline void rq_clock_skip_update(struct rq *rq, bool skip)
 		rq->clock_skip_update &= ~RQCF_REQ_SKIP;
 }
 
+struct rq_flags {
+	unsigned long flags;
+	struct pin_cookie cookie;
+};
+
+static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
+{
+	rf->cookie = lockdep_pin_lock(&rq->lock);
+}
+
+static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
+{
+	lockdep_unpin_lock(&rq->lock, rf->cookie);
+}
+
+static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf)
+{
+	lockdep_repin_lock(&rq->lock, rf->cookie);
+}
+
 #ifdef CONFIG_NUMA
 enum numa_topology_type {
 	NUMA_DIRECT,
@@ -1245,7 +1265,7 @@ struct sched_class {
 	 */
 	struct task_struct * (*pick_next_task) (struct rq *rq,
 						struct task_struct *prev,
-						struct pin_cookie cookie);
+						struct rq_flags *rf);
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 
 #ifdef CONFIG_SMP
@@ -1501,11 +1521,6 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
 static inline void sched_avg_update(struct rq *rq) { }
 #endif
 
-struct rq_flags {
-	unsigned long flags;
-	struct pin_cookie cookie;
-};
-
 struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
 	__acquires(rq->lock);
 struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
@@ -1515,7 +1530,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
 static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
 	__releases(rq->lock)
 {
-	lockdep_unpin_lock(&rq->lock, rf->cookie);
+	rq_unpin_lock(rq, rf);
 	raw_spin_unlock(&rq->lock);
 }
 
@@ -1524,7 +1539,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
 	__releases(rq->lock)
 	__releases(p->pi_lock)
 {
-	lockdep_unpin_lock(&rq->lock, rf->cookie);
+	rq_unpin_lock(rq, rf);
 	raw_spin_unlock(&rq->lock);
 	raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
 }
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 604297a08b3a..9f69fb630853 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -24,7 +24,7 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
 }
 
 static struct task_struct *
-pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
+pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 {
 	struct task_struct *stop = rq->stop;
 
-- 
cgit v1.2.3


From 92509b732baf14c59ca702307270cfaa3a585ae7 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Wed, 21 Sep 2016 14:38:11 +0100
Subject: sched/core: Reset RQCF_ACT_SKIP before unpinning rq->lock

rq_clock() is called from sched_info_{depart,arrive}() after resetting
RQCF_ACT_SKIP but prior to a call to update_rq_clock().

In preparation for pending patches that check whether the rq clock has
been updated inside of a pin context before rq_clock() is called, move
the reset of rq->clock_skip_update immediately before unpinning the rq
lock.

This will avoid the new warnings which check if update_rq_clock() is
being actively skipped.

Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luca Abeni <luca.abeni@unitn.it>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Yuyang Du <yuyang.du@intel.com>
Link: http://lkml.kernel.org/r/20160921133813.31976-6-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 41df935180ea..311460b46d68 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2887,6 +2887,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		prev->active_mm = NULL;
 		rq->prev_mm = oldmm;
 	}
+
+	rq->clock_skip_update = 0;
+
 	/*
 	 * Since the runqueue lock will be released by the next
 	 * task (which is an invalid locking op but in the case
@@ -3392,7 +3395,6 @@ static void __sched notrace __schedule(bool preempt)
 	next = pick_next_task(rq, prev, &rf);
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();
-	rq->clock_skip_update = 0;
 
 	if (likely(prev != next)) {
 		rq->nr_switches++;
@@ -3402,6 +3404,7 @@ static void __sched notrace __schedule(bool preempt)
 		trace_sched_switch(preempt, prev, next);
 		rq = context_switch(rq, prev, next, &rf); /* unlocks the rq */
 	} else {
+		rq->clock_skip_update = 0;
 		rq_unpin_lock(rq, &rf);
 		raw_spin_unlock_irq(&rq->lock);
 	}
-- 
cgit v1.2.3


From 46f69fa33712ad12ccaa723e46ed5929ee93589b Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Wed, 21 Sep 2016 14:38:12 +0100
Subject: sched/fair: Push rq lock pin/unpin into idle_balance()

Future patches will emit warnings if rq_clock() is called before
update_rq_clock() inside a rq_pin_lock()/rq_unpin_lock() pair.

Since there is only one caller of idle_balance() we can push the
unpin/repin there.

Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luca Abeni <luca.abeni@unitn.it>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Yuyang Du <yuyang.du@intel.com>
Link: http://lkml.kernel.org/r/20160921133813.31976-7-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 490441255c56..faf80e10d662 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3424,7 +3424,7 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
 	return cfs_rq->avg.load_avg;
 }
 
-static int idle_balance(struct rq *this_rq);
+static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
 
 #else /* CONFIG_SMP */
 
@@ -3453,7 +3453,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 static inline void
 detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 
-static inline int idle_balance(struct rq *rq)
+static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
 {
 	return 0;
 }
@@ -6320,15 +6320,8 @@ simple:
 	return p;
 
 idle:
-	/*
-	 * This is OK, because current is on_cpu, which avoids it being picked
-	 * for load-balance and preemption/IRQs are still disabled avoiding
-	 * further scheduler activity on it and we're being very careful to
-	 * re-start the picking loop.
-	 */
-	rq_unpin_lock(rq, rf);
-	new_tasks = idle_balance(rq);
-	rq_repin_lock(rq, rf);
+	new_tasks = idle_balance(rq, rf);
+
 	/*
 	 * Because idle_balance() releases (and re-acquires) rq->lock, it is
 	 * possible for any higher priority task to appear. In that case we
@@ -8297,7 +8290,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
  * idle_balance is called by schedule() if this_cpu is about to become
  * idle. Attempts to pull tasks from other CPUs.
  */
-static int idle_balance(struct rq *this_rq)
+static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
 {
 	unsigned long next_balance = jiffies + HZ;
 	int this_cpu = this_rq->cpu;
@@ -8311,6 +8304,14 @@ static int idle_balance(struct rq *this_rq)
 	 */
 	this_rq->idle_stamp = rq_clock(this_rq);
 
+	/*
+	 * This is OK, because current is on_cpu, which avoids it being picked
+	 * for load-balance and preemption/IRQs are still disabled avoiding
+	 * further scheduler activity on it and we're being very careful to
+	 * re-start the picking loop.
+	 */
+	rq_unpin_lock(this_rq, rf);
+
 	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
 	    !this_rq->rd->overload) {
 		rcu_read_lock();
@@ -8388,6 +8389,8 @@ out:
 	if (pulled_task)
 		this_rq->idle_stamp = 0;
 
+	rq_repin_lock(this_rq, rf);
+
 	return pulled_task;
 }
 
-- 
cgit v1.2.3


From 4126bad6717336abe5d666440ae15555563ca53f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 3 Oct 2016 16:20:59 +0200
Subject: sched/core: Add missing update_rq_clock() in
 post_init_entity_util_avg()

Address this rq-clock update bug:

  WARNING: CPU: 0 PID: 0 at ../kernel/sched/sched.h:797 post_init_entity_util_avg()
  rq->clock_update_flags < RQCF_ACT_SKIP

  Call Trace:
    __warn()
    post_init_entity_util_avg()
    wake_up_new_task()
    _do_fork()
    kernel_thread()
    rest_init()
    start_kernel()

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 1 +
 kernel/sched/fair.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 311460b46d68..9217c3221b0d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2578,6 +2578,7 @@ void wake_up_new_task(struct task_struct *p)
 	__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
 	rq = __task_rq_lock(p, &rf);
+	update_rq_clock(rq);
 	post_init_entity_util_avg(&p->se);
 
 	activate_task(rq, p, 0);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index faf80e10d662..972b67622922 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9267,6 +9267,7 @@ void online_fair_sched_group(struct task_group *tg)
 		se = tg->se[i];
 
 		raw_spin_lock_irq(&rq->lock);
+		update_rq_clock(rq);
 		attach_entity_cfs_rq(se);
 		sync_throttle(tg, i);
 		raw_spin_unlock_irq(&rq->lock);
-- 
cgit v1.2.3


From 80f5c1b84baa8180c3c27b7e227429712cd967b6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 3 Oct 2016 16:28:37 +0200
Subject: sched/core: Add missing update_rq_clock() in detach_task_cfs_rq()

Instead of adding the update_rq_clock() all the way at the bottom of
the callstack, add one at the top, this to aid later effort to
minimize update_rq_lock() calls.

  WARNING: CPU: 0 PID: 1 at ../kernel/sched/sched.h:797 detach_task_cfs_rq()
  rq->clock_update_flags < RQCF_ACT_SKIP

  Call Trace:
    dump_stack()
    __warn()
    warn_slowpath_fmt()
    detach_task_cfs_rq()
    switched_from_fair()
    __sched_setscheduler()
    _sched_setscheduler()
    sched_set_stop_task()
    cpu_stop_create()
    __smpboot_create_thread.part.2()
    smpboot_register_percpu_thread_cpumask()
    cpu_stop_init()
    do_one_initcall()
    ? print_cpu_info()
    kernel_init_freeable()
    ? rest_init()
    kernel_init()
    ret_from_fork()

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9217c3221b0d..b7f7f215b51b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3655,6 +3655,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	BUG_ON(prio > MAX_PRIO);
 
 	rq = __task_rq_lock(p, &rf);
+	update_rq_clock(rq);
 
 	/*
 	 * Idle task boosting is a nono in general. There is one
@@ -4183,6 +4184,7 @@ recheck:
 	 * runqueue lock must be held.
 	 */
 	rq = task_rq_lock(p, &rf);
+	update_rq_clock(rq);
 
 	/*
 	 * Changing the policy of the stop threads its a very bad idea
@@ -8435,6 +8437,7 @@ static void cpu_cgroup_fork(struct task_struct *task)
 
 	rq = task_rq_lock(task, &rf);
 
+	update_rq_clock(rq);
 	sched_change_group(task, TASK_SET_GROUP);
 
 	task_rq_unlock(rq, task, &rf);
-- 
cgit v1.2.3


From 3bed5e2166a5e433bf62162f3cd3c5174d335934 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 3 Oct 2016 16:35:32 +0200
Subject: sched/core: Add missing update_rq_clock() call for task_hot()

Add the update_rq_clock() call at the top of the callstack instead of
at the bottom where we find it missing, this to aid later effort to
minimize the number of update_rq_lock() calls.

  WARNING: CPU: 30 PID: 194 at ../kernel/sched/sched.h:797 assert_clock_updated()
  rq->clock_update_flags < RQCF_ACT_SKIP

  Call Trace:
    dump_stack()
    __warn()
    warn_slowpath_fmt()
    assert_clock_updated.isra.63.part.64()
    can_migrate_task()
    load_balance()
    pick_next_task_fair()
    __schedule()
    schedule()
    worker_thread()
    kthread()

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 972b67622922..b3bfe3fb4e13 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8070,6 +8070,7 @@ redo:
 
 more_balance:
 		raw_spin_lock_irqsave(&busiest->lock, flags);
+		update_rq_clock(busiest);
 
 		/*
 		 * cur_ld_moved - load moved in current iteration
@@ -8446,6 +8447,7 @@ static int active_load_balance_cpu_stop(void *data)
 		};
 
 		schedstat_inc(sd->alb_count);
+		update_rq_clock(busiest_rq);
 
 		p = detach_one_task(&env);
 		if (p) {
-- 
cgit v1.2.3


From 2fb8d36787affe26f3536c3d8ec094995a48037d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 3 Oct 2016 16:44:25 +0200
Subject: sched/core: Add missing update_rq_clock() call in set_user_nice()

Address this rq-clock update bug:

  WARNING: CPU: 30 PID: 195 at ../kernel/sched/sched.h:797 set_next_entity()
  rq->clock_update_flags < RQCF_ACT_SKIP

  Call Trace:
    dump_stack()
    __warn()
    warn_slowpath_fmt()
    set_next_entity()
    ? _raw_spin_lock()
    set_curr_task_fair()
    set_user_nice.part.85()
    set_user_nice()
    create_worker()
    worker_thread()
    kthread()
    ret_from_fork()

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b7f7f215b51b..d2338927773a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3752,6 +3752,8 @@ void set_user_nice(struct task_struct *p, long nice)
 	 * the task might be in the middle of scheduling on another CPU.
 	 */
 	rq = task_rq_lock(p, &rf);
+	update_rq_clock(rq);
+
 	/*
 	 * The RT priorities are set via sched_setscheduler(), but we still
 	 * allow the 'normal' nice value to be set - but as expected
-- 
cgit v1.2.3


From cb42c9a3ebbbb23448c3f9a25417fae6309b1a92 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Wed, 21 Sep 2016 14:38:13 +0100
Subject: sched/core: Add debugging code to catch missing update_rq_clock()
 calls

There's no diagnostic checks for figuring out when we've accidentally
missed update_rq_clock() calls. Let's add some by piggybacking on the
rq_*pin_lock() wrappers.

The idea behind the diagnostic checks is that upon pining rq lock the
rq clock should be updated, via update_rq_clock(), before anybody
reads the clock with rq_clock() or rq_clock_task().

The exception to this rule is when updates have explicitly been
disabled with the rq_clock_skip_update() optimisation.

There are some functions that only unpin the rq lock in order to grab
some other lock and avoid deadlock. In that case we don't need to
update the clock again and the previous diagnostic state can be
carried over in rq_repin_lock() by saving the state in the rq_flags
context.

Since this patch adds a new clock update flag and some already exist
in rq::clock_skip_update, that field has now been renamed. An attempt
has been made to keep the flag manipulation code small and fast since
it's used in the heart of the __schedule() fast path.

For the !CONFIG_SCHED_DEBUG case the only object code change (other
than addresses) is the following change to reset RQCF_ACT_SKIP inside
of __schedule(),

  -       c7 83 38 09 00 00 00    movl   $0x0,0x938(%rbx)
  -       00 00 00
  +       83 a3 38 09 00 00 fc    andl   $0xfffffffc,0x938(%rbx)

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luca Abeni <luca.abeni@unitn.it>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Yuyang Du <yuyang.du@intel.com>
Link: http://lkml.kernel.org/r/20160921133813.31976-8-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  | 11 +++++---
 kernel/sched/sched.h | 74 +++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 75 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d2338927773a..a129b34b8206 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -102,9 +102,12 @@ void update_rq_clock(struct rq *rq)
 
 	lockdep_assert_held(&rq->lock);
 
-	if (rq->clock_skip_update & RQCF_ACT_SKIP)
+	if (rq->clock_update_flags & RQCF_ACT_SKIP)
 		return;
 
+#ifdef CONFIG_SCHED_DEBUG
+	rq->clock_update_flags |= RQCF_UPDATED;
+#endif
 	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
 	if (delta < 0)
 		return;
@@ -2889,7 +2892,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		rq->prev_mm = oldmm;
 	}
 
-	rq->clock_skip_update = 0;
+	rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
 
 	/*
 	 * Since the runqueue lock will be released by the next
@@ -3364,7 +3367,7 @@ static void __sched notrace __schedule(bool preempt)
 	raw_spin_lock(&rq->lock);
 	rq_pin_lock(rq, &rf);
 
-	rq->clock_skip_update <<= 1; /* promote REQ to ACT */
+	rq->clock_update_flags <<= 1; /* promote REQ to ACT */
 
 	switch_count = &prev->nivcsw;
 	if (!preempt && prev->state) {
@@ -3405,7 +3408,7 @@ static void __sched notrace __schedule(bool preempt)
 		trace_sched_switch(preempt, prev, next);
 		rq = context_switch(rq, prev, next, &rf); /* unlocks the rq */
 	} else {
-		rq->clock_skip_update = 0;
+		rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
 		rq_unpin_lock(rq, &rf);
 		raw_spin_unlock_irq(&rq->lock);
 	}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 98e7eee07237..6eeae7ebd99b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -644,7 +644,7 @@ struct rq {
 	unsigned long next_balance;
 	struct mm_struct *prev_mm;
 
-	unsigned int clock_skip_update;
+	unsigned int clock_update_flags;
 	u64 clock;
 	u64 clock_task;
 
@@ -768,48 +768,110 @@ static inline u64 __rq_clock_broken(struct rq *rq)
 	return READ_ONCE(rq->clock);
 }
 
+/*
+ * rq::clock_update_flags bits
+ *
+ * %RQCF_REQ_SKIP - will request skipping of clock update on the next
+ *  call to __schedule(). This is an optimisation to avoid
+ *  neighbouring rq clock updates.
+ *
+ * %RQCF_ACT_SKIP - is set from inside of __schedule() when skipping is
+ *  in effect and calls to update_rq_clock() are being ignored.
+ *
+ * %RQCF_UPDATED - is a debug flag that indicates whether a call has been
+ *  made to update_rq_clock() since the last time rq::lock was pinned.
+ *
+ * If inside of __schedule(), clock_update_flags will have been
+ * shifted left (a left shift is a cheap operation for the fast path
+ * to promote %RQCF_REQ_SKIP to %RQCF_ACT_SKIP), so you must use,
+ *
+ *	if (rq-clock_update_flags >= RQCF_UPDATED)
+ *
+ * to check if %RQCF_UPADTED is set. It'll never be shifted more than
+ * one position though, because the next rq_unpin_lock() will shift it
+ * back.
+ */
+#define RQCF_REQ_SKIP	0x01
+#define RQCF_ACT_SKIP	0x02
+#define RQCF_UPDATED	0x04
+
+static inline void assert_clock_updated(struct rq *rq)
+{
+	/*
+	 * The only reason for not seeing a clock update since the
+	 * last rq_pin_lock() is if we're currently skipping updates.
+	 */
+	SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP);
+}
+
 static inline u64 rq_clock(struct rq *rq)
 {
 	lockdep_assert_held(&rq->lock);
+	assert_clock_updated(rq);
+
 	return rq->clock;
 }
 
 static inline u64 rq_clock_task(struct rq *rq)
 {
 	lockdep_assert_held(&rq->lock);
+	assert_clock_updated(rq);
+
 	return rq->clock_task;
 }
 
-#define RQCF_REQ_SKIP	0x01
-#define RQCF_ACT_SKIP	0x02
-
 static inline void rq_clock_skip_update(struct rq *rq, bool skip)
 {
 	lockdep_assert_held(&rq->lock);
 	if (skip)
-		rq->clock_skip_update |= RQCF_REQ_SKIP;
+		rq->clock_update_flags |= RQCF_REQ_SKIP;
 	else
-		rq->clock_skip_update &= ~RQCF_REQ_SKIP;
+		rq->clock_update_flags &= ~RQCF_REQ_SKIP;
 }
 
 struct rq_flags {
 	unsigned long flags;
 	struct pin_cookie cookie;
+#ifdef CONFIG_SCHED_DEBUG
+	/*
+	 * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the
+	 * current pin context is stashed here in case it needs to be
+	 * restored in rq_repin_lock().
+	 */
+	unsigned int clock_update_flags;
+#endif
 };
 
 static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
 {
 	rf->cookie = lockdep_pin_lock(&rq->lock);
+
+#ifdef CONFIG_SCHED_DEBUG
+	rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+	rf->clock_update_flags = 0;
+#endif
 }
 
 static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
 {
+#ifdef CONFIG_SCHED_DEBUG
+	if (rq->clock_update_flags > RQCF_ACT_SKIP)
+		rf->clock_update_flags = RQCF_UPDATED;
+#endif
+
 	lockdep_unpin_lock(&rq->lock, rf->cookie);
 }
 
 static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf)
 {
 	lockdep_repin_lock(&rq->lock, rf->cookie);
+
+#ifdef CONFIG_SCHED_DEBUG
+	/*
+	 * Restore the value we stashed in @rf for this pin context.
+	 */
+	rq->clock_update_flags |= rf->clock_update_flags;
+#endif
 }
 
 #ifdef CONFIG_NUMA
-- 
cgit v1.2.3


From 12907fbb1a691807bb0420a27126e15934cb7954 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 15 Dec 2016 11:44:28 +0100
Subject: sched/clock, clocksource: Add optional cs::mark_unstable() method

PeterZ reported that we'd fail to mark the TSC unstable when the
clocksource watchdog finds it unsuitable.

Allow a clocksource to run a custom action when its being marked
unstable and hook up the TSC unstable code.

Reported-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/tsc.c       | 11 +++++++++++
 include/linux/clocksource.h |  3 +++
 kernel/time/clocksource.c   |  4 ++++
 3 files changed, 18 insertions(+)

(limited to 'kernel')

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index be3a49ee0356..c8174c815d83 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1106,6 +1106,16 @@ static u64 read_tsc(struct clocksource *cs)
 	return (u64)rdtsc_ordered();
 }
 
+static void tsc_cs_mark_unstable(struct clocksource *cs)
+{
+	if (tsc_unstable)
+		return;
+	tsc_unstable = 1;
+	clear_sched_clock_stable();
+	disable_sched_clock_irqtime();
+	pr_info("Marking TSC unstable due to clocksource watchdog\n");
+}
+
 /*
  * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
  */
@@ -1118,6 +1128,7 @@ static struct clocksource clocksource_tsc = {
 				  CLOCK_SOURCE_MUST_VERIFY,
 	.archdata               = { .vclock_mode = VCLOCK_TSC },
 	.resume			= tsc_resume,
+	.mark_unstable		= tsc_cs_mark_unstable,
 };
 
 void mark_tsc_unstable(char *reason)
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index e315d04a2fd9..cfc75848a35d 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -62,6 +62,8 @@ struct module;
  * @archdata:		arch-specific data
  * @suspend:		suspend function for the clocksource, if necessary
  * @resume:		resume function for the clocksource, if necessary
+ * @mark_unstable:	Optional function to inform the clocksource driver that
+ *			the watchdog marked the clocksource unstable
  * @owner:		module reference, must be set by clocksource in modules
  *
  * Note: This struct is not used in hotpathes of the timekeeping code
@@ -93,6 +95,7 @@ struct clocksource {
 	unsigned long flags;
 	void (*suspend)(struct clocksource *cs);
 	void (*resume)(struct clocksource *cs);
+	void (*mark_unstable)(struct clocksource *cs);
 
 	/* private: */
 #ifdef CONFIG_CLOCKSOURCE_WATCHDOG
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 665985b0a89a..93621ae718d3 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -141,6 +141,10 @@ static void __clocksource_unstable(struct clocksource *cs)
 {
 	cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
 	cs->flags |= CLOCK_SOURCE_UNSTABLE;
+
+	if (cs->mark_unstable)
+		cs->mark_unstable(cs);
+
 	if (finished_booting)
 		schedule_work(&watchdog_work);
 }
-- 
cgit v1.2.3


From 555570d744f8150d3fce6083f144026cd1e63627 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 15 Dec 2016 13:21:58 +0100
Subject: sched/clock: Update static_key usage

sched_clock was still using the deprecated static_key interface.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/clock.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index e85a725e5c34..5d6dd38b449c 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -78,19 +78,17 @@ EXPORT_SYMBOL_GPL(sched_clock);
 __read_mostly int sched_clock_running;
 
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-static struct static_key __sched_clock_stable = STATIC_KEY_INIT;
+static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable);
 static int __sched_clock_stable_early;
 
 int sched_clock_stable(void)
 {
-	return static_key_false(&__sched_clock_stable);
+	return static_branch_likely(&__sched_clock_stable);
 }
 
 static void __set_sched_clock_stable(void)
 {
-	if (!sched_clock_stable())
-		static_key_slow_inc(&__sched_clock_stable);
-
+	static_branch_enable(&__sched_clock_stable);
 	tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
 }
 
@@ -109,9 +107,7 @@ void set_sched_clock_stable(void)
 static void __clear_sched_clock_stable(struct work_struct *work)
 {
 	/* XXX worry about clock continuity */
-	if (sched_clock_stable())
-		static_key_slow_dec(&__sched_clock_stable);
-
+	static_branch_disable(&__sched_clock_stable);
 	tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);
 }
 
-- 
cgit v1.2.3


From 9881b024b7d7671f6a014091bc96506b89081802 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 15 Dec 2016 13:35:52 +0100
Subject: sched/clock: Delay switching sched_clock to stable

Currently we switch to the stable sched_clock if we guess the TSC is
usable, and then switch back to the unstable path if it turns out TSC
isn't stable during SMP bringup after all.

Delay switching to the stable path until after SMP bringup is
complete. This way we'll avoid switching during the time we detect the
worst of the TSC offences.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  5 +++++
 init/main.c           |  1 -
 kernel/sched/clock.c  | 50 ++++++++++++++++++++++----------------------------
 kernel/sched/core.c   |  4 ++++
 4 files changed, 31 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ad3ec9ec61f7..94a48bb58297 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2515,6 +2515,10 @@ extern u64 sched_clock_cpu(int cpu);
 extern void sched_clock_init(void);
 
 #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static inline void sched_clock_init_late(void)
+{
+}
+
 static inline void sched_clock_tick(void)
 {
 }
@@ -2537,6 +2541,7 @@ static inline u64 local_clock(void)
 	return sched_clock();
 }
 #else
+extern void sched_clock_init_late(void);
 /*
  * Architectures can set this to 1 if they have specified
  * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
diff --git a/init/main.c b/init/main.c
index b0c9d6facef9..19228149386c 100644
--- a/init/main.c
+++ b/init/main.c
@@ -625,7 +625,6 @@ asmlinkage __visible void __init start_kernel(void)
 	numa_policy_init();
 	if (late_time_init)
 		late_time_init();
-	sched_clock_init();
 	calibrate_delay();
 	pidmap_init();
 	anon_vma_init();
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 5d6dd38b449c..b3466d4e0cc2 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -77,6 +77,11 @@ EXPORT_SYMBOL_GPL(sched_clock);
 
 __read_mostly int sched_clock_running;
 
+void sched_clock_init(void)
+{
+	sched_clock_running = 1;
+}
+
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable);
 static int __sched_clock_stable_early;
@@ -96,12 +101,18 @@ void set_sched_clock_stable(void)
 {
 	__sched_clock_stable_early = 1;
 
-	smp_mb(); /* matches sched_clock_init() */
-
-	if (!sched_clock_running)
-		return;
+	smp_mb(); /* matches sched_clock_init_late() */
 
-	__set_sched_clock_stable();
+	/*
+	 * This really should only be called early (before
+	 * sched_clock_init_late()) when guestimating our sched_clock() is
+	 * solid.
+	 *
+	 * After that we test stability and we can negate our guess using
+	 * clear_sched_clock_stable, possibly from a watchdog.
+	 */
+	if (WARN_ON_ONCE(sched_clock_running == 2))
+		__set_sched_clock_stable();
 }
 
 static void __clear_sched_clock_stable(struct work_struct *work)
@@ -117,12 +128,10 @@ void clear_sched_clock_stable(void)
 {
 	__sched_clock_stable_early = 0;
 
-	smp_mb(); /* matches sched_clock_init() */
-
-	if (!sched_clock_running)
-		return;
+	smp_mb(); /* matches sched_clock_init_late() */
 
-	schedule_work(&sched_clock_work);
+	if (sched_clock_running == 2)
+		schedule_work(&sched_clock_work);
 }
 
 struct sched_clock_data {
@@ -143,20 +152,9 @@ static inline struct sched_clock_data *cpu_sdc(int cpu)
 	return &per_cpu(sched_clock_data, cpu);
 }
 
-void sched_clock_init(void)
+void sched_clock_init_late(void)
 {
-	u64 ktime_now = ktime_to_ns(ktime_get());
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		struct sched_clock_data *scd = cpu_sdc(cpu);
-
-		scd->tick_raw = 0;
-		scd->tick_gtod = ktime_now;
-		scd->clock = ktime_now;
-	}
-
-	sched_clock_running = 1;
+	sched_clock_running = 2;
 
 	/*
 	 * Ensure that it is impossible to not do a static_key update.
@@ -362,11 +360,6 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
 #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 
-void sched_clock_init(void)
-{
-	sched_clock_running = 1;
-}
-
 u64 sched_clock_cpu(int cpu)
 {
 	if (unlikely(!sched_clock_running))
@@ -374,6 +367,7 @@ u64 sched_clock_cpu(int cpu)
 
 	return sched_clock();
 }
+
 #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 
 /*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a129b34b8206..96a4267e6020 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7498,6 +7498,7 @@ void __init sched_init_smp(void)
 	init_sched_dl_class();
 
 	sched_init_smt();
+	sched_clock_init_late();
 
 	sched_smp_initialized = true;
 }
@@ -7513,6 +7514,7 @@ early_initcall(migration_init);
 void __init sched_init_smp(void)
 {
 	sched_init_granularity();
+	sched_clock_init_late();
 }
 #endif /* CONFIG_SMP */
 
@@ -7556,6 +7558,8 @@ void __init sched_init(void)
 	int i, j;
 	unsigned long alloc_size = 0, ptr;
 
+	sched_clock_init();
+
 	for (i = 0; i < WAIT_TABLE_SIZE; i++)
 		init_waitqueue_head(bit_wait_table + i);
 
-- 
cgit v1.2.3


From 5680d8094ffa9e5cfc81afdd865027ee6417c263 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 15 Dec 2016 13:36:17 +0100
Subject: sched/clock: Provide better clock continuity

When switching between the unstable and stable variants it is
currently possible that clock discontinuities occur.

And while these will mostly be 'small', attempt to do better.

As observed on my IVB-EP, the sched_clock() is ~1.5s ahead of the
ktime_get_ns() based timeline at the point of switchover
(sched_clock_init_late()) after SMP bringup.

Equally, when the TSC is later found to be unstable -- typically
because SMM tries to hide its SMI latencies by mucking with the TSC --
we want to avoid large jumps.

Since the clocksource watchdog reports the issue after the fact we
cannot exactly fix up time, but since SMI latencies are typically
small (~10ns range), the discontinuity is mainly due to drift between
sched_clock() and ktime_get_ns() (which on my desktop is ~79s over
24days).

I dislike this patch because it adds overhead to the good case in
favour of dealing with badness. But given the widespread failure of
TSC stability this is worth it.

Note that in case the TSC makes drastic jumps after SMP bringup we're
still hosed. There's just not much we can do in that case without
stupid overhead.

If we were to somehow expose tsc_clocksource_reliable (which is hard
because this code is also used on ia64 and parisc) we could avoid some
of the newly introduced overhead.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/clock.c | 99 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 65 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index b3466d4e0cc2..7713b2b53f61 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -86,6 +86,30 @@ void sched_clock_init(void)
 static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable);
 static int __sched_clock_stable_early;
 
+/*
+ * We want: ktime_get_ns() + gtod_offset == sched_clock() + raw_offset
+ */
+static __read_mostly u64 raw_offset;
+static __read_mostly u64 gtod_offset;
+
+struct sched_clock_data {
+	u64			tick_raw;
+	u64			tick_gtod;
+	u64			clock;
+};
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
+
+static inline struct sched_clock_data *this_scd(void)
+{
+	return this_cpu_ptr(&sched_clock_data);
+}
+
+static inline struct sched_clock_data *cpu_sdc(int cpu)
+{
+	return &per_cpu(sched_clock_data, cpu);
+}
+
 int sched_clock_stable(void)
 {
 	return static_branch_likely(&__sched_clock_stable);
@@ -93,6 +117,17 @@ int sched_clock_stable(void)
 
 static void __set_sched_clock_stable(void)
 {
+	struct sched_clock_data *scd = this_scd();
+
+	/*
+	 * Attempt to make the (initial) unstable->stable transition continuous.
+	 */
+	raw_offset = (scd->tick_gtod + gtod_offset) - (scd->tick_raw);
+
+	printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n",
+			scd->tick_gtod, gtod_offset,
+			scd->tick_raw,  raw_offset);
+
 	static_branch_enable(&__sched_clock_stable);
 	tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
 }
@@ -117,7 +152,23 @@ void set_sched_clock_stable(void)
 
 static void __clear_sched_clock_stable(struct work_struct *work)
 {
-	/* XXX worry about clock continuity */
+	struct sched_clock_data *scd = this_scd();
+
+	/*
+	 * Attempt to make the stable->unstable transition continuous.
+	 *
+	 * Trouble is, this is typically called from the TSC watchdog
+	 * timer, which is late per definition. This means the tick
+	 * values can already be screwy.
+	 *
+	 * Still do what we can.
+	 */
+	gtod_offset = (scd->tick_raw + raw_offset) - (scd->tick_gtod);
+
+	printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n",
+			scd->tick_gtod, gtod_offset,
+			scd->tick_raw,  raw_offset);
+
 	static_branch_disable(&__sched_clock_stable);
 	tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);
 }
@@ -134,28 +185,9 @@ void clear_sched_clock_stable(void)
 		schedule_work(&sched_clock_work);
 }
 
-struct sched_clock_data {
-	u64			tick_raw;
-	u64			tick_gtod;
-	u64			clock;
-};
-
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
-
-static inline struct sched_clock_data *this_scd(void)
-{
-	return this_cpu_ptr(&sched_clock_data);
-}
-
-static inline struct sched_clock_data *cpu_sdc(int cpu)
-{
-	return &per_cpu(sched_clock_data, cpu);
-}
-
 void sched_clock_init_late(void)
 {
 	sched_clock_running = 2;
-
 	/*
 	 * Ensure that it is impossible to not do a static_key update.
 	 *
@@ -210,7 +242,7 @@ again:
 	 *		      scd->tick_gtod + TICK_NSEC);
 	 */
 
-	clock = scd->tick_gtod + delta;
+	clock = scd->tick_gtod + gtod_offset + delta;
 	min_clock = wrap_max(scd->tick_gtod, old_clock);
 	max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
 
@@ -296,7 +328,7 @@ u64 sched_clock_cpu(int cpu)
 	u64 clock;
 
 	if (sched_clock_stable())
-		return sched_clock();
+		return sched_clock() + raw_offset;
 
 	if (unlikely(!sched_clock_running))
 		return 0ull;
@@ -317,23 +349,22 @@ EXPORT_SYMBOL_GPL(sched_clock_cpu);
 void sched_clock_tick(void)
 {
 	struct sched_clock_data *scd;
-	u64 now, now_gtod;
-
-	if (sched_clock_stable())
-		return;
-
-	if (unlikely(!sched_clock_running))
-		return;
 
 	WARN_ON_ONCE(!irqs_disabled());
 
+	/*
+	 * Update these values even if sched_clock_stable(), because it can
+	 * become unstable at any point in time at which point we need some
+	 * values to fall back on.
+	 *
+	 * XXX arguably we can skip this if we expose tsc_clocksource_reliable
+	 */
 	scd = this_scd();
-	now_gtod = ktime_to_ns(ktime_get());
-	now = sched_clock();
+	scd->tick_raw  = sched_clock();
+	scd->tick_gtod = ktime_get_ns();
 
-	scd->tick_raw = now;
-	scd->tick_gtod = now_gtod;
-	sched_clock_local(scd);
+	if (!sched_clock_stable() && likely(sched_clock_running))
+		sched_clock_local(scd);
 }
 
 /*
-- 
cgit v1.2.3


From 59f8c2989283bbd3df9fcfb22494d84f4852e536 Mon Sep 17 00:00:00 2001
From: Tommaso Cucinotta <tommaso.cucinotta@sssup.it>
Date: Wed, 26 Oct 2016 11:17:17 +0200
Subject: sched/deadline: Show leftover runtime and abs deadline in
 /proc/*/sched

This patch allows for reading the current (leftover) runtime and
absolute deadline of a SCHED_DEADLINE task through /proc/*/sched
(entries dl.runtime and dl.deadline), while debugging/testing.

Signed-off-by: Tommaso Cucinotta <tommaso.cucinotta@sssup.it>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@arm.com>
Reviewed-by: Luca Abeni <luca.abeni@unitn.it>
Acked-by: Daniel Bistrot de Oliveira <danielbristot@gmail.com>
Cc: Juri Lelli <juri.lelli@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1477473437-10346-2-git-send-email-tommaso.cucinotta@sssup.it
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/scheduler/sched-deadline.txt | 6 ++++++
 kernel/sched/debug.c                       | 4 ++++
 2 files changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt
index 8e37b0ba2c9d..cbc1b46cbf70 100644
--- a/Documentation/scheduler/sched-deadline.txt
+++ b/Documentation/scheduler/sched-deadline.txt
@@ -408,6 +408,11 @@ CONTENTS
   * the new scheduling related syscalls that manipulate it, i.e.,
     sched_setattr() and sched_getattr() are implemented.
 
+ For debugging purposes, the leftover runtime and absolute deadline of a
+ SCHED_DEADLINE task can be retrieved through /proc/<pid>/sched (entries
+ dl.runtime and dl.deadline, both values in ns). A programmatic way to
+ retrieve these values from production code is under discussion.
+
 
 4.3 Default behavior
 ---------------------
@@ -476,6 +481,7 @@ CONTENTS
 
  Still missing:
 
+  - programmatic way to retrieve current runtime and absolute deadline
   - refinements to deadline inheritance, especially regarding the possibility
     of retaining bandwidth isolation among non-interacting tasks. This is
     being studied from both theoretical and practical points of view, and
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index fa178b62ea79..109adc0e9cb9 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -953,6 +953,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 #endif
 	P(policy);
 	P(prio);
+	if (p->policy == SCHED_DEADLINE) {
+		P(dl.runtime);
+		P(dl.deadline);
+	}
 #undef PN_SCHEDSTAT
 #undef PN
 #undef __PN
-- 
cgit v1.2.3


From da9647e076d440190fff4b4ee0f4b829dd6e8c4f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 21 Dec 2016 09:05:02 +0100
Subject: sched/completions: Fix complete_all() semantics

Documentation/scheduler/completion.txt says this about complete_all():

  "calls complete_all() to signal all current and future waiters."

Which doesn't strictly match the current semantics. Currently
complete_all() is equivalent to UINT_MAX/2 complete() invocations,
which is distinctly less than 'all current and future waiters'
(enumerable vs innumerable), although it has worked in practice.

However, Dmitry had a weird case where it might matter, so change
completions to use saturation semantics for complete()/complete_all().
Once done hits UINT_MAX (and complete_all() sets it there) it will
never again be decremented.

Requested-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: der.herr@hofr.at
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/completion.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 8d0f35debf35..f063a25d4449 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -31,7 +31,8 @@ void complete(struct completion *x)
 	unsigned long flags;
 
 	spin_lock_irqsave(&x->wait.lock, flags);
-	x->done++;
+	if (x->done != UINT_MAX)
+		x->done++;
 	__wake_up_locked(&x->wait, TASK_NORMAL, 1);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 }
@@ -51,7 +52,7 @@ void complete_all(struct completion *x)
 	unsigned long flags;
 
 	spin_lock_irqsave(&x->wait.lock, flags);
-	x->done += UINT_MAX/2;
+	x->done = UINT_MAX;
 	__wake_up_locked(&x->wait, TASK_NORMAL, 0);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 }
@@ -79,7 +80,8 @@ do_wait_for_common(struct completion *x,
 		if (!x->done)
 			return timeout;
 	}
-	x->done--;
+	if (x->done != UINT_MAX)
+		x->done--;
 	return timeout ?: 1;
 }
 
@@ -280,7 +282,7 @@ bool try_wait_for_completion(struct completion *x)
 	spin_lock_irqsave(&x->wait.lock, flags);
 	if (!x->done)
 		ret = 0;
-	else
+	else if (x->done != UINT_MAX)
 		x->done--;
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 	return ret;
-- 
cgit v1.2.3


From 89ee048f3cc796db6f26906c6bef4edf0bee70fd Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Wed, 21 Dec 2016 16:50:26 +0100
Subject: sched/core: Fix group_entity's share update

The update of the share of a cfs_rq is done when its load_avg is updated
but before the group_entity's load_avg has been updated for the past time
slot. This generates wrong load_avg accounting which can be significant
when small tasks are involved in the scheduling.

Let take the example of a task a that is dequeued of its task group A:
   root
  (cfs_rq)
    \
    (se)
     A
    (cfs_rq)
      \
      (se)
       a

Task "a" was the only task in task group A which becomes idle when a is
dequeued.

We have the sequence:

- dequeue_entity a->se
    - update_load_avg(a->se)
    - dequeue_entity_load_avg(A->cfs_rq, a->se)
    - update_cfs_shares(A->cfs_rq)
	A->cfs_rq->load.weight == 0
        A->se->load.weight is updated with the new share (0 in this case)
- dequeue_entity A->se
    - update_load_avg(A->se) but its weight is now null so the last time
      slot (up to a tick) will be accounted with a weight of 0 instead of
      its real weight during the time slot. The last time slot will be
      accounted as an idle one whereas it was a running one.

If the running time of task a is short enough that no tick happens when it
runs, all running time of group entity A->se will be accounted as idle
time.

Instead, we should update the share of a cfs_rq (in fact the weight of its
group entity) only after having updated the load_avg of the group_entity.

update_cfs_shares() now takes the sched_entity as a parameter instead of the
cfs_rq, and the weight of the group_entity is updated only once its load_avg
has been synced with current time.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: pjt@google.com
Link: http://lkml.kernel.org/r/1482335426-7664-1-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 50 +++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 37 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b3bfe3fb4e13..2b866a279bdf 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2689,16 +2689,20 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 
-static void update_cfs_shares(struct cfs_rq *cfs_rq)
+static void update_cfs_shares(struct sched_entity *se)
 {
+	struct cfs_rq *cfs_rq = group_cfs_rq(se);
 	struct task_group *tg;
-	struct sched_entity *se;
 	long shares;
 
-	tg = cfs_rq->tg;
-	se = tg->se[cpu_of(rq_of(cfs_rq))];
-	if (!se || throttled_hierarchy(cfs_rq))
+	if (!cfs_rq)
+		return;
+
+	if (throttled_hierarchy(cfs_rq))
 		return;
+
+	tg = cfs_rq->tg;
+
 #ifndef CONFIG_SMP
 	if (likely(se->load.weight == tg->shares))
 		return;
@@ -2707,8 +2711,9 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
 
 	reweight_entity(cfs_rq_of(se), se, shares);
 }
+
 #else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+static inline void update_cfs_shares(struct sched_entity *se)
 {
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -3582,10 +3587,18 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (renorm && !curr)
 		se->vruntime += cfs_rq->min_vruntime;
 
+	/*
+	 * When enqueuing a sched_entity, we must:
+	 *   - Update loads to have both entity and cfs_rq synced with now.
+	 *   - Add its load to cfs_rq->runnable_avg
+	 *   - For group_entity, update its weight to reflect the new share of
+	 *     its group cfs_rq
+	 *   - Add its new weight to cfs_rq->load.weight
+	 */
 	update_load_avg(se, UPDATE_TG);
 	enqueue_entity_load_avg(cfs_rq, se);
+	update_cfs_shares(se);
 	account_entity_enqueue(cfs_rq, se);
-	update_cfs_shares(cfs_rq);
 
 	if (flags & ENQUEUE_WAKEUP)
 		place_entity(cfs_rq, se, 0);
@@ -3657,6 +3670,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
+
+	/*
+	 * When dequeuing a sched_entity, we must:
+	 *   - Update loads to have both entity and cfs_rq synced with now.
+	 *   - Substract its load from the cfs_rq->runnable_avg.
+	 *   - Substract its previous weight from cfs_rq->load.weight.
+	 *   - For group entity, update its weight to reflect the new share
+	 *     of its group cfs_rq.
+	 */
 	update_load_avg(se, UPDATE_TG);
 	dequeue_entity_load_avg(cfs_rq, se);
 
@@ -3681,7 +3703,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	/* return excess runtime on last dequeue */
 	return_cfs_rq_runtime(cfs_rq);
 
-	update_cfs_shares(cfs_rq);
+	update_cfs_shares(se);
 
 	/*
 	 * Now advance min_vruntime if @se was the entity holding it back,
@@ -3864,7 +3886,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	 * Ensure that runnable average is periodically updated.
 	 */
 	update_load_avg(curr, UPDATE_TG);
-	update_cfs_shares(cfs_rq);
+	update_cfs_shares(curr);
 
 #ifdef CONFIG_SCHED_HRTICK
 	/*
@@ -4761,7 +4783,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 
 		update_load_avg(se, UPDATE_TG);
-		update_cfs_shares(cfs_rq);
+		update_cfs_shares(se);
 	}
 
 	if (!se)
@@ -4820,7 +4842,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 
 		update_load_avg(se, UPDATE_TG);
-		update_cfs_shares(cfs_rq);
+		update_cfs_shares(se);
 	}
 
 	if (!se)
@@ -9362,8 +9384,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 
 		/* Possible calls to update_curr() need rq clock */
 		update_rq_clock(rq);
-		for_each_sched_entity(se)
-			update_cfs_shares(group_cfs_rq(se));
+		for_each_sched_entity(se) {
+			update_load_avg(se, UPDATE_TG);
+			update_cfs_shares(se);
+		}
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 	}
 
-- 
cgit v1.2.3


From b8fd8423697b9ec729c5bb91737faad84ae19985 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Wed, 11 Jan 2017 11:29:47 +0000
Subject: sched/fair: Explain why MIN_SHARES isn't scaled in calc_cfs_shares()

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Turner <pjt@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Samuel Thibault <samuel.thibault@ens-lyon.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/e9a4d858-bcf3-36b9-e3a9-449953e34569@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2b866a279bdf..274c747a01ce 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2657,6 +2657,18 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 	if (tg_weight)
 		shares /= tg_weight;
 
+	/*
+	 * MIN_SHARES has to be unscaled here to support per-CPU partitioning
+	 * of a group with small tg->shares value. It is a floor value which is
+	 * assigned as a minimum load.weight to the sched_entity representing
+	 * the group on a CPU.
+	 *
+	 * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
+	 * on an 8-core system with 8 tasks each runnable on one CPU shares has
+	 * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
+	 * case no task is runnable on a CPU MIN_SHARES=2 should be returned
+	 * instead of 0.
+	 */
 	if (shares < MIN_SHARES)
 		shares = MIN_SHARES;
 	if (shares > tg->shares)
-- 
cgit v1.2.3


From e33a9bba85a869b85fd0a7f16e21a5ec8977e325 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 7 Dec 2016 15:48:41 -0500
Subject: sched/core: move IO scheduling accounting from io_schedule_timeout()
 into scheduler

For an interface to support blocking for IOs, it must call
io_schedule() instead of schedule().  This makes it tedious to add IO
blocking to existing interfaces as the switching between schedule()
and io_schedule() is often buried deep.

As we already have a way to mark the task as IO scheduling, this can
be made easier by separating out io_schedule() into multiple steps so
that IO schedule preparation can be performed before invoking a
blocking interface and the actual accounting happens inside the
scheduler.

io_schedule_timeout() does the following three things prior to calling
schedule_timeout().

 1. Mark the task as scheduling for IO.
 2. Flush out plugged IOs.
 3. Account the IO scheduling.

done close to the actual scheduling.  This patch moves #3 into the
scheduler so that later patches can separate out preparation and
finish steps from io_schedule().

Patch-originally-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: adilger.kernel@dilger.ca
Cc: akpm@linux-foundation.org
Cc: axboe@kernel.dk
Cc: jack@suse.com
Cc: kernel-team@fb.com
Cc: mingbo@fb.com
Cc: tytso@mit.edu
Link: http://lkml.kernel.org/r/20161207204841.GA22296@htj.duckdns.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 61 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 96a4267e6020..9fd37169b302 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2089,11 +2089,24 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	p->sched_contributes_to_load = !!task_contributes_to_load(p);
 	p->state = TASK_WAKING;
 
+	if (p->in_iowait) {
+		delayacct_blkio_end();
+		atomic_dec(&task_rq(p)->nr_iowait);
+	}
+
 	cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
 	if (task_cpu(p) != cpu) {
 		wake_flags |= WF_MIGRATED;
 		set_task_cpu(p, cpu);
 	}
+
+#else /* CONFIG_SMP */
+
+	if (p->in_iowait) {
+		delayacct_blkio_end();
+		atomic_dec(&task_rq(p)->nr_iowait);
+	}
+
 #endif /* CONFIG_SMP */
 
 	ttwu_queue(p, cpu, wake_flags);
@@ -2143,8 +2156,13 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
 
 	trace_sched_waking(p);
 
-	if (!task_on_rq_queued(p))
+	if (!task_on_rq_queued(p)) {
+		if (p->in_iowait) {
+			delayacct_blkio_end();
+			atomic_dec(&rq->nr_iowait);
+		}
 		ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+	}
 
 	ttwu_do_wakeup(rq, p, 0, rf);
 	ttwu_stat(p, smp_processor_id(), 0);
@@ -2956,6 +2974,36 @@ unsigned long long nr_context_switches(void)
 	return sum;
 }
 
+/*
+ * IO-wait accounting, and how its mostly bollocks (on SMP).
+ *
+ * The idea behind IO-wait account is to account the idle time that we could
+ * have spend running if it were not for IO. That is, if we were to improve the
+ * storage performance, we'd have a proportional reduction in IO-wait time.
+ *
+ * This all works nicely on UP, where, when a task blocks on IO, we account
+ * idle time as IO-wait, because if the storage were faster, it could've been
+ * running and we'd not be idle.
+ *
+ * This has been extended to SMP, by doing the same for each CPU. This however
+ * is broken.
+ *
+ * Imagine for instance the case where two tasks block on one CPU, only the one
+ * CPU will have IO-wait accounted, while the other has regular idle. Even
+ * though, if the storage were faster, both could've ran at the same time,
+ * utilising both CPUs.
+ *
+ * This means, that when looking globally, the current IO-wait accounting on
+ * SMP is a lower bound, by reason of under accounting.
+ *
+ * Worse, since the numbers are provided per CPU, they are sometimes
+ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly
+ * associated with any one particular CPU, it can wake to another CPU than it
+ * blocked on. This means the per CPU IO-wait number is meaningless.
+ *
+ * Task CPU affinities can make all that even more 'interesting'.
+ */
+
 unsigned long nr_iowait(void)
 {
 	unsigned long i, sum = 0;
@@ -2966,6 +3014,13 @@ unsigned long nr_iowait(void)
 	return sum;
 }
 
+/*
+ * Consumers of these two interfaces, like for example the cpufreq menu
+ * governor are using nonsensical data. Boosting frequency for a CPU that has
+ * IO-wait which might not even end up running the task when it does become
+ * runnable.
+ */
+
 unsigned long nr_iowait_cpu(int cpu)
 {
 	struct rq *this = cpu_rq(cpu);
@@ -3377,6 +3432,11 @@ static void __sched notrace __schedule(bool preempt)
 			deactivate_task(rq, prev, DEQUEUE_SLEEP);
 			prev->on_rq = 0;
 
+			if (prev->in_iowait) {
+				atomic_inc(&rq->nr_iowait);
+				delayacct_blkio_start();
+			}
+
 			/*
 			 * If a worker went to sleep, notify and ask workqueue
 			 * whether it wants to wake up a task to maintain
@@ -5075,19 +5135,13 @@ EXPORT_SYMBOL_GPL(yield_to);
 long __sched io_schedule_timeout(long timeout)
 {
 	int old_iowait = current->in_iowait;
-	struct rq *rq;
 	long ret;
 
 	current->in_iowait = 1;
 	blk_schedule_flush_plug(current);
 
-	delayacct_blkio_start();
-	rq = raw_rq();
-	atomic_inc(&rq->nr_iowait);
 	ret = schedule_timeout(timeout);
 	current->in_iowait = old_iowait;
-	atomic_dec(&rq->nr_iowait);
-	delayacct_blkio_end();
 
 	return ret;
 }
-- 
cgit v1.2.3


From 10ab56434f2f633a51e432ee8b7c29e12438e163 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 28 Oct 2016 12:58:10 -0400
Subject: sched/core: Separate out io_schedule_prepare() and
 io_schedule_finish()

Now that IO schedule accounting is done inside __schedule(),
io_schedule() can be split into three steps - prep, schedule, and
finish - where the schedule part doesn't need any special annotation.
This allows marking a sleep as iowait by simply wrapping an existing
blocking function with io_schedule_prepare() and io_schedule_finish().

Because task_struct->in_iowait is single bit, the caller of
io_schedule_prepare() needs to record and the pass its state to
io_schedule_finish() to be safe regarding nesting.  While this isn't
the prettiest, these functions are mostly gonna be used by core
functions and we don't want to use more space for ->in_iowait.

While at it, as it's simple to do now, reimplement io_schedule()
without unnecessarily going through io_schedule_timeout().

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: adilger.kernel@dilger.ca
Cc: jack@suse.com
Cc: kernel-team@fb.com
Cc: mingbo@fb.com
Cc: tytso@mit.edu
Link: http://lkml.kernel.org/r/1477673892-28940-3-git-send-email-tj@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  8 +++-----
 kernel/sched/core.c   | 33 ++++++++++++++++++++++++++++-----
 2 files changed, 31 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 94a48bb58297..a8daed914eef 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -461,12 +461,10 @@ extern signed long schedule_timeout_idle(signed long timeout);
 asmlinkage void schedule(void);
 extern void schedule_preempt_disabled(void);
 
+extern int __must_check io_schedule_prepare(void);
+extern void io_schedule_finish(int token);
 extern long io_schedule_timeout(long timeout);
-
-static inline void io_schedule(void)
-{
-	io_schedule_timeout(MAX_SCHEDULE_TIMEOUT);
-}
+extern void io_schedule(void);
 
 void __noreturn do_task_dead(void);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9fd37169b302..49ce1cb3d320 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5128,25 +5128,48 @@ out_irq:
 }
 EXPORT_SYMBOL_GPL(yield_to);
 
+int io_schedule_prepare(void)
+{
+	int old_iowait = current->in_iowait;
+
+	current->in_iowait = 1;
+	blk_schedule_flush_plug(current);
+
+	return old_iowait;
+}
+
+void io_schedule_finish(int token)
+{
+	current->in_iowait = token;
+}
+
 /*
  * This task is about to go to sleep on IO. Increment rq->nr_iowait so
  * that process accounting knows that this is a task in IO wait state.
  */
 long __sched io_schedule_timeout(long timeout)
 {
-	int old_iowait = current->in_iowait;
+	int token;
 	long ret;
 
-	current->in_iowait = 1;
-	blk_schedule_flush_plug(current);
-
+	token = io_schedule_prepare();
 	ret = schedule_timeout(timeout);
-	current->in_iowait = old_iowait;
+	io_schedule_finish(token);
 
 	return ret;
 }
 EXPORT_SYMBOL(io_schedule_timeout);
 
+void io_schedule(void)
+{
+	int token;
+
+	token = io_schedule_prepare();
+	schedule();
+	io_schedule_finish(token);
+}
+EXPORT_SYMBOL(io_schedule);
+
 /**
  * sys_sched_get_priority_max - return maximum RT priority.
  * @policy: scheduling class.
-- 
cgit v1.2.3


From 1460cb65a10f6c7a6e3a1c76513338861a0a43b6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 28 Oct 2016 12:58:11 -0400
Subject: locking/mutex, sched/wait: Add mutex_lock_io()

We sometimes end up propagating IO blocking through mutexes; however,
because there currently is no way of annotating mutex sleeps as
iowait, there are cases where iowait and /proc/stat:procs_blocked
report misleading numbers obscuring the actual state of the system.

This patch adds mutex_lock_io() so that mutex sleeps can be marked as
iowait in those cases.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: adilger.kernel@dilger.ca
Cc: jack@suse.com
Cc: kernel-team@fb.com
Cc: mingbo@fb.com
Cc: tytso@mit.edu
Link: http://lkml.kernel.org/r/1477673892-28940-4-git-send-email-tj@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mutex.h  |  4 ++++
 kernel/locking/mutex.c | 24 ++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index b97870f2debd..980ba16b8749 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -156,10 +156,12 @@ extern int __must_check mutex_lock_interruptible_nested(struct mutex *lock,
 					unsigned int subclass);
 extern int __must_check mutex_lock_killable_nested(struct mutex *lock,
 					unsigned int subclass);
+extern void mutex_lock_io_nested(struct mutex *lock, unsigned int subclass);
 
 #define mutex_lock(lock) mutex_lock_nested(lock, 0)
 #define mutex_lock_interruptible(lock) mutex_lock_interruptible_nested(lock, 0)
 #define mutex_lock_killable(lock) mutex_lock_killable_nested(lock, 0)
+#define mutex_lock_io(lock) mutex_lock_io_nested(lock, 0)
 
 #define mutex_lock_nest_lock(lock, nest_lock)				\
 do {									\
@@ -171,11 +173,13 @@ do {									\
 extern void mutex_lock(struct mutex *lock);
 extern int __must_check mutex_lock_interruptible(struct mutex *lock);
 extern int __must_check mutex_lock_killable(struct mutex *lock);
+extern void mutex_lock_io(struct mutex *lock);
 
 # define mutex_lock_nested(lock, subclass) mutex_lock(lock)
 # define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock)
 # define mutex_lock_killable_nested(lock, subclass) mutex_lock_killable(lock)
 # define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
+# define mutex_lock_nest_io(lock, nest_lock) mutex_io(lock)
 #endif
 
 /*
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 9b349619f431..8464a5cbab97 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -783,6 +783,20 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
 }
 EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
 
+void __sched
+mutex_lock_io_nested(struct mutex *lock, unsigned int subclass)
+{
+	int token;
+
+	might_sleep();
+
+	token = io_schedule_prepare();
+	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
+			    subclass, NULL, _RET_IP_, NULL, 0);
+	io_schedule_finish(token);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_io_nested);
+
 static inline int
 ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
@@ -950,6 +964,16 @@ int __sched mutex_lock_killable(struct mutex *lock)
 }
 EXPORT_SYMBOL(mutex_lock_killable);
 
+void __sched mutex_lock_io(struct mutex *lock)
+{
+	int token;
+
+	token = io_schedule_prepare();
+	mutex_lock(lock);
+	io_schedule_finish(token);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_io);
+
 static noinline void __sched
 __mutex_lock_slowpath(struct mutex *lock)
 {
-- 
cgit v1.2.3


From 0186a6cbdc6287fde65858e5d9c714dc167b8ace Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 1 Dec 2016 11:47:05 +0000
Subject: locking/ww_mutex: Add ww_mutex to locktorture test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Although ww_mutexes degenerate into mutexes, it would be useful to
torture the deadlock handling between multiple ww_mutexes in addition to
torturing the regular mutexes.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <dev@mblankhorst.nl>
Cc: Nicolai Hähnle <nhaehnle@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20161201114711.28697-3-chris@chris-wilson.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/locktorture.c                       | 73 ++++++++++++++++++++++
 .../selftests/rcutorture/configs/lock/CFLIST       |  1 +
 .../selftests/rcutorture/configs/lock/LOCK07       |  6 ++
 .../selftests/rcutorture/configs/lock/LOCK07.boot  |  1 +
 4 files changed, 81 insertions(+)
 create mode 100644 tools/testing/selftests/rcutorture/configs/lock/LOCK07
 create mode 100644 tools/testing/selftests/rcutorture/configs/lock/LOCK07.boot

(limited to 'kernel')

diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index f8c5af52a131..9bffedd82884 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -372,6 +372,78 @@ static struct lock_torture_ops mutex_lock_ops = {
 	.name		= "mutex_lock"
 };
 
+#include <linux/ww_mutex.h>
+static DEFINE_WW_CLASS(torture_ww_class);
+static DEFINE_WW_MUTEX(torture_ww_mutex_0, &torture_ww_class);
+static DEFINE_WW_MUTEX(torture_ww_mutex_1, &torture_ww_class);
+static DEFINE_WW_MUTEX(torture_ww_mutex_2, &torture_ww_class);
+
+static int torture_ww_mutex_lock(void)
+__acquires(torture_ww_mutex_0)
+__acquires(torture_ww_mutex_1)
+__acquires(torture_ww_mutex_2)
+{
+	LIST_HEAD(list);
+	struct reorder_lock {
+		struct list_head link;
+		struct ww_mutex *lock;
+	} locks[3], *ll, *ln;
+	struct ww_acquire_ctx ctx;
+
+	locks[0].lock = &torture_ww_mutex_0;
+	list_add(&locks[0].link, &list);
+
+	locks[1].lock = &torture_ww_mutex_1;
+	list_add(&locks[1].link, &list);
+
+	locks[2].lock = &torture_ww_mutex_2;
+	list_add(&locks[2].link, &list);
+
+	ww_acquire_init(&ctx, &torture_ww_class);
+
+	list_for_each_entry(ll, &list, link) {
+		int err;
+
+		err = ww_mutex_lock(ll->lock, &ctx);
+		if (!err)
+			continue;
+
+		ln = ll;
+		list_for_each_entry_continue_reverse(ln, &list, link)
+			ww_mutex_unlock(ln->lock);
+
+		if (err != -EDEADLK)
+			return err;
+
+		ww_mutex_lock_slow(ll->lock, &ctx);
+		list_move(&ll->link, &list);
+	}
+
+	ww_acquire_fini(&ctx);
+	return 0;
+}
+
+static void torture_ww_mutex_unlock(void)
+__releases(torture_ww_mutex_0)
+__releases(torture_ww_mutex_1)
+__releases(torture_ww_mutex_2)
+{
+	ww_mutex_unlock(&torture_ww_mutex_0);
+	ww_mutex_unlock(&torture_ww_mutex_1);
+	ww_mutex_unlock(&torture_ww_mutex_2);
+}
+
+static struct lock_torture_ops ww_mutex_lock_ops = {
+	.writelock	= torture_ww_mutex_lock,
+	.write_delay	= torture_mutex_delay,
+	.task_boost     = torture_boost_dummy,
+	.writeunlock	= torture_ww_mutex_unlock,
+	.readlock       = NULL,
+	.read_delay     = NULL,
+	.readunlock     = NULL,
+	.name		= "ww_mutex_lock"
+};
+
 #ifdef CONFIG_RT_MUTEXES
 static DEFINE_RT_MUTEX(torture_rtmutex);
 
@@ -793,6 +865,7 @@ static int __init lock_torture_init(void)
 		&spin_lock_ops, &spin_lock_irq_ops,
 		&rw_lock_ops, &rw_lock_irq_ops,
 		&mutex_lock_ops,
+		&ww_mutex_lock_ops,
 #ifdef CONFIG_RT_MUTEXES
 		&rtmutex_lock_ops,
 #endif
diff --git a/tools/testing/selftests/rcutorture/configs/lock/CFLIST b/tools/testing/selftests/rcutorture/configs/lock/CFLIST
index b9611c523723..41bae5824339 100644
--- a/tools/testing/selftests/rcutorture/configs/lock/CFLIST
+++ b/tools/testing/selftests/rcutorture/configs/lock/CFLIST
@@ -4,3 +4,4 @@ LOCK03
 LOCK04
 LOCK05
 LOCK06
+LOCK07
diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK07 b/tools/testing/selftests/rcutorture/configs/lock/LOCK07
new file mode 100644
index 000000000000..1d1da1477fc3
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK07
@@ -0,0 +1,6 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=4
+CONFIG_HOTPLUG_CPU=y
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK07.boot b/tools/testing/selftests/rcutorture/configs/lock/LOCK07.boot
new file mode 100644
index 000000000000..97dadd1a9e45
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK07.boot
@@ -0,0 +1 @@
+locktorture.torture_type=ww_mutex_lock
-- 
cgit v1.2.3


From f2a5fec17395f259d54daa8833d81b00cceb15c3 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 1 Dec 2016 11:47:06 +0000
Subject: locking/ww_mutex: Begin kselftests for ww_mutex
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <dev@mblankhorst.nl>
Cc: Nicolai Hähnle <nhaehnle@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20161201114711.28697-4-chris@chris-wilson.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/Makefile        |   1 +
 kernel/locking/test-ww_mutex.c | 140 +++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig.debug              |  12 ++++
 3 files changed, 153 insertions(+)
 create mode 100644 kernel/locking/test-ww_mutex.c

(limited to 'kernel')

diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 6f88e352cd4f..760158d9d98d 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -28,3 +28,4 @@ obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
 obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
 obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
+obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
new file mode 100644
index 000000000000..472d5b1f303f
--- /dev/null
+++ b/kernel/locking/test-ww_mutex.c
@@ -0,0 +1,140 @@
+/*
+ * Module-based API test facility for ww_mutexes
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ */
+
+#include <linux/kernel.h>
+
+#include <linux/completion.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/ww_mutex.h>
+
+static DEFINE_WW_CLASS(ww_class);
+
+struct test_mutex {
+	struct work_struct work;
+	struct ww_mutex mutex;
+	struct completion ready, go, done;
+	unsigned int flags;
+};
+
+#define TEST_MTX_SPIN BIT(0)
+#define TEST_MTX_TRY BIT(1)
+#define TEST_MTX_CTX BIT(2)
+#define __TEST_MTX_LAST BIT(3)
+
+static void test_mutex_work(struct work_struct *work)
+{
+	struct test_mutex *mtx = container_of(work, typeof(*mtx), work);
+
+	complete(&mtx->ready);
+	wait_for_completion(&mtx->go);
+
+	if (mtx->flags & TEST_MTX_TRY) {
+		while (!ww_mutex_trylock(&mtx->mutex))
+			cpu_relax();
+	} else {
+		ww_mutex_lock(&mtx->mutex, NULL);
+	}
+	complete(&mtx->done);
+	ww_mutex_unlock(&mtx->mutex);
+}
+
+static int __test_mutex(unsigned int flags)
+{
+#define TIMEOUT (HZ / 16)
+	struct test_mutex mtx;
+	struct ww_acquire_ctx ctx;
+	int ret;
+
+	ww_mutex_init(&mtx.mutex, &ww_class);
+	ww_acquire_init(&ctx, &ww_class);
+
+	INIT_WORK_ONSTACK(&mtx.work, test_mutex_work);
+	init_completion(&mtx.ready);
+	init_completion(&mtx.go);
+	init_completion(&mtx.done);
+	mtx.flags = flags;
+
+	schedule_work(&mtx.work);
+
+	wait_for_completion(&mtx.ready);
+	ww_mutex_lock(&mtx.mutex, (flags & TEST_MTX_CTX) ? &ctx : NULL);
+	complete(&mtx.go);
+	if (flags & TEST_MTX_SPIN) {
+		unsigned long timeout = jiffies + TIMEOUT;
+
+		ret = 0;
+		do {
+			if (completion_done(&mtx.done)) {
+				ret = -EINVAL;
+				break;
+			}
+			cpu_relax();
+		} while (time_before(jiffies, timeout));
+	} else {
+		ret = wait_for_completion_timeout(&mtx.done, TIMEOUT);
+	}
+	ww_mutex_unlock(&mtx.mutex);
+	ww_acquire_fini(&ctx);
+
+	if (ret) {
+		pr_err("%s(flags=%x): mutual exclusion failure\n",
+		       __func__, flags);
+		ret = -EINVAL;
+	}
+
+	flush_work(&mtx.work);
+	destroy_work_on_stack(&mtx.work);
+	return ret;
+#undef TIMEOUT
+}
+
+static int test_mutex(void)
+{
+	int ret;
+	int i;
+
+	for (i = 0; i < __TEST_MTX_LAST; i++) {
+		ret = __test_mutex(i);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int __init test_ww_mutex_init(void)
+{
+	int ret;
+
+	ret = test_mutex();
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static void __exit test_ww_mutex_exit(void)
+{
+}
+
+module_init(test_ww_mutex_init);
+module_exit(test_ww_mutex_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Intel Corporation");
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index eb9e9a7870fa..5b37821632a2 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1180,6 +1180,18 @@ config LOCK_TORTURE_TEST
 	  Say M if you want these torture tests to build as a module.
 	  Say N if you are unsure.
 
+config WW_MUTEX_SELFTEST
+	tristate "Wait/wound mutex selftests"
+	help
+	  This option provides a kernel module that runs tests on the
+	  on the struct ww_mutex locking API.
+
+	  It is recommended to enable DEBUG_WW_MUTEX_SLOWPATH in conjunction
+	  with this test harness.
+
+	  Say M if you want these self tests to build as a module.
+	  Say N if you are unsure.
+
 endmenu # lock debugging
 
 config TRACE_IRQFLAGS
-- 
cgit v1.2.3


From c22fb3807fd0a3bdd98c13c4d2190ab064e6c09d Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 1 Dec 2016 11:47:07 +0000
Subject: locking/ww_mutex: Add kselftests for ww_mutex AA deadlock detection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <dev@mblankhorst.nl>
Cc: Nicolai Hähnle <nhaehnle@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20161201114711.28697-5-chris@chris-wilson.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/test-ww_mutex.c | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

(limited to 'kernel')

diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 472d5b1f303f..835fa7a1036e 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -118,6 +118,41 @@ static int test_mutex(void)
 	return 0;
 }
 
+static int test_aa(void)
+{
+	struct ww_mutex mutex;
+	struct ww_acquire_ctx ctx;
+	int ret;
+
+	ww_mutex_init(&mutex, &ww_class);
+	ww_acquire_init(&ctx, &ww_class);
+
+	ww_mutex_lock(&mutex, &ctx);
+
+	if (ww_mutex_trylock(&mutex))  {
+		pr_err("%s: trylocked itself!\n", __func__);
+		ww_mutex_unlock(&mutex);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = ww_mutex_lock(&mutex, &ctx);
+	if (ret != -EALREADY) {
+		pr_err("%s: missed deadlock for recursing, ret=%d\n",
+		       __func__, ret);
+		if (!ret)
+			ww_mutex_unlock(&mutex);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = 0;
+out:
+	ww_mutex_unlock(&mutex);
+	ww_acquire_fini(&ctx);
+	return ret;
+}
+
 static int __init test_ww_mutex_init(void)
 {
 	int ret;
@@ -126,6 +161,10 @@ static int __init test_ww_mutex_init(void)
 	if (ret)
 		return ret;
 
+	ret = test_aa();
+	if (ret)
+		return ret;
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 70207686e492fb97c11d88ae7d8ebce7d2d080aa Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 1 Dec 2016 11:47:08 +0000
Subject: locking/ww_mutex: Add kselftests for ww_mutex ABBA deadlock detection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <dev@mblankhorst.nl>
Cc: Nicolai Hähnle <nhaehnle@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20161201114711.28697-6-chris@chris-wilson.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/test-ww_mutex.c | 98 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)

(limited to 'kernel')

diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 835fa7a1036e..5a643ba2b020 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -153,6 +153,96 @@ out:
 	return ret;
 }
 
+struct test_abba {
+	struct work_struct work;
+	struct ww_mutex a_mutex;
+	struct ww_mutex b_mutex;
+	struct completion a_ready;
+	struct completion b_ready;
+	bool resolve;
+	int result;
+};
+
+static void test_abba_work(struct work_struct *work)
+{
+	struct test_abba *abba = container_of(work, typeof(*abba), work);
+	struct ww_acquire_ctx ctx;
+	int err;
+
+	ww_acquire_init(&ctx, &ww_class);
+	ww_mutex_lock(&abba->b_mutex, &ctx);
+
+	complete(&abba->b_ready);
+	wait_for_completion(&abba->a_ready);
+
+	err = ww_mutex_lock(&abba->a_mutex, &ctx);
+	if (abba->resolve && err == -EDEADLK) {
+		ww_mutex_unlock(&abba->b_mutex);
+		ww_mutex_lock_slow(&abba->a_mutex, &ctx);
+		err = ww_mutex_lock(&abba->b_mutex, &ctx);
+	}
+
+	if (!err)
+		ww_mutex_unlock(&abba->a_mutex);
+	ww_mutex_unlock(&abba->b_mutex);
+	ww_acquire_fini(&ctx);
+
+	abba->result = err;
+}
+
+static int test_abba(bool resolve)
+{
+	struct test_abba abba;
+	struct ww_acquire_ctx ctx;
+	int err, ret;
+
+	ww_mutex_init(&abba.a_mutex, &ww_class);
+	ww_mutex_init(&abba.b_mutex, &ww_class);
+	INIT_WORK_ONSTACK(&abba.work, test_abba_work);
+	init_completion(&abba.a_ready);
+	init_completion(&abba.b_ready);
+	abba.resolve = resolve;
+
+	schedule_work(&abba.work);
+
+	ww_acquire_init(&ctx, &ww_class);
+	ww_mutex_lock(&abba.a_mutex, &ctx);
+
+	complete(&abba.a_ready);
+	wait_for_completion(&abba.b_ready);
+
+	err = ww_mutex_lock(&abba.b_mutex, &ctx);
+	if (resolve && err == -EDEADLK) {
+		ww_mutex_unlock(&abba.a_mutex);
+		ww_mutex_lock_slow(&abba.b_mutex, &ctx);
+		err = ww_mutex_lock(&abba.a_mutex, &ctx);
+	}
+
+	if (!err)
+		ww_mutex_unlock(&abba.b_mutex);
+	ww_mutex_unlock(&abba.a_mutex);
+	ww_acquire_fini(&ctx);
+
+	flush_work(&abba.work);
+	destroy_work_on_stack(&abba.work);
+
+	ret = 0;
+	if (resolve) {
+		if (err || abba.result) {
+			pr_err("%s: failed to resolve ABBA deadlock, A err=%d, B err=%d\n",
+			       __func__, err, abba.result);
+			ret = -EINVAL;
+		}
+	} else {
+		if (err != -EDEADLK && abba.result != -EDEADLK) {
+			pr_err("%s: missed ABBA deadlock, A err=%d, B err=%d\n",
+			       __func__, err, abba.result);
+			ret = -EINVAL;
+		}
+	}
+	return ret;
+}
+
 static int __init test_ww_mutex_init(void)
 {
 	int ret;
@@ -165,6 +255,14 @@ static int __init test_ww_mutex_init(void)
 	if (ret)
 		return ret;
 
+	ret = test_abba(false);
+	if (ret)
+		return ret;
+
+	ret = test_abba(true);
+	if (ret)
+		return ret;
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From d1b42b800e5d09dcee52812b4396aca3a3696ba9 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 1 Dec 2016 11:47:09 +0000
Subject: locking/ww_mutex: Add kselftests for resolving ww_mutex cyclic
 deadlocks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Check that ww_mutexes can detect cyclic deadlocks (generalised ABBA
cycles) and resolve them by lock reordering.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <dev@mblankhorst.nl>
Cc: Nicolai Hähnle <nhaehnle@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20161201114711.28697-7-chris@chris-wilson.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/test-ww_mutex.c | 115 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 115 insertions(+)

(limited to 'kernel')

diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 5a643ba2b020..84da738e57d1 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -21,9 +21,11 @@
 #include <linux/completion.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/ww_mutex.h>
 
 static DEFINE_WW_CLASS(ww_class);
+struct workqueue_struct *wq;
 
 struct test_mutex {
 	struct work_struct work;
@@ -243,10 +245,118 @@ static int test_abba(bool resolve)
 	return ret;
 }
 
+struct test_cycle {
+	struct work_struct work;
+	struct ww_mutex a_mutex;
+	struct ww_mutex *b_mutex;
+	struct completion *a_signal;
+	struct completion b_signal;
+	int result;
+};
+
+static void test_cycle_work(struct work_struct *work)
+{
+	struct test_cycle *cycle = container_of(work, typeof(*cycle), work);
+	struct ww_acquire_ctx ctx;
+	int err;
+
+	ww_acquire_init(&ctx, &ww_class);
+	ww_mutex_lock(&cycle->a_mutex, &ctx);
+
+	complete(cycle->a_signal);
+	wait_for_completion(&cycle->b_signal);
+
+	err = ww_mutex_lock(cycle->b_mutex, &ctx);
+	if (err == -EDEADLK) {
+		ww_mutex_unlock(&cycle->a_mutex);
+		ww_mutex_lock_slow(cycle->b_mutex, &ctx);
+		err = ww_mutex_lock(&cycle->a_mutex, &ctx);
+	}
+
+	if (!err)
+		ww_mutex_unlock(cycle->b_mutex);
+	ww_mutex_unlock(&cycle->a_mutex);
+	ww_acquire_fini(&ctx);
+
+	cycle->result = err;
+}
+
+static int __test_cycle(unsigned int nthreads)
+{
+	struct test_cycle *cycles;
+	unsigned int n, last = nthreads - 1;
+	int ret;
+
+	cycles = kmalloc_array(nthreads, sizeof(*cycles), GFP_KERNEL);
+	if (!cycles)
+		return -ENOMEM;
+
+	for (n = 0; n < nthreads; n++) {
+		struct test_cycle *cycle = &cycles[n];
+
+		ww_mutex_init(&cycle->a_mutex, &ww_class);
+		if (n == last)
+			cycle->b_mutex = &cycles[0].a_mutex;
+		else
+			cycle->b_mutex = &cycles[n + 1].a_mutex;
+
+		if (n == 0)
+			cycle->a_signal = &cycles[last].b_signal;
+		else
+			cycle->a_signal = &cycles[n - 1].b_signal;
+		init_completion(&cycle->b_signal);
+
+		INIT_WORK(&cycle->work, test_cycle_work);
+		cycle->result = 0;
+	}
+
+	for (n = 0; n < nthreads; n++)
+		queue_work(wq, &cycles[n].work);
+
+	flush_workqueue(wq);
+
+	ret = 0;
+	for (n = 0; n < nthreads; n++) {
+		struct test_cycle *cycle = &cycles[n];
+
+		if (!cycle->result)
+			continue;
+
+		pr_err("cylic deadlock not resolved, ret[%d/%d] = %d\n",
+		       n, nthreads, cycle->result);
+		ret = -EINVAL;
+		break;
+	}
+
+	for (n = 0; n < nthreads; n++)
+		ww_mutex_destroy(&cycles[n].a_mutex);
+	kfree(cycles);
+	return ret;
+}
+
+static int test_cycle(unsigned int ncpus)
+{
+	unsigned int n;
+	int ret;
+
+	for (n = 2; n <= ncpus + 1; n++) {
+		ret = __test_cycle(n);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 static int __init test_ww_mutex_init(void)
 {
+	int ncpus = num_online_cpus();
 	int ret;
 
+	wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0);
+	if (!wq)
+		return -ENOMEM;
+
 	ret = test_mutex();
 	if (ret)
 		return ret;
@@ -263,11 +373,16 @@ static int __init test_ww_mutex_init(void)
 	if (ret)
 		return ret;
 
+	ret = test_cycle(ncpus);
+	if (ret)
+		return ret;
+
 	return 0;
 }
 
 static void __exit test_ww_mutex_exit(void)
 {
+	destroy_workqueue(wq);
 }
 
 module_init(test_ww_mutex_init);
-- 
cgit v1.2.3


From 2a0c11282881875dc44f166a20eedf0d866dd0ef Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 1 Dec 2016 11:47:10 +0000
Subject: locking/ww_mutex: Add kselftests for ww_mutex stress
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <dev@mblankhorst.nl>
Cc: Nicolai Hähnle <nhaehnle@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20161201114711.28697-8-chris@chris-wilson.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/test-ww_mutex.c | 254 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 254 insertions(+)

(limited to 'kernel')

diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 84da738e57d1..da6c9a34f62f 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -19,8 +19,10 @@
 #include <linux/kernel.h>
 
 #include <linux/completion.h>
+#include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
+#include <linux/random.h>
 #include <linux/slab.h>
 #include <linux/ww_mutex.h>
 
@@ -348,6 +350,246 @@ static int test_cycle(unsigned int ncpus)
 	return 0;
 }
 
+struct stress {
+	struct work_struct work;
+	struct ww_mutex *locks;
+	int nlocks;
+	int nloops;
+};
+
+static int *get_random_order(int count)
+{
+	int *order;
+	int n, r, tmp;
+
+	order = kmalloc_array(count, sizeof(*order), GFP_TEMPORARY);
+	if (!order)
+		return order;
+
+	for (n = 0; n < count; n++)
+		order[n] = n;
+
+	for (n = count - 1; n > 1; n--) {
+		r = get_random_int() % (n + 1);
+		if (r != n) {
+			tmp = order[n];
+			order[n] = order[r];
+			order[r] = tmp;
+		}
+	}
+
+	return order;
+}
+
+static void dummy_load(struct stress *stress)
+{
+	usleep_range(1000, 2000);
+}
+
+static void stress_inorder_work(struct work_struct *work)
+{
+	struct stress *stress = container_of(work, typeof(*stress), work);
+	const int nlocks = stress->nlocks;
+	struct ww_mutex *locks = stress->locks;
+	struct ww_acquire_ctx ctx;
+	int *order;
+
+	order = get_random_order(nlocks);
+	if (!order)
+		return;
+
+	ww_acquire_init(&ctx, &ww_class);
+
+	do {
+		int contended = -1;
+		int n, err;
+
+retry:
+		err = 0;
+		for (n = 0; n < nlocks; n++) {
+			if (n == contended)
+				continue;
+
+			err = ww_mutex_lock(&locks[order[n]], &ctx);
+			if (err < 0)
+				break;
+		}
+		if (!err)
+			dummy_load(stress);
+
+		if (contended > n)
+			ww_mutex_unlock(&locks[order[contended]]);
+		contended = n;
+		while (n--)
+			ww_mutex_unlock(&locks[order[n]]);
+
+		if (err == -EDEADLK) {
+			ww_mutex_lock_slow(&locks[order[contended]], &ctx);
+			goto retry;
+		}
+
+		if (err) {
+			pr_err_once("stress (%s) failed with %d\n",
+				    __func__, err);
+			break;
+		}
+	} while (--stress->nloops);
+
+	ww_acquire_fini(&ctx);
+
+	kfree(order);
+	kfree(stress);
+}
+
+struct reorder_lock {
+	struct list_head link;
+	struct ww_mutex *lock;
+};
+
+static void stress_reorder_work(struct work_struct *work)
+{
+	struct stress *stress = container_of(work, typeof(*stress), work);
+	LIST_HEAD(locks);
+	struct ww_acquire_ctx ctx;
+	struct reorder_lock *ll, *ln;
+	int *order;
+	int n, err;
+
+	order = get_random_order(stress->nlocks);
+	if (!order)
+		return;
+
+	for (n = 0; n < stress->nlocks; n++) {
+		ll = kmalloc(sizeof(*ll), GFP_KERNEL);
+		if (!ll)
+			goto out;
+
+		ll->lock = &stress->locks[order[n]];
+		list_add(&ll->link, &locks);
+	}
+	kfree(order);
+	order = NULL;
+
+	ww_acquire_init(&ctx, &ww_class);
+
+	do {
+		list_for_each_entry(ll, &locks, link) {
+			err = ww_mutex_lock(ll->lock, &ctx);
+			if (!err)
+				continue;
+
+			ln = ll;
+			list_for_each_entry_continue_reverse(ln, &locks, link)
+				ww_mutex_unlock(ln->lock);
+
+			if (err != -EDEADLK) {
+				pr_err_once("stress (%s) failed with %d\n",
+					    __func__, err);
+				break;
+			}
+
+			ww_mutex_lock_slow(ll->lock, &ctx);
+			list_move(&ll->link, &locks); /* restarts iteration */
+		}
+
+		dummy_load(stress);
+		list_for_each_entry(ll, &locks, link)
+			ww_mutex_unlock(ll->lock);
+	} while (--stress->nloops);
+
+	ww_acquire_fini(&ctx);
+
+out:
+	list_for_each_entry_safe(ll, ln, &locks, link)
+		kfree(ll);
+	kfree(order);
+	kfree(stress);
+}
+
+static void stress_one_work(struct work_struct *work)
+{
+	struct stress *stress = container_of(work, typeof(*stress), work);
+	const int nlocks = stress->nlocks;
+	struct ww_mutex *lock = stress->locks + (get_random_int() % nlocks);
+	int err;
+
+	do {
+		err = ww_mutex_lock(lock, NULL);
+		if (!err) {
+			dummy_load(stress);
+			ww_mutex_unlock(lock);
+		} else {
+			pr_err_once("stress (%s) failed with %d\n",
+				    __func__, err);
+			break;
+		}
+	} while (--stress->nloops);
+
+	kfree(stress);
+}
+
+#define STRESS_INORDER BIT(0)
+#define STRESS_REORDER BIT(1)
+#define STRESS_ONE BIT(2)
+#define STRESS_ALL (STRESS_INORDER | STRESS_REORDER | STRESS_ONE)
+
+static int stress(int nlocks, int nthreads, int nloops, unsigned int flags)
+{
+	struct ww_mutex *locks;
+	int n;
+
+	locks = kmalloc_array(nlocks, sizeof(*locks), GFP_KERNEL);
+	if (!locks)
+		return -ENOMEM;
+
+	for (n = 0; n < nlocks; n++)
+		ww_mutex_init(&locks[n], &ww_class);
+
+	for (n = 0; nthreads; n++) {
+		struct stress *stress;
+		void (*fn)(struct work_struct *work);
+
+		fn = NULL;
+		switch (n & 3) {
+		case 0:
+			if (flags & STRESS_INORDER)
+				fn = stress_inorder_work;
+			break;
+		case 1:
+			if (flags & STRESS_REORDER)
+				fn = stress_reorder_work;
+			break;
+		case 2:
+			if (flags & STRESS_ONE)
+				fn = stress_one_work;
+			break;
+		}
+
+		if (!fn)
+			continue;
+
+		stress = kmalloc(sizeof(*stress), GFP_KERNEL);
+		if (!stress)
+			break;
+
+		INIT_WORK(&stress->work, fn);
+		stress->locks = locks;
+		stress->nlocks = nlocks;
+		stress->nloops = nloops;
+
+		queue_work(wq, &stress->work);
+		nthreads--;
+	}
+
+	flush_workqueue(wq);
+
+	for (n = 0; n < nlocks; n++)
+		ww_mutex_destroy(&locks[n]);
+	kfree(locks);
+
+	return 0;
+}
+
 static int __init test_ww_mutex_init(void)
 {
 	int ncpus = num_online_cpus();
@@ -377,6 +619,18 @@ static int __init test_ww_mutex_init(void)
 	if (ret)
 		return ret;
 
+	ret = stress(16, 2*ncpus, 1<<10, STRESS_INORDER);
+	if (ret)
+		return ret;
+
+	ret = stress(16, 2*ncpus, 1<<10, STRESS_REORDER);
+	if (ret)
+		return ret;
+
+	ret = stress(4096, hweight32(STRESS_ALL)*ncpus, 1<<12, STRESS_ALL);
+	if (ret)
+		return ret;
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 1e24edca0557dba6486d39d3c24c288475432bcf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 14 Nov 2016 17:12:23 +0100
Subject: locking/atomic, kref: Add KREF_INIT()

Since we need to change the implementation, stop exposing internals.

Provide KREF_INIT() to allow static initialization of struct kref.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/block/drbd/drbd_bitmap.c | 2 +-
 fs/fuse/fuse_i.h                 | 2 +-
 include/linux/kref.h             | 2 ++
 init/version.c                   | 4 +---
 kernel/pid.c                     | 4 +---
 5 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index ab62b81c2ca7..dece26f119d4 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -1070,7 +1070,7 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned
 		.done = 0,
 		.flags = flags,
 		.error = 0,
-		.kref = { ATOMIC_INIT(2) },
+		.kref = KREF_INIT(2),
 	};
 
 	if (!get_ldev_if_state(device, D_ATTACHING)) {  /* put is in drbd_bm_aio_ctx_destroy() */
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 91307940c8ac..052f8d3c41cb 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -256,7 +256,7 @@ struct fuse_io_priv {
 
 #define FUSE_IO_PRIV_SYNC(f) \
 {					\
-	.refcnt = { ATOMIC_INIT(1) },	\
+	.refcnt = KREF_INIT(1),		\
 	.async = 0,			\
 	.file = f,			\
 }
diff --git a/include/linux/kref.h b/include/linux/kref.h
index e15828fd71f1..9af255ad1e2f 100644
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -24,6 +24,8 @@ struct kref {
 	atomic_t refcount;
 };
 
+#define KREF_INIT(n)	{ .refcount = ATOMIC_INIT(n), }
+
 /**
  * kref_init - initialize object.
  * @kref: object in question.
diff --git a/init/version.c b/init/version.c
index fe41a63efed6..5606341e9efd 100644
--- a/init/version.c
+++ b/init/version.c
@@ -23,9 +23,7 @@ int version_string(LINUX_VERSION_CODE);
 #endif
 
 struct uts_namespace init_uts_ns = {
-	.kref = {
-		.refcount	= ATOMIC_INIT(2),
-	},
+	.kref = KREF_INIT(2),
 	.name = {
 		.sysname	= UTS_SYSNAME,
 		.nodename	= UTS_NODENAME,
diff --git a/kernel/pid.c b/kernel/pid.c
index f66162f2359b..0291804151b5 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -68,9 +68,7 @@ static inline int mk_pid(struct pid_namespace *pid_ns,
  * the scheme scales to up to 4 million PIDs, runtime.
  */
 struct pid_namespace init_pid_ns = {
-	.kref = {
-		.refcount       = ATOMIC_INIT(2),
-	},
+	.kref = KREF_INIT(2),
 	.pidmap = {
 		[ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
 	},
-- 
cgit v1.2.3


From f4dbba591945dc301c302672adefba9e2ec08dc5 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linaro.org>
Date: Thu, 10 Nov 2016 13:06:39 -0800
Subject: locktorture: Fix potential memory leak with rw lock test

When running locktorture module with the below commands with kmemleak enabled:

$ modprobe locktorture torture_type=rw_lock_irq
$ rmmod locktorture

The below kmemleak got caught:

root@10:~# echo scan > /sys/kernel/debug/kmemleak
[  323.197029] kmemleak: 2 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
root@10:~# cat /sys/kernel/debug/kmemleak
unreferenced object 0xffffffc07592d500 (size 128):
  comm "modprobe", pid 368, jiffies 4294924118 (age 205.824s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 c3 7b 02 00 00 00 00 00  .........{......
    00 00 00 00 00 00 00 00 d7 9b 02 00 00 00 00 00  ................
  backtrace:
    [<ffffff80081e5a88>] create_object+0x110/0x288
    [<ffffff80086c6078>] kmemleak_alloc+0x58/0xa0
    [<ffffff80081d5acc>] __kmalloc+0x234/0x318
    [<ffffff80006fa130>] 0xffffff80006fa130
    [<ffffff8008083ae4>] do_one_initcall+0x44/0x138
    [<ffffff800817e28c>] do_init_module+0x68/0x1cc
    [<ffffff800811c848>] load_module+0x1a68/0x22e0
    [<ffffff800811d340>] SyS_finit_module+0xe0/0xf0
    [<ffffff80080836f0>] el0_svc_naked+0x24/0x28
    [<ffffffffffffffff>] 0xffffffffffffffff
unreferenced object 0xffffffc07592d480 (size 128):
  comm "modprobe", pid 368, jiffies 4294924118 (age 205.824s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 3b 6f 01 00 00 00 00 00  ........;o......
    00 00 00 00 00 00 00 00 23 6a 01 00 00 00 00 00  ........#j......
  backtrace:
    [<ffffff80081e5a88>] create_object+0x110/0x288
    [<ffffff80086c6078>] kmemleak_alloc+0x58/0xa0
    [<ffffff80081d5acc>] __kmalloc+0x234/0x318
    [<ffffff80006fa22c>] 0xffffff80006fa22c
    [<ffffff8008083ae4>] do_one_initcall+0x44/0x138
    [<ffffff800817e28c>] do_init_module+0x68/0x1cc
    [<ffffff800811c848>] load_module+0x1a68/0x22e0
    [<ffffff800811d340>] SyS_finit_module+0xe0/0xf0
    [<ffffff80080836f0>] el0_svc_naked+0x24/0x28
    [<ffffffffffffffff>] 0xffffffffffffffff

It is because cxt.lwsa and cxt.lrsa don't get freed in module_exit, so free
them in lock_torture_cleanup() and free writer_tasks if reader_tasks is
failed at memory allocation.

Signed-off-by: Yang Shi <yang.shi@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/locking/locktorture.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index f8c5af52a131..d3de04b12f8c 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -780,6 +780,10 @@ static void lock_torture_cleanup(void)
 	else
 		lock_torture_print_module_parms(cxt.cur_ops,
 						"End of test: SUCCESS");
+
+	kfree(cxt.lwsa);
+	kfree(cxt.lrsa);
+
 end:
 	torture_cleanup_end();
 }
@@ -924,6 +928,8 @@ static int __init lock_torture_init(void)
 				       GFP_KERNEL);
 		if (reader_tasks == NULL) {
 			VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory");
+			kfree(writer_tasks);
+			writer_tasks = NULL;
 			firsterr = -ENOMEM;
 			goto unwind;
 		}
-- 
cgit v1.2.3


From d8ebf5191d7fdf81ba34a7c3d726b99c34918030 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 15 Jan 2017 19:03:40 -0500
Subject: cgroup: cosmetic update to cgroup_taskset_add()

cgroup_taskset_add() was using list_add_tail() when for source csets
but list_move_tail() for destination.  As the operations are gated by
list_empty() test, list_move_tail() is equivalent to list_add_tail()
here.  Use list_add_tail() too for destination csets too.

This doesn't cause any functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Zefan Li <lizefan@huawei.com>
---
 kernel/cgroup/cgroup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d9d82e96d67f..aed492e907c1 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1980,8 +1980,8 @@ static void cgroup_taskset_add(struct task_struct *task,
 	if (list_empty(&cset->mg_node))
 		list_add_tail(&cset->mg_node, &tset->src_csets);
 	if (list_empty(&cset->mg_dst_cset->mg_node))
-		list_move_tail(&cset->mg_dst_cset->mg_node,
-			       &tset->dst_csets);
+		list_add_tail(&cset->mg_dst_cset->mg_node,
+			      &tset->dst_csets);
 }
 
 /**
-- 
cgit v1.2.3


From e595cd706982bff0211e6fafe5a108421e747fbc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 15 Jan 2017 19:03:41 -0500
Subject: cgroup: track migration context in cgroup_mgctx

cgroup migration is performed in four steps - css_set preloading,
addition of target tasks, actual migration, and clean up.  A list
named preloaded_csets is used to track the preloading.  This is a bit
too restricted and the code is already depending on the subtlety that
all source css_sets appear before destination ones.

Let's create struct cgroup_mgctx which keeps track of everything
during migration.  Currently, it has separate preload lists for source
and destination csets and also embeds cgroup_taskset which is used
during the actual migration.  This moves struct cgroup_taskset
definition to cgroup-internal.h.

This patch doesn't cause any functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Zefan Li <lizefan@huawei.com>
---
 kernel/cgroup/cgroup-internal.h |  66 ++++++++++++++++--
 kernel/cgroup/cgroup-v1.c       |  10 +--
 kernel/cgroup/cgroup.c          | 144 ++++++++++++++++------------------------
 3 files changed, 122 insertions(+), 98 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 589b0e7013ec..5f8c8ac5ab88 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -26,6 +26,61 @@ struct cgrp_cset_link {
 	struct list_head	cgrp_link;
 };
 
+/* used to track tasks and csets during migration */
+struct cgroup_taskset {
+	/* the src and dst cset list running through cset->mg_node */
+	struct list_head	src_csets;
+	struct list_head	dst_csets;
+
+	/* the subsys currently being processed */
+	int			ssid;
+
+	/*
+	 * Fields for cgroup_taskset_*() iteration.
+	 *
+	 * Before migration is committed, the target migration tasks are on
+	 * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
+	 * the csets on ->dst_csets.  ->csets point to either ->src_csets
+	 * or ->dst_csets depending on whether migration is committed.
+	 *
+	 * ->cur_csets and ->cur_task point to the current task position
+	 * during iteration.
+	 */
+	struct list_head	*csets;
+	struct css_set		*cur_cset;
+	struct task_struct	*cur_task;
+};
+
+/* migration context also tracks preloading */
+struct cgroup_mgctx {
+	/*
+	 * Preloaded source and destination csets.  Used to guarantee
+	 * atomic success or failure on actual migration.
+	 */
+	struct list_head	preloaded_src_csets;
+	struct list_head	preloaded_dst_csets;
+
+	/* tasks and csets to migrate */
+	struct cgroup_taskset	tset;
+};
+
+#define CGROUP_TASKSET_INIT(tset)						\
+{										\
+	.src_csets		= LIST_HEAD_INIT(tset.src_csets),		\
+	.dst_csets		= LIST_HEAD_INIT(tset.dst_csets),		\
+	.csets			= &tset.src_csets,				\
+}
+
+#define CGROUP_MGCTX_INIT(name)							\
+{										\
+	LIST_HEAD_INIT(name.preloaded_src_csets),				\
+	LIST_HEAD_INIT(name.preloaded_dst_csets),				\
+	CGROUP_TASKSET_INIT(name.tset),						\
+}
+
+#define DEFINE_CGROUP_MGCTX(name)						\
+	struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
+
 struct cgroup_sb_opts {
 	u16 subsys_mask;
 	unsigned int flags;
@@ -112,13 +167,12 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
 			       struct cgroup_namespace *ns);
 
 bool cgroup_may_migrate_to(struct cgroup *dst_cgrp);
-void cgroup_migrate_finish(struct list_head *preloaded_csets);
-void cgroup_migrate_add_src(struct css_set *src_cset,
-			    struct cgroup *dst_cgrp,
-			    struct list_head *preloaded_csets);
-int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets);
+void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
+void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp,
+			    struct cgroup_mgctx *mgctx);
+int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx);
 int cgroup_migrate(struct task_struct *leader, bool threadgroup,
-		   struct cgroup_root *root);
+		   struct cgroup_mgctx *mgctx, struct cgroup_root *root);
 
 int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
 		       bool threadgroup);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 465b10111eaf..7a965f460faf 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -87,7 +87,7 @@ EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
  */
 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 {
-	LIST_HEAD(preloaded_csets);
+	DEFINE_CGROUP_MGCTX(mgctx);
 	struct cgrp_cset_link *link;
 	struct css_task_iter it;
 	struct task_struct *task;
@@ -106,10 +106,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 	/* all tasks in @from are being moved, all csets are source */
 	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(link, &from->cset_links, cset_link)
-		cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
+		cgroup_migrate_add_src(link->cset, to, &mgctx);
 	spin_unlock_irq(&css_set_lock);
 
-	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
+	ret = cgroup_migrate_prepare_dst(&mgctx);
 	if (ret)
 		goto out_err;
 
@@ -125,14 +125,14 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 		css_task_iter_end(&it);
 
 		if (task) {
-			ret = cgroup_migrate(task, false, to->root);
+			ret = cgroup_migrate(task, false, &mgctx, to->root);
 			if (!ret)
 				trace_cgroup_transfer_tasks(to, task, false);
 			put_task_struct(task);
 		}
 	} while (task && !ret);
 out_err:
-	cgroup_migrate_finish(&preloaded_csets);
+	cgroup_migrate_finish(&mgctx);
 	percpu_up_write(&cgroup_threadgroup_rwsem);
 	mutex_unlock(&cgroup_mutex);
 	return ret;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index aed492e907c1..f90554d24e59 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1916,49 +1916,18 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 }
 EXPORT_SYMBOL_GPL(task_cgroup_path);
 
-/* used to track tasks and other necessary states during migration */
-struct cgroup_taskset {
-	/* the src and dst cset list running through cset->mg_node */
-	struct list_head	src_csets;
-	struct list_head	dst_csets;
-
-	/* the subsys currently being processed */
-	int			ssid;
-
-	/*
-	 * Fields for cgroup_taskset_*() iteration.
-	 *
-	 * Before migration is committed, the target migration tasks are on
-	 * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
-	 * the csets on ->dst_csets.  ->csets point to either ->src_csets
-	 * or ->dst_csets depending on whether migration is committed.
-	 *
-	 * ->cur_csets and ->cur_task point to the current task position
-	 * during iteration.
-	 */
-	struct list_head	*csets;
-	struct css_set		*cur_cset;
-	struct task_struct	*cur_task;
-};
-
-#define CGROUP_TASKSET_INIT(tset)	(struct cgroup_taskset){	\
-	.src_csets		= LIST_HEAD_INIT(tset.src_csets),	\
-	.dst_csets		= LIST_HEAD_INIT(tset.dst_csets),	\
-	.csets			= &tset.src_csets,			\
-}
-
 /**
- * cgroup_taskset_add - try to add a migration target task to a taskset
+ * cgroup_migrate_add_task - add a migration target task to a migration context
  * @task: target task
- * @tset: target taskset
+ * @mgctx: target migration context
  *
- * Add @task, which is a migration target, to @tset.  This function becomes
- * noop if @task doesn't need to be migrated.  @task's css_set should have
- * been added as a migration source and @task->cg_list will be moved from
- * the css_set's tasks list to mg_tasks one.
+ * Add @task, which is a migration target, to @mgctx->tset.  This function
+ * becomes noop if @task doesn't need to be migrated.  @task's css_set
+ * should have been added as a migration source and @task->cg_list will be
+ * moved from the css_set's tasks list to mg_tasks one.
  */
-static void cgroup_taskset_add(struct task_struct *task,
-			       struct cgroup_taskset *tset)
+static void cgroup_migrate_add_task(struct task_struct *task,
+				    struct cgroup_mgctx *mgctx)
 {
 	struct css_set *cset;
 
@@ -1978,10 +1947,11 @@ static void cgroup_taskset_add(struct task_struct *task,
 
 	list_move_tail(&task->cg_list, &cset->mg_tasks);
 	if (list_empty(&cset->mg_node))
-		list_add_tail(&cset->mg_node, &tset->src_csets);
+		list_add_tail(&cset->mg_node,
+			      &mgctx->tset.src_csets);
 	if (list_empty(&cset->mg_dst_cset->mg_node))
 		list_add_tail(&cset->mg_dst_cset->mg_node,
-			      &tset->dst_csets);
+			      &mgctx->tset.dst_csets);
 }
 
 /**
@@ -2048,17 +2018,18 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
 
 /**
  * cgroup_taskset_migrate - migrate a taskset
- * @tset: taget taskset
+ * @mgctx: migration context
  * @root: cgroup root the migration is taking place on
  *
- * Migrate tasks in @tset as setup by migration preparation functions.
+ * Migrate tasks in @mgctx as setup by migration preparation functions.
  * This function fails iff one of the ->can_attach callbacks fails and
- * guarantees that either all or none of the tasks in @tset are migrated.
- * @tset is consumed regardless of success.
+ * guarantees that either all or none of the tasks in @mgctx are migrated.
+ * @mgctx is consumed regardless of success.
  */
-static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
+static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx,
 				  struct cgroup_root *root)
 {
+	struct cgroup_taskset *tset = &mgctx->tset;
 	struct cgroup_subsys *ss;
 	struct task_struct *task, *tmp_task;
 	struct css_set *cset, *tmp_cset;
@@ -2151,25 +2122,31 @@ bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
 
 /**
  * cgroup_migrate_finish - cleanup after attach
- * @preloaded_csets: list of preloaded css_sets
+ * @mgctx: migration context
  *
  * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
  * those functions for details.
  */
-void cgroup_migrate_finish(struct list_head *preloaded_csets)
+void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
 {
+	LIST_HEAD(preloaded);
 	struct css_set *cset, *tmp_cset;
 
 	lockdep_assert_held(&cgroup_mutex);
 
 	spin_lock_irq(&css_set_lock);
-	list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
+
+	list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
+	list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
+
+	list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
 		cset->mg_src_cgrp = NULL;
 		cset->mg_dst_cgrp = NULL;
 		cset->mg_dst_cset = NULL;
 		list_del_init(&cset->mg_preload_node);
 		put_css_set_locked(cset);
 	}
+
 	spin_unlock_irq(&css_set_lock);
 }
 
@@ -2177,10 +2154,10 @@ void cgroup_migrate_finish(struct list_head *preloaded_csets)
  * cgroup_migrate_add_src - add a migration source css_set
  * @src_cset: the source css_set to add
  * @dst_cgrp: the destination cgroup
- * @preloaded_csets: list of preloaded css_sets
+ * @mgctx: migration context
  *
  * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
- * @src_cset and add it to @preloaded_csets, which should later be cleaned
+ * @src_cset and add it to @mgctx->src_csets, which should later be cleaned
  * up by cgroup_migrate_finish().
  *
  * This function may be called without holding cgroup_threadgroup_rwsem
@@ -2191,7 +2168,7 @@ void cgroup_migrate_finish(struct list_head *preloaded_csets)
  */
 void cgroup_migrate_add_src(struct css_set *src_cset,
 			    struct cgroup *dst_cgrp,
-			    struct list_head *preloaded_csets)
+			    struct cgroup_mgctx *mgctx)
 {
 	struct cgroup *src_cgrp;
 
@@ -2219,32 +2196,32 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
 	src_cset->mg_src_cgrp = src_cgrp;
 	src_cset->mg_dst_cgrp = dst_cgrp;
 	get_css_set(src_cset);
-	list_add(&src_cset->mg_preload_node, preloaded_csets);
+	list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
 }
 
 /**
  * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
- * @preloaded_csets: list of preloaded source css_sets
+ * @mgctx: migration context
  *
  * Tasks are about to be moved and all the source css_sets have been
- * preloaded to @preloaded_csets.  This function looks up and pins all
- * destination css_sets, links each to its source, and append them to
- * @preloaded_csets.
+ * preloaded to @mgctx->preloaded_src_csets.  This function looks up and
+ * pins all destination css_sets, links each to its source, and append them
+ * to @mgctx->preloaded_dst_csets.
  *
  * This function must be called after cgroup_migrate_add_src() has been
  * called on each migration source css_set.  After migration is performed
  * using cgroup_migrate(), cgroup_migrate_finish() must be called on
- * @preloaded_csets.
+ * @mgctx.
  */
-int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
+int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
 {
-	LIST_HEAD(csets);
 	struct css_set *src_cset, *tmp_cset;
 
 	lockdep_assert_held(&cgroup_mutex);
 
 	/* look up the dst cset for each src cset and link it to src */
-	list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
+	list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
+				 mg_preload_node) {
 		struct css_set *dst_cset;
 
 		dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
@@ -2270,15 +2247,15 @@ int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
 		src_cset->mg_dst_cset = dst_cset;
 
 		if (list_empty(&dst_cset->mg_preload_node))
-			list_add(&dst_cset->mg_preload_node, &csets);
+			list_add_tail(&dst_cset->mg_preload_node,
+				      &mgctx->preloaded_dst_csets);
 		else
 			put_css_set(dst_cset);
 	}
 
-	list_splice_tail(&csets, preloaded_csets);
 	return 0;
 err:
-	cgroup_migrate_finish(&csets);
+	cgroup_migrate_finish(mgctx);
 	return -ENOMEM;
 }
 
@@ -2287,6 +2264,7 @@ err:
  * @leader: the leader of the process or the task to migrate
  * @threadgroup: whether @leader points to the whole process or a single task
  * @root: cgroup root migration is taking place on
+ * @mgctx: migration context
  *
  * Migrate a process or task denoted by @leader.  If migrating a process,
  * the caller must be holding cgroup_threadgroup_rwsem.  The caller is also
@@ -2301,9 +2279,8 @@ err:
  * actually starting migrating.
  */
 int cgroup_migrate(struct task_struct *leader, bool threadgroup,
-		   struct cgroup_root *root)
+		   struct cgroup_mgctx *mgctx, struct cgroup_root *root)
 {
-	struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
 	struct task_struct *task;
 
 	/*
@@ -2315,14 +2292,14 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
 	rcu_read_lock();
 	task = leader;
 	do {
-		cgroup_taskset_add(task, &tset);
+		cgroup_migrate_add_task(task, mgctx);
 		if (!threadgroup)
 			break;
 	} while_each_thread(leader, task);
 	rcu_read_unlock();
 	spin_unlock_irq(&css_set_lock);
 
-	return cgroup_taskset_migrate(&tset, root);
+	return cgroup_migrate_execute(mgctx, root);
 }
 
 /**
@@ -2336,7 +2313,7 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
 int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
 		       bool threadgroup)
 {
-	LIST_HEAD(preloaded_csets);
+	DEFINE_CGROUP_MGCTX(mgctx);
 	struct task_struct *task;
 	int ret;
 
@@ -2348,8 +2325,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
 	rcu_read_lock();
 	task = leader;
 	do {
-		cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
-				       &preloaded_csets);
+		cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
 		if (!threadgroup)
 			break;
 	} while_each_thread(leader, task);
@@ -2357,11 +2333,11 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
 	spin_unlock_irq(&css_set_lock);
 
 	/* prepare dst csets and commit */
-	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
+	ret = cgroup_migrate_prepare_dst(&mgctx);
 	if (!ret)
-		ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root);
+		ret = cgroup_migrate(leader, threadgroup, &mgctx, dst_cgrp->root);
 
-	cgroup_migrate_finish(&preloaded_csets);
+	cgroup_migrate_finish(&mgctx);
 
 	if (!ret)
 		trace_cgroup_attach_task(dst_cgrp, leader, threadgroup);
@@ -2528,8 +2504,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
  */
 static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 {
-	LIST_HEAD(preloaded_csets);
-	struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
+	DEFINE_CGROUP_MGCTX(mgctx);
 	struct cgroup_subsys_state *d_css;
 	struct cgroup *dsct;
 	struct css_set *src_cset;
@@ -2545,33 +2520,28 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 		struct cgrp_cset_link *link;
 
 		list_for_each_entry(link, &dsct->cset_links, cset_link)
-			cgroup_migrate_add_src(link->cset, dsct,
-					       &preloaded_csets);
+			cgroup_migrate_add_src(link->cset, dsct, &mgctx);
 	}
 	spin_unlock_irq(&css_set_lock);
 
 	/* NULL dst indicates self on default hierarchy */
-	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
+	ret = cgroup_migrate_prepare_dst(&mgctx);
 	if (ret)
 		goto out_finish;
 
 	spin_lock_irq(&css_set_lock);
-	list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
+	list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
 		struct task_struct *task, *ntask;
 
-		/* src_csets precede dst_csets, break on the first dst_cset */
-		if (!src_cset->mg_src_cgrp)
-			break;
-
 		/* all tasks in src_csets need to be migrated */
 		list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
-			cgroup_taskset_add(task, &tset);
+			cgroup_migrate_add_task(task, &mgctx);
 	}
 	spin_unlock_irq(&css_set_lock);
 
-	ret = cgroup_taskset_migrate(&tset, cgrp->root);
+	ret = cgroup_migrate_execute(&mgctx, cgrp->root);
 out_finish:
-	cgroup_migrate_finish(&preloaded_csets);
+	cgroup_migrate_finish(&mgctx);
 	percpu_up_write(&cgroup_threadgroup_rwsem);
 	return ret;
 }
-- 
cgit v1.2.3


From bfc2cf6f61fceac42235345081eb713329baa2a2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 15 Jan 2017 19:03:41 -0500
Subject: cgroup: call subsys->*attach() only for subsystems which are actually
 affected by migration

Currently, subsys->*attach() callbacks are called for all subsystems
which are attached to the hierarchy on which the migration is taking
place.

With cgroup_migrate_prepare_dst() filtering out identity migrations,
v1 hierarchies can avoid spurious ->*attach() callback invocations
where the source and destination csses are identical; however, this
isn't enough on v2 as only a subset of the attached controllers can be
affected on controller enable/disable.

While spurious ->*attach() invocations aren't critically broken,
they're unnecessary overhead and can lead to temporary overcharges on
certain controllers.  Fix it by tracking which subsystems are affected
by a migration and invoking ->*attach() callbacks only on those
subsystems.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Zefan Li <lizefan@huawei.com>
---
 kernel/cgroup/cgroup-internal.h |  5 ++++-
 kernel/cgroup/cgroup-v1.c       |  2 +-
 kernel/cgroup/cgroup.c          | 25 ++++++++++++++-----------
 3 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 5f8c8ac5ab88..9203bfb05603 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -62,6 +62,9 @@ struct cgroup_mgctx {
 
 	/* tasks and csets to migrate */
 	struct cgroup_taskset	tset;
+
+	/* subsystems affected by migration */
+	u16			ss_mask;
 };
 
 #define CGROUP_TASKSET_INIT(tset)						\
@@ -172,7 +175,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp,
 			    struct cgroup_mgctx *mgctx);
 int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx);
 int cgroup_migrate(struct task_struct *leader, bool threadgroup,
-		   struct cgroup_mgctx *mgctx, struct cgroup_root *root);
+		   struct cgroup_mgctx *mgctx);
 
 int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
 		       bool threadgroup);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 7a965f460faf..fc34bcf2329f 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -125,7 +125,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 		css_task_iter_end(&it);
 
 		if (task) {
-			ret = cgroup_migrate(task, false, &mgctx, to->root);
+			ret = cgroup_migrate(task, false, &mgctx);
 			if (!ret)
 				trace_cgroup_transfer_tasks(to, task, false);
 			put_task_struct(task);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index f90554d24e59..69ad5b3de0c1 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2019,15 +2019,13 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
 /**
  * cgroup_taskset_migrate - migrate a taskset
  * @mgctx: migration context
- * @root: cgroup root the migration is taking place on
  *
  * Migrate tasks in @mgctx as setup by migration preparation functions.
  * This function fails iff one of the ->can_attach callbacks fails and
  * guarantees that either all or none of the tasks in @mgctx are migrated.
  * @mgctx is consumed regardless of success.
  */
-static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx,
-				  struct cgroup_root *root)
+static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
 {
 	struct cgroup_taskset *tset = &mgctx->tset;
 	struct cgroup_subsys *ss;
@@ -2040,7 +2038,7 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx,
 		return 0;
 
 	/* check that we can legitimately attach to the cgroup */
-	do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+	do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
 		if (ss->can_attach) {
 			tset->ssid = ssid;
 			ret = ss->can_attach(tset);
@@ -2076,7 +2074,7 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx,
 	 */
 	tset->csets = &tset->dst_csets;
 
-	do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+	do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
 		if (ss->attach) {
 			tset->ssid = ssid;
 			ss->attach(tset);
@@ -2087,7 +2085,7 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx,
 	goto out_release_tset;
 
 out_cancel_attach:
-	do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+	do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
 		if (ssid == failed_ssid)
 			break;
 		if (ss->cancel_attach) {
@@ -2223,6 +2221,8 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
 	list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
 				 mg_preload_node) {
 		struct css_set *dst_cset;
+		struct cgroup_subsys *ss;
+		int ssid;
 
 		dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
 		if (!dst_cset)
@@ -2251,6 +2251,10 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
 				      &mgctx->preloaded_dst_csets);
 		else
 			put_css_set(dst_cset);
+
+		for_each_subsys(ss, ssid)
+			if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
+				mgctx->ss_mask |= 1 << ssid;
 	}
 
 	return 0;
@@ -2263,7 +2267,6 @@ err:
  * cgroup_migrate - migrate a process or task to a cgroup
  * @leader: the leader of the process or the task to migrate
  * @threadgroup: whether @leader points to the whole process or a single task
- * @root: cgroup root migration is taking place on
  * @mgctx: migration context
  *
  * Migrate a process or task denoted by @leader.  If migrating a process,
@@ -2279,7 +2282,7 @@ err:
  * actually starting migrating.
  */
 int cgroup_migrate(struct task_struct *leader, bool threadgroup,
-		   struct cgroup_mgctx *mgctx, struct cgroup_root *root)
+		   struct cgroup_mgctx *mgctx)
 {
 	struct task_struct *task;
 
@@ -2299,7 +2302,7 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
 	rcu_read_unlock();
 	spin_unlock_irq(&css_set_lock);
 
-	return cgroup_migrate_execute(mgctx, root);
+	return cgroup_migrate_execute(mgctx);
 }
 
 /**
@@ -2335,7 +2338,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
 	/* prepare dst csets and commit */
 	ret = cgroup_migrate_prepare_dst(&mgctx);
 	if (!ret)
-		ret = cgroup_migrate(leader, threadgroup, &mgctx, dst_cgrp->root);
+		ret = cgroup_migrate(leader, threadgroup, &mgctx);
 
 	cgroup_migrate_finish(&mgctx);
 
@@ -2539,7 +2542,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 	}
 	spin_unlock_irq(&css_set_lock);
 
-	ret = cgroup_migrate_execute(&mgctx, cgrp->root);
+	ret = cgroup_migrate_execute(&mgctx);
 out_finish:
 	cgroup_migrate_finish(&mgctx);
 	percpu_up_write(&cgroup_threadgroup_rwsem);
-- 
cgit v1.2.3


From 2d071c643f1cd15a24172de4b5b7ae2adb93abbb Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 15 Jan 2017 01:34:25 +0100
Subject: bpf, trace: make ctx access checks more robust

Make sure that ctx cannot potentially be accessed oob by asserting
explicitly that ctx access size into pt_regs for BPF_PROG_TYPE_KPROBE
programs must be within limits. In case some 32bit archs have pt_regs
not being a multiple of 8, then BPF_DW access could cause such access.

BPF_PROG_TYPE_KPROBE progs don't have a ctx conversion function since
there's no extra mapping needed. kprobe_prog_is_valid_access() didn't
enforce sizeof(long) as the only allowed access size, since LLVM can
generate non BPF_W/BPF_DW access to regs from time to time.

For BPF_PROG_TYPE_TRACEPOINT we don't have a ctx conversion either, so
add a BUILD_BUG_ON() check to make sure that BPF_DW access will not be
a similar issue in future (ctx works on event buffer as opposed to
pt_regs there).

Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 1860e7f1e5a8..c22a961d1a42 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -459,6 +459,13 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type
 		return false;
 	if (off % size != 0)
 		return false;
+	/*
+	 * Assertion for 32 bit to make sure last 8 byte access
+	 * (BPF_DW) to the last 4 byte member is disallowed.
+	 */
+	if (off + size > sizeof(struct pt_regs))
+		return false;
+
 	return true;
 }
 
@@ -540,6 +547,8 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type
 		return false;
 	if (off % size != 0)
 		return false;
+
+	BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64));
 	return true;
 }
 
-- 
cgit v1.2.3


From 6563de9d6f1336157c9861bcd9864e0b47d65f9d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 2 Nov 2016 13:33:57 -0700
Subject: rcu: Abstract the dynticks momentary-idle operation

This commit is the first step towards full abstraction of all accesses to
the ->dynticks counter, implementing the previously open-coded atomic add
of two in a new rcu_dynticks_momentary_idle() function.  This abstraction
will ease changes to the ->dynticks counter operation.

Note that this commit gets rid of the smp_mb__before_atomic() and the
smp_mb__after_atomic() calls that were previously present.  The reason
that this is OK from a memory-ordering perspective is that the atomic
operation is now atomic_add_return(), which, as a value-returning atomic,
guarantees full ordering.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tree.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index cb4e2056ccf3..14e283c351f6 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -281,6 +281,19 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
 #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
 };
 
+/*
+ * Do a double-increment of the ->dynticks counter to emulate a
+ * momentary idle-CPU quiescent state.
+ */
+static void rcu_dynticks_momentary_idle(void)
+{
+	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+	int special = atomic_add_return(2, &rdtp->dynticks);
+
+	/* It is illegal to call this from idle state. */
+	WARN_ON_ONCE(!(special & 0x1));
+}
+
 DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
 EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
 
@@ -300,7 +313,6 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
 static void rcu_momentary_dyntick_idle(void)
 {
 	struct rcu_data *rdp;
-	struct rcu_dynticks *rdtp;
 	int resched_mask;
 	struct rcu_state *rsp;
 
@@ -327,10 +339,7 @@ static void rcu_momentary_dyntick_idle(void)
 		 * quiescent state, with no need for this CPU to do anything
 		 * further.
 		 */
-		rdtp = this_cpu_ptr(&rcu_dynticks);
-		smp_mb__before_atomic(); /* Earlier stuff before QS. */
-		atomic_add(2, &rdtp->dynticks);  /* QS. */
-		smp_mb__after_atomic(); /* Later stuff after QS. */
+		rcu_dynticks_momentary_idle();
 		break;
 	}
 }
-- 
cgit v1.2.3


From 8b2f63ab05eb233b2b396e133889ce3d1d30d944 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 2 Nov 2016 14:12:05 -0700
Subject: rcu: Abstract the dynticks snapshot operation

This commit is the second step towards full abstraction of all accesses to
the ->dynticks counter, implementing the previously open-coded atomic
add of zero in a new rcu_dynticks_snap() function.  This abstraction will
ease changes o the ->dynticks counter operation.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tree.c     | 19 ++++++++++++++++---
 kernel/rcu/tree_exp.h |  6 ++----
 2 files changed, 18 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 14e283c351f6..805d55ee0b2a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -281,6 +281,17 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
 #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
 };
 
+/*
+ * Snapshot the ->dynticks counter with full ordering so as to allow
+ * stable comparison of this counter with past and future snapshots.
+ */
+static int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
+{
+	int snap = atomic_add_return(0, &rdtp->dynticks);
+
+	return snap;
+}
+
 /*
  * Do a double-increment of the ->dynticks counter to emulate a
  * momentary idle-CPU quiescent state.
@@ -1049,7 +1060,9 @@ void rcu_nmi_exit(void)
  */
 bool notrace __rcu_is_watching(void)
 {
-	return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1;
+	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
+	return atomic_read(&rdtp->dynticks) & 0x1;
 }
 
 /**
@@ -1132,7 +1145,7 @@ static int rcu_is_cpu_rrupt_from_idle(void)
 static int dyntick_save_progress_counter(struct rcu_data *rdp,
 					 bool *isidle, unsigned long *maxj)
 {
-	rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
+	rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks);
 	rcu_sysidle_check_cpu(rdp, isidle, maxj);
 	if ((rdp->dynticks_snap & 0x1) == 0) {
 		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
@@ -1157,7 +1170,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
 	int *rcrmp;
 	unsigned int snap;
 
-	curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
+	curr = (unsigned int)rcu_dynticks_snap(rdp->dynticks);
 	snap = (unsigned int)rdp->dynticks_snap;
 
 	/*
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index e59e1849b89a..011f626b2fd8 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -356,10 +356,9 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
 		mask_ofl_test = 0;
 		for_each_leaf_node_possible_cpu(rnp, cpu) {
 			struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-			struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
 
 			rdp->exp_dynticks_snap =
-				atomic_add_return(0, &rdtp->dynticks);
+				rcu_dynticks_snap(rdp->dynticks);
 			if (raw_smp_processor_id() == cpu ||
 			    !(rdp->exp_dynticks_snap & 0x1) ||
 			    !(rnp->qsmaskinitnext & rdp->grpmask))
@@ -380,12 +379,11 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
 		for_each_leaf_node_possible_cpu(rnp, cpu) {
 			unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
 			struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-			struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
 
 			if (!(mask_ofl_ipi & mask))
 				continue;
 retry_ipi:
-			if (atomic_add_return(0, &rdtp->dynticks) !=
+			if (rcu_dynticks_snap(rdp->dynticks) !=
 			    rdp->exp_dynticks_snap) {
 				mask_ofl_test |= mask;
 				continue;
-- 
cgit v1.2.3


From 7c6094db591b320332441e5f169156a4255b2180 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 2 Nov 2016 17:30:02 +0100
Subject: rcu: update: Make RCU_EXPEDITE_BOOT be the default

RCU_EXPEDITE_BOOT should speed up the boot process by enforcing
synchronize_rcu_expedited() instead of synchronize_rcu() during the boot
process. There should be no reason why one does not want this and there
is no need worry about real time latency at this point.
Therefore make it default.

Note that users wishing to avoid expediting entirely, for example when
bringing up new hardware possibly having flaky IPIs, can use the
rcu_normal boot parameter to override boot-time expediting.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
[ paulmck: Reworded commit log. ]
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 init/Kconfig        | 13 -------------
 kernel/rcu/update.c |  6 ++----
 2 files changed, 2 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/init/Kconfig b/init/Kconfig
index 223b734abccd..96e6d56acd50 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -781,19 +781,6 @@ config RCU_NOCB_CPU_ALL
 
 endchoice
 
-config RCU_EXPEDITE_BOOT
-	bool
-	default n
-	help
-	  This option enables expedited grace periods at boot time,
-	  as if rcu_expedite_gp() had been invoked early in boot.
-	  The corresponding rcu_unexpedite_gp() is invoked from
-	  rcu_end_inkernel_boot(), which is intended to be invoked
-	  at the end of the kernel-only boot sequence, just before
-	  init is exec'ed.
-
-	  Accept the default if unsure.
-
 endmenu # "RCU Subsystem"
 
 config BUILD_BIN2C
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 4f6db7e6a117..9e03db9ea9c0 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -132,8 +132,7 @@ bool rcu_gp_is_normal(void)
 }
 EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
 
-static atomic_t rcu_expedited_nesting =
-	ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
+static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1);
 
 /*
  * Should normal grace-period primitives be expedited?  Intended for
@@ -182,8 +181,7 @@ EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
  */
 void rcu_end_inkernel_boot(void)
 {
-	if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
-		rcu_unexpedite_gp();
+	rcu_unexpedite_gp();
 	if (rcu_normal_after_boot)
 		WRITE_ONCE(rcu_normal, 1);
 }
-- 
cgit v1.2.3


From 6496bb72bf20c1c7e4d6be44dfa663163e709116 Mon Sep 17 00:00:00 2001
From: Kenny Yu <kennyyu@fb.com>
Date: Fri, 13 Jan 2017 08:58:34 -0800
Subject: uprobe: Find last occurrence of ':' when parsing uprobe PATH:OFFSET

Previously, `create_trace_uprobe` found the *first* occurence
of the ':' character when parsing `PATH:OFFSET` for a uprobe.
However, if the path contains a ':' character, then the function
would parse the path incorrectly. Even worse, if the path does not
exist, the subsequent call to `kern_path()` would set `ret` to
`ENOENT`, leading to very cryptic errno values in user space.

The fix is to find the *last* occurence of ':'.

How to repro:: The write fails with "No such file or directory", suggesting
incorrectly that the `uprobe_events` file does not exist.

  $ mkdir testing && cd testing
  $ cp /bin/bash .
  $ cp /bin/bash ./bash:with:colon
  $ echo "p:uprobes/p__root_testing_bash_0x6 /root/testing/bash:0x6" > /sys/kernel/debug/tracing/uprobe_events     # this works
  $ echo "p:uprobes/p__root_testing_bash_with_colon_0x6 /root/testing/bash:with:colon:0x6" >> /sys/kernel/debug/tracing/uprobe_events     # this doesn't
  -bash: echo: write error: No such file or directory

With the patch:

  $ echo "p:uprobes/p__root_testing_bash_0x6 /root/testing/bash:0x6" > /sys/kernel/debug/tracing/uprobe_events     # this still works
  $ echo "p:uprobes/p__root_testing_bash_with_colon_0x6 /root/testing/bash:with:colon:0x6" >> /sys/kernel/debug/tracing/uprobe_events     # this works now too!
  $ cat /sys/kernel/debug/tracing/uprobe_events
  p:uprobes/p__root_testing_bash_0x6 /root/testing/bash:0x0000000000000006
  p:uprobes/p__root_testing_bash_with_colon_0x6 /root/testing/bash:with:colon:0x0000000000000006

Link: http://lkml.kernel.org/r/20170113165834.4081016-1-kennyyu@fb.com

Signed-off-by: Kenny Yu <kennyyu@fb.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_uprobe.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 0913693caf6e..4f2ba2bb11e0 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -431,7 +431,8 @@ static int create_trace_uprobe(int argc, char **argv)
 		pr_info("Probe point is not specified.\n");
 		return -EINVAL;
 	}
-	arg = strchr(argv[1], ':');
+	/* Find the last occurrence, in case the path contains ':' too. */
+	arg = strrchr(argv[1], ':');
 	if (!arg) {
 		ret = -EINVAL;
 		goto fail_address_parse;
-- 
cgit v1.2.3


From d45ae1f7041ac52ade6c5ec76d96bbed765d67aa Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 17 Jan 2017 12:29:35 -0500
Subject: tracing: Process constants for (un)likely() profiler

When running the likely/unlikely profiler, one of the results did not look
accurate. It noted that the unlikely() in link_path_walk() was 100%
incorrect. When I added a trace_printk() to see what was happening there, it
became 80% correct! Looking deeper into what whas happening, I found that
gcc split that if statement into two paths. One where the if statement
became a constant, the other path a variable. The other path had the if
statement always hit (making the unlikely there, always false), but since
the #define unlikely() has:

  #define unlikely() (__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 0))

Where constants are ignored by the branch profiler, the "constant" path
made by the compiler was ignored, even though it was hit 80% of the time.

By just passing the constant value to the __branch_check__() function and
tracing it out of line (as always correct, as likely/unlikely isn't a factor
for constants), then we get back the accurate readings of branches that were
optimized by gcc causing part of the execution to become constant.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/compiler.h    | 14 ++++++++------
 kernel/trace/trace_branch.c |  6 +++++-
 2 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index cf0fa5d86059..bbbe1570de1c 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -107,12 +107,13 @@ struct ftrace_branch_data {
  */
 #if defined(CONFIG_TRACE_BRANCH_PROFILING) \
     && !defined(DISABLE_BRANCH_PROFILING) && !defined(__CHECKER__)
-void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
+void ftrace_likely_update(struct ftrace_branch_data *f, int val,
+			  int expect, int is_constant);
 
 #define likely_notrace(x)	__builtin_expect(!!(x), 1)
 #define unlikely_notrace(x)	__builtin_expect(!!(x), 0)
 
-#define __branch_check__(x, expect) ({					\
+#define __branch_check__(x, expect, is_constant) ({			\
 			int ______r;					\
 			static struct ftrace_branch_data		\
 				__attribute__((__aligned__(4)))		\
@@ -122,8 +123,9 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
 				.file = __FILE__,			\
 				.line = __LINE__,			\
 			};						\
-			______r = likely_notrace(x);			\
-			ftrace_likely_update(&______f, ______r, expect); \
+			______r = __builtin_expect(!!(x), expect);	\
+			ftrace_likely_update(&______f, ______r,		\
+					     expect, is_constant);	\
 			______r;					\
 		})
 
@@ -133,10 +135,10 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
  * written by Daniel Walker.
  */
 # ifndef likely
-#  define likely(x)	(__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 1))
+#  define likely(x)	(__branch_check__(x, 1, __builtin_constant_p(x)))
 # endif
 # ifndef unlikely
-#  define unlikely(x)	(__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 0))
+#  define unlikely(x)	(__branch_check__(x, 0, __builtin_constant_p(x)))
 # endif
 
 #ifdef CONFIG_PROFILE_ALL_BRANCHES
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 75489de546b6..7afe426ea528 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -200,8 +200,12 @@ void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 }
 #endif /* CONFIG_BRANCH_TRACER */
 
-void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
+void ftrace_likely_update(struct ftrace_branch_data *f, int val,
+			  int expect, int is_constant)
 {
+	/* A constant is always correct */
+	if (is_constant)
+		val = expect;
 	/*
 	 * I would love to have a trace point here instead, but the
 	 * trace point code is so inundated with unlikely and likely
-- 
cgit v1.2.3


From 92c82e8a322b32a6cabe7d8800dc10401157a623 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Fri, 13 Jan 2017 03:26:29 -0500
Subject: audit: add feature audit_lost reset

Add a method to reset the audit_lost value.

An AUDIT_SET message with the AUDIT_STATUS_LOST flag set by itself
will return a positive value repesenting the current audit_lost value
and reset the counter to zero.  If AUDIT_STATUS_LOST is not the
only flag set, the reset command will be ignored.  The value sent with
the command is ignored.  The return value will be the +ve lost value at
reset time.

An AUDIT_CONFIG_CHANGE message will be queued to the listening audit
daemon.  The message will be a standard CONFIG_CHANGE message with the
fields "lost=0" and "old=" with the latter containing the value of
audit_lost at reset time.

See: https://github.com/linux-audit/audit-kernel/issues/3

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Acked-by: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/uapi/linux/audit.h | 6 +++++-
 kernel/audit.c             | 8 +++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index c8dc97bc2c1b..3f24110ae63c 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -326,15 +326,19 @@ enum {
 #define AUDIT_STATUS_RATE_LIMIT		0x0008
 #define AUDIT_STATUS_BACKLOG_LIMIT	0x0010
 #define AUDIT_STATUS_BACKLOG_WAIT_TIME	0x0020
+#define AUDIT_STATUS_LOST		0x0040
 
 #define AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT	0x00000001
 #define AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME	0x00000002
 #define AUDIT_FEATURE_BITMAP_EXECUTABLE_PATH	0x00000004
 #define AUDIT_FEATURE_BITMAP_SESSIONID_FILTER	0x00000010
+#define AUDIT_FEATURE_BITMAP_LOST_RESET		0x00000020
+
 #define AUDIT_FEATURE_BITMAP_ALL (AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT | \
 				  AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME | \
 				  AUDIT_FEATURE_BITMAP_EXECUTABLE_PATH | \
-				  AUDIT_FEATURE_BITMAP_SESSIONID_FILTER)
+				  AUDIT_FEATURE_BITMAP_SESSIONID_FILTER | \
+				  AUDIT_FEATURE_BITMAP_LOST_RESET)
 
 /* deprecated: AUDIT_VERSION_* */
 #define AUDIT_VERSION_LATEST 		AUDIT_FEATURE_BITMAP_ALL
diff --git a/kernel/audit.c b/kernel/audit.c
index 57acf2541fdd..25dd70a588b2 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -121,7 +121,7 @@ u32		audit_sig_sid = 0;
    3) suppressed due to audit_rate_limit
    4) suppressed due to audit_backlog_limit
 */
-static atomic_t    audit_lost = ATOMIC_INIT(0);
+static atomic_t	audit_lost = ATOMIC_INIT(0);
 
 /* The netlink socket. */
 static struct sock *audit_sock;
@@ -1052,6 +1052,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			if (err < 0)
 				return err;
 		}
+		if (s.mask == AUDIT_STATUS_LOST) {
+			u32 lost = atomic_xchg(&audit_lost, 0);
+
+			audit_log_config_change("lost", 0, lost, 1);
+			return lost;
+		}
 		break;
 	}
 	case AUDIT_GET_FEATURE:
-- 
cgit v1.2.3


From 6d2c5d6c46dd3d9924831efd6c913fdf4d484985 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 16 Jan 2017 17:50:02 +0100
Subject: kmod: make usermodehelper path a const string

This is in preparation for making it so that usermode helper programs
can't be changed, if desired, by userspace.  We will tackle the mess of
cleaning up the write-ability of argv and env later, that's going to
take more work, for much less gain...

Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kmod.h | 7 ++++---
 kernel/kmod.c        | 4 ++--
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index fcfd2bf14d3f..c4e441e00db5 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -56,7 +56,7 @@ struct file;
 struct subprocess_info {
 	struct work_struct work;
 	struct completion *complete;
-	char *path;
+	const char *path;
 	char **argv;
 	char **envp;
 	int wait;
@@ -67,10 +67,11 @@ struct subprocess_info {
 };
 
 extern int
-call_usermodehelper(char *path, char **argv, char **envp, int wait);
+call_usermodehelper(const char *path, char **argv, char **envp, int wait);
 
 extern struct subprocess_info *
-call_usermodehelper_setup(char *path, char **argv, char **envp, gfp_t gfp_mask,
+call_usermodehelper_setup(const char *path, char **argv, char **envp,
+			  gfp_t gfp_mask,
 			  int (*init)(struct subprocess_info *info, struct cred *new),
 			  void (*cleanup)(struct subprocess_info *), void *data);
 
diff --git a/kernel/kmod.c b/kernel/kmod.c
index d45c96073afb..426a614e97fe 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -516,7 +516,7 @@ static void helper_unlock(void)
  * Function must be runnable in either a process context or the
  * context in which call_usermodehelper_exec is called.
  */
-struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
+struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
 		char **envp, gfp_t gfp_mask,
 		int (*init)(struct subprocess_info *info, struct cred *new),
 		void (*cleanup)(struct subprocess_info *info),
@@ -613,7 +613,7 @@ EXPORT_SYMBOL(call_usermodehelper_exec);
  * This function is the equivalent to use call_usermodehelper_setup() and
  * call_usermodehelper_exec().
  */
-int call_usermodehelper(char *path, char **argv, char **envp, int wait)
+int call_usermodehelper(const char *path, char **argv, char **envp, int wait)
 {
 	struct subprocess_info *info;
 	gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
-- 
cgit v1.2.3


From 64e90a8acb8590c2468c919f803652f081e3a4bf Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 16 Jan 2017 16:22:39 +0100
Subject: Introduce STATIC_USERMODEHELPER to mediate call_usermodehelper()

Some usermode helper applications are defined at kernel build time, while
others can be changed at runtime.  To provide a sane way to filter these, add a
new kernel option "STATIC_USERMODEHELPER".  This option routes all
call_usermodehelper() calls through this binary, no matter what the caller
wishes to have called.

The new binary (by default set to /sbin/usermode-helper, but can be changed
through the STATIC_USERMODEHELPER_PATH option) can properly filter the
requested programs to be run by the kernel by looking at the first argument
that is passed to it.  All other options should then be passed onto the proper
program if so desired.

To disable all call_usermodehelper() calls by the kernel, set
STATIC_USERMODEHELPER_PATH to an empty string.

Thanks to Neil Brown for the idea of this feature.

Cc: NeilBrown <neilb@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/kmod.c    | 14 ++++++++++++++
 security/Kconfig | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 426a614e97fe..0c407f905ca4 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -528,7 +528,12 @@ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
 		goto out;
 
 	INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
+
+#ifdef CONFIG_STATIC_USERMODEHELPER
+	sub_info->path = CONFIG_STATIC_USERMODEHELPER_PATH;
+#else
 	sub_info->path = path;
+#endif
 	sub_info->argv = argv;
 	sub_info->envp = envp;
 
@@ -566,6 +571,15 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
 		retval = -EBUSY;
 		goto out;
 	}
+
+	/*
+	 * If there is no binary for us to call, then just return and get out of
+	 * here.  This allows us to set STATIC_USERMODEHELPER_PATH to "" and
+	 * disable all call_usermodehelper() calls.
+	 */
+	if (strlen(sub_info->path) == 0)
+		goto out;
+
 	/*
 	 * Set the completion pointer only if there is a waiter.
 	 * This makes it possible to use umh_complete to free
diff --git a/security/Kconfig b/security/Kconfig
index 118f4549404e..d900f47eaa68 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -158,6 +158,41 @@ config HARDENED_USERCOPY_PAGESPAN
 	  been removed. This config is intended to be used only while
 	  trying to find such users.
 
+config STATIC_USERMODEHELPER
+	bool "Force all usermode helper calls through a single binary"
+	help
+	  By default, the kernel can call many different userspace
+	  binary programs through the "usermode helper" kernel
+	  interface.  Some of these binaries are statically defined
+	  either in the kernel code itself, or as a kernel configuration
+	  option.  However, some of these are dynamically created at
+	  runtime, or can be modified after the kernel has started up.
+	  To provide an additional layer of security, route all of these
+	  calls through a single executable that can not have its name
+	  changed.
+
+	  Note, it is up to this single binary to then call the relevant
+	  "real" usermode helper binary, based on the first argument
+	  passed to it.  If desired, this program can filter and pick
+	  and choose what real programs are called.
+
+	  If you wish for all usermode helper programs are to be
+	  disabled, choose this option and then set
+	  STATIC_USERMODEHELPER_PATH to an empty string.
+
+config STATIC_USERMODEHELPER_PATH
+	string "Path to the static usermode helper binary"
+	depends on STATIC_USERMODEHELPER
+	default "/sbin/usermode-helper"
+	help
+	  The binary called by the kernel when any usermode helper
+	  program is wish to be run.  The "real" application's name will
+	  be in the first argument passed to this program on the command
+	  line.
+
+	  If you wish for all usermode helper programs to be disabled,
+	  specify an empty string here (i.e. "").
+
 source security/selinux/Kconfig
 source security/smack/Kconfig
 source security/tomoyo/Kconfig
-- 
cgit v1.2.3


From 134e6a034cb004ed5acd3048792de70ced1c6cf5 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 19 Jan 2017 08:57:14 -0500
Subject: tracing: Show number of constants profiled in likely profiler

Now that constants are traced, it is useful to see the number of constants
that are traced in the likely/unlikely profiler in order to know if they
should be ignored or not.

The likely/unlikely will display a number after the "correct" number if a
"constant" count exists.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/compiler.h    | 15 ++++++----
 kernel/trace/trace_branch.c | 68 ++++++++++++++++++++++++++++++++++++---------
 2 files changed, 65 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index bbbe1570de1c..a73cc9afa784 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -101,13 +101,18 @@ struct ftrace_branch_data {
 	};
 };
 
+struct ftrace_likely_data {
+	struct ftrace_branch_data	data;
+	unsigned long			constant;
+};
+
 /*
  * Note: DISABLE_BRANCH_PROFILING can be used by special lowlevel code
  * to disable branch tracing on a per file basis.
  */
 #if defined(CONFIG_TRACE_BRANCH_PROFILING) \
     && !defined(DISABLE_BRANCH_PROFILING) && !defined(__CHECKER__)
-void ftrace_likely_update(struct ftrace_branch_data *f, int val,
+void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 			  int expect, int is_constant);
 
 #define likely_notrace(x)	__builtin_expect(!!(x), 1)
@@ -115,13 +120,13 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val,
 
 #define __branch_check__(x, expect, is_constant) ({			\
 			int ______r;					\
-			static struct ftrace_branch_data		\
+			static struct ftrace_likely_data		\
 				__attribute__((__aligned__(4)))		\
 				__attribute__((section("_ftrace_annotated_branch"))) \
 				______f = {				\
-				.func = __func__,			\
-				.file = __FILE__,			\
-				.line = __LINE__,			\
+				.data.func = __func__,			\
+				.data.file = __FILE__,			\
+				.data.line = __LINE__,			\
 			};						\
 			______r = __builtin_expect(!!(x), expect);	\
 			ftrace_likely_update(&______f, ______r,		\
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 7afe426ea528..fd483d74f8e1 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -200,25 +200,27 @@ void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 }
 #endif /* CONFIG_BRANCH_TRACER */
 
-void ftrace_likely_update(struct ftrace_branch_data *f, int val,
+void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 			  int expect, int is_constant)
 {
 	/* A constant is always correct */
-	if (is_constant)
+	if (is_constant) {
+		f->constant++;
 		val = expect;
+	}
 	/*
 	 * I would love to have a trace point here instead, but the
 	 * trace point code is so inundated with unlikely and likely
 	 * conditions that the recursive nightmare that exists is too
 	 * much to try to get working. At least for now.
 	 */
-	trace_likely_condition(f, val, expect);
+	trace_likely_condition(&f->data, val, expect);
 
 	/* FIXME: Make this atomic! */
 	if (val == expect)
-		f->correct++;
+		f->data.correct++;
 	else
-		f->incorrect++;
+		f->data.incorrect++;
 }
 EXPORT_SYMBOL(ftrace_likely_update);
 
@@ -249,29 +251,60 @@ static inline long get_incorrect_percent(struct ftrace_branch_data *p)
 	return percent;
 }
 
-static int branch_stat_show(struct seq_file *m, void *v)
+static const char *branch_stat_process_file(struct ftrace_branch_data *p)
 {
-	struct ftrace_branch_data *p = v;
 	const char *f;
-	long percent;
 
 	/* Only print the file, not the path */
 	f = p->file + strlen(p->file);
 	while (f >= p->file && *f != '/')
 		f--;
-	f++;
+	return ++f;
+}
+
+static void branch_stat_show(struct seq_file *m,
+			     struct ftrace_branch_data *p, const char *f)
+{
+	long percent;
 
 	/*
 	 * The miss is overlayed on correct, and hit on incorrect.
 	 */
 	percent = get_incorrect_percent(p);
 
-	seq_printf(m, "%8lu %8lu ",  p->correct, p->incorrect);
 	if (percent < 0)
 		seq_puts(m, "  X ");
 	else
 		seq_printf(m, "%3ld ", percent);
+
 	seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
+}
+
+static int branch_stat_show_normal(struct seq_file *m,
+				   struct ftrace_branch_data *p, const char *f)
+{
+	seq_printf(m, "%8lu %8lu ",  p->correct, p->incorrect);
+	branch_stat_show(m, p, f);
+	return 0;
+}
+
+static int annotate_branch_stat_show(struct seq_file *m, void *v)
+{
+	struct ftrace_likely_data *p = v;
+	const char *f;
+	int l;
+
+	f = branch_stat_process_file(&p->data);
+
+	if (!p->constant)
+		return branch_stat_show_normal(m, &p->data, f);
+
+	l = snprintf(NULL, 0, "/%lu", p->constant);
+	l = l > 8 ? 0 : 8 - l;
+
+	seq_printf(m, "%8lu/%lu %*lu ",
+		   p->data.correct, p->constant, l, p->data.incorrect);
+	branch_stat_show(m, &p->data, f);
 	return 0;
 }
 
@@ -283,7 +316,7 @@ static void *annotated_branch_stat_start(struct tracer_stat *trace)
 static void *
 annotated_branch_stat_next(void *v, int idx)
 {
-	struct ftrace_branch_data *p = v;
+	struct ftrace_likely_data *p = v;
 
 	++p;
 
@@ -332,7 +365,7 @@ static struct tracer_stat annotated_branch_stats = {
 	.stat_next = annotated_branch_stat_next,
 	.stat_cmp = annotated_branch_stat_cmp,
 	.stat_headers = annotated_branch_stat_headers,
-	.stat_show = branch_stat_show
+	.stat_show = annotate_branch_stat_show
 };
 
 __init static int init_annotated_branch_stats(void)
@@ -383,12 +416,21 @@ all_branch_stat_next(void *v, int idx)
 	return p;
 }
 
+static int all_branch_stat_show(struct seq_file *m, void *v)
+{
+	struct ftrace_branch_data *p = v;
+	const char *f;
+
+	f = branch_stat_process_file(p);
+	return branch_stat_show_normal(m, p, f);
+}
+
 static struct tracer_stat all_branch_stats = {
 	.name = "branch_all",
 	.stat_start = all_branch_stat_start,
 	.stat_next = all_branch_stat_next,
 	.stat_headers = all_branch_stat_headers,
-	.stat_show = branch_stat_show
+	.stat_show = all_branch_stat_show
 };
 
 __init static int all_annotated_branch_stats(void)
-- 
cgit v1.2.3


From 068f530b3f274d313395663bf8d674798d4858c6 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 19 Jan 2017 08:57:41 -0500
Subject: tracing: Add the constant count for branch tracer

The unlikely/likely branch profiler now gets called even if the if statement
is a constant (always goes in one direction without a compare). Add a value
to denote this in the likely/unlikely tracer as well.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_branch.c  | 17 +++++++++--------
 kernel/trace/trace_entries.h |  6 ++++--
 2 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index fd483d74f8e1..4d8fdf3184dc 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -27,7 +27,7 @@ static DEFINE_MUTEX(branch_tracing_mutex);
 static struct trace_array *branch_tracer;
 
 static void
-probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
 {
 	struct trace_event_call *call = &event_branch;
 	struct trace_array *tr = branch_tracer;
@@ -68,16 +68,17 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	entry	= ring_buffer_event_data(event);
 
 	/* Strip off the path, only save the file */
-	p = f->file + strlen(f->file);
-	while (p >= f->file && *p != '/')
+	p = f->data.file + strlen(f->data.file);
+	while (p >= f->data.file && *p != '/')
 		p--;
 	p++;
 
-	strncpy(entry->func, f->func, TRACE_FUNC_SIZE);
+	strncpy(entry->func, f->data.func, TRACE_FUNC_SIZE);
 	strncpy(entry->file, p, TRACE_FILE_SIZE);
 	entry->func[TRACE_FUNC_SIZE] = 0;
 	entry->file[TRACE_FILE_SIZE] = 0;
-	entry->line = f->line;
+	entry->constant = f->constant;
+	entry->line = f->data.line;
 	entry->correct = val == expect;
 
 	if (!call_filter_check_discard(call, entry, buffer, event))
@@ -89,7 +90,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 }
 
 static inline
-void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
 {
 	if (!branch_tracing_enabled)
 		return;
@@ -195,7 +196,7 @@ core_initcall(init_branch_tracer);
 
 #else
 static inline
-void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
 {
 }
 #endif /* CONFIG_BRANCH_TRACER */
@@ -214,7 +215,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 	 * conditions that the recursive nightmare that exists is too
 	 * much to try to get working. At least for now.
 	 */
-	trace_likely_condition(&f->data, val, expect);
+	trace_likely_condition(f, val, expect);
 
 	/* FIXME: Make this atomic! */
 	if (val == expect)
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index eb7396b7e7c3..c203ac4df791 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -328,11 +328,13 @@ FTRACE_ENTRY(branch, trace_branch,
 		__array(	char,		func,	TRACE_FUNC_SIZE+1	)
 		__array(	char,		file,	TRACE_FILE_SIZE+1	)
 		__field(	char,		correct				)
+		__field(	char,		constant			)
 	),
 
-	F_printk("%u:%s:%s (%u)",
+	F_printk("%u:%s:%s (%u)%s",
 		 __entry->line,
-		 __entry->func, __entry->file, __entry->correct),
+		 __entry->func, __entry->file, __entry->correct,
+		 __entry->constant ? " CONSTANT" : ""),
 
 	FILTER_OTHER
 );
-- 
cgit v1.2.3


From acb04058de49458010c44bb35b849d45113fd668 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 19 Jan 2017 14:36:33 +0100
Subject: sched/clock: Fix hotplug crash

Mike reported that he could trigger the WARN_ON_ONCE() in
set_sched_clock_stable() using hotplug.

This exposed a fundamental problem with the interface, we should never
mark the TSC stable if we ever find it to be unstable. Therefore
set_sched_clock_stable() is a broken interface.

The reason it existed is that not having it is a pain, it means all
relevant architecture code needs to call clear_sched_clock_stable()
where appropriate.

Of the three architectures that select HAVE_UNSTABLE_SCHED_CLOCK ia64
and parisc are trivial in that they never called
set_sched_clock_stable(), so add an unconditional call to
clear_sched_clock_stable() to them.

For x86 the story is a lot more involved, and what this patch tries to
do is ensure we preserve the status quo. So even is Cyrix or Transmeta
have usable TSC they never called set_sched_clock_stable() so they now
get an explicit mark unstable.

Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 9881b024b7d7 ("sched/clock: Delay switching sched_clock to stable")
Link: http://lkml.kernel.org/r/20170119133633.GB6536@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/ia64/kernel/setup.c        |  2 ++
 arch/parisc/kernel/setup.c      |  2 ++
 arch/x86/kernel/cpu/amd.c       |  6 ++++--
 arch/x86/kernel/cpu/centaur.c   |  6 ++++--
 arch/x86/kernel/cpu/common.c    |  3 +++
 arch/x86/kernel/cpu/cyrix.c     |  2 ++
 arch/x86/kernel/cpu/intel.c     |  6 ++++--
 arch/x86/kernel/cpu/transmeta.c |  3 +++
 arch/x86/kernel/kvmclock.c      |  2 +-
 include/linux/sched.h           |  1 -
 kernel/sched/clock.c            | 29 ++++++++---------------------
 11 files changed, 33 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 7ec7acc844c2..c483ece3eb84 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -619,6 +619,8 @@ setup_arch (char **cmdline_p)
 	check_sal_cache_flush();
 #endif
 	paging_init();
+
+	clear_sched_clock_stable();
 }
 
 /*
diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c
index 2e66a887788e..068ed3607bac 100644
--- a/arch/parisc/kernel/setup.c
+++ b/arch/parisc/kernel/setup.c
@@ -36,6 +36,7 @@
 #undef PCI_DEBUG
 #include <linux/proc_fs.h>
 #include <linux/export.h>
+#include <linux/sched.h>
 
 #include <asm/processor.h>
 #include <asm/sections.h>
@@ -176,6 +177,7 @@ void __init setup_arch(char **cmdline_p)
 	conswitchp = &dummy_con;	/* we use do_take_over_console() later ! */
 #endif
 
+	clear_sched_clock_stable();
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 71cae73a5076..1bb253a6ee4d 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -548,8 +548,10 @@ static void early_init_amd(struct cpuinfo_x86 *c)
 	if (c->x86_power & (1 << 8)) {
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 		set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
-		if (!check_tsc_unstable())
-			set_sched_clock_stable();
+		if (check_tsc_unstable())
+			clear_sched_clock_stable();
+	} else {
+		clear_sched_clock_stable();
 	}
 
 	/* Bit 12 of 8000_0007 edx is accumulated power mechanism. */
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 1661d8ec9280..2c234a6d94c4 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,5 +1,5 @@
-#include <linux/bitops.h>
-#include <linux/kernel.h>
+
+#include <linux/sched.h>
 
 #include <asm/cpufeature.h>
 #include <asm/e820.h>
@@ -104,6 +104,8 @@ static void early_init_centaur(struct cpuinfo_x86 *c)
 #ifdef CONFIG_X86_64
 	set_cpu_cap(c, X86_FEATURE_SYSENTER32);
 #endif
+
+	clear_sched_clock_stable();
 }
 
 static void init_centaur(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index dc1697ca5191..3457186275a0 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -83,6 +83,7 @@ static void default_init(struct cpuinfo_x86 *c)
 			strcpy(c->x86_model_id, "386");
 	}
 #endif
+	clear_sched_clock_stable();
 }
 
 static const struct cpu_dev default_cpu = {
@@ -1055,6 +1056,8 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 	 */
 	if (this_cpu->c_init)
 		this_cpu->c_init(c);
+	else
+		clear_sched_clock_stable();
 
 	/* Disable the PN if appropriate */
 	squash_the_stupid_serial_number(c);
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index bd9dcd6b712d..47416f959a48 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -9,6 +9,7 @@
 #include <asm/pci-direct.h>
 #include <asm/tsc.h>
 #include <asm/cpufeature.h>
+#include <linux/sched.h>
 
 #include "cpu.h"
 
@@ -183,6 +184,7 @@ static void early_init_cyrix(struct cpuinfo_x86 *c)
 		set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
 		break;
 	}
+	clear_sched_clock_stable();
 }
 
 static void init_cyrix(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index fcd484d2bb03..26eaff4907b2 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -124,8 +124,10 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 	if (c->x86_power & (1 << 8)) {
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 		set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
-		if (!check_tsc_unstable())
-			set_sched_clock_stable();
+		if (check_tsc_unstable())
+			clear_sched_clock_stable();
+	} else {
+		clear_sched_clock_stable();
 	}
 
 	/* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 34178564be2a..c1ea5b999839 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -1,4 +1,5 @@
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/mm.h>
 #include <asm/cpufeature.h>
 #include <asm/msr.h>
@@ -14,6 +15,8 @@ static void early_init_transmeta(struct cpuinfo_x86 *c)
 		if (xlvl >= 0x80860001)
 			c->x86_capability[CPUID_8086_0001_EDX] = cpuid_edx(0x80860001);
 	}
+
+	clear_sched_clock_stable();
 }
 
 static void init_transmeta(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 2a5cafdf8808..542710b99f52 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -107,12 +107,12 @@ static inline void kvm_sched_clock_init(bool stable)
 {
 	if (!stable) {
 		pv_time_ops.sched_clock = kvm_clock_read;
+		clear_sched_clock_stable();
 		return;
 	}
 
 	kvm_sched_clock_offset = kvm_clock_read();
 	pv_time_ops.sched_clock = kvm_sched_clock_read;
-	set_sched_clock_stable();
 
 	printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n",
 			kvm_sched_clock_offset);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a8daed914eef..69e6852fede1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2547,7 +2547,6 @@ extern void sched_clock_init_late(void);
  * is reliable after all:
  */
 extern int sched_clock_stable(void);
-extern void set_sched_clock_stable(void);
 extern void clear_sched_clock_stable(void);
 
 extern void sched_clock_tick(void);
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 7713b2b53f61..ad64efe41722 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -83,8 +83,15 @@ void sched_clock_init(void)
 }
 
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+/*
+ * We must start with !__sched_clock_stable because the unstable -> stable
+ * transition is accurate, while the stable -> unstable transition is not.
+ *
+ * Similarly we start with __sched_clock_stable_early, thereby assuming we
+ * will become stable, such that there's only a single 1 -> 0 transition.
+ */
 static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable);
-static int __sched_clock_stable_early;
+static int __sched_clock_stable_early = 1;
 
 /*
  * We want: ktime_get_ns() + gtod_offset == sched_clock() + raw_offset
@@ -132,24 +139,6 @@ static void __set_sched_clock_stable(void)
 	tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
 }
 
-void set_sched_clock_stable(void)
-{
-	__sched_clock_stable_early = 1;
-
-	smp_mb(); /* matches sched_clock_init_late() */
-
-	/*
-	 * This really should only be called early (before
-	 * sched_clock_init_late()) when guestimating our sched_clock() is
-	 * solid.
-	 *
-	 * After that we test stability and we can negate our guess using
-	 * clear_sched_clock_stable, possibly from a watchdog.
-	 */
-	if (WARN_ON_ONCE(sched_clock_running == 2))
-		__set_sched_clock_stable();
-}
-
 static void __clear_sched_clock_stable(struct work_struct *work)
 {
 	struct sched_clock_data *scd = this_scd();
@@ -199,8 +188,6 @@ void sched_clock_init_late(void)
 
 	if (__sched_clock_stable_early)
 		__set_sched_clock_stable();
-	else
-		__clear_sched_clock_stable(NULL);
 }
 
 /*
-- 
cgit v1.2.3


From 3e278c0dc1cf5070d9462ececde1c07369b469b2 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Fri, 20 Jan 2017 11:44:45 +0900
Subject: ftrace: Factor out __ftrace_hash_move()

The __ftrace_hash_move() is to allocates properly-sized hash and move
entries in the src ftrace_hash.  It will be used to set function graph
filters which has nothing to do with the dyn_ftrace records.

Link: http://lkml.kernel.org/r/20170120024447.26097-1-namhyung@kernel.org

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 40 ++++++++++++++++++++++++----------------
 1 file changed, 24 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index eb230f06ba41..37b0e948d924 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1383,9 +1383,8 @@ ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash);
 static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
 				       struct ftrace_hash *new_hash);
 
-static int
-ftrace_hash_move(struct ftrace_ops *ops, int enable,
-		 struct ftrace_hash **dst, struct ftrace_hash *src)
+static struct ftrace_hash *
+__ftrace_hash_move(struct ftrace_hash *src)
 {
 	struct ftrace_func_entry *entry;
 	struct hlist_node *tn;
@@ -1393,21 +1392,13 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
 	struct ftrace_hash *new_hash;
 	int size = src->count;
 	int bits = 0;
-	int ret;
 	int i;
 
-	/* Reject setting notrace hash on IPMODIFY ftrace_ops */
-	if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable)
-		return -EINVAL;
-
 	/*
-	 * If the new source is empty, just free dst and assign it
-	 * the empty_hash.
+	 * If the new source is empty, just return the empty_hash.
 	 */
-	if (!src->count) {
-		new_hash = EMPTY_HASH;
-		goto update;
-	}
+	if (!src->count)
+		return EMPTY_HASH;
 
 	/*
 	 * Make the hash size about 1/2 the # found
@@ -1421,7 +1412,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
 
 	new_hash = alloc_ftrace_hash(bits);
 	if (!new_hash)
-		return -ENOMEM;
+		return NULL;
 
 	size = 1 << src->size_bits;
 	for (i = 0; i < size; i++) {
@@ -1432,7 +1423,24 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
 		}
 	}
 
-update:
+	return new_hash;
+}
+
+static int
+ftrace_hash_move(struct ftrace_ops *ops, int enable,
+		 struct ftrace_hash **dst, struct ftrace_hash *src)
+{
+	struct ftrace_hash *new_hash;
+	int ret;
+
+	/* Reject setting notrace hash on IPMODIFY ftrace_ops */
+	if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable)
+		return -EINVAL;
+
+	new_hash = __ftrace_hash_move(src);
+	if (!new_hash)
+		return -ENOMEM;
+
 	/* Make sure this can be applied if it is IPMODIFY ftrace_ops */
 	if (enable) {
 		/* IPMODIFY should be updated only when filter_hash updating */
-- 
cgit v1.2.3


From a5e8c07059d0f0b31737408711d44794928ac218 Mon Sep 17 00:00:00 2001
From: Gianluca Borello <g.borello@gmail.com>
Date: Wed, 18 Jan 2017 17:55:49 +0000
Subject: bpf: add bpf_probe_read_str helper

Provide a simple helper with the same semantics of strncpy_from_unsafe():

int bpf_probe_read_str(void *dst, int size, const void *unsafe_addr)

This gives more flexibility to a bpf program. A typical use case is
intercepting a file name during sys_open(). The current approach is:

SEC("kprobe/sys_open")
void bpf_sys_open(struct pt_regs *ctx)
{
	char buf[PATHLEN]; // PATHLEN is defined to 256
	bpf_probe_read(buf, sizeof(buf), ctx->di);

	/* consume buf */
}

This is suboptimal because the size of the string needs to be estimated
at compile time, causing more memory to be copied than often necessary,
and can become more problematic if further processing on buf is done,
for example by pushing it to userspace via bpf_perf_event_output(),
since the real length of the string is unknown and the entire buffer
must be copied (and defining an unrolled strnlen() inside the bpf
program is a very inefficient and unfeasible approach).

With the new helper, the code can easily operate on the actual string
length rather than the buffer size:

SEC("kprobe/sys_open")
void bpf_sys_open(struct pt_regs *ctx)
{
	char buf[PATHLEN]; // PATHLEN is defined to 256
	int res = bpf_probe_read_str(buf, sizeof(buf), ctx->di);

	/* consume buf, for example push it to userspace via
	 * bpf_perf_event_output(), but this time we can use
	 * res (the string length) as event size, after checking
	 * its boundaries.
	 */
}

Another useful use case is when parsing individual process arguments or
individual environment variables navigating current->mm->arg_start and
current->mm->env_start: using this helper and the return value, one can
quickly iterate at the right offset of the memory area.

The code changes simply leverage the already existent
strncpy_from_unsafe() kernel function, which is safe to be called from a
bpf program as it is used in bpf_trace_printk().

Signed-off-by: Gianluca Borello <g.borello@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 15 ++++++++++++++-
 kernel/trace/bpf_trace.c | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0eb0e87dbe9f..54a5894bb4ea 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -430,6 +430,18 @@ union bpf_attr {
  *     @xdp_md: pointer to xdp_md
  *     @delta: An positive/negative integer to be added to xdp_md.data
  *     Return: 0 on success or negative on error
+ *
+ * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
+ *     Copy a NUL terminated string from unsafe address. In case the string
+ *     length is smaller than size, the target is not padded with further NUL
+ *     bytes. In case the string length is larger than size, just count-1
+ *     bytes are copied and the last byte is set to NUL.
+ *     @dst: destination address
+ *     @size: maximum number of bytes to copy, including the trailing NUL
+ *     @unsafe_ptr: unsafe address
+ *     Return:
+ *       > 0 length of the string including the trailing NUL on success
+ *       < 0 error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -476,7 +488,8 @@ union bpf_attr {
 	FN(set_hash_invalid),		\
 	FN(get_numa_node_id),		\
 	FN(skb_change_head),		\
-	FN(xdp_adjust_head),
+	FN(xdp_adjust_head),		\
+	FN(probe_read_str),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index c22a961d1a42..424daa4586d1 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -395,6 +395,36 @@ static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
 	.arg2_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size,
+	   const void *, unsafe_ptr)
+{
+	int ret;
+
+	/*
+	 * The strncpy_from_unsafe() call will likely not fill the entire
+	 * buffer, but that's okay in this circumstance as we're probing
+	 * arbitrary memory anyway similar to bpf_probe_read() and might
+	 * as well probe the stack. Thus, memory is explicitly cleared
+	 * only in error case, so that improper users ignoring return
+	 * code altogether don't copy garbage; otherwise length of string
+	 * is returned that can be used for bpf_perf_event_output() et al.
+	 */
+	ret = strncpy_from_unsafe(dst, unsafe_ptr, size);
+	if (unlikely(ret < 0))
+		memset(dst, 0, size);
+
+	return ret;
+}
+
+static const struct bpf_func_proto bpf_probe_read_str_proto = {
+	.func		= bpf_probe_read_str,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg2_type	= ARG_CONST_SIZE,
+	.arg3_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -432,6 +462,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
 		return &bpf_current_task_under_cgroup_proto;
 	case BPF_FUNC_get_prandom_u32:
 		return &bpf_get_prandom_u32_proto;
+	case BPF_FUNC_probe_read_str:
+		return &bpf_probe_read_str_proto;
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3


From 4046bf023b0647d09704a32d9fe8aecbcee3e4c3 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Fri, 20 Jan 2017 11:44:46 +0900
Subject: ftrace: Expose ftrace_hash_empty and ftrace_lookup_ip

It will be used when checking graph filter hashes later.

Link: http://lkml.kernel.org/r/20170120024447.26097-2-namhyung@kernel.org

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
[ Moved ftrace_hash dec and functions outside of FUNCTION_GRAPH define ]
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 14 +-------------
 kernel/trace/trace.h  | 16 +++++++++++++++-
 2 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 37b0e948d924..0470e373b9b4 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1110,13 +1110,6 @@ struct ftrace_func_entry {
 	unsigned long ip;
 };
 
-struct ftrace_hash {
-	unsigned long		size_bits;
-	struct hlist_head	*buckets;
-	unsigned long		count;
-	struct rcu_head		rcu;
-};
-
 /*
  * We make these constant because no one should touch them,
  * but they are used as the default "empty hash", to avoid allocating
@@ -1192,12 +1185,7 @@ struct ftrace_page {
 static struct ftrace_page	*ftrace_pages_start;
 static struct ftrace_page	*ftrace_pages;
 
-static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash)
-{
-	return !hash || !hash->count;
-}
-
-static struct ftrace_func_entry *
+struct ftrace_func_entry *
 ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
 {
 	unsigned long key;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1ea51ab53edf..9001460291bb 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -753,6 +753,21 @@ enum print_line_t print_trace_line(struct trace_iterator *iter);
 
 extern char trace_find_mark(unsigned long long duration);
 
+struct ftrace_hash {
+	unsigned long		size_bits;
+	struct hlist_head	*buckets;
+	unsigned long		count;
+	struct rcu_head		rcu;
+};
+
+struct ftrace_func_entry *
+ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip);
+
+static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash)
+{
+	return !hash || !hash->count;
+}
+
 /* Standard output formatting function used for function return traces */
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
@@ -787,7 +802,6 @@ extern void __trace_graph_return(struct trace_array *tr,
 				 struct ftrace_graph_ret *trace,
 				 unsigned long flags, int pc);
 
-
 #ifdef CONFIG_DYNAMIC_FTRACE
 /* TODO: make this variable */
 #define FTRACE_GRAPH_MAX_FUNCS		32
-- 
cgit v1.2.3


From b9b0c831bed2682c2e3e9f5420fb6985549ef020 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Fri, 20 Jan 2017 11:44:47 +0900
Subject: ftrace: Convert graph filter to use hash tables

Use ftrace_hash instead of a static array of a fixed size.  This is
useful when a graph filter pattern matches to a large number of
functions.  Now hash lookup is done with preemption disabled to protect
from the hash being changed/freed.

Link: http://lkml.kernel.org/r/20170120024447.26097-3-namhyung@kernel.org

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 190 +++++++++++++++++++++++++++++++++-----------------
 kernel/trace/trace.h  |  64 ++++++++---------
 2 files changed, 158 insertions(+), 96 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0470e373b9b4..2d554a02241d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4378,7 +4378,7 @@ __setup("ftrace_filter=", set_ftrace_filter);
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
 static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
-static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);
+static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer);
 
 static unsigned long save_global_trampoline;
 static unsigned long save_global_flags;
@@ -4401,18 +4401,17 @@ static void __init set_ftrace_early_graph(char *buf, int enable)
 {
 	int ret;
 	char *func;
-	unsigned long *table = ftrace_graph_funcs;
-	int *count = &ftrace_graph_count;
+	struct ftrace_hash *hash;
 
-	if (!enable) {
-		table = ftrace_graph_notrace_funcs;
-		count = &ftrace_graph_notrace_count;
-	}
+	if (enable)
+		hash = ftrace_graph_hash;
+	else
+		hash = ftrace_graph_notrace_hash;
 
 	while (buf) {
 		func = strsep(&buf, ",");
 		/* we allow only one expression at a time */
-		ret = ftrace_set_func(table, count, FTRACE_GRAPH_MAX_FUNCS, func);
+		ret = ftrace_graph_set_hash(hash, func);
 		if (ret)
 			printk(KERN_DEBUG "ftrace: function %s not "
 					  "traceable\n", func);
@@ -4536,15 +4535,20 @@ static const struct file_operations ftrace_notrace_fops = {
 
 static DEFINE_MUTEX(graph_lock);
 
-int ftrace_graph_count;
-int ftrace_graph_notrace_count;
-unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
-unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
+struct ftrace_hash *ftrace_graph_hash = EMPTY_HASH;
+struct ftrace_hash *ftrace_graph_notrace_hash = EMPTY_HASH;
+
+enum graph_filter_type {
+	GRAPH_FILTER_NOTRACE	= 0,
+	GRAPH_FILTER_FUNCTION,
+};
 
 struct ftrace_graph_data {
-	unsigned long *table;
-	size_t size;
-	int *count;
+	struct ftrace_hash *hash;
+	struct ftrace_func_entry *entry;
+	int idx;   /* for hash table iteration */
+	enum graph_filter_type type;
+	struct ftrace_hash *new_hash;
 	const struct seq_operations *seq_ops;
 };
 
@@ -4552,10 +4556,31 @@ static void *
 __g_next(struct seq_file *m, loff_t *pos)
 {
 	struct ftrace_graph_data *fgd = m->private;
+	struct ftrace_func_entry *entry = fgd->entry;
+	struct hlist_head *head;
+	int i, idx = fgd->idx;
 
-	if (*pos >= *fgd->count)
+	if (*pos >= fgd->hash->count)
 		return NULL;
-	return &fgd->table[*pos];
+
+	if (entry) {
+		hlist_for_each_entry_continue(entry, hlist) {
+			fgd->entry = entry;
+			return entry;
+		}
+
+		idx++;
+	}
+
+	for (i = idx; i < 1 << fgd->hash->size_bits; i++) {
+		head = &fgd->hash->buckets[i];
+		hlist_for_each_entry(entry, head, hlist) {
+			fgd->entry = entry;
+			fgd->idx = i;
+			return entry;
+		}
+	}
+	return NULL;
 }
 
 static void *
@@ -4572,9 +4597,11 @@ static void *g_start(struct seq_file *m, loff_t *pos)
 	mutex_lock(&graph_lock);
 
 	/* Nothing, tell g_show to print all functions are enabled */
-	if (!*fgd->count && !*pos)
+	if (ftrace_hash_empty(fgd->hash) && !*pos)
 		return (void *)1;
 
+	fgd->idx = 0;
+	fgd->entry = NULL;
 	return __g_next(m, pos);
 }
 
@@ -4585,22 +4612,22 @@ static void g_stop(struct seq_file *m, void *p)
 
 static int g_show(struct seq_file *m, void *v)
 {
-	unsigned long *ptr = v;
+	struct ftrace_func_entry *entry = v;
 
-	if (!ptr)
+	if (!entry)
 		return 0;
 
-	if (ptr == (unsigned long *)1) {
+	if (entry == (void *)1) {
 		struct ftrace_graph_data *fgd = m->private;
 
-		if (fgd->table == ftrace_graph_funcs)
+		if (fgd->type == GRAPH_FILTER_FUNCTION)
 			seq_puts(m, "#### all functions enabled ####\n");
 		else
 			seq_puts(m, "#### no functions disabled ####\n");
 		return 0;
 	}
 
-	seq_printf(m, "%ps\n", (void *)*ptr);
+	seq_printf(m, "%ps\n", (void *)entry->ip);
 
 	return 0;
 }
@@ -4617,24 +4644,37 @@ __ftrace_graph_open(struct inode *inode, struct file *file,
 		    struct ftrace_graph_data *fgd)
 {
 	int ret = 0;
+	struct ftrace_hash *new_hash = NULL;
 
-	mutex_lock(&graph_lock);
-	if ((file->f_mode & FMODE_WRITE) &&
-	    (file->f_flags & O_TRUNC)) {
-		*fgd->count = 0;
-		memset(fgd->table, 0, fgd->size * sizeof(*fgd->table));
+	if (file->f_mode & FMODE_WRITE) {
+		const int size_bits = FTRACE_HASH_DEFAULT_BITS;
+
+		if (file->f_flags & O_TRUNC)
+			new_hash = alloc_ftrace_hash(size_bits);
+		else
+			new_hash = alloc_and_copy_ftrace_hash(size_bits,
+							      fgd->hash);
+		if (!new_hash) {
+			ret = -ENOMEM;
+			goto out;
+		}
 	}
-	mutex_unlock(&graph_lock);
 
 	if (file->f_mode & FMODE_READ) {
-		ret = seq_open(file, fgd->seq_ops);
+		ret = seq_open(file, &ftrace_graph_seq_ops);
 		if (!ret) {
 			struct seq_file *m = file->private_data;
 			m->private = fgd;
+		} else {
+			/* Failed */
+			free_ftrace_hash(new_hash);
+			new_hash = NULL;
 		}
 	} else
 		file->private_data = fgd;
 
+out:
+	fgd->new_hash = new_hash;
 	return ret;
 }
 
@@ -4642,6 +4682,7 @@ static int
 ftrace_graph_open(struct inode *inode, struct file *file)
 {
 	struct ftrace_graph_data *fgd;
+	int ret;
 
 	if (unlikely(ftrace_disabled))
 		return -ENODEV;
@@ -4650,18 +4691,25 @@ ftrace_graph_open(struct inode *inode, struct file *file)
 	if (fgd == NULL)
 		return -ENOMEM;
 
-	fgd->table = ftrace_graph_funcs;
-	fgd->size = FTRACE_GRAPH_MAX_FUNCS;
-	fgd->count = &ftrace_graph_count;
+	mutex_lock(&graph_lock);
+
+	fgd->hash = ftrace_graph_hash;
+	fgd->type = GRAPH_FILTER_FUNCTION;
 	fgd->seq_ops = &ftrace_graph_seq_ops;
 
-	return __ftrace_graph_open(inode, file, fgd);
+	ret = __ftrace_graph_open(inode, file, fgd);
+	if (ret < 0)
+		kfree(fgd);
+
+	mutex_unlock(&graph_lock);
+	return ret;
 }
 
 static int
 ftrace_graph_notrace_open(struct inode *inode, struct file *file)
 {
 	struct ftrace_graph_data *fgd;
+	int ret;
 
 	if (unlikely(ftrace_disabled))
 		return -ENODEV;
@@ -4670,45 +4718,53 @@ ftrace_graph_notrace_open(struct inode *inode, struct file *file)
 	if (fgd == NULL)
 		return -ENOMEM;
 
-	fgd->table = ftrace_graph_notrace_funcs;
-	fgd->size = FTRACE_GRAPH_MAX_FUNCS;
-	fgd->count = &ftrace_graph_notrace_count;
+	mutex_lock(&graph_lock);
+
+	fgd->hash = ftrace_graph_notrace_hash;
+	fgd->type = GRAPH_FILTER_NOTRACE;
 	fgd->seq_ops = &ftrace_graph_seq_ops;
 
-	return __ftrace_graph_open(inode, file, fgd);
+	ret = __ftrace_graph_open(inode, file, fgd);
+	if (ret < 0)
+		kfree(fgd);
+
+	mutex_unlock(&graph_lock);
+	return ret;
 }
 
 static int
 ftrace_graph_release(struct inode *inode, struct file *file)
 {
+	struct ftrace_graph_data *fgd;
+
 	if (file->f_mode & FMODE_READ) {
 		struct seq_file *m = file->private_data;
 
-		kfree(m->private);
+		fgd = m->private;
 		seq_release(inode, file);
 	} else {
-		kfree(file->private_data);
+		fgd = file->private_data;
 	}
 
+	kfree(fgd->new_hash);
+	kfree(fgd);
+
 	return 0;
 }
 
 static int
-ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)
+ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer)
 {
 	struct ftrace_glob func_g;
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
+	struct ftrace_func_entry *entry;
 	int fail = 1;
 	int not;
-	bool exists;
-	int i;
 
 	/* decode regex */
 	func_g.type = filter_parse_regex(buffer, strlen(buffer),
 					 &func_g.search, &not);
-	if (!not && *idx >= size)
-		return -EBUSY;
 
 	func_g.len = strlen(func_g.search);
 
@@ -4725,26 +4781,18 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)
 			continue;
 
 		if (ftrace_match_record(rec, &func_g, NULL, 0)) {
-			/* if it is in the array */
-			exists = false;
-			for (i = 0; i < *idx; i++) {
-				if (array[i] == rec->ip) {
-					exists = true;
-					break;
-				}
-			}
+			entry = ftrace_lookup_ip(hash, rec->ip);
 
 			if (!not) {
 				fail = 0;
-				if (!exists) {
-					array[(*idx)++] = rec->ip;
-					if (*idx >= size)
-						goto out;
-				}
+
+				if (entry)
+					continue;
+				if (add_hash_entry(hash, rec->ip) < 0)
+					goto out;
 			} else {
-				if (exists) {
-					array[i] = array[--(*idx)];
-					array[*idx] = 0;
+				if (entry) {
+					free_hash_entry(hash, entry);
 					fail = 0;
 				}
 			}
@@ -4766,6 +4814,7 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
 	struct trace_parser parser;
 	ssize_t read, ret = 0;
 	struct ftrace_graph_data *fgd = file->private_data;
+	struct ftrace_hash *old_hash, *new_hash;
 
 	if (!cnt)
 		return 0;
@@ -4781,10 +4830,25 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
 		mutex_lock(&graph_lock);
 
 		/* we allow only one expression at a time */
-		ret = ftrace_set_func(fgd->table, fgd->count, fgd->size,
-				      parser.buffer);
+		ret = ftrace_graph_set_hash(fgd->new_hash,
+					    parser.buffer);
+
+		old_hash = fgd->hash;
+		new_hash = __ftrace_hash_move(fgd->new_hash);
+		if (!new_hash)
+			ret = -ENOMEM;
+
+		if (fgd->type == GRAPH_FILTER_FUNCTION)
+			rcu_assign_pointer(ftrace_graph_hash, new_hash);
+		else
+			rcu_assign_pointer(ftrace_graph_notrace_hash, new_hash);
 
 		mutex_unlock(&graph_lock);
+
+		/* Wait till all users are no longer using the old hash */
+		synchronize_sched();
+
+		free_ftrace_hash(old_hash);
 	}
 
 	if (!ret)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9001460291bb..afbec961eab1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -803,51 +803,49 @@ extern void __trace_graph_return(struct trace_array *tr,
 				 unsigned long flags, int pc);
 
 #ifdef CONFIG_DYNAMIC_FTRACE
-/* TODO: make this variable */
-#define FTRACE_GRAPH_MAX_FUNCS		32
-extern int ftrace_graph_count;
-extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
-extern int ftrace_graph_notrace_count;
-extern unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS];
+extern struct ftrace_hash *ftrace_graph_hash;
+extern struct ftrace_hash *ftrace_graph_notrace_hash;
 
 static inline int ftrace_graph_addr(unsigned long addr)
 {
-	int i;
-
-	if (!ftrace_graph_count)
-		return 1;
-
-	for (i = 0; i < ftrace_graph_count; i++) {
-		if (addr == ftrace_graph_funcs[i]) {
-			/*
-			 * If no irqs are to be traced, but a set_graph_function
-			 * is set, and called by an interrupt handler, we still
-			 * want to trace it.
-			 */
-			if (in_irq())
-				trace_recursion_set(TRACE_IRQ_BIT);
-			else
-				trace_recursion_clear(TRACE_IRQ_BIT);
-			return 1;
-		}
+	int ret = 0;
+
+	preempt_disable_notrace();
+
+	if (ftrace_hash_empty(ftrace_graph_hash)) {
+		ret = 1;
+		goto out;
 	}
 
-	return 0;
+	if (ftrace_lookup_ip(ftrace_graph_hash, addr)) {
+		/*
+		 * If no irqs are to be traced, but a set_graph_function
+		 * is set, and called by an interrupt handler, we still
+		 * want to trace it.
+		 */
+		if (in_irq())
+			trace_recursion_set(TRACE_IRQ_BIT);
+		else
+			trace_recursion_clear(TRACE_IRQ_BIT);
+		ret = 1;
+	}
+
+out:
+	preempt_enable_notrace();
+	return ret;
 }
 
 static inline int ftrace_graph_notrace_addr(unsigned long addr)
 {
-	int i;
+	int ret = 0;
 
-	if (!ftrace_graph_notrace_count)
-		return 0;
+	preempt_disable_notrace();
 
-	for (i = 0; i < ftrace_graph_notrace_count; i++) {
-		if (addr == ftrace_graph_notrace_funcs[i])
-			return 1;
-	}
+	if (ftrace_lookup_ip(ftrace_graph_notrace_hash, addr))
+		ret = 1;
 
-	return 0;
+	preempt_enable_notrace();
+	return ret;
 }
 #else
 static inline int ftrace_graph_addr(unsigned long addr)
-- 
cgit v1.2.3


From bcc9a76d5ac426bc45c9e863b1830347827ca77a Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Sat, 21 Jan 2017 21:33:35 -0500
Subject: locking/rwsem: Reinit wake_q after use

In __rwsem_down_write_failed_common(), the same wake_q variable name
is defined twice, with the inner wake_q hiding the one in outer scope.
We can either use different names for the two wake_q's.

Even better, we can use the same wake_q twice, if necessary.

To enable the latter change, we need to define a new helper function
wake_q_init() to enable reinitalization of wake_q after use.

Signed-off-by: Waiman Long <longman@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1485052415-9611-1-git-send-email-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h       | 6 ++++++
 kernel/locking/rwsem-xadd.c | 7 +++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f4f9d32f12b3..4e62b378bd65 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1021,6 +1021,12 @@ struct wake_q_head {
 #define DEFINE_WAKE_Q(name)				\
 	struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
 
+static inline void wake_q_init(struct wake_q_head *head)
+{
+	head->first = WAKE_Q_TAIL;
+	head->lastp = &head->first;
+}
+
 extern void wake_q_add(struct wake_q_head *head,
 		       struct task_struct *task);
 extern void wake_up_q(struct wake_q_head *head);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index a3a381aee24b..2ad8d8dc3bb1 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -502,8 +502,6 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
 		 * wake any read locks that were queued ahead of us.
 		 */
 		if (count > RWSEM_WAITING_BIAS) {
-			DEFINE_WAKE_Q(wake_q);
-
 			__rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q);
 			/*
 			 * The wakeup is normally called _after_ the wait_lock
@@ -513,6 +511,11 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
 			 * for attempting rwsem_try_write_lock().
 			 */
 			wake_up_q(&wake_q);
+
+			/*
+			 * Reinitialize wake_q after use.
+			 */
+			wake_q_init(&wake_q);
 		}
 
 	} else
-- 
cgit v1.2.3


From b25e67161c295c98acda92123b2dd1e7d8642901 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@chromium.org>
Date: Thu, 19 Jan 2017 22:28:57 -0600
Subject: seccomp: dump core when using SECCOMP_RET_KILL

The SECCOMP_RET_KILL mode is documented as immediately killing the
process as if a SIGSYS had been sent and not caught (similar to a
SIGKILL).  However, a SIGSYS is documented as triggering a coredump
which does not happen today.

This has the advantage of being able to more easily debug a process
that fails a seccomp filter.  Today, most apps need to recompile and
change their filter in order to get detailed info out, or manually run
things through strace, or enable detailed kernel auditing.  Now we get
coredumps that fit into existing system-wide crash reporting setups.

From a security pov, this shouldn't be a problem.  Unhandled signals
can already be sent externally which trigger a coredump independent of
the status of the seccomp filter.  The act of dumping core itself does
not cause change in execution of the program.

URL: https://crbug.com/676357
Signed-off-by: Mike Frysinger <vapier@chromium.org>
Acked-by: Jorge Lucangeli Obes <jorgelo@chromium.org>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 kernel/seccomp.c | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index f7ce79a46050..f8f88ebcb3ba 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -16,6 +16,7 @@
 #include <linux/atomic.h>
 #include <linux/audit.h>
 #include <linux/compat.h>
+#include <linux/coredump.h>
 #include <linux/sched.h>
 #include <linux/seccomp.h>
 #include <linux/slab.h>
@@ -486,6 +487,17 @@ void put_seccomp_filter(struct task_struct *tsk)
 	}
 }
 
+static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason)
+{
+	memset(info, 0, sizeof(*info));
+	info->si_signo = SIGSYS;
+	info->si_code = SYS_SECCOMP;
+	info->si_call_addr = (void __user *)KSTK_EIP(current);
+	info->si_errno = reason;
+	info->si_arch = syscall_get_arch();
+	info->si_syscall = syscall;
+}
+
 /**
  * seccomp_send_sigsys - signals the task to allow in-process syscall emulation
  * @syscall: syscall number to send to userland
@@ -496,13 +508,7 @@ void put_seccomp_filter(struct task_struct *tsk)
 static void seccomp_send_sigsys(int syscall, int reason)
 {
 	struct siginfo info;
-	memset(&info, 0, sizeof(info));
-	info.si_signo = SIGSYS;
-	info.si_code = SYS_SECCOMP;
-	info.si_call_addr = (void __user *)KSTK_EIP(current);
-	info.si_errno = reason;
-	info.si_arch = syscall_get_arch();
-	info.si_syscall = syscall;
+	seccomp_init_siginfo(&info, syscall, reason);
 	force_sig_info(SIGSYS, &info, current);
 }
 #endif	/* CONFIG_SECCOMP_FILTER */
@@ -634,10 +640,17 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 		return 0;
 
 	case SECCOMP_RET_KILL:
-	default:
+	default: {
+		siginfo_t info;
 		audit_seccomp(this_syscall, SIGSYS, action);
+		/* Show the original registers in the dump. */
+		syscall_rollback(current, task_pt_regs(current));
+		/* Trigger a manual coredump since do_exit skips it. */
+		seccomp_init_siginfo(&info, this_syscall, data);
+		do_coredump(&info);
 		do_exit(SIGSYS);
 	}
+	}
 
 	unreachable();
 
-- 
cgit v1.2.3


From 631a9639ac413da6242cb15558ebd661cf633622 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 19 Jan 2017 20:57:57 +0000
Subject: irqdomain: Add irq domain MSI and MSI_REMAP flags

We introduce two new enum values for the irq domain flag:
- IRQ_DOMAIN_FLAG_MSI indicates the irq domain corresponds to
  an MSI domain
- IRQ_DOMAIN_FLAG_MSI_REMAP indicates the irq domain has MSI
  remapping capabilities.

Those values will be useful to check all MSI irq domains have
MSI remapping support when assessing the safety of IRQ assignment
to a guest.

irq_domain_hierarchical_is_msi_remap() allows to check if an
irq domain or any parent implements MSI remapping.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: Tomasz Nowicki <tomasz.nowicki@caviumnetworks.com>
Tested-by: Tomasz Nowicki <tomasz.nowicki@caviumnetworks.com>
Tested-by: Bharat Bhushan <bharat.bhushan@nxp.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 include/linux/irqdomain.h | 35 +++++++++++++++++++++++++++++++++++
 kernel/irq/irqdomain.c    | 14 ++++++++++++++
 2 files changed, 49 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index ffb84604c1de..bc2f5719dace 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -183,6 +183,12 @@ enum {
 	/* Irq domain is an IPI domain with single virq */
 	IRQ_DOMAIN_FLAG_IPI_SINGLE	= (1 << 3),
 
+	/* Irq domain implements MSIs */
+	IRQ_DOMAIN_FLAG_MSI		= (1 << 4),
+
+	/* Irq domain implements MSI remapping */
+	IRQ_DOMAIN_FLAG_MSI_REMAP	= (1 << 5),
+
 	/*
 	 * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved
 	 * for implementation specific purposes and ignored by the
@@ -446,6 +452,19 @@ static inline bool irq_domain_is_ipi_single(struct irq_domain *domain)
 {
 	return domain->flags & IRQ_DOMAIN_FLAG_IPI_SINGLE;
 }
+
+static inline bool irq_domain_is_msi(struct irq_domain *domain)
+{
+	return domain->flags & IRQ_DOMAIN_FLAG_MSI;
+}
+
+static inline bool irq_domain_is_msi_remap(struct irq_domain *domain)
+{
+	return domain->flags & IRQ_DOMAIN_FLAG_MSI_REMAP;
+}
+
+extern bool irq_domain_hierarchical_is_msi_remap(struct irq_domain *domain);
+
 #else	/* CONFIG_IRQ_DOMAIN_HIERARCHY */
 static inline void irq_domain_activate_irq(struct irq_data *data) { }
 static inline void irq_domain_deactivate_irq(struct irq_data *data) { }
@@ -477,6 +496,22 @@ static inline bool irq_domain_is_ipi_single(struct irq_domain *domain)
 {
 	return false;
 }
+
+static inline bool irq_domain_is_msi(struct irq_domain *domain)
+{
+	return false;
+}
+
+static inline bool irq_domain_is_msi_remap(struct irq_domain *domain)
+{
+	return false;
+}
+
+static inline bool
+irq_domain_hierarchical_is_msi_remap(struct irq_domain *domain)
+{
+	return false;
+}
 #endif	/* CONFIG_IRQ_DOMAIN_HIERARCHY */
 
 #else /* CONFIG_IRQ_DOMAIN */
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8c0a0ae43521..876e13172dc8 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1392,6 +1392,20 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain)
 	if (domain->ops->alloc)
 		domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY;
 }
+
+/**
+ * irq_domain_hierarchical_is_msi_remap - Check if the domain or any
+ * parent has MSI remapping support
+ * @domain: domain pointer
+ */
+bool irq_domain_hierarchical_is_msi_remap(struct irq_domain *domain)
+{
+	for (; domain; domain = domain->parent) {
+		if (irq_domain_is_msi_remap(domain))
+			return true;
+	}
+	return false;
+}
 #else	/* CONFIG_IRQ_DOMAIN_HIERARCHY */
 /**
  * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain
-- 
cgit v1.2.3


From 88156f00904183d99e19269fbdb5cb56dc1522c3 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 19 Jan 2017 20:57:58 +0000
Subject: genirq/msi: Set IRQ_DOMAIN_FLAG_MSI on MSI domain creation

Now we have a flag value indicating an IRQ domain implements MSI,
let's set it on msi_create_irq_domain().

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Tested-by: Tomasz Nowicki <tomasz.nowicki@caviumnetworks.com>
Tested-by: Bharat Bhushan <bharat.bhushan@nxp.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 kernel/irq/msi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index ee230063f033..ddc2f5427f75 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -270,8 +270,8 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
 	if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
 		msi_domain_update_chip_ops(info);
 
-	return irq_domain_create_hierarchy(parent, 0, 0, fwnode,
-					   &msi_domain_ops, info);
+	return irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0,
+					   fwnode, &msi_domain_ops, info);
 }
 
 int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
-- 
cgit v1.2.3


From c7b41f0af38f53e46050b56a5b0e96710097b83c Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 19 Jan 2017 20:57:59 +0000
Subject: irqdomain: irq_domain_check_msi_remap

This new function checks whether all MSI irq domains
implement IRQ remapping. This is useful to understand
whether VFIO passthrough is safe with respect to interrupts.

On ARM typically an MSI controller can sit downstream
to the IOMMU without preventing VFIO passthrough.
As such any assigned device can write into the MSI doorbell.
In case the MSI controller implements IRQ remapping, assigned
devices will not be able to trigger interrupts towards the
host. On the contrary, the assignment must be emphasized as
unsafe with respect to interrupts.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: Tomasz Nowicki <tomasz.nowicki@caviumnetworks.com>
Tested-by: Tomasz Nowicki <tomasz.nowicki@caviumnetworks.com>
Tested-by: Bharat Bhushan <bharat.bhushan@nxp.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 include/linux/irqdomain.h |  1 +
 kernel/irq/irqdomain.c    | 25 +++++++++++++++++++++++++
 2 files changed, 26 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index bc2f5719dace..188eced6813e 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -222,6 +222,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
 					 void *host_data);
 extern struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec,
 						   enum irq_domain_bus_token bus_token);
+extern bool irq_domain_check_msi_remap(void);
 extern void irq_set_default_host(struct irq_domain *host);
 extern int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
 				  irq_hw_number_t hwirq, int node,
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 876e13172dc8..80c4f9312187 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -277,6 +277,31 @@ struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec,
 }
 EXPORT_SYMBOL_GPL(irq_find_matching_fwspec);
 
+/**
+ * irq_domain_check_msi_remap - Check whether all MSI irq domains implement
+ * IRQ remapping
+ *
+ * Return: false if any MSI irq domain does not support IRQ remapping,
+ * true otherwise (including if there is no MSI irq domain)
+ */
+bool irq_domain_check_msi_remap(void)
+{
+	struct irq_domain *h;
+	bool ret = true;
+
+	mutex_lock(&irq_domain_mutex);
+	list_for_each_entry(h, &irq_domain_list, link) {
+		if (irq_domain_is_msi(h) &&
+		    !irq_domain_hierarchical_is_msi_remap(h)) {
+			ret = false;
+			break;
+		}
+	}
+	mutex_unlock(&irq_domain_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(irq_domain_check_msi_remap);
+
 /**
  * irq_set_default_host() - Set a "default" irq domain
  * @domain: default domain pointer
-- 
cgit v1.2.3


From 4d4f88fa235f7f9ef8213564dc1804144332238b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 5 Nov 2016 04:17:15 -0700
Subject: lockdep: Make RCU suspicious-access splats use pr_err

This commit switches RCU suspicious-access splats use pr_err()
instead of the current INFO printk()s.  This change makes it easier
to automatically classify splats.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/locking/lockdep.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 7c38f8f3d97b..d9a698e8458f 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -4412,13 +4412,13 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
 #endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
 	/* Note: the following can be executed concurrently, so be careful. */
 	printk("\n");
-	printk("===============================\n");
-	printk("[ INFO: suspicious RCU usage. ]\n");
+	pr_err("===============================\n");
+	pr_err("[ ERR: suspicious RCU usage.  ]\n");
 	print_kernel_ident();
-	printk("-------------------------------\n");
-	printk("%s:%d %s!\n", file, line, s);
-	printk("\nother info that might help us debug this:\n\n");
-	printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
+	pr_err("-------------------------------\n");
+	pr_err("%s:%d %s!\n", file, line, s);
+	pr_err("\nother info that might help us debug this:\n\n");
+	pr_err("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
 	       !rcu_lockdep_current_cpu_online()
 			? "RCU used illegally from offline CPU!\n"
 			: !rcu_is_watching()
-- 
cgit v1.2.3


From 907565337ebf998a68cb5c5b2174ce5e5da065eb Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Thu, 3 Nov 2016 10:29:28 -0600
Subject: Fix: Disable sys_membarrier when nohz_full is enabled

Userspace applications should be allowed to expect the membarrier system
call with MEMBARRIER_CMD_SHARED command to issue memory barriers on
nohz_full CPUs, but synchronize_sched() does not take those into
account.

Given that we do not want unrelated processes to be able to affect
real-time sensitive nohz_full CPUs, simply return ENOSYS when membarrier
is invoked on a kernel with enabled nohz_full CPUs.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Josh Triplett <josh@joshtriplett.org>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: <stable@vger.kernel.org>	[3.10+]
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Rik van Riel <riel@redhat.com>
Acked-by: Lai Jiangshan <jiangshanlai@gmail.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/membarrier.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/membarrier.c b/kernel/membarrier.c
index 536c727a56e9..9f9284f37f8d 100644
--- a/kernel/membarrier.c
+++ b/kernel/membarrier.c
@@ -16,6 +16,7 @@
 
 #include <linux/syscalls.h>
 #include <linux/membarrier.h>
+#include <linux/tick.h>
 
 /*
  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
@@ -51,6 +52,9 @@
  */
 SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
 {
+	/* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
+	if (tick_nohz_full_enabled())
+		return -ENOSYS;
 	if (unlikely(flags))
 		return -EINVAL;
 	switch (cmd) {
-- 
cgit v1.2.3


From c4402b27f1778fad0dbc27b2d88bb06ca22a06f0 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Wed, 9 Nov 2016 17:57:13 +0900
Subject: rcu: Only dump stalled-tasks stacks if there was a real stall

The print_other_cpu_stall() function currently unconditionally invokes
rcu_print_detail_task_stall().  This is OK because if there was a stall
sufficient to cause print_other_cpu_stall() to be invoked, that stall
is very likely to persist through the entire print_other_cpu_stall()
execution.  However, if the stall did not persist, the variable ndetected
will be zero, and that variable is already tested in an "if" statement.
Therefore, this commit moves the call to rcu_print_detail_task_stall()
under that pre-existing "if" to improve readability, with a very rare
reduction in overhead.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
[ paulmck: Reworked commit log. ]
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tree.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index cb4e2056ccf3..6232d2f9a84e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1379,6 +1379,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 	       (long)rsp->gpnum, (long)rsp->completed, totqlen);
 	if (ndetected) {
 		rcu_dump_cpu_stacks(rsp);
+
+		/* Complain about tasks blocking the grace period. */
+		rcu_print_detail_task_stall(rsp);
 	} else {
 		if (READ_ONCE(rsp->gpnum) != gpnum ||
 		    READ_ONCE(rsp->completed) == gpnum) {
@@ -1395,9 +1398,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 		}
 	}
 
-	/* Complain about tasks blocking the grace period. */
-	rcu_print_detail_task_stall(rsp);
-
 	rcu_check_gp_kthread_starvation(rsp);
 
 	panic_on_rcu_stall();
-- 
cgit v1.2.3


From 2535db485cad8f302196af05888721ce82528978 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 17 Nov 2016 11:00:44 -0800
Subject: rcu: Remove unneeded rcu_process_callbacks() declarations

The declarations of __rcu_process_callbacks() and rcu_process_callbacks()
are not needed, as the definition of both of these functions appear before
any uses.  This commit therefore removes both declarations.

Reported-by: "Ahmed, Iftekhar" <ahmedi@oregonstate.edu>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tiny.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index b23a4d076f3d..fa6a48d3917b 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -41,8 +41,6 @@
 
 /* Forward declarations for tiny_plugin.h. */
 struct rcu_ctrlblk;
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
-static void rcu_process_callbacks(struct softirq_action *unused);
 static void __call_rcu(struct rcu_head *head,
 		       rcu_callback_t func,
 		       struct rcu_ctrlblk *rcp);
-- 
cgit v1.2.3


From 94060d2235cf419f2d46dbce95727e6f826b233c Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Mon, 17 Oct 2016 11:13:07 +0200
Subject: rcu: Remove unused but set variable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since commit 7ec99de36f40 ("rcu: Provide exact CPU-online tracking for
RCU"), the variable mask in rcu_init_percpu_data is set but no longer
used. Remove it to fix the following warning when building with 'W=1':

  kernel/rcu/tree.c: In function ‘rcu_init_percpu_data’:
  kernel/rcu/tree.c:3765:16: warning: variable ‘mask’ set but not used [-Wunused-but-set-variable]

Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tree.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 6232d2f9a84e..83bf054e194e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3765,7 +3765,6 @@ static void
 rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 {
 	unsigned long flags;
-	unsigned long mask;
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
@@ -3788,7 +3787,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	 * of the next grace period.
 	 */
 	rnp = rdp->mynode;
-	mask = rdp->grpmask;
 	raw_spin_lock_rcu_node(rnp);		/* irqs already disabled. */
 	if (!rdp->beenonline)
 		WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1);
-- 
cgit v1.2.3


From 28053bc72c0e588c577557ce9e95a6cb65078471 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 1 Dec 2016 11:31:31 -0800
Subject: rcu: Add long-term CPU kicking

This commit prepares for the removal of short-term CPU kicking (in a
subsequent commit).  It does so by starting to invoke resched_cpu()
for each holdout at each force-quiescent-state interval that is more
than halfway through the stall-warning interval.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tree.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 83bf054e194e..0e61b62e3f4a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1225,6 +1225,12 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
 			 rdp->rsp->gp_start + 2 * jiffies_till_sched_qs) ||
 	    ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + jiffies_till_sched_qs))
 		resched_cpu(rdp->cpu);  /* Force CPU into scheduler. */
+	/*
+	 * If more than halfway to RCU CPU stall-warning time, do
+	 * a resched_cpu() to try to loosen things up a bit.
+	 */
+	if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2)
+		resched_cpu(rdp->cpu);
 
 	return 0;
 }
-- 
cgit v1.2.3


From b201fa67371862229f27a1f022196423aa5c7381 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 25 Nov 2016 00:07:23 -0800
Subject: rcu: Remove short-term CPU kicking

Commit 4914950aaa12d ("rcu: Stop treating in-kernel CPU-bound workloads
as errors") added a (relatively) short-timeout call to resched_cpu().
This was inspired by as issue that was fixed by b7e7ade34e61 ("sched/core:
Fix remote wakeups").  But given that this issue was fixed, it is time
for the current commit to remove this call to resched_cpu().

Reported-by: Byungchul Park <byungchul.park@lge.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tree.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 0e61b62e3f4a..5a4aaad75e76 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1220,11 +1220,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
 		rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
 	}
 
-	/* And if it has been a really long time, kick the CPU as well. */
-	if (ULONG_CMP_GE(jiffies,
-			 rdp->rsp->gp_start + 2 * jiffies_till_sched_qs) ||
-	    ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + jiffies_till_sched_qs))
-		resched_cpu(rdp->cpu);  /* Force CPU into scheduler. */
 	/*
 	 * If more than halfway to RCU CPU stall-warning time, do
 	 * a resched_cpu() to try to loosen things up a bit.
-- 
cgit v1.2.3


From 7aa92230c9e86b2150f718185f70e0af592e290b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 29 Nov 2016 05:49:06 -0800
Subject: rcu: Once again use NMI-based stack traces in stall warnings

This commit is for all intents and purposes a revert of bc1dce514e9b
("rcu: Don't use NMIs to dump other CPUs' stacks").  The reason to suppose
that this can now safely be reverted is the presence of 42a0bb3f7138
("printk/nmi: generic solution for safe printk in NMI"), which is said
to have made NMI-based stack dumps safe.

However, this reversion keeps one nice property of bc1dce514e9b
("rcu: Don't use NMIs to dump other CPUs' stacks"), namely that
only those CPUs blocking the grace period are dumped.  The new
trigger_single_cpu_backtrace() is used to make this happen, as
suggested by Josh Poimboeuf.

Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tree.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5a4aaad75e76..d7b63b88434b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1278,7 +1278,10 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
 }
 
 /*
- * Dump stacks of all tasks running on stalled CPUs.
+ * Dump stacks of all tasks running on stalled CPUs.  First try using
+ * NMIs, but fall back to manual remote stack tracing on architectures
+ * that don't support NMI-based stack dumps.  The NMI-triggered stack
+ * traces are more accurate because they are printed by the target CPU.
  */
 static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
 {
@@ -1288,11 +1291,10 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
 
 	rcu_for_each_leaf_node(rsp, rnp) {
 		raw_spin_lock_irqsave_rcu_node(rnp, flags);
-		if (rnp->qsmask != 0) {
-			for_each_leaf_node_possible_cpu(rnp, cpu)
-				if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
+		for_each_leaf_node_possible_cpu(rnp, cpu)
+			if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
+				if (!trigger_single_cpu_backtrace(cpu))
 					dump_cpu_task(cpu);
-		}
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	}
 }
-- 
cgit v1.2.3


From 630c7ed9ca0608912fa7c8591d05dfc8742dc9e6 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 15 Dec 2016 15:37:47 -0800
Subject: rcu: Don't wake rcuc/X kthreads on NOCB CPUs

Chris Friesen notice that rcuc/X kthreads were consuming CPU even on
NOCB CPUs.  This makes no sense because the only purpose or these
kthreads is to invoke normal (non-offloaded) callbacks, of which there
will never be any on NOCB CPUs.  This problem was due to a bug in
cpu_has_callbacks_ready_to_invoke(), which should have been checking
->nxttail[RCU_NEXT_TAIL] for NULL, but which was instead (incorrectly)
checking ->nxttail[RCU_DONE_TAIL].  Because ->nxttail[RCU_DONE_TAIL] is
never NULL, the only effect is to cause the rcuc/X kthread to execute
when it should not do so.

This commit therefore checks ->nxttail[RCU_NEXT_TAIL], which is NULL
for NOCB CPUs.

Reported-by: Chris Friesen <chris.friesen@windriver.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d7b63b88434b..be2301238a23 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -611,7 +611,7 @@ static int
 cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
 {
 	return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] &&
-	       rdp->nxttail[RCU_DONE_TAIL] != NULL;
+	       rdp->nxttail[RCU_NEXT_TAIL] != NULL;
 }
 
 /*
-- 
cgit v1.2.3


From 09e2db37ec6d22bc27e2ba94ab4889541567f52e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 18 Dec 2016 13:31:02 -0800
Subject: rcu: Add comment headers to expedited-grace-period counter functions

These functions (rcu_exp_gp_seq_start(), rcu_exp_gp_seq_end(),
rcu_exp_gp_seq_snap(), and rcu_exp_gp_seq_done() seemed too obvious
to comment when written, but not so much when being documented.
This commit therefore adds header comments to each of them.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tree_exp.h | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index e59e1849b89a..303df97bbfc5 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -20,16 +20,26 @@
  * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
  */
 
-/* Wrapper functions for expedited grace periods.  */
+/*
+ * Record the start of an expedited grace period.
+ */
 static void rcu_exp_gp_seq_start(struct rcu_state *rsp)
 {
 	rcu_seq_start(&rsp->expedited_sequence);
 }
+
+/*
+ * Record the end of an expedited grace period.
+ */
 static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
 {
 	rcu_seq_end(&rsp->expedited_sequence);
 	smp_mb(); /* Ensure that consecutive grace periods serialize. */
 }
+
+/*
+ * Take a snapshot of the expedited-grace-period counter.
+ */
 static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
 {
 	unsigned long s;
@@ -39,6 +49,12 @@ static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
 	trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
 	return s;
 }
+
+/*
+ * Given a counter snapshot from rcu_exp_gp_seq_snap(), return true
+ * if a full expedited grace period has elapsed since that snapshot
+ * was taken.
+ */
 static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
 {
 	return rcu_seq_done(&rsp->expedited_sequence, s);
-- 
cgit v1.2.3


From fdbb9b315ce40922f3a8d2b8352776d7bc963d84 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 20 Dec 2016 07:17:58 -0800
Subject: rcu: Make rcu_cpu_starting() use its "cpu" argument

The rcu_cpu_starting() function uses this_cpu_ptr() to locate the
incoming CPU's rcu_data structure.  This works for the boot CPU and for
all CPUs onlined after rcu_init() executes (during very early boot).
Currently, this is the full set of CPUs, so all is well.  But if
anyone ever parallelizes boot before rcu_init() time, it will fail.
This commit therefore substitutes the rcu_cpu_starting() function's
this_cpu_pointer() for per_cpu_ptr(), future-proofing the code and
(arguably) improving readability.

This commit inadvertently fixes a latent bug: If there ever had been
more than just the boot CPU online at rcu_init() time, the old code
would not initialize the non-boot CPUs, but rather would repeatedly
initialize the boot CPU.

Reported-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index be2301238a23..a4b4762442bb 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3873,7 +3873,7 @@ void rcu_cpu_starting(unsigned int cpu)
 	struct rcu_state *rsp;
 
 	for_each_rcu_flavor(rsp) {
-		rdp = this_cpu_ptr(rsp->rda);
+		rdp = per_cpu_ptr(rsp->rda, cpu);
 		rnp = rdp->mynode;
 		mask = rdp->grpmask;
 		raw_spin_lock_irqsave_rcu_node(rnp, flags);
-- 
cgit v1.2.3


From 9831ce3bb4f8b8f18f6e9719b64ba4e727d70fb3 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 2 Jan 2017 14:24:24 -0800
Subject: rcu: Fix comment in rcu_organize_nocb_kthreads()

It used to be that the rcuo callback-offload kthreads were spawned
in rcu_organize_nocb_kthreads(), and the comment before the "for"
loop says as much.  However, this spawning has long since moved to
the CPU-hotplug code, so this commit fixes this comment.

Reported-by: Michalis Kokologiannakis <mixaskok@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tree_plugin.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 56583e764ebf..2f5541f6f031 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2366,8 +2366,9 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp)
 	}
 
 	/*
-	 * Each pass through this loop sets up one rcu_data structure and
-	 * spawns one rcu_nocb_kthread().
+	 * Each pass through this loop sets up one rcu_data structure.
+	 * Should the corresponding CPU come online in the future, then
+	 * we will spawn the needed set of rcu_nocb_kthread() kthreads.
 	 */
 	for_each_cpu(cpu, rcu_nocb_mask) {
 		rdp = per_cpu_ptr(rsp->rda, cpu);
-- 
cgit v1.2.3


From bb4e2c08bbfa8eb032db7814f6100086aac102d3 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 6 Jan 2017 22:04:22 -0800
Subject: rcu: Eliminate unused expedited_normal counter

Expedited grace periods no longer fall back to normal grace periods
in response to lock contention, given that expedited grace periods
now use the rcu_node tree so as to avoid contention.  This commit
therfore removes the expedited_normal counter.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 Documentation/RCU/trace.txt | 5 +----
 kernel/rcu/tree.h           | 1 -
 kernel/rcu/tree_trace.c     | 3 +--
 3 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index 00a3a38b375a..6549012033f9 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -237,7 +237,7 @@ o	"ktl" is the low-order 16 bits (in hexadecimal) of the count of
 
 The output of "cat rcu/rcu_preempt/rcuexp" looks as follows:
 
-s=21872 wd1=0 wd2=0 wd3=5 n=0 enq=0 sc=21872
+s=21872 wd1=0 wd2=0 wd3=5 enq=0 sc=21872
 
 These fields are as follows:
 
@@ -249,9 +249,6 @@ o	"wd1", "wd2", and "wd3" are the number of times that an attempt
 	completed an expedited grace period that satisfies the attempted
 	request.  "Our work is done."
 
-o	"n" is number of times that a concurrent CPU-hotplug operation
-	forced a fallback to a normal grace period.
-
 o	"enq" is the number of quiescent states still outstanding.
 
 o	"sc" is the number of times that the attempt to start a
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index fe98dd24adf8..8f750dffb0dd 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -521,7 +521,6 @@ struct rcu_state {
 	struct mutex exp_mutex;			/* Serialize expedited GP. */
 	struct mutex exp_wake_mutex;		/* Serialize wakeup. */
 	unsigned long expedited_sequence;	/* Take a ticket. */
-	atomic_long_t expedited_normal;		/* # fallbacks to normal. */
 	atomic_t expedited_need_qs;		/* # CPUs left to check in. */
 	struct swait_queue_head expedited_wq;	/* Wait for check-ins. */
 	int ncpus_snap;				/* # CPUs seen last time. */
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index b1f28972872c..2e932cd1da31 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -194,9 +194,8 @@ static int show_rcuexp(struct seq_file *m, void *v)
 		s2 += atomic_long_read(&rdp->exp_workdone2);
 		s3 += atomic_long_read(&rdp->exp_workdone3);
 	}
-	seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
+	seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu enq=%d sc=%lu\n",
 		   rsp->expedited_sequence, s0, s1, s2, s3,
-		   atomic_long_read(&rsp->expedited_normal),
 		   atomic_read(&rsp->expedited_need_qs),
 		   rsp->expedited_sequence / 2);
 	return 0;
-- 
cgit v1.2.3


From 8dc79888a792f6c365c2a26903e49ff919e72488 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 9 Jan 2017 15:25:55 -0800
Subject: rcu: Add lockdep checks to synchronous expedited primitives

The non-expedited synchronize_*rcu() primitives have lockdep checks, but
their expedited counterparts lack these checks.  This commit therefore
adds these checks to the expedited synchronize_*rcu() primitives.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tree_exp.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 303df97bbfc5..f3e214898e3a 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -639,6 +639,11 @@ void synchronize_sched_expedited(void)
 {
 	struct rcu_state *rsp = &rcu_sched_state;
 
+	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+			 lock_is_held(&rcu_lock_map) ||
+			 lock_is_held(&rcu_sched_lock_map),
+			 "Illegal synchronize_sched_expedited() in RCU read-side critical section");
+
 	/* If only one CPU, this is automatically a grace period. */
 	if (rcu_blocking_is_gp())
 		return;
@@ -708,6 +713,11 @@ void synchronize_rcu_expedited(void)
 {
 	struct rcu_state *rsp = rcu_state_p;
 
+	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+			 lock_is_held(&rcu_lock_map) ||
+			 lock_is_held(&rcu_sched_lock_map),
+			 "Illegal synchronize_rcu_expedited() in RCU read-side critical section");
+
 	if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
 		return;
 	_synchronize_rcu_expedited(rsp, sync_rcu_exp_handler);
-- 
cgit v1.2.3


From 2625d469baeef3aabdfe122572e00c517e2d9451 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 2 Nov 2016 14:23:30 -0700
Subject: rcu: Abstract dynticks extended quiescent state enter/exit operations

This commit is the third step towards full abstraction of all accesses
to the ->dynticks counter, implementing the previously open-coded atomic
add of 1 and entry checks in a new rcu_dynticks_eqs_enter() function, and
the same but with exit checks in a new rcu_dynticks_eqs_exit() function.
This abstraction will ease changes to the ->dynticks counter operation.

Note that this commit gets rid of the smp_mb__before_atomic() and the
smp_mb__after_atomic() calls that were previously present.  The reason
that this is OK from a memory-ordering perspective is that the atomic
operation is now atomic_add_return(), which, as a value-returning atomic,
guarantees full ordering.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ paulmck: Fixed RCU_TRACE() statements added by this commit. ]
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tree.c | 88 +++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 62 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 805d55ee0b2a..3169d5a21b55 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -281,6 +281,61 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
 #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
 };
 
+/*
+ * Record entry into an extended quiescent state.  This is only to be
+ * called when not already in an extended quiescent state.
+ */
+static void rcu_dynticks_eqs_enter(void)
+{
+	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+	int special;
+
+	/*
+	 * CPUs seeing atomic_inc_return() must see prior RCU read-side
+	 * critical sections, and we also must force ordering with the
+	 * next idle sojourn.
+	 */
+	special = atomic_inc_return(&rdtp->dynticks);
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && special & 0x1);
+}
+
+/*
+ * Record exit from an extended quiescent state.  This is only to be
+ * called from an extended quiescent state.
+ */
+static void rcu_dynticks_eqs_exit(void)
+{
+	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+	int special;
+
+	/*
+	 * CPUs seeing atomic_inc_return() must see prior idle sojourns,
+	 * and we also must force ordering with the next RCU read-side
+	 * critical section.
+	 */
+	special = atomic_inc_return(&rdtp->dynticks);
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(special & 0x1));
+}
+
+/*
+ * Reset the current CPU's ->dynticks counter to indicate that the
+ * newly onlined CPU is no longer in an extended quiescent state.
+ * This will either leave the counter unchanged, or increment it
+ * to the next non-quiescent value.
+ *
+ * The non-atomic test/increment sequence works because the upper bits
+ * of the ->dynticks counter are manipulated only by the corresponding CPU,
+ * or when the corresponding CPU is offline.
+ */
+static void rcu_dynticks_eqs_online(void)
+{
+	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
+	if (atomic_read(&rdtp->dynticks) & 0x1)
+		return;
+	atomic_add(0x1, &rdtp->dynticks);
+}
+
 /*
  * Snapshot the ->dynticks counter with full ordering so as to allow
  * stable comparison of this counter with past and future snapshots.
@@ -693,7 +748,7 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
 {
 	struct rcu_state *rsp;
 	struct rcu_data *rdp;
-	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+	RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);)
 
 	trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
 	if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
@@ -712,12 +767,7 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
 		do_nocb_deferred_wakeup(rdp);
 	}
 	rcu_prepare_for_idle();
-	/* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
-	smp_mb__before_atomic();  /* See above. */
-	atomic_inc(&rdtp->dynticks);
-	smp_mb__after_atomic();  /* Force ordering with next sojourn. */
-	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
-		     atomic_read(&rdtp->dynticks) & 0x1);
+	rcu_dynticks_eqs_enter();
 	rcu_dynticks_task_enter();
 
 	/*
@@ -846,15 +896,10 @@ void rcu_irq_exit_irqson(void)
  */
 static void rcu_eqs_exit_common(long long oldval, int user)
 {
-	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+	RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);)
 
 	rcu_dynticks_task_exit();
-	smp_mb__before_atomic();  /* Force ordering w/previous sojourn. */
-	atomic_inc(&rdtp->dynticks);
-	/* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
-	smp_mb__after_atomic();  /* See above. */
-	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
-		     !(atomic_read(&rdtp->dynticks) & 0x1));
+	rcu_dynticks_eqs_exit();
 	rcu_cleanup_after_idle();
 	trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
 	if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
@@ -1001,11 +1046,7 @@ void rcu_nmi_enter(void)
 	 * period (observation due to Andy Lutomirski).
 	 */
 	if (!(atomic_read(&rdtp->dynticks) & 0x1)) {
-		smp_mb__before_atomic();  /* Force delay from prior write. */
-		atomic_inc(&rdtp->dynticks);
-		/* atomic_inc() before later RCU read-side crit sects */
-		smp_mb__after_atomic();  /* See above. */
-		WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+		rcu_dynticks_eqs_exit();
 		incby = 1;
 	}
 	rdtp->dynticks_nmi_nesting += incby;
@@ -1043,11 +1084,7 @@ void rcu_nmi_exit(void)
 
 	/* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
 	rdtp->dynticks_nmi_nesting = 0;
-	/* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
-	smp_mb__before_atomic();  /* See above. */
-	atomic_inc(&rdtp->dynticks);
-	smp_mb__after_atomic();  /* Force delay to next write. */
-	WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
+	rcu_dynticks_eqs_enter();
 }
 
 /**
@@ -3800,8 +3837,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 		init_callback_list(rdp);  /* Re-enable callbacks on this CPU. */
 	rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
 	rcu_sysidle_init_percpu_data(rdp->dynticks);
-	atomic_set(&rdp->dynticks->dynticks,
-		   (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
+	rcu_dynticks_eqs_online();
 	raw_spin_unlock_rcu_node(rnp);		/* irqs remain disabled. */
 
 	/*
-- 
cgit v1.2.3


From 02a5c550b2738f2bfea8e1e00aa75944d71c9e18 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 2 Nov 2016 17:25:06 -0700
Subject: rcu: Abstract extended quiescent state determination

This commit is the fourth step towards full abstraction of all accesses
to the ->dynticks counter, implementing previously open-coded checks and
comparisons in new rcu_dynticks_in_eqs() and rcu_dynticks_in_eqs_since()
functions.  This abstraction will ease changes to the ->dynticks counter
operation.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/rcutiny.h  |  6 ++++++
 kernel/rcu/tree.c        | 52 +++++++++++++++++++++++++++++++++++-------------
 kernel/rcu/tree.h        |  2 ++
 kernel/rcu/tree_exp.h    |  6 +++---
 kernel/rcu/tree_plugin.h |  2 +-
 kernel/rcu/tree_trace.c  |  2 +-
 6 files changed, 51 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index ac81e4063b40..4f9b2fa2173d 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -27,6 +27,12 @@
 
 #include <linux/cache.h>
 
+struct rcu_dynticks;
+static inline int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
+{
+	return 0;
+}
+
 static inline unsigned long get_state_synchronize_rcu(void)
 {
 	return 0;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 3169d5a21b55..8b970319c75b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -336,17 +336,48 @@ static void rcu_dynticks_eqs_online(void)
 	atomic_add(0x1, &rdtp->dynticks);
 }
 
+/*
+ * Is the current CPU in an extended quiescent state?
+ *
+ * No ordering, as we are sampling CPU-local information.
+ */
+bool rcu_dynticks_curr_cpu_in_eqs(void)
+{
+	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
+	return !(atomic_read(&rdtp->dynticks) & 0x1);
+}
+
 /*
  * Snapshot the ->dynticks counter with full ordering so as to allow
  * stable comparison of this counter with past and future snapshots.
  */
-static int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
+int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
 {
 	int snap = atomic_add_return(0, &rdtp->dynticks);
 
 	return snap;
 }
 
+/*
+ * Return true if the snapshot returned from rcu_dynticks_snap()
+ * indicates that RCU is in an extended quiescent state.
+ */
+static bool rcu_dynticks_in_eqs(int snap)
+{
+	return !(snap & 0x1);
+}
+
+/*
+ * Return true if the CPU corresponding to the specified rcu_dynticks
+ * structure has spent some time in an extended quiescent state since
+ * rcu_dynticks_snap() returned the specified snapshot.
+ */
+static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap)
+{
+	return snap != rcu_dynticks_snap(rdtp);
+}
+
 /*
  * Do a double-increment of the ->dynticks counter to emulate a
  * momentary idle-CPU quiescent state.
@@ -1045,7 +1076,7 @@ void rcu_nmi_enter(void)
 	 * to be in the outermost NMI handler that interrupted an RCU-idle
 	 * period (observation due to Andy Lutomirski).
 	 */
-	if (!(atomic_read(&rdtp->dynticks) & 0x1)) {
+	if (rcu_dynticks_curr_cpu_in_eqs()) {
 		rcu_dynticks_eqs_exit();
 		incby = 1;
 	}
@@ -1071,7 +1102,7 @@ void rcu_nmi_exit(void)
 	 * to us!)
 	 */
 	WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0);
-	WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+	WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
 
 	/*
 	 * If the nesting level is not 1, the CPU wasn't RCU-idle, so
@@ -1097,9 +1128,7 @@ void rcu_nmi_exit(void)
  */
 bool notrace __rcu_is_watching(void)
 {
-	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-
-	return atomic_read(&rdtp->dynticks) & 0x1;
+	return !rcu_dynticks_curr_cpu_in_eqs();
 }
 
 /**
@@ -1184,7 +1213,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
 {
 	rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks);
 	rcu_sysidle_check_cpu(rdp, isidle, maxj);
-	if ((rdp->dynticks_snap & 0x1) == 0) {
+	if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
 		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
 		if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4,
 				 rdp->mynode->gpnum))
@@ -1203,12 +1232,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
 static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
 				    bool *isidle, unsigned long *maxj)
 {
-	unsigned int curr;
 	int *rcrmp;
-	unsigned int snap;
-
-	curr = (unsigned int)rcu_dynticks_snap(rdp->dynticks);
-	snap = (unsigned int)rdp->dynticks_snap;
 
 	/*
 	 * If the CPU passed through or entered a dynticks idle phase with
@@ -1218,7 +1242,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
 	 * read-side critical section that started before the beginning
 	 * of the current RCU grace period.
 	 */
-	if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) {
+	if (rcu_dynticks_in_eqs_since(rdp->dynticks, rdp->dynticks_snap)) {
 		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
 		rdp->dynticks_fqs++;
 		return 1;
@@ -3807,7 +3831,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 	rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
 	rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
 	WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
-	WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
+	WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp->dynticks)));
 	rdp->cpu = cpu;
 	rdp->rsp = rsp;
 	rcu_boot_init_nocb_percpu_data(rdp);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index fe98dd24adf8..3b953dcf6afc 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -595,6 +595,8 @@ extern struct rcu_state rcu_bh_state;
 extern struct rcu_state rcu_preempt_state;
 #endif /* #ifdef CONFIG_PREEMPT_RCU */
 
+int rcu_dynticks_snap(struct rcu_dynticks *rdtp);
+
 #ifdef CONFIG_RCU_BOOST
 DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
 DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 011f626b2fd8..e155a465cf84 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -360,7 +360,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
 			rdp->exp_dynticks_snap =
 				rcu_dynticks_snap(rdp->dynticks);
 			if (raw_smp_processor_id() == cpu ||
-			    !(rdp->exp_dynticks_snap & 0x1) ||
+			    rcu_dynticks_in_eqs(rdp->exp_dynticks_snap) ||
 			    !(rnp->qsmaskinitnext & rdp->grpmask))
 				mask_ofl_test |= rdp->grpmask;
 		}
@@ -383,8 +383,8 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
 			if (!(mask_ofl_ipi & mask))
 				continue;
 retry_ipi:
-			if (rcu_dynticks_snap(rdp->dynticks) !=
-			    rdp->exp_dynticks_snap) {
+			if (rcu_dynticks_in_eqs_since(rdp->dynticks,
+						      rdp->exp_dynticks_snap)) {
 				mask_ofl_test |= mask;
 				continue;
 			}
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 56583e764ebf..652209589adf 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1643,7 +1643,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
 	       "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
 	       "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
 	       ticks_value, ticks_title,
-	       atomic_read(&rdtp->dynticks) & 0xfff,
+	       rcu_dynticks_snap(rdtp) & 0xfff,
 	       rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
 	       rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
 	       READ_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index b1f28972872c..b833cd0a29e8 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -124,7 +124,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 		   rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
 		   rdp->core_needs_qs);
 	seq_printf(m, " dt=%d/%llx/%d df=%lu",
-		   atomic_read(&rdp->dynticks->dynticks),
+		   rcu_dynticks_snap(rdp->dynticks),
 		   rdp->dynticks->dynticks_nesting,
 		   rdp->dynticks->dynticks_nmi_nesting,
 		   rdp->dynticks_fqs);
-- 
cgit v1.2.3


From 3a19b46a5c17b12ef0691df19c676ba3da330a57 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 30 Nov 2016 11:21:21 -0800
Subject: rcu: Check cond_resched_rcu_qs() state less often to reduce GP
 overhead

Commit 4a81e8328d37 ("rcu: Reduce overhead of cond_resched() checks
for RCU") moved quiescent-state generation out of cond_resched()
and commit bde6c3aa9930 ("rcu: Provide cond_resched_rcu_qs() to force
quiescent states in long loops") introduced cond_resched_rcu_qs(), and
commit 5cd37193ce85 ("rcu: Make cond_resched_rcu_qs() apply to normal RCU
flavors") introduced the per-CPU rcu_qs_ctr variable, which is frequently
polled by the RCU core state machine.

This frequent polling can increase grace-period rate, which in turn
increases grace-period overhead, which is visible in some benchmarks
(for example, the "open1" benchmark in Anton Blanchard's "will it scale"
suite).  This commit therefore reduces the rate at which rcu_qs_ctr
is polled by moving that polling into the force-quiescent-state (FQS)
machinery, and by further polling it only after the grace period has
been in effect for at least jiffies_till_sched_qs jiffies.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/trace/events/rcu.h | 10 +++++-----
 kernel/rcu/tree.c          | 46 ++++++++++++++++++++++++++++++++++------------
 2 files changed, 39 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 9d4f9b3a2b7b..e3facb356838 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -385,11 +385,11 @@ TRACE_EVENT(rcu_quiescent_state_report,
 
 /*
  * Tracepoint for quiescent states detected by force_quiescent_state().
- * These trace events include the type of RCU, the grace-period number
- * that was blocked by the CPU, the CPU itself, and the type of quiescent
- * state, which can be "dti" for dyntick-idle mode, "ofl" for CPU offline,
- * or "kick" when kicking a CPU that has been in dyntick-idle mode for
- * too long.
+ * These trace events include the type of RCU, the grace-period number that
+ * was blocked by the CPU, the CPU itself, and the type of quiescent state,
+ * which can be "dti" for dyntick-idle mode, "ofl" for CPU offline, "kick"
+ * when kicking a CPU that has been in dyntick-idle mode for too long, or
+ * "rqc" if the CPU got a quiescent state via its rcu_qs_ctr.
  */
 TRACE_EVENT(rcu_fqs,
 
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8b970319c75b..d8245cbd08f9 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1232,7 +1232,10 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
 static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
 				    bool *isidle, unsigned long *maxj)
 {
+	unsigned long jtsq;
 	int *rcrmp;
+	unsigned long rjtsc;
+	struct rcu_node *rnp;
 
 	/*
 	 * If the CPU passed through or entered a dynticks idle phase with
@@ -1248,6 +1251,31 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
 		return 1;
 	}
 
+	/* Compute and saturate jiffies_till_sched_qs. */
+	jtsq = jiffies_till_sched_qs;
+	rjtsc = rcu_jiffies_till_stall_check();
+	if (jtsq > rjtsc / 2) {
+		WRITE_ONCE(jiffies_till_sched_qs, rjtsc);
+		jtsq = rjtsc / 2;
+	} else if (jtsq < 1) {
+		WRITE_ONCE(jiffies_till_sched_qs, 1);
+		jtsq = 1;
+	}
+
+	/*
+	 * Has this CPU encountered a cond_resched_rcu_qs() since the
+	 * beginning of the grace period?  For this to be the case,
+	 * the CPU has to have noticed the current grace period.  This
+	 * might not be the case for nohz_full CPUs looping in the kernel.
+	 */
+	rnp = rdp->mynode;
+	if (time_after(jiffies, rdp->rsp->gp_start + jtsq) &&
+	    READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_qs_ctr, rdp->cpu) &&
+	    READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) {
+		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc"));
+		return 1;
+	}
+
 	/*
 	 * Check for the CPU being offline, but only if the grace period
 	 * is old enough.  We don't need to worry about the CPU changing
@@ -1290,9 +1318,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
 	 * warning delay.
 	 */
 	rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu);
-	if (ULONG_CMP_GE(jiffies,
-			 rdp->rsp->gp_start + jiffies_till_sched_qs) ||
-	    ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
+	if (time_after(jiffies, rdp->rsp->gp_start + jtsq) ||
+	    time_after(jiffies, rdp->rsp->jiffies_resched)) {
 		if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
 			WRITE_ONCE(rdp->cond_resched_completed,
 				   READ_ONCE(rdp->mynode->completed));
@@ -2550,10 +2577,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 
 	rnp = rdp->mynode;
 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
-	if ((rdp->cpu_no_qs.b.norm &&
-	     rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
-	    rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
-	    rdp->gpwrap) {
+	if (rdp->cpu_no_qs.b.norm || rdp->gpnum != rnp->gpnum ||
+	    rnp->completed == rnp->gpnum || rdp->gpwrap) {
 
 		/*
 		 * The grace period in which this quiescent state was
@@ -2608,8 +2633,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 	 * Was there a quiescent state since the beginning of the grace
 	 * period? If no, then exit and wait for the next call.
 	 */
-	if (rdp->cpu_no_qs.b.norm &&
-	    rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr))
+	if (rdp->cpu_no_qs.b.norm)
 		return;
 
 	/*
@@ -3563,9 +3587,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 	    rdp->core_needs_qs && rdp->cpu_no_qs.b.norm &&
 	    rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
 		rdp->n_rp_core_needs_qs++;
-	} else if (rdp->core_needs_qs &&
-		   (!rdp->cpu_no_qs.b.norm ||
-		    rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
+	} else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) {
 		rdp->n_rp_report_qs++;
 		return 1;
 	}
-- 
cgit v1.2.3


From 38d30b336ccf8ee98e0e494a13738a0fade5a5e6 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 1 Dec 2016 09:55:27 -0800
Subject: rcu: Adjust FQS offline checks for exact online-CPU detection

Commit 7ec99de36f40 ("rcu: Provide exact CPU-online tracking for RCU"),
as its title suggests, got rid of RCU's remaining CPU-hotplug timing
guesswork.  This commit therefore removes the one-jiffy kludge that was
used to paper over this guesswork.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tree.c | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d8245cbd08f9..6e4b1dcebeb2 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1276,21 +1276,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
 		return 1;
 	}
 
-	/*
-	 * Check for the CPU being offline, but only if the grace period
-	 * is old enough.  We don't need to worry about the CPU changing
-	 * state: If we see it offline even once, it has been through a
-	 * quiescent state.
-	 *
-	 * The reason for insisting that the grace period be at least
-	 * one jiffy old is that CPUs that are not quite online and that
-	 * have just gone offline can still execute RCU read-side critical
-	 * sections.
-	 */
-	if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies))
-		return 0;  /* Grace period is not old enough. */
-	barrier();
-	if (cpu_is_offline(rdp->cpu)) {
+	/* Check for the CPU being offline. */
+	if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp))) {
 		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl"));
 		rdp->offline_fqs++;
 		return 1;
-- 
cgit v1.2.3


From b95a5c4db09bc7c253636cb84dc9b12c577fd5a0 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Sat, 21 Jan 2017 17:26:11 +0100
Subject: bpf: add a longest prefix match trie map implementation

This trie implements a longest prefix match algorithm that can be used
to match IP addresses to a stored set of ranges.

Internally, data is stored in an unbalanced trie of nodes that has a
maximum height of n, where n is the prefixlen the trie was created
with.

Tries may be created with prefix lengths that are multiples of 8, in
the range from 8 to 2048. The key used for lookup and update operations
is a struct bpf_lpm_trie_key, and the value is a uint64_t.

The code carries more information about the internal implementation.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Reviewed-by: David Herrmann <dh.herrmann@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |   7 +
 kernel/bpf/Makefile      |   2 +-
 kernel/bpf/lpm_trie.c    | 503 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 511 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/lpm_trie.c

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 54a5894bb4ea..bd3068485410 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -63,6 +63,12 @@ struct bpf_insn {
 	__s32	imm;		/* signed immediate constant */
 };
 
+/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */
+struct bpf_lpm_trie_key {
+	__u32	prefixlen;	/* up to 32 for AF_INET, 128 for AF_INET6 */
+	__u8	data[0];	/* Arbitrary size */
+};
+
 /* BPF syscall commands, see bpf(2) man-page for details. */
 enum bpf_cmd {
 	BPF_MAP_CREATE,
@@ -89,6 +95,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_CGROUP_ARRAY,
 	BPF_MAP_TYPE_LRU_HASH,
 	BPF_MAP_TYPE_LRU_PERCPU_HASH,
+	BPF_MAP_TYPE_LPM_TRIE,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 1276474ac3cd..e1ce4f4fd7fd 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,7 +1,7 @@
 obj-y := core.o
 
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o
 ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
 endif
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
new file mode 100644
index 000000000000..ba19241d1979
--- /dev/null
+++ b/kernel/bpf/lpm_trie.c
@@ -0,0 +1,503 @@
+/*
+ * Longest prefix match list implementation
+ *
+ * Copyright (c) 2016,2017 Daniel Mack
+ * Copyright (c) 2016 David Herrmann
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License.  See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <net/ipv6.h>
+
+/* Intermediate node */
+#define LPM_TREE_NODE_FLAG_IM BIT(0)
+
+struct lpm_trie_node;
+
+struct lpm_trie_node {
+	struct rcu_head rcu;
+	struct lpm_trie_node __rcu	*child[2];
+	u32				prefixlen;
+	u32				flags;
+	u8				data[0];
+};
+
+struct lpm_trie {
+	struct bpf_map			map;
+	struct lpm_trie_node __rcu	*root;
+	size_t				n_entries;
+	size_t				max_prefixlen;
+	size_t				data_size;
+	raw_spinlock_t			lock;
+};
+
+/* This trie implements a longest prefix match algorithm that can be used to
+ * match IP addresses to a stored set of ranges.
+ *
+ * Data stored in @data of struct bpf_lpm_key and struct lpm_trie_node is
+ * interpreted as big endian, so data[0] stores the most significant byte.
+ *
+ * Match ranges are internally stored in instances of struct lpm_trie_node
+ * which each contain their prefix length as well as two pointers that may
+ * lead to more nodes containing more specific matches. Each node also stores
+ * a value that is defined by and returned to userspace via the update_elem
+ * and lookup functions.
+ *
+ * For instance, let's start with a trie that was created with a prefix length
+ * of 32, so it can be used for IPv4 addresses, and one single element that
+ * matches 192.168.0.0/16. The data array would hence contain
+ * [0xc0, 0xa8, 0x00, 0x00] in big-endian notation. This documentation will
+ * stick to IP-address notation for readability though.
+ *
+ * As the trie is empty initially, the new node (1) will be places as root
+ * node, denoted as (R) in the example below. As there are no other node, both
+ * child pointers are %NULL.
+ *
+ *              +----------------+
+ *              |       (1)  (R) |
+ *              | 192.168.0.0/16 |
+ *              |    value: 1    |
+ *              |   [0]    [1]   |
+ *              +----------------+
+ *
+ * Next, let's add a new node (2) matching 192.168.0.0/24. As there is already
+ * a node with the same data and a smaller prefix (ie, a less specific one),
+ * node (2) will become a child of (1). In child index depends on the next bit
+ * that is outside of what (1) matches, and that bit is 0, so (2) will be
+ * child[0] of (1):
+ *
+ *              +----------------+
+ *              |       (1)  (R) |
+ *              | 192.168.0.0/16 |
+ *              |    value: 1    |
+ *              |   [0]    [1]   |
+ *              +----------------+
+ *                   |
+ *    +----------------+
+ *    |       (2)      |
+ *    | 192.168.0.0/24 |
+ *    |    value: 2    |
+ *    |   [0]    [1]   |
+ *    +----------------+
+ *
+ * The child[1] slot of (1) could be filled with another node which has bit #17
+ * (the next bit after the ones that (1) matches on) set to 1. For instance,
+ * 192.168.128.0/24:
+ *
+ *              +----------------+
+ *              |       (1)  (R) |
+ *              | 192.168.0.0/16 |
+ *              |    value: 1    |
+ *              |   [0]    [1]   |
+ *              +----------------+
+ *                   |      |
+ *    +----------------+  +------------------+
+ *    |       (2)      |  |        (3)       |
+ *    | 192.168.0.0/24 |  | 192.168.128.0/24 |
+ *    |    value: 2    |  |     value: 3     |
+ *    |   [0]    [1]   |  |    [0]    [1]    |
+ *    +----------------+  +------------------+
+ *
+ * Let's add another node (4) to the game for 192.168.1.0/24. In order to place
+ * it, node (1) is looked at first, and because (4) of the semantics laid out
+ * above (bit #17 is 0), it would normally be attached to (1) as child[0].
+ * However, that slot is already allocated, so a new node is needed in between.
+ * That node does not have a value attached to it and it will never be
+ * returned to users as result of a lookup. It is only there to differentiate
+ * the traversal further. It will get a prefix as wide as necessary to
+ * distinguish its two children:
+ *
+ *                      +----------------+
+ *                      |       (1)  (R) |
+ *                      | 192.168.0.0/16 |
+ *                      |    value: 1    |
+ *                      |   [0]    [1]   |
+ *                      +----------------+
+ *                           |      |
+ *            +----------------+  +------------------+
+ *            |       (4)  (I) |  |        (3)       |
+ *            | 192.168.0.0/23 |  | 192.168.128.0/24 |
+ *            |    value: ---  |  |     value: 3     |
+ *            |   [0]    [1]   |  |    [0]    [1]    |
+ *            +----------------+  +------------------+
+ *                 |      |
+ *  +----------------+  +----------------+
+ *  |       (2)      |  |       (5)      |
+ *  | 192.168.0.0/24 |  | 192.168.1.0/24 |
+ *  |    value: 2    |  |     value: 5   |
+ *  |   [0]    [1]   |  |   [0]    [1]   |
+ *  +----------------+  +----------------+
+ *
+ * 192.168.1.1/32 would be a child of (5) etc.
+ *
+ * An intermediate node will be turned into a 'real' node on demand. In the
+ * example above, (4) would be re-used if 192.168.0.0/23 is added to the trie.
+ *
+ * A fully populated trie would have a height of 32 nodes, as the trie was
+ * created with a prefix length of 32.
+ *
+ * The lookup starts at the root node. If the current node matches and if there
+ * is a child that can be used to become more specific, the trie is traversed
+ * downwards. The last node in the traversal that is a non-intermediate one is
+ * returned.
+ */
+
+static inline int extract_bit(const u8 *data, size_t index)
+{
+	return !!(data[index / 8] & (1 << (7 - (index % 8))));
+}
+
+/**
+ * longest_prefix_match() - determine the longest prefix
+ * @trie:	The trie to get internal sizes from
+ * @node:	The node to operate on
+ * @key:	The key to compare to @node
+ *
+ * Determine the longest prefix of @node that matches the bits in @key.
+ */
+static size_t longest_prefix_match(const struct lpm_trie *trie,
+				   const struct lpm_trie_node *node,
+				   const struct bpf_lpm_trie_key *key)
+{
+	size_t prefixlen = 0;
+	size_t i;
+
+	for (i = 0; i < trie->data_size; i++) {
+		size_t b;
+
+		b = 8 - fls(node->data[i] ^ key->data[i]);
+		prefixlen += b;
+
+		if (prefixlen >= node->prefixlen || prefixlen >= key->prefixlen)
+			return min(node->prefixlen, key->prefixlen);
+
+		if (b < 8)
+			break;
+	}
+
+	return prefixlen;
+}
+
+/* Called from syscall or from eBPF program */
+static void *trie_lookup_elem(struct bpf_map *map, void *_key)
+{
+	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+	struct lpm_trie_node *node, *found = NULL;
+	struct bpf_lpm_trie_key *key = _key;
+
+	/* Start walking the trie from the root node ... */
+
+	for (node = rcu_dereference(trie->root); node;) {
+		unsigned int next_bit;
+		size_t matchlen;
+
+		/* Determine the longest prefix of @node that matches @key.
+		 * If it's the maximum possible prefix for this trie, we have
+		 * an exact match and can return it directly.
+		 */
+		matchlen = longest_prefix_match(trie, node, key);
+		if (matchlen == trie->max_prefixlen) {
+			found = node;
+			break;
+		}
+
+		/* If the number of bits that match is smaller than the prefix
+		 * length of @node, bail out and return the node we have seen
+		 * last in the traversal (ie, the parent).
+		 */
+		if (matchlen < node->prefixlen)
+			break;
+
+		/* Consider this node as return candidate unless it is an
+		 * artificially added intermediate one.
+		 */
+		if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
+			found = node;
+
+		/* If the node match is fully satisfied, let's see if we can
+		 * become more specific. Determine the next bit in the key and
+		 * traverse down.
+		 */
+		next_bit = extract_bit(key->data, node->prefixlen);
+		node = rcu_dereference(node->child[next_bit]);
+	}
+
+	if (!found)
+		return NULL;
+
+	return found->data + trie->data_size;
+}
+
+static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
+						 const void *value)
+{
+	struct lpm_trie_node *node;
+	size_t size = sizeof(struct lpm_trie_node) + trie->data_size;
+
+	if (value)
+		size += trie->map.value_size;
+
+	node = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
+	if (!node)
+		return NULL;
+
+	node->flags = 0;
+
+	if (value)
+		memcpy(node->data + trie->data_size, value,
+		       trie->map.value_size);
+
+	return node;
+}
+
+/* Called from syscall or from eBPF program */
+static int trie_update_elem(struct bpf_map *map,
+			    void *_key, void *value, u64 flags)
+{
+	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+	struct lpm_trie_node *node, *im_node, *new_node = NULL;
+	struct lpm_trie_node __rcu **slot;
+	struct bpf_lpm_trie_key *key = _key;
+	unsigned long irq_flags;
+	unsigned int next_bit;
+	size_t matchlen = 0;
+	int ret = 0;
+
+	if (unlikely(flags > BPF_EXIST))
+		return -EINVAL;
+
+	if (key->prefixlen > trie->max_prefixlen)
+		return -EINVAL;
+
+	raw_spin_lock_irqsave(&trie->lock, irq_flags);
+
+	/* Allocate and fill a new node */
+
+	if (trie->n_entries == trie->map.max_entries) {
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	new_node = lpm_trie_node_alloc(trie, value);
+	if (!new_node) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	trie->n_entries++;
+
+	new_node->prefixlen = key->prefixlen;
+	RCU_INIT_POINTER(new_node->child[0], NULL);
+	RCU_INIT_POINTER(new_node->child[1], NULL);
+	memcpy(new_node->data, key->data, trie->data_size);
+
+	/* Now find a slot to attach the new node. To do that, walk the tree
+	 * from the root and match as many bits as possible for each node until
+	 * we either find an empty slot or a slot that needs to be replaced by
+	 * an intermediate node.
+	 */
+	slot = &trie->root;
+
+	while ((node = rcu_dereference_protected(*slot,
+					lockdep_is_held(&trie->lock)))) {
+		matchlen = longest_prefix_match(trie, node, key);
+
+		if (node->prefixlen != matchlen ||
+		    node->prefixlen == key->prefixlen ||
+		    node->prefixlen == trie->max_prefixlen)
+			break;
+
+		next_bit = extract_bit(key->data, node->prefixlen);
+		slot = &node->child[next_bit];
+	}
+
+	/* If the slot is empty (a free child pointer or an empty root),
+	 * simply assign the @new_node to that slot and be done.
+	 */
+	if (!node) {
+		rcu_assign_pointer(*slot, new_node);
+		goto out;
+	}
+
+	/* If the slot we picked already exists, replace it with @new_node
+	 * which already has the correct data array set.
+	 */
+	if (node->prefixlen == matchlen) {
+		new_node->child[0] = node->child[0];
+		new_node->child[1] = node->child[1];
+
+		if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
+			trie->n_entries--;
+
+		rcu_assign_pointer(*slot, new_node);
+		kfree_rcu(node, rcu);
+
+		goto out;
+	}
+
+	/* If the new node matches the prefix completely, it must be inserted
+	 * as an ancestor. Simply insert it between @node and *@slot.
+	 */
+	if (matchlen == key->prefixlen) {
+		next_bit = extract_bit(node->data, matchlen);
+		rcu_assign_pointer(new_node->child[next_bit], node);
+		rcu_assign_pointer(*slot, new_node);
+		goto out;
+	}
+
+	im_node = lpm_trie_node_alloc(trie, NULL);
+	if (!im_node) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	im_node->prefixlen = matchlen;
+	im_node->flags |= LPM_TREE_NODE_FLAG_IM;
+	memcpy(im_node->data, node->data, trie->data_size);
+
+	/* Now determine which child to install in which slot */
+	if (extract_bit(key->data, matchlen)) {
+		rcu_assign_pointer(im_node->child[0], node);
+		rcu_assign_pointer(im_node->child[1], new_node);
+	} else {
+		rcu_assign_pointer(im_node->child[0], new_node);
+		rcu_assign_pointer(im_node->child[1], node);
+	}
+
+	/* Finally, assign the intermediate node to the determined spot */
+	rcu_assign_pointer(*slot, im_node);
+
+out:
+	if (ret) {
+		if (new_node)
+			trie->n_entries--;
+
+		kfree(new_node);
+		kfree(im_node);
+	}
+
+	raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
+
+	return ret;
+}
+
+static int trie_delete_elem(struct bpf_map *map, void *key)
+{
+	/* TODO */
+	return -ENOSYS;
+}
+
+static struct bpf_map *trie_alloc(union bpf_attr *attr)
+{
+	size_t cost, cost_per_node;
+	struct lpm_trie *trie;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 ||
+	    attr->map_flags != BPF_F_NO_PREALLOC ||
+	    attr->key_size < sizeof(struct bpf_lpm_trie_key) + 1   ||
+	    attr->key_size > sizeof(struct bpf_lpm_trie_key) + 256 ||
+	    attr->value_size == 0)
+		return ERR_PTR(-EINVAL);
+
+	trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN);
+	if (!trie)
+		return ERR_PTR(-ENOMEM);
+
+	/* copy mandatory map attributes */
+	trie->map.map_type = attr->map_type;
+	trie->map.key_size = attr->key_size;
+	trie->map.value_size = attr->value_size;
+	trie->map.max_entries = attr->max_entries;
+	trie->data_size = attr->key_size -
+			  offsetof(struct bpf_lpm_trie_key, data);
+	trie->max_prefixlen = trie->data_size * 8;
+
+	cost_per_node = sizeof(struct lpm_trie_node) +
+			attr->value_size + trie->data_size;
+	cost = sizeof(*trie) + attr->max_entries * cost_per_node;
+	trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+	ret = bpf_map_precharge_memlock(trie->map.pages);
+	if (ret) {
+		kfree(trie);
+		return ERR_PTR(ret);
+	}
+
+	raw_spin_lock_init(&trie->lock);
+
+	return &trie->map;
+}
+
+static void trie_free(struct bpf_map *map)
+{
+	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+	struct lpm_trie_node __rcu **slot;
+	struct lpm_trie_node *node;
+
+	raw_spin_lock(&trie->lock);
+
+	/* Always start at the root and walk down to a node that has no
+	 * children. Then free that node, nullify its reference in the parent
+	 * and start over.
+	 */
+
+	for (;;) {
+		slot = &trie->root;
+
+		for (;;) {
+			node = rcu_dereference_protected(*slot,
+					lockdep_is_held(&trie->lock));
+			if (!node)
+				goto unlock;
+
+			if (rcu_access_pointer(node->child[0])) {
+				slot = &node->child[0];
+				continue;
+			}
+
+			if (rcu_access_pointer(node->child[1])) {
+				slot = &node->child[1];
+				continue;
+			}
+
+			kfree(node);
+			RCU_INIT_POINTER(*slot, NULL);
+			break;
+		}
+	}
+
+unlock:
+	raw_spin_unlock(&trie->lock);
+}
+
+static const struct bpf_map_ops trie_ops = {
+	.map_alloc = trie_alloc,
+	.map_free = trie_free,
+	.map_lookup_elem = trie_lookup_elem,
+	.map_update_elem = trie_update_elem,
+	.map_delete_elem = trie_delete_elem,
+};
+
+static struct bpf_map_type_list trie_type __read_mostly = {
+	.ops = &trie_ops,
+	.type = BPF_MAP_TYPE_LPM_TRIE,
+};
+
+static int __init register_trie_map(void)
+{
+	bpf_register_map_type(&trie_type);
+	return 0;
+}
+late_initcall(register_trie_map);
-- 
cgit v1.2.3


From 1cce1eea0aff51201753fcaca421df825b0813b6 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <n.borisov.lkml@gmail.com>
Date: Wed, 14 Dec 2016 15:56:33 +0200
Subject: inotify: Convert to using per-namespace limits

This patchset converts inotify to using the newly introduced
per-userns sysctl infrastructure.

Currently the inotify instances/watches are being accounted in the
user_struct structure. This means that in setups where multiple
users in unprivileged containers map to the same underlying
real user (i.e. pointing to the same user_struct) the inotify limits
are going to be shared as well, allowing one user(or application) to exhaust
all others limits.

Fix this by switching the inotify sysctls to using the
per-namespace/per-user limits. This will allow the server admin to
set sensible global limits, which can further be tuned inside every
individual user namespace. Additionally, in order to preserve the
sysctl ABI make the existing inotify instances/watches sysctls
modify the values of the initial user namespace.

Signed-off-by: Nikolay Borisov <n.borisov.lkml@gmail.com>
Acked-by: Jan Kara <jack@suse.cz>
Acked-by: Serge Hallyn <serge@hallyn.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/notify/inotify/inotify.h          | 17 +++++++++++++++++
 fs/notify/inotify/inotify_fsnotify.c |  6 ++----
 fs/notify/inotify/inotify_user.c     | 34 +++++++++++++++++-----------------
 include/linux/fsnotify_backend.h     |  3 ++-
 include/linux/sched.h                |  4 ----
 include/linux/user_namespace.h       |  4 ++++
 kernel/ucount.c                      |  6 +++++-
 7 files changed, 47 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index a6f5907a3fee..7c461fd49c4c 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -30,3 +30,20 @@ extern int inotify_handle_event(struct fsnotify_group *group,
 				const unsigned char *file_name, u32 cookie);
 
 extern const struct fsnotify_ops inotify_fsnotify_ops;
+
+#ifdef CONFIG_INOTIFY_USER
+static inline void dec_inotify_instances(struct ucounts *ucounts)
+{
+	dec_ucount(ucounts, UCOUNT_INOTIFY_INSTANCES);
+}
+
+static inline struct ucounts *inc_inotify_watches(struct ucounts *ucounts)
+{
+	return inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_INOTIFY_WATCHES);
+}
+
+static inline void dec_inotify_watches(struct ucounts *ucounts)
+{
+	dec_ucount(ucounts, UCOUNT_INOTIFY_WATCHES);
+}
+#endif
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 19e7ec109a75..f36c29398de3 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -165,10 +165,8 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
 	/* ideally the idr is empty and we won't hit the BUG in the callback */
 	idr_for_each(&group->inotify_data.idr, idr_callback, group);
 	idr_destroy(&group->inotify_data.idr);
-	if (group->inotify_data.user) {
-		atomic_dec(&group->inotify_data.user->inotify_devs);
-		free_uid(group->inotify_data.user);
-	}
+	if (group->inotify_data.ucounts)
+		dec_inotify_instances(group->inotify_data.ucounts);
 }
 
 static void inotify_free_event(struct fsnotify_event *fsn_event)
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 69d1ea3d292a..1cf41c623be1 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -44,10 +44,8 @@
 
 #include <asm/ioctls.h>
 
-/* these are configurable via /proc/sys/fs/inotify/ */
-static int inotify_max_user_instances __read_mostly;
+/* configurable via /proc/sys/fs/inotify/ */
 static int inotify_max_queued_events __read_mostly;
-static int inotify_max_user_watches __read_mostly;
 
 static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
 
@@ -60,7 +58,7 @@ static int zero;
 struct ctl_table inotify_table[] = {
 	{
 		.procname	= "max_user_instances",
-		.data		= &inotify_max_user_instances,
+		.data		= &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES],
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
@@ -68,7 +66,7 @@ struct ctl_table inotify_table[] = {
 	},
 	{
 		.procname	= "max_user_watches",
-		.data		= &inotify_max_user_watches,
+		.data		= &init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES],
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
@@ -500,7 +498,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
 	/* remove this mark from the idr */
 	inotify_remove_from_idr(group, i_mark);
 
-	atomic_dec(&group->inotify_data.user->inotify_watches);
+	dec_inotify_watches(group->inotify_data.ucounts);
 }
 
 /* ding dong the mark is dead */
@@ -584,14 +582,17 @@ static int inotify_new_watch(struct fsnotify_group *group,
 	tmp_i_mark->fsn_mark.mask = mask;
 	tmp_i_mark->wd = -1;
 
-	ret = -ENOSPC;
-	if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
-		goto out_err;
-
 	ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark);
 	if (ret)
 		goto out_err;
 
+	/* increment the number of watches the user has */
+	if (!inc_inotify_watches(group->inotify_data.ucounts)) {
+		inotify_remove_from_idr(group, tmp_i_mark);
+		ret = -ENOSPC;
+		goto out_err;
+	}
+
 	/* we are on the idr, now get on the inode */
 	ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode,
 				       NULL, 0);
@@ -601,8 +602,6 @@ static int inotify_new_watch(struct fsnotify_group *group,
 		goto out_err;
 	}
 
-	/* increment the number of watches the user has */
-	atomic_inc(&group->inotify_data.user->inotify_watches);
 
 	/* return the watch descriptor for this new mark */
 	ret = tmp_i_mark->wd;
@@ -653,10 +652,11 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
 
 	spin_lock_init(&group->inotify_data.idr_lock);
 	idr_init(&group->inotify_data.idr);
-	group->inotify_data.user = get_current_user();
+	group->inotify_data.ucounts = inc_ucount(current_user_ns(),
+						 current_euid(),
+						 UCOUNT_INOTIFY_INSTANCES);
 
-	if (atomic_inc_return(&group->inotify_data.user->inotify_devs) >
-	    inotify_max_user_instances) {
+	if (!group->inotify_data.ucounts) {
 		fsnotify_destroy_group(group);
 		return ERR_PTR(-EMFILE);
 	}
@@ -819,8 +819,8 @@ static int __init inotify_user_setup(void)
 	inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
 
 	inotify_max_queued_events = 16384;
-	inotify_max_user_instances = 128;
-	inotify_max_user_watches = 8192;
+	init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128;
+	init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = 8192;
 
 	return 0;
 }
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 0cf34d6cc253..c8f2738113f4 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -16,6 +16,7 @@
 #include <linux/spinlock.h>
 #include <linux/types.h>
 #include <linux/atomic.h>
+#include <linux/user_namespace.h>
 
 /*
  * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily
@@ -170,7 +171,7 @@ struct fsnotify_group {
 		struct inotify_group_private_data {
 			spinlock_t	idr_lock;
 			struct idr      idr;
-			struct user_struct      *user;
+			struct ucounts *ucounts;
 		} inotify_data;
 #endif
 #ifdef CONFIG_FANOTIFY
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4d1905245c7a..d2334229167f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -868,10 +868,6 @@ struct user_struct {
 	atomic_t __count;	/* reference count */
 	atomic_t processes;	/* How many processes does this user have? */
 	atomic_t sigpending;	/* How many pending signals does this user have? */
-#ifdef CONFIG_INOTIFY_USER
-	atomic_t inotify_watches; /* How many inotify watches does this user have? */
-	atomic_t inotify_devs;	/* How many inotify devs does this user have opened? */
-#endif
 #ifdef CONFIG_FANOTIFY
 	atomic_t fanotify_listeners;
 #endif
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index eb209d4523f5..363e0e8082a9 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -32,6 +32,10 @@ enum ucount_type {
 	UCOUNT_NET_NAMESPACES,
 	UCOUNT_MNT_NAMESPACES,
 	UCOUNT_CGROUP_NAMESPACES,
+#ifdef CONFIG_INOTIFY_USER
+	UCOUNT_INOTIFY_INSTANCES,
+	UCOUNT_INOTIFY_WATCHES,
+#endif
 	UCOUNT_COUNTS,
 };
 
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 4bbd38ec3788..68716403b261 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -57,7 +57,7 @@ static struct ctl_table_root set_root = {
 
 static int zero = 0;
 static int int_max = INT_MAX;
-#define UCOUNT_ENTRY(name) 				\
+#define UCOUNT_ENTRY(name)				\
 	{						\
 		.procname	= name,			\
 		.maxlen		= sizeof(int),		\
@@ -74,6 +74,10 @@ static struct ctl_table user_table[] = {
 	UCOUNT_ENTRY("max_net_namespaces"),
 	UCOUNT_ENTRY("max_mnt_namespaces"),
 	UCOUNT_ENTRY("max_cgroup_namespaces"),
+#ifdef CONFIG_INOTIFY_USER
+	UCOUNT_ENTRY("max_inotify_instances"),
+	UCOUNT_ENTRY("max_inotify_watches"),
+#endif
 	{ }
 };
 #endif /* CONFIG_SYSCTL */
-- 
cgit v1.2.3


From d140199af510ad4749dc5e38b7922135258ba5fd Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 24 Jan 2017 01:26:46 +0100
Subject: bpf, lpm: fix kfree of im_node in trie_update_elem

We need to initialize im_node to NULL, otherwise in case of error path
it gets passed to kfree() as uninitialized pointer.

Fixes: b95a5c4db09b ("bpf: add a longest prefix match trie map implementation")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/lpm_trie.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index ba19241d1979..144e9763102f 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -262,7 +262,7 @@ static int trie_update_elem(struct bpf_map *map,
 			    void *_key, void *value, u64 flags)
 {
 	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
-	struct lpm_trie_node *node, *im_node, *new_node = NULL;
+	struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL;
 	struct lpm_trie_node __rcu **slot;
 	struct bpf_lpm_trie_key *key = _key;
 	unsigned long irq_flags;
-- 
cgit v1.2.3


From 3fadc80115837b86f989d17c4aa92bb5cb7bc1b6 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 24 Jan 2017 01:06:30 +0100
Subject: bpf: enable verifier to better track const alu ops

William reported couple of issues in relation to direct packet
access. Typical scheme is to check for data + [off] <= data_end,
where [off] can be either immediate or coming from a tracked
register that contains an immediate, depending on the branch, we
can then access the data. However, in case of calculating [off]
for either the mentioned test itself or for access after the test
in a more "complex" way, then the verifier will stop tracking the
CONST_IMM marked register and will mark it as UNKNOWN_VALUE one.

Adding that UNKNOWN_VALUE typed register to a pkt() marked
register, the verifier then bails out in check_packet_ptr_add()
as it finds the registers imm value below 48. In the first below
example, that is due to evaluate_reg_imm_alu() not handling right
shifts and thus marking the register as UNKNOWN_VALUE via helper
__mark_reg_unknown_value() that resets imm to 0.

In the second case the same happens at the time when r4 is set
to r4 &= r5, where it transitions to UNKNOWN_VALUE from
evaluate_reg_imm_alu(). Later on r4 we shift right by 3 inside
evaluate_reg_alu(), where the register's imm turns into 3. That
is, for registers with type UNKNOWN_VALUE, imm of 0 means that
we don't know what value the register has, and for imm > 0 it
means that the value has [imm] upper zero bits. F.e. when shifting
an UNKNOWN_VALUE register by 3 to the right, no matter what value
it had, we know that the 3 upper most bits must be zero now.
This is to make sure that ALU operations with unknown registers
don't overflow. Meaning, once we know that we have more than 48
upper zero bits, or, in other words cannot go beyond 0xffff offset
with ALU ops, such an addition will track the target register
as a new pkt() register with a new id, but 0 offset and 0 range,
so for that a new data/data_end test will be required. Is the source
register a CONST_IMM one that is to be added to the pkt() register,
or the source instruction is an add instruction with immediate
value, then it will get added if it stays within max 0xffff bounds.
>From there, pkt() type, can be accessed should reg->off + imm be
within the access range of pkt().

  [...]
  from 28 to 30: R0=imm1,min_value=1,max_value=1
    R1=pkt(id=0,off=0,r=22) R2=pkt_end
    R3=imm144,min_value=144,max_value=144
    R4=imm0,min_value=0,max_value=0
    R5=inv48,min_value=2054,max_value=2054 R10=fp
  30: (bf) r5 = r3
  31: (07) r5 += 23
  32: (77) r5 >>= 3
  33: (bf) r6 = r1
  34: (0f) r6 += r5
  cannot add integer value with 0 upper zero bits to ptr_to_packet

  [...]
  from 52 to 80: R0=imm1,min_value=1,max_value=1
    R1=pkt(id=0,off=0,r=34) R2=pkt_end R3=inv
    R4=imm272 R5=inv56,min_value=17,max_value=17
    R6=pkt(id=0,off=26,r=34) R10=fp
  80: (07) r4 += 71
  81: (18) r5 = 0xfffffff8
  83: (5f) r4 &= r5
  84: (77) r4 >>= 3
  85: (0f) r1 += r4
  cannot add integer value with 3 upper zero bits to ptr_to_packet

Thus to get above use-cases working, evaluate_reg_imm_alu() has
been extended for further ALU ops. This is fine, because we only
operate strictly within realm of CONST_IMM types, so here we don't
care about overflows as they will happen in the simulated but also
real execution and interaction with pkt() in check_packet_ptr_add()
will check actual imm value once added to pkt(), but it's irrelevant
before.

With regards to 06c1c049721a ("bpf: allow helpers access to variable
memory") that works on UNKNOWN_VALUE registers, the verifier becomes
now a bit smarter as it can better resolve ALU ops, so we need to
adapt two test cases there, as min/max bound tracking only becomes
necessary when registers were spilled to stack. So while mask was
set before to track upper bound for UNKNOWN_VALUE case, it's now
resolved directly as CONST_IMM, and such contructs are only necessary
when f.e. registers are spilled.

For commit 6b17387307ba ("bpf: recognize 64bit immediate loads as
consts") that initially enabled dw load tracking only for nfp jit/
analyzer, I did couple of tests on large, complex programs and we
don't increase complexity badly (my tests were in ~3% range on avg).
I've added a couple of tests similar to affected code above, and
it works fine with verifier now.

Reported-by: William Tu <u9012063@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Gianluca Borello <g.borello@gmail.com>
Cc: William Tu <u9012063@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c                       | 64 +++++++++++++++-------
 tools/testing/selftests/bpf/test_verifier.c | 82 +++++++++++++++++++++++++++++
 2 files changed, 127 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8f69df7e8167..fb3513b35c0b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1566,22 +1566,54 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
 	struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
 	struct bpf_reg_state *src_reg = &regs[insn->src_reg];
 	u8 opcode = BPF_OP(insn->code);
+	u64 dst_imm = dst_reg->imm;
 
-	/* dst_reg->type == CONST_IMM here, simulate execution of 'add'/'or'
-	 * insn. Don't care about overflow or negative values, just add them
+	/* dst_reg->type == CONST_IMM here. Simulate execution of insns
+	 * containing ALU ops. Don't care about overflow or negative
+	 * values, just add/sub/... them; registers are in u64.
 	 */
-	if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K)
-		dst_reg->imm += insn->imm;
-	else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X &&
-		 src_reg->type == CONST_IMM)
-		dst_reg->imm += src_reg->imm;
-	else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K)
-		dst_reg->imm |= insn->imm;
-	else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X &&
-		 src_reg->type == CONST_IMM)
-		dst_reg->imm |= src_reg->imm;
-	else
+	if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) {
+		dst_imm += insn->imm;
+	} else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X &&
+		   src_reg->type == CONST_IMM) {
+		dst_imm += src_reg->imm;
+	} else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_K) {
+		dst_imm -= insn->imm;
+	} else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_X &&
+		   src_reg->type == CONST_IMM) {
+		dst_imm -= src_reg->imm;
+	} else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_K) {
+		dst_imm *= insn->imm;
+	} else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_X &&
+		   src_reg->type == CONST_IMM) {
+		dst_imm *= src_reg->imm;
+	} else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K) {
+		dst_imm |= insn->imm;
+	} else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X &&
+		   src_reg->type == CONST_IMM) {
+		dst_imm |= src_reg->imm;
+	} else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_K) {
+		dst_imm &= insn->imm;
+	} else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_X &&
+		   src_reg->type == CONST_IMM) {
+		dst_imm &= src_reg->imm;
+	} else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_K) {
+		dst_imm >>= insn->imm;
+	} else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_X &&
+		   src_reg->type == CONST_IMM) {
+		dst_imm >>= src_reg->imm;
+	} else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_K) {
+		dst_imm <<= insn->imm;
+	} else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_X &&
+		   src_reg->type == CONST_IMM) {
+		dst_imm <<= src_reg->imm;
+	} else {
 		mark_reg_unknown_value(regs, insn->dst_reg);
+		goto out;
+	}
+
+	dst_reg->imm = dst_imm;
+out:
 	return 0;
 }
 
@@ -2225,14 +2257,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		return err;
 
 	if (insn->src_reg == 0) {
-		/* generic move 64-bit immediate into a register,
-		 * only analyzer needs to collect the ld_imm value.
-		 */
 		u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
 
-		if (!env->analyzer_ops)
-			return 0;
-
 		regs[insn->dst_reg].type = CONST_IMM;
 		regs[insn->dst_reg].imm = imm;
 		return 0;
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 1aa73241c999..0d0912c7f03c 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -2325,6 +2325,84 @@ static struct bpf_test tests[] = {
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
+	{
+		"direct packet access: test11 (shift, good access)",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 22),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 8),
+			BPF_MOV64_IMM(BPF_REG_3, 144),
+			BPF_MOV64_REG(BPF_REG_5, BPF_REG_3),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 23),
+			BPF_ALU64_IMM(BPF_RSH, BPF_REG_5, 3),
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_2),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_5),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
+		"direct packet access: test12 (and, good access)",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 22),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 8),
+			BPF_MOV64_IMM(BPF_REG_3, 144),
+			BPF_MOV64_REG(BPF_REG_5, BPF_REG_3),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 23),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_5, 15),
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_2),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_5),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
+		"direct packet access: test13 (branches, good access)",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 22),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 13),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct __sk_buff, mark)),
+			BPF_MOV64_IMM(BPF_REG_4, 1),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_4, 2),
+			BPF_MOV64_IMM(BPF_REG_3, 14),
+			BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+			BPF_MOV64_IMM(BPF_REG_3, 24),
+			BPF_MOV64_REG(BPF_REG_5, BPF_REG_3),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 23),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_5, 15),
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_2),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_5),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
 	{
 		"helper access to packet: test1, valid packet_ptr range",
 		.insns = {
@@ -4208,6 +4286,8 @@ static struct bpf_test tests[] = {
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_1, 0),
 			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128),
 			BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 64),
 			BPF_MOV64_IMM(BPF_REG_3, 0),
 			BPF_MOV64_IMM(BPF_REG_4, 0),
@@ -4251,6 +4331,8 @@ static struct bpf_test tests[] = {
 			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
 			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
 			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128),
 			BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 63),
 			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1),
 			BPF_MOV64_IMM(BPF_REG_3, 0),
-- 
cgit v1.2.3


From 2acae0d5b0f73a8fb4b180bd13491feb96e55fc6 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 25 Jan 2017 02:28:16 +0100
Subject: trace: add variant without spacing in trace_print_hex_seq

For upcoming tracepoint support for BPF, we want to dump the program's
tag. Format should be similar to __print_hex(), but without spacing.
Add a __print_hex_str() variant for exactly that purpose that reuses
trace_print_hex_seq().

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/trace_events.h | 3 ++-
 include/trace/trace_events.h | 8 +++++++-
 kernel/trace/trace_output.c  | 7 ++++---
 3 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index be007610ceb0..cfa475a0e9ca 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -33,7 +33,8 @@ const char *trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
 				    unsigned int bitmask_size);
 
 const char *trace_print_hex_seq(struct trace_seq *p,
-				const unsigned char *buf, int len);
+				const unsigned char *buf, int len,
+				bool spacing);
 
 const char *trace_print_array_seq(struct trace_seq *p,
 				   const void *buf, int count,
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index 467e12f780d8..9f684629c3cf 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -297,7 +297,12 @@ TRACE_MAKE_SYSTEM_STR();
 #endif
 
 #undef __print_hex
-#define __print_hex(buf, buf_len) trace_print_hex_seq(p, buf, buf_len)
+#define __print_hex(buf, buf_len)					\
+	trace_print_hex_seq(p, buf, buf_len, true)
+
+#undef __print_hex_str
+#define __print_hex_str(buf, buf_len)					\
+	trace_print_hex_seq(p, buf, buf_len, false)
 
 #undef __print_array
 #define __print_array(array, count, el_size)				\
@@ -711,6 +716,7 @@ static inline void ftrace_test_probe_##call(void)			\
 #undef __print_flags
 #undef __print_symbolic
 #undef __print_hex
+#undef __print_hex_str
 #undef __get_dynamic_array
 #undef __get_dynamic_array_len
 #undef __get_str
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 5d33a7352919..30a144b1b9ee 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -163,14 +163,15 @@ trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
 EXPORT_SYMBOL_GPL(trace_print_bitmask_seq);
 
 const char *
-trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
+trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len,
+		    bool spacing)
 {
 	int i;
 	const char *ret = trace_seq_buffer_ptr(p);
 
 	for (i = 0; i < buf_len; i++)
-		trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]);
-
+		trace_seq_printf(p, "%s%2.2x", !spacing || i == 0 ? "" : " ",
+				 buf[i]);
 	trace_seq_putc(p, 0);
 
 	return ret;
-- 
cgit v1.2.3


From a67edbf4fb6deadcfe57a04a134abed4a5ba3bb5 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 25 Jan 2017 02:28:18 +0100
Subject: bpf: add initial bpf tracepoints

This work adds a number of tracepoints to paths that are either
considered slow-path or exception-like states, where monitoring or
inspecting them would be desirable.

For bpf(2) syscall, tracepoints have been placed for main commands
when they succeed. In XDP case, tracepoint is for exceptions, that
is, f.e. on abnormal BPF program exit such as unknown or XDP_ABORTED
return code, or when error occurs during XDP_TX action and the packet
could not be forwarded.

Both have been split into separate event headers, and can be further
extended. Worst case, if they unexpectedly should get into our way in
future, they can also removed [1]. Of course, these tracepoints (like
any other) can be analyzed by eBPF itself, etc. Example output:

  # ./perf record -a -e bpf:* sleep 10
  # ./perf script
  sock_example  6197 [005]   283.980322:      bpf:bpf_map_create: map type=ARRAY ufd=4 key=4 val=8 max=256 flags=0
  sock_example  6197 [005]   283.980721:       bpf:bpf_prog_load: prog=a5ea8fa30ea6849c type=SOCKET_FILTER ufd=5
  sock_example  6197 [005]   283.988423:   bpf:bpf_prog_get_type: prog=a5ea8fa30ea6849c type=SOCKET_FILTER
  sock_example  6197 [005]   283.988443: bpf:bpf_map_lookup_elem: map type=ARRAY ufd=4 key=[06 00 00 00] val=[00 00 00 00 00 00 00 00]
  [...]
  sock_example  6197 [005]   288.990868: bpf:bpf_map_lookup_elem: map type=ARRAY ufd=4 key=[01 00 00 00] val=[14 00 00 00 00 00 00 00]
       swapper     0 [005]   289.338243:    bpf:bpf_prog_put_rcu: prog=a5ea8fa30ea6849c type=SOCKET_FILTER

  [1] https://lwn.net/Articles/705270/

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c         |   3 +
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    |  12 +-
 .../net/ethernet/netronome/nfp/nfp_net_common.c    |  15 +-
 drivers/net/ethernet/qlogic/qede/qede_fp.c         |   4 +
 drivers/net/virtio_net.c                           |  12 +-
 include/linux/bpf_trace.h                          |   7 +
 include/trace/events/bpf.h                         | 347 +++++++++++++++++++++
 include/trace/events/xdp.h                         |  53 ++++
 kernel/bpf/core.c                                  |   9 +
 kernel/bpf/inode.c                                 |  17 +-
 kernel/bpf/syscall.c                               |  19 +-
 11 files changed, 483 insertions(+), 15 deletions(-)
 create mode 100644 include/linux/bpf_trace.h
 create mode 100644 include/trace/events/bpf.h
 create mode 100644 include/trace/events/xdp.h

(limited to 'kernel')

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index e362f99334d0..f15ddba3659a 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -33,6 +33,7 @@
 
 #include <net/busy_poll.h>
 #include <linux/bpf.h>
+#include <linux/bpf_trace.h>
 #include <linux/mlx4/cq.h>
 #include <linux/slab.h>
 #include <linux/mlx4/qp.h>
@@ -926,10 +927,12 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 							length, cq->ring,
 							&doorbell_pending)))
 					goto consumed;
+				trace_xdp_exception(dev, xdp_prog, act);
 				goto xdp_drop_no_cnt; /* Drop on xmit failure */
 			default:
 				bpf_warn_invalid_xdp_action(act);
 			case XDP_ABORTED:
+				trace_xdp_exception(dev, xdp_prog, act);
 			case XDP_DROP:
 				ring->xdp_drop++;
 xdp_drop_no_cnt:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index ba50583ea3ed..3d2e1a1886a5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -33,6 +33,7 @@
 #include <linux/ip.h>
 #include <linux/ipv6.h>
 #include <linux/tcp.h>
+#include <linux/bpf_trace.h>
 #include <net/busy_poll.h>
 #include "en.h"
 #include "en_tc.h"
@@ -640,7 +641,7 @@ static inline void mlx5e_xmit_xdp_doorbell(struct mlx5e_sq *sq)
 	mlx5e_tx_notify_hw(sq, &wqe->ctrl, 0);
 }
 
-static inline void mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq,
+static inline bool mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq,
 					struct mlx5e_dma_info *di,
 					const struct xdp_buff *xdp)
 {
@@ -662,7 +663,7 @@ static inline void mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq,
 		     MLX5E_SW2HW_MTU(rq->netdev->mtu) < dma_len)) {
 		rq->stats.xdp_drop++;
 		mlx5e_page_release(rq, di, true);
-		return;
+		return false;
 	}
 
 	if (unlikely(!mlx5e_sq_has_room_for(sq, MLX5E_XDP_TX_WQEBBS))) {
@@ -673,7 +674,7 @@ static inline void mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq,
 		}
 		rq->stats.xdp_tx_full++;
 		mlx5e_page_release(rq, di, true);
-		return;
+		return false;
 	}
 
 	dma_len -= MLX5E_XDP_MIN_INLINE;
@@ -703,6 +704,7 @@ static inline void mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq,
 
 	sq->db.xdp.doorbell = true;
 	rq->stats.xdp_tx++;
+	return true;
 }
 
 /* returns true if packet was consumed by xdp */
@@ -728,11 +730,13 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq,
 		*len = xdp.data_end - xdp.data;
 		return false;
 	case XDP_TX:
-		mlx5e_xmit_xdp_frame(rq, di, &xdp);
+		if (unlikely(!mlx5e_xmit_xdp_frame(rq, di, &xdp)))
+			trace_xdp_exception(rq->netdev, prog, act);
 		return true;
 	default:
 		bpf_warn_invalid_xdp_action(act);
 	case XDP_ABORTED:
+		trace_xdp_exception(rq->netdev, prog, act);
 	case XDP_DROP:
 		rq->stats.xdp_drop++;
 		mlx5e_page_release(rq, di, true);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 67afd95ffb93..6ac43abf561b 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -42,6 +42,7 @@
  */
 
 #include <linux/bpf.h>
+#include <linux/bpf_trace.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
@@ -1459,7 +1460,7 @@ nfp_net_rx_drop(struct nfp_net_r_vector *r_vec, struct nfp_net_rx_ring *rx_ring,
 		dev_kfree_skb_any(skb);
 }
 
-static void
+static bool
 nfp_net_tx_xdp_buf(struct nfp_net *nn, struct nfp_net_rx_ring *rx_ring,
 		   struct nfp_net_tx_ring *tx_ring,
 		   struct nfp_net_rx_buf *rxbuf, unsigned int pkt_off,
@@ -1473,13 +1474,13 @@ nfp_net_tx_xdp_buf(struct nfp_net *nn, struct nfp_net_rx_ring *rx_ring,
 
 	if (unlikely(nfp_net_tx_full(tx_ring, 1))) {
 		nfp_net_rx_drop(rx_ring->r_vec, rx_ring, rxbuf, NULL);
-		return;
+		return false;
 	}
 
 	new_frag = nfp_net_napi_alloc_one(nn, DMA_BIDIRECTIONAL, &new_dma_addr);
 	if (unlikely(!new_frag)) {
 		nfp_net_rx_drop(rx_ring->r_vec, rx_ring, rxbuf, NULL);
-		return;
+		return false;
 	}
 	nfp_net_rx_give_one(rx_ring, new_frag, new_dma_addr);
 
@@ -1509,6 +1510,7 @@ nfp_net_tx_xdp_buf(struct nfp_net *nn, struct nfp_net_rx_ring *rx_ring,
 
 	tx_ring->wr_p++;
 	tx_ring->wr_ptr_add++;
+	return true;
 }
 
 static int nfp_net_run_xdp(struct bpf_prog *prog, void *data, unsigned int len)
@@ -1613,12 +1615,15 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget)
 			case XDP_PASS:
 				break;
 			case XDP_TX:
-				nfp_net_tx_xdp_buf(nn, rx_ring, tx_ring, rxbuf,
-						   pkt_off, pkt_len);
+				if (unlikely(!nfp_net_tx_xdp_buf(nn, rx_ring,
+								 tx_ring, rxbuf,
+								 pkt_off, pkt_len)))
+					trace_xdp_exception(nn->netdev, xdp_prog, act);
 				continue;
 			default:
 				bpf_warn_invalid_xdp_action(act);
 			case XDP_ABORTED:
+				trace_xdp_exception(nn->netdev, xdp_prog, act);
 			case XDP_DROP:
 				nfp_net_rx_give_one(rx_ring, rxbuf->frag,
 						    rxbuf->dma_addr);
diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c
index 1a6ca4884fad..445d4d2492c3 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_fp.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c
@@ -32,6 +32,7 @@
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/skbuff.h>
+#include <linux/bpf_trace.h>
 #include <net/udp_tunnel.h>
 #include <linux/ip.h>
 #include <net/ipv6.h>
@@ -1016,6 +1017,7 @@ static bool qede_rx_xdp(struct qede_dev *edev,
 		/* We need the replacement buffer before transmit. */
 		if (qede_alloc_rx_buffer(rxq, true)) {
 			qede_recycle_rx_bd_ring(rxq, 1);
+			trace_xdp_exception(edev->ndev, prog, act);
 			return false;
 		}
 
@@ -1026,6 +1028,7 @@ static bool qede_rx_xdp(struct qede_dev *edev,
 			dma_unmap_page(rxq->dev, bd->mapping,
 				       PAGE_SIZE, DMA_BIDIRECTIONAL);
 			__free_page(bd->data);
+			trace_xdp_exception(edev->ndev, prog, act);
 		}
 
 		/* Regardless, we've consumed an Rx BD */
@@ -1035,6 +1038,7 @@ static bool qede_rx_xdp(struct qede_dev *edev,
 	default:
 		bpf_warn_invalid_xdp_action(act);
 	case XDP_ABORTED:
+		trace_xdp_exception(edev->ndev, prog, act);
 	case XDP_DROP:
 		qede_recycle_rx_bd_ring(rxq, cqe->bd_num);
 	}
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 37db91d1a0a3..f9bf94887ff1 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -23,6 +23,7 @@
 #include <linux/virtio.h>
 #include <linux/virtio_net.h>
 #include <linux/bpf.h>
+#include <linux/bpf_trace.h>
 #include <linux/scatterlist.h>
 #include <linux/if_vlan.h>
 #include <linux/slab.h>
@@ -330,7 +331,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 	return skb;
 }
 
-static void virtnet_xdp_xmit(struct virtnet_info *vi,
+static bool virtnet_xdp_xmit(struct virtnet_info *vi,
 			     struct receive_queue *rq,
 			     struct send_queue *sq,
 			     struct xdp_buff *xdp,
@@ -382,10 +383,12 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
 			put_page(page);
 		} else /* small buffer */
 			kfree_skb(data);
-		return; // On error abort to avoid unnecessary kick
+		/* On error abort to avoid unnecessary kick */
+		return false;
 	}
 
 	virtqueue_kick(sq->vq);
+	return true;
 }
 
 static u32 do_xdp_prog(struct virtnet_info *vi,
@@ -421,11 +424,14 @@ static u32 do_xdp_prog(struct virtnet_info *vi,
 			vi->xdp_queue_pairs +
 			smp_processor_id();
 		xdp.data = buf;
-		virtnet_xdp_xmit(vi, rq, &vi->sq[qp], &xdp, data);
+		if (unlikely(!virtnet_xdp_xmit(vi, rq, &vi->sq[qp], &xdp,
+					       data)))
+			trace_xdp_exception(vi->dev, xdp_prog, act);
 		return XDP_TX;
 	default:
 		bpf_warn_invalid_xdp_action(act);
 	case XDP_ABORTED:
+		trace_xdp_exception(vi->dev, xdp_prog, act);
 	case XDP_DROP:
 		return XDP_DROP;
 	}
diff --git a/include/linux/bpf_trace.h b/include/linux/bpf_trace.h
new file mode 100644
index 000000000000..b22efbdd2eb4
--- /dev/null
+++ b/include/linux/bpf_trace.h
@@ -0,0 +1,7 @@
+#ifndef __LINUX_BPF_TRACE_H__
+#define __LINUX_BPF_TRACE_H__
+
+#include <trace/events/bpf.h>
+#include <trace/events/xdp.h>
+
+#endif /* __LINUX_BPF_TRACE_H__ */
diff --git a/include/trace/events/bpf.h b/include/trace/events/bpf.h
new file mode 100644
index 000000000000..c3a53fd47ff1
--- /dev/null
+++ b/include/trace/events/bpf.h
@@ -0,0 +1,347 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM bpf
+
+#if !defined(_TRACE_BPF_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_BPF_H
+
+#include <linux/filter.h>
+#include <linux/bpf.h>
+#include <linux/fs.h>
+#include <linux/tracepoint.h>
+
+#define __PROG_TYPE_MAP(FN)	\
+	FN(SOCKET_FILTER)	\
+	FN(KPROBE)		\
+	FN(SCHED_CLS)		\
+	FN(SCHED_ACT)		\
+	FN(TRACEPOINT)		\
+	FN(XDP)			\
+	FN(PERF_EVENT)		\
+	FN(CGROUP_SKB)		\
+	FN(CGROUP_SOCK)		\
+	FN(LWT_IN)		\
+	FN(LWT_OUT)		\
+	FN(LWT_XMIT)
+
+#define __MAP_TYPE_MAP(FN)	\
+	FN(HASH)		\
+	FN(ARRAY)		\
+	FN(PROG_ARRAY)		\
+	FN(PERF_EVENT_ARRAY)	\
+	FN(PERCPU_HASH)		\
+	FN(PERCPU_ARRAY)	\
+	FN(STACK_TRACE)		\
+	FN(CGROUP_ARRAY)	\
+	FN(LRU_HASH)		\
+	FN(LRU_PERCPU_HASH)	\
+	FN(LPM_TRIE)
+
+#define __PROG_TYPE_TP_FN(x)	\
+	TRACE_DEFINE_ENUM(BPF_PROG_TYPE_##x);
+#define __PROG_TYPE_SYM_FN(x)	\
+	{ BPF_PROG_TYPE_##x, #x },
+#define __PROG_TYPE_SYM_TAB	\
+	__PROG_TYPE_MAP(__PROG_TYPE_SYM_FN) { -1, 0 }
+__PROG_TYPE_MAP(__PROG_TYPE_TP_FN)
+
+#define __MAP_TYPE_TP_FN(x)	\
+	TRACE_DEFINE_ENUM(BPF_MAP_TYPE_##x);
+#define __MAP_TYPE_SYM_FN(x)	\
+	{ BPF_MAP_TYPE_##x, #x },
+#define __MAP_TYPE_SYM_TAB	\
+	__MAP_TYPE_MAP(__MAP_TYPE_SYM_FN) { -1, 0 }
+__MAP_TYPE_MAP(__MAP_TYPE_TP_FN)
+
+DECLARE_EVENT_CLASS(bpf_prog_event,
+
+	TP_PROTO(const struct bpf_prog *prg),
+
+	TP_ARGS(prg),
+
+	TP_STRUCT__entry(
+		__array(u8, prog_tag, 8)
+		__field(u32, type)
+	),
+
+	TP_fast_assign(
+		BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag));
+		memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag));
+		__entry->type = prg->type;
+	),
+
+	TP_printk("prog=%s type=%s",
+		  __print_hex_str(__entry->prog_tag, 8),
+		  __print_symbolic(__entry->type, __PROG_TYPE_SYM_TAB))
+);
+
+DEFINE_EVENT(bpf_prog_event, bpf_prog_get_type,
+
+	TP_PROTO(const struct bpf_prog *prg),
+
+	TP_ARGS(prg)
+);
+
+DEFINE_EVENT(bpf_prog_event, bpf_prog_put_rcu,
+
+	TP_PROTO(const struct bpf_prog *prg),
+
+	TP_ARGS(prg)
+);
+
+TRACE_EVENT(bpf_prog_load,
+
+	TP_PROTO(const struct bpf_prog *prg, int ufd),
+
+	TP_ARGS(prg, ufd),
+
+	TP_STRUCT__entry(
+		__array(u8, prog_tag, 8)
+		__field(u32, type)
+		__field(int, ufd)
+	),
+
+	TP_fast_assign(
+		BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag));
+		memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag));
+		__entry->type = prg->type;
+		__entry->ufd  = ufd;
+	),
+
+	TP_printk("prog=%s type=%s ufd=%d",
+		  __print_hex_str(__entry->prog_tag, 8),
+		  __print_symbolic(__entry->type, __PROG_TYPE_SYM_TAB),
+		  __entry->ufd)
+);
+
+TRACE_EVENT(bpf_map_create,
+
+	TP_PROTO(const struct bpf_map *map, int ufd),
+
+	TP_ARGS(map, ufd),
+
+	TP_STRUCT__entry(
+		__field(u32, type)
+		__field(u32, size_key)
+		__field(u32, size_value)
+		__field(u32, max_entries)
+		__field(u32, flags)
+		__field(int, ufd)
+	),
+
+	TP_fast_assign(
+		__entry->type        = map->map_type;
+		__entry->size_key    = map->key_size;
+		__entry->size_value  = map->value_size;
+		__entry->max_entries = map->max_entries;
+		__entry->flags       = map->map_flags;
+		__entry->ufd         = ufd;
+	),
+
+	TP_printk("map type=%s ufd=%d key=%u val=%u max=%u flags=%x",
+		  __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
+		  __entry->ufd, __entry->size_key, __entry->size_value,
+		  __entry->max_entries, __entry->flags)
+);
+
+DECLARE_EVENT_CLASS(bpf_obj_prog,
+
+	TP_PROTO(const struct bpf_prog *prg, int ufd,
+		 const struct filename *pname),
+
+	TP_ARGS(prg, ufd, pname),
+
+	TP_STRUCT__entry(
+		__array(u8, prog_tag, 8)
+		__field(int, ufd)
+		__string(path, pname->name)
+	),
+
+	TP_fast_assign(
+		BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag));
+		memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag));
+		__assign_str(path, pname->name);
+		__entry->ufd = ufd;
+	),
+
+	TP_printk("prog=%s path=%s ufd=%d",
+		  __print_hex_str(__entry->prog_tag, 8),
+		  __get_str(path), __entry->ufd)
+);
+
+DEFINE_EVENT(bpf_obj_prog, bpf_obj_pin_prog,
+
+	TP_PROTO(const struct bpf_prog *prg, int ufd,
+		 const struct filename *pname),
+
+	TP_ARGS(prg, ufd, pname)
+);
+
+DEFINE_EVENT(bpf_obj_prog, bpf_obj_get_prog,
+
+	TP_PROTO(const struct bpf_prog *prg, int ufd,
+		 const struct filename *pname),
+
+	TP_ARGS(prg, ufd, pname)
+);
+
+DECLARE_EVENT_CLASS(bpf_obj_map,
+
+	TP_PROTO(const struct bpf_map *map, int ufd,
+		 const struct filename *pname),
+
+	TP_ARGS(map, ufd, pname),
+
+	TP_STRUCT__entry(
+		__field(u32, type)
+		__field(int, ufd)
+		__string(path, pname->name)
+	),
+
+	TP_fast_assign(
+		__assign_str(path, pname->name);
+		__entry->type = map->map_type;
+		__entry->ufd  = ufd;
+	),
+
+	TP_printk("map type=%s ufd=%d path=%s",
+		  __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
+		  __entry->ufd, __get_str(path))
+);
+
+DEFINE_EVENT(bpf_obj_map, bpf_obj_pin_map,
+
+	TP_PROTO(const struct bpf_map *map, int ufd,
+		 const struct filename *pname),
+
+	TP_ARGS(map, ufd, pname)
+);
+
+DEFINE_EVENT(bpf_obj_map, bpf_obj_get_map,
+
+	TP_PROTO(const struct bpf_map *map, int ufd,
+		 const struct filename *pname),
+
+	TP_ARGS(map, ufd, pname)
+);
+
+DECLARE_EVENT_CLASS(bpf_map_keyval,
+
+	TP_PROTO(const struct bpf_map *map, int ufd,
+		 const void *key, const void *val),
+
+	TP_ARGS(map, ufd, key, val),
+
+	TP_STRUCT__entry(
+		__field(u32, type)
+		__field(u32, key_len)
+		__dynamic_array(u8, key, map->key_size)
+		__field(bool, key_trunc)
+		__field(u32, val_len)
+		__dynamic_array(u8, val, map->value_size)
+		__field(bool, val_trunc)
+		__field(int, ufd)
+	),
+
+	TP_fast_assign(
+		memcpy(__get_dynamic_array(key), key, map->key_size);
+		memcpy(__get_dynamic_array(val), val, map->value_size);
+		__entry->type      = map->map_type;
+		__entry->key_len   = min(map->key_size, 16U);
+		__entry->key_trunc = map->key_size != __entry->key_len;
+		__entry->val_len   = min(map->value_size, 16U);
+		__entry->val_trunc = map->value_size != __entry->val_len;
+		__entry->ufd       = ufd;
+	),
+
+	TP_printk("map type=%s ufd=%d key=[%s%s] val=[%s%s]",
+		  __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
+		  __entry->ufd,
+		  __print_hex(__get_dynamic_array(key), __entry->key_len),
+		  __entry->key_trunc ? " ..." : "",
+		  __print_hex(__get_dynamic_array(val), __entry->val_len),
+		  __entry->val_trunc ? " ..." : "")
+);
+
+DEFINE_EVENT(bpf_map_keyval, bpf_map_lookup_elem,
+
+	TP_PROTO(const struct bpf_map *map, int ufd,
+		 const void *key, const void *val),
+
+	TP_ARGS(map, ufd, key, val)
+);
+
+DEFINE_EVENT(bpf_map_keyval, bpf_map_update_elem,
+
+	TP_PROTO(const struct bpf_map *map, int ufd,
+		 const void *key, const void *val),
+
+	TP_ARGS(map, ufd, key, val)
+);
+
+TRACE_EVENT(bpf_map_delete_elem,
+
+	TP_PROTO(const struct bpf_map *map, int ufd,
+		 const void *key),
+
+	TP_ARGS(map, ufd, key),
+
+	TP_STRUCT__entry(
+		__field(u32, type)
+		__field(u32, key_len)
+		__dynamic_array(u8, key, map->key_size)
+		__field(bool, key_trunc)
+		__field(int, ufd)
+	),
+
+	TP_fast_assign(
+		memcpy(__get_dynamic_array(key), key, map->key_size);
+		__entry->type      = map->map_type;
+		__entry->key_len   = min(map->key_size, 16U);
+		__entry->key_trunc = map->key_size != __entry->key_len;
+		__entry->ufd       = ufd;
+	),
+
+	TP_printk("map type=%s ufd=%d key=[%s%s]",
+		  __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
+		  __entry->ufd,
+		  __print_hex(__get_dynamic_array(key), __entry->key_len),
+		  __entry->key_trunc ? " ..." : "")
+);
+
+TRACE_EVENT(bpf_map_next_key,
+
+	TP_PROTO(const struct bpf_map *map, int ufd,
+		 const void *key, const void *key_next),
+
+	TP_ARGS(map, ufd, key, key_next),
+
+	TP_STRUCT__entry(
+		__field(u32, type)
+		__field(u32, key_len)
+		__dynamic_array(u8, key, map->key_size)
+		__dynamic_array(u8, nxt, map->key_size)
+		__field(bool, key_trunc)
+		__field(int, ufd)
+	),
+
+	TP_fast_assign(
+		memcpy(__get_dynamic_array(key), key, map->key_size);
+		memcpy(__get_dynamic_array(nxt), key_next, map->key_size);
+		__entry->type      = map->map_type;
+		__entry->key_len   = min(map->key_size, 16U);
+		__entry->key_trunc = map->key_size != __entry->key_len;
+		__entry->ufd       = ufd;
+	),
+
+	TP_printk("map type=%s ufd=%d key=[%s%s] next=[%s%s]",
+		  __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
+		  __entry->ufd,
+		  __print_hex(__get_dynamic_array(key), __entry->key_len),
+		  __entry->key_trunc ? " ..." : "",
+		  __print_hex(__get_dynamic_array(nxt), __entry->key_len),
+		  __entry->key_trunc ? " ..." : "")
+);
+
+#endif /* _TRACE_BPF_H */
+
+#include <trace/define_trace.h>
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
new file mode 100644
index 000000000000..1b61357d3f57
--- /dev/null
+++ b/include/trace/events/xdp.h
@@ -0,0 +1,53 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM xdp
+
+#if !defined(_TRACE_XDP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_XDP_H
+
+#include <linux/netdevice.h>
+#include <linux/filter.h>
+#include <linux/tracepoint.h>
+
+#define __XDP_ACT_MAP(FN)	\
+	FN(ABORTED)		\
+	FN(DROP)		\
+	FN(PASS)		\
+	FN(TX)
+
+#define __XDP_ACT_TP_FN(x)	\
+	TRACE_DEFINE_ENUM(XDP_##x);
+#define __XDP_ACT_SYM_FN(x)	\
+	{ XDP_##x, #x },
+#define __XDP_ACT_SYM_TAB	\
+	__XDP_ACT_MAP(__XDP_ACT_SYM_FN) { -1, 0 }
+__XDP_ACT_MAP(__XDP_ACT_TP_FN)
+
+TRACE_EVENT(xdp_exception,
+
+	TP_PROTO(const struct net_device *dev,
+		 const struct bpf_prog *xdp, u32 act),
+
+	TP_ARGS(dev, xdp, act),
+
+	TP_STRUCT__entry(
+		__string(name, dev->name)
+		__array(u8, prog_tag, 8)
+		__field(u32, act)
+	),
+
+	TP_fast_assign(
+		BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(xdp->tag));
+		memcpy(__entry->prog_tag, xdp->tag, sizeof(xdp->tag));
+		__assign_str(name, dev->name);
+		__entry->act = act;
+	),
+
+	TP_printk("prog=%s device=%s action=%s",
+		  __print_hex_str(__entry->prog_tag, 8),
+		  __get_str(name),
+		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB))
+);
+
+#endif /* _TRACE_XDP_H */
+
+#include <trace/define_trace.h>
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 503d4211988a..fddd76b1b627 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1173,3 +1173,12 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
 {
 	return -EFAULT;
 }
+
+/* All definitions of tracepoints related to BPF. */
+#define CREATE_TRACE_POINTS
+#include <linux/bpf_trace.h>
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu);
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 0b030c9126d3..fddcae801724 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -21,6 +21,7 @@
 #include <linux/parser.h>
 #include <linux/filter.h>
 #include <linux/bpf.h>
+#include <linux/bpf_trace.h>
 
 enum bpf_type {
 	BPF_TYPE_UNSPEC	= 0,
@@ -281,6 +282,13 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
 	ret = bpf_obj_do_pin(pname, raw, type);
 	if (ret != 0)
 		bpf_any_put(raw, type);
+	if ((trace_bpf_obj_pin_prog_enabled() ||
+	     trace_bpf_obj_pin_map_enabled()) && !ret) {
+		if (type == BPF_TYPE_PROG)
+			trace_bpf_obj_pin_prog(raw, ufd, pname);
+		if (type == BPF_TYPE_MAP)
+			trace_bpf_obj_pin_map(raw, ufd, pname);
+	}
 out:
 	putname(pname);
 	return ret;
@@ -342,8 +350,15 @@ int bpf_obj_get_user(const char __user *pathname)
 	else
 		goto out;
 
-	if (ret < 0)
+	if (ret < 0) {
 		bpf_any_put(raw, type);
+	} else if (trace_bpf_obj_get_prog_enabled() ||
+		   trace_bpf_obj_get_map_enabled()) {
+		if (type == BPF_TYPE_PROG)
+			trace_bpf_obj_get_prog(raw, ret, pname);
+		if (type == BPF_TYPE_MAP)
+			trace_bpf_obj_get_map(raw, ret, pname);
+	}
 out:
 	putname(pname);
 	return ret;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1d6b29e4e2c3..05ad086ab71d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -10,6 +10,7 @@
  * General Public License for more details.
  */
 #include <linux/bpf.h>
+#include <linux/bpf_trace.h>
 #include <linux/syscalls.h>
 #include <linux/slab.h>
 #include <linux/anon_inodes.h>
@@ -215,6 +216,7 @@ static int map_create(union bpf_attr *attr)
 		/* failed to allocate fd */
 		goto free_map;
 
+	trace_bpf_map_create(map, err);
 	return err;
 
 free_map:
@@ -339,6 +341,7 @@ static int map_lookup_elem(union bpf_attr *attr)
 	if (copy_to_user(uvalue, value, value_size) != 0)
 		goto free_value;
 
+	trace_bpf_map_lookup_elem(map, ufd, key, value);
 	err = 0;
 
 free_value:
@@ -421,6 +424,8 @@ static int map_update_elem(union bpf_attr *attr)
 	__this_cpu_dec(bpf_prog_active);
 	preempt_enable();
 
+	if (!err)
+		trace_bpf_map_update_elem(map, ufd, key, value);
 free_value:
 	kfree(value);
 free_key:
@@ -466,6 +471,8 @@ static int map_delete_elem(union bpf_attr *attr)
 	__this_cpu_dec(bpf_prog_active);
 	preempt_enable();
 
+	if (!err)
+		trace_bpf_map_delete_elem(map, ufd, key);
 free_key:
 	kfree(key);
 err_put:
@@ -518,6 +525,7 @@ static int map_get_next_key(union bpf_attr *attr)
 	if (copy_to_user(unext_key, next_key, map->key_size) != 0)
 		goto free_next_key;
 
+	trace_bpf_map_next_key(map, ufd, key, next_key);
 	err = 0;
 
 free_next_key:
@@ -671,8 +679,10 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 
 void bpf_prog_put(struct bpf_prog *prog)
 {
-	if (atomic_dec_and_test(&prog->aux->refcnt))
+	if (atomic_dec_and_test(&prog->aux->refcnt)) {
+		trace_bpf_prog_put_rcu(prog);
 		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
+	}
 }
 EXPORT_SYMBOL_GPL(bpf_prog_put);
 
@@ -781,7 +791,11 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
 
 struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 {
-	return __bpf_prog_get(ufd, &type);
+	struct bpf_prog *prog = __bpf_prog_get(ufd, &type);
+
+	if (!IS_ERR(prog))
+		trace_bpf_prog_get_type(prog);
+	return prog;
 }
 EXPORT_SYMBOL_GPL(bpf_prog_get_type);
 
@@ -863,6 +877,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 		/* failed to allocate fd */
 		goto free_used_maps;
 
+	trace_bpf_prog_load(prog, err);
 	return err;
 
 free_used_maps:
-- 
cgit v1.2.3


From f2c4689640e9a34bc45c013032185ed4ce47e7ff Mon Sep 17 00:00:00 2001
From: Lance Roy <ldr709@gmail.com>
Date: Mon, 23 Jan 2017 13:35:18 -0800
Subject: srcu: Implement more-efficient reader counts

SRCU uses two per-cpu counters: a nesting counter to count the number of
active critical sections, and a sequence counter to ensure that the nesting
counters don't change while they are being added together in
srcu_readers_active_idx_check().

This patch instead uses per-cpu lock and unlock counters. Because both
counters only increase and srcu_readers_active_idx_check() reads the unlock
counter before the lock counter, this achieves the same end without having
to increment two different counters in srcu_read_lock(). This also saves a
smp_mb() in srcu_readers_active_idx_check().

Possible bug: There is no guarantee that the lock counter won't overflow
during srcu_readers_active_idx_check(), as there are no memory barriers
around srcu_flip() (see comment in srcu_readers_active_idx_check() for
details). However, this problem was already present before this patch.

Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Lance Roy <ldr709@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcu.h    |  10 ++--
 kernel/rcu/rcutorture.c |  19 +++++++-
 kernel/rcu/srcu.c       | 122 +++++++++++++++++-------------------------------
 3 files changed, 66 insertions(+), 85 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index dc8eb63c6568..a598cf3ac70c 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -33,9 +33,9 @@
 #include <linux/rcupdate.h>
 #include <linux/workqueue.h>
 
-struct srcu_struct_array {
-	unsigned long c[2];
-	unsigned long seq[2];
+struct srcu_array {
+	unsigned long lock_count[2];
+	unsigned long unlock_count[2];
 };
 
 struct rcu_batch {
@@ -46,7 +46,7 @@ struct rcu_batch {
 
 struct srcu_struct {
 	unsigned long completed;
-	struct srcu_struct_array __percpu *per_cpu_ref;
+	struct srcu_array __percpu *per_cpu_ref;
 	spinlock_t queue_lock; /* protect ->batch_queue, ->running */
 	bool running;
 	/* callbacks just queued */
@@ -118,7 +118,7 @@ void process_srcu(struct work_struct *work);
  * See include/linux/percpu-defs.h for the rules on per-CPU variables.
  */
 #define __DEFINE_SRCU(name, is_static)					\
-	static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
+	static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\
 	is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
 #define DEFINE_SRCU(name)		__DEFINE_SRCU(name, /* not static */)
 #define DEFINE_STATIC_SRCU(name)	__DEFINE_SRCU(name, static)
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 87c51225ceec..d81345be730e 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -564,10 +564,25 @@ static void srcu_torture_stats(void)
 	pr_alert("%s%s per-CPU(idx=%d):",
 		 torture_type, TORTURE_FLAG, idx);
 	for_each_possible_cpu(cpu) {
+		unsigned long l0, l1;
+		unsigned long u0, u1;
 		long c0, c1;
+		struct srcu_array *counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu);
 
-		c0 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[!idx];
-		c1 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[idx];
+		u0 = counts->unlock_count[!idx];
+		u1 = counts->unlock_count[idx];
+
+		/*
+		 * Make sure that a lock is always counted if the corresponding
+		 * unlock is counted.
+		 */
+		smp_rmb();
+
+		l0 = counts->lock_count[!idx];
+		l1 = counts->lock_count[idx];
+
+		c0 = l0 - u0;
+		c1 = l1 - u1;
 		pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
 	}
 	pr_cont("\n");
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 9b9cdd549caa..c9a0015e1c2e 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -106,7 +106,7 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
 	rcu_batch_init(&sp->batch_check1);
 	rcu_batch_init(&sp->batch_done);
 	INIT_DELAYED_WORK(&sp->work, process_srcu);
-	sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
+	sp->per_cpu_ref = alloc_percpu(struct srcu_array);
 	return sp->per_cpu_ref ? 0 : -ENOMEM;
 }
 
@@ -141,114 +141,77 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 /*
- * Returns approximate total of the readers' ->seq[] values for the
+ * Returns approximate total of the readers' ->lock_count[] values for the
  * rank of per-CPU counters specified by idx.
  */
-static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
+static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
 {
 	int cpu;
 	unsigned long sum = 0;
-	unsigned long t;
 
 	for_each_possible_cpu(cpu) {
-		t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
-		sum += t;
+		struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
+
+		sum += READ_ONCE(cpuc->lock_count[idx]);
 	}
 	return sum;
 }
 
 /*
- * Returns approximate number of readers active on the specified rank
- * of the per-CPU ->c[] counters.
+ * Returns approximate total of the readers' ->unlock_count[] values for the
+ * rank of per-CPU counters specified by idx.
  */
-static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
+static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
 {
 	int cpu;
 	unsigned long sum = 0;
-	unsigned long t;
 
 	for_each_possible_cpu(cpu) {
-		t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
-		sum += t;
+		struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
+
+		sum += READ_ONCE(cpuc->unlock_count[idx]);
 	}
 	return sum;
 }
 
 /*
  * Return true if the number of pre-existing readers is determined to
- * be stably zero.  An example unstable zero can occur if the call
- * to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
- * but due to task migration, sees the corresponding __srcu_read_unlock()
- * decrement.  This can happen because srcu_readers_active_idx() takes
- * time to sum the array, and might in fact be interrupted or preempted
- * partway through the summation.
+ * be zero.
  */
 static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
 {
-	unsigned long seq;
+	unsigned long unlocks;
 
-	seq = srcu_readers_seq_idx(sp, idx);
+	unlocks = srcu_readers_unlock_idx(sp, idx);
 
 	/*
-	 * The following smp_mb() A pairs with the smp_mb() B located in
-	 * __srcu_read_lock().  This pairing ensures that if an
-	 * __srcu_read_lock() increments its counter after the summation
-	 * in srcu_readers_active_idx(), then the corresponding SRCU read-side
-	 * critical section will see any changes made prior to the start
-	 * of the current SRCU grace period.
+	 * Make sure that a lock is always counted if the corresponding unlock
+	 * is counted. Needs to be a smp_mb() as the read side may contain a
+	 * read from a variable that is written to before the synchronize_srcu()
+	 * in the write side. In this case smp_mb()s A and B act like the store
+	 * buffering pattern.
 	 *
-	 * Also, if the above call to srcu_readers_seq_idx() saw the
-	 * increment of ->seq[], then the call to srcu_readers_active_idx()
-	 * must see the increment of ->c[].
+	 * This smp_mb() also pairs with smp_mb() C to prevent accesses after the
+	 * synchronize_srcu() from being executed before the grace period ends.
 	 */
 	smp_mb(); /* A */
 
 	/*
-	 * Note that srcu_readers_active_idx() can incorrectly return
-	 * zero even though there is a pre-existing reader throughout.
-	 * To see this, suppose that task A is in a very long SRCU
-	 * read-side critical section that started on CPU 0, and that
-	 * no other reader exists, so that the sum of the counters
-	 * is equal to one.  Then suppose that task B starts executing
-	 * srcu_readers_active_idx(), summing up to CPU 1, and then that
-	 * task C starts reading on CPU 0, so that its increment is not
-	 * summed, but finishes reading on CPU 2, so that its decrement
-	 * -is- summed.  Then when task B completes its sum, it will
-	 * incorrectly get zero, despite the fact that task A has been
-	 * in its SRCU read-side critical section the whole time.
+	 * If the locks are the same as the unlocks, then there must have
+	 * been no readers on this index at some time in between. This does not
+	 * mean that there are no more readers, as one could have read the
+	 * current index but not have incremented the lock counter yet.
 	 *
-	 * We therefore do a validation step should srcu_readers_active_idx()
-	 * return zero.
+	 * Possible bug: There is no guarantee that there haven't been ULONG_MAX
+	 * increments of ->lock_count[] since the unlocks were counted, meaning
+	 * that this could return true even if there are still active readers.
+	 * Since there are no memory barriers around srcu_flip(), the CPU is not
+	 * required to increment ->completed before running
+	 * srcu_readers_unlock_idx(), which means that there could be an
+	 * arbitrarily large number of critical sections that execute after
+	 * srcu_readers_unlock_idx() but use the old value of ->completed.
 	 */
-	if (srcu_readers_active_idx(sp, idx) != 0)
-		return false;
-
-	/*
-	 * The remainder of this function is the validation step.
-	 * The following smp_mb() D pairs with the smp_mb() C in
-	 * __srcu_read_unlock().  If the __srcu_read_unlock() was seen
-	 * by srcu_readers_active_idx() above, then any destructive
-	 * operation performed after the grace period will happen after
-	 * the corresponding SRCU read-side critical section.
-	 *
-	 * Note that there can be at most NR_CPUS worth of readers using
-	 * the old index, which is not enough to overflow even a 32-bit
-	 * integer.  (Yes, this does mean that systems having more than
-	 * a billion or so CPUs need to be 64-bit systems.)  Therefore,
-	 * the sum of the ->seq[] counters cannot possibly overflow.
-	 * Therefore, the only way that the return values of the two
-	 * calls to srcu_readers_seq_idx() can be equal is if there were
-	 * no increments of the corresponding rank of ->seq[] counts
-	 * in the interim.  But the missed-increment scenario laid out
-	 * above includes an increment of the ->seq[] counter by
-	 * the corresponding __srcu_read_lock().  Therefore, if this
-	 * scenario occurs, the return values from the two calls to
-	 * srcu_readers_seq_idx() will differ, and thus the validation
-	 * step below suffices.
-	 */
-	smp_mb(); /* D */
-
-	return srcu_readers_seq_idx(sp, idx) == seq;
+	return srcu_readers_lock_idx(sp, idx) == unlocks;
 }
 
 /**
@@ -266,8 +229,12 @@ static bool srcu_readers_active(struct srcu_struct *sp)
 	unsigned long sum = 0;
 
 	for_each_possible_cpu(cpu) {
-		sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
-		sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
+		struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
+
+		sum += READ_ONCE(cpuc->lock_count[0]);
+		sum += READ_ONCE(cpuc->lock_count[1]);
+		sum -= READ_ONCE(cpuc->unlock_count[0]);
+		sum -= READ_ONCE(cpuc->unlock_count[1]);
 	}
 	return sum;
 }
@@ -298,9 +265,8 @@ int __srcu_read_lock(struct srcu_struct *sp)
 	int idx;
 
 	idx = READ_ONCE(sp->completed) & 0x1;
-	__this_cpu_inc(sp->per_cpu_ref->c[idx]);
+	__this_cpu_inc(sp->per_cpu_ref->lock_count[idx]);
 	smp_mb(); /* B */  /* Avoid leaking the critical section. */
-	__this_cpu_inc(sp->per_cpu_ref->seq[idx]);
 	return idx;
 }
 EXPORT_SYMBOL_GPL(__srcu_read_lock);
@@ -314,7 +280,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
 void __srcu_read_unlock(struct srcu_struct *sp, int idx)
 {
 	smp_mb(); /* C */  /* Avoid leaking the critical section. */
-	this_cpu_dec(sp->per_cpu_ref->c[idx]);
+	this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]);
 }
 EXPORT_SYMBOL_GPL(__srcu_read_unlock);
 
@@ -349,7 +315,7 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
 
 /*
  * Increment the ->completed counter so that future SRCU readers will
- * use the other rank of the ->c[] and ->seq[] arrays.  This allows
+ * use the other rank of the ->(un)lock_count[] arrays.  This allows
  * us to wait for pre-existing readers in a starvation-free manner.
  */
 static void srcu_flip(struct srcu_struct *sp)
-- 
cgit v1.2.3


From d85b62f18d543c663cbdd6061054efeb9e66cee7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 28 Nov 2016 12:08:49 -0800
Subject: srcu: Force full grace-period ordering

If a process invokes synchronize_srcu(), is delayed just the right amount
of time, and thus does not sleep when waiting for the grace period to
complete, there is no ordering between the end of the grace period and
the code following the synchronize_srcu().  Similarly, there can be a
lack of ordering between the end of the SRCU grace period and callback
invocation.

This commit adds the necessary ordering.

Reported-by: Lance Roy <ldr709@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ paulmck: Further smp_mb() adjustment per email with Lance Roy. ]
---
 include/linux/rcupdate.h | 12 ++++++++++++
 kernel/rcu/srcu.c        | 10 ++++++++--
 kernel/rcu/tree.h        | 12 ------------
 3 files changed, 20 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 01f71e1d2e94..6ade6a52d9d4 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -1161,5 +1161,17 @@ do { \
 		ftrace_dump(oops_dump_mode); \
 } while (0)
 
+/*
+ * Place this after a lock-acquisition primitive to guarantee that
+ * an UNLOCK+LOCK pair acts as a full barrier.  This guarantee applies
+ * if the UNLOCK and LOCK are executed by the same CPU or if the
+ * UNLOCK and LOCK operate on the same lock variable.
+ */
+#ifdef CONFIG_PPC
+#define smp_mb__after_unlock_lock()	smp_mb()  /* Full ordering for lock. */
+#else /* #ifdef CONFIG_PPC */
+#define smp_mb__after_unlock_lock()	do { } while (0)
+#endif /* #else #ifdef CONFIG_PPC */
+
 
 #endif /* __LINUX_RCUPDATE_H */
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index c9a0015e1c2e..665bc9951523 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -358,6 +358,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
 	head->next = NULL;
 	head->func = func;
 	spin_lock_irqsave(&sp->queue_lock, flags);
+	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
 	rcu_batch_queue(&sp->batch_queue, head);
 	if (!sp->running) {
 		sp->running = true;
@@ -391,6 +392,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
 	head->next = NULL;
 	head->func = wakeme_after_rcu;
 	spin_lock_irq(&sp->queue_lock);
+	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
 	if (!sp->running) {
 		/* steal the processing owner */
 		sp->running = true;
@@ -410,8 +412,11 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
 		spin_unlock_irq(&sp->queue_lock);
 	}
 
-	if (!done)
+	if (!done) {
 		wait_for_completion(&rcu.completion);
+		smp_mb(); /* Caller's later accesses after GP. */
+	}
+
 }
 
 /**
@@ -579,7 +584,8 @@ static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
 /*
  * Invoke a limited number of SRCU callbacks that have passed through
  * their grace period.  If there are more to do, SRCU will reschedule
- * the workqueue.
+ * the workqueue.  Note that needed memory barriers have been executed
+ * in this task's context by srcu_readers_active_idx_check().
  */
 static void srcu_invoke_callbacks(struct srcu_struct *sp)
 {
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index fe98dd24adf8..abcc25bdcb29 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -687,18 +687,6 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
 }
 #endif /* #ifdef CONFIG_RCU_TRACE */
 
-/*
- * Place this after a lock-acquisition primitive to guarantee that
- * an UNLOCK+LOCK pair act as a full barrier.  This guarantee applies
- * if the UNLOCK and LOCK are executed by the same CPU or if the
- * UNLOCK and LOCK operate on the same lock variable.
- */
-#ifdef CONFIG_PPC
-#define smp_mb__after_unlock_lock()	smp_mb()  /* Full ordering for lock. */
-#else /* #ifdef CONFIG_PPC */
-#define smp_mb__after_unlock_lock()	do { } while (0)
-#endif /* #else #ifdef CONFIG_PPC */
-
 /*
  * Wrappers for the rcu_node::lock acquire and release.
  *
-- 
cgit v1.2.3


From 7f554a3d05bea9f6b7bf8e0b041d09447f82d74a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 24 Jan 2017 08:51:34 -0800
Subject: srcu: Reduce probability of SRCU ->unlock_count[] counter overflow

Because there are no memory barriers between the srcu_flip() ->completed
increment and the summation of the read-side ->unlock_count[] counters,
both the compiler and the CPU can reorder the summation with the
->completed increment.  If the updater is preempted long enough during
this process, the read-side counters could overflow, resulting in a
too-short grace period.

This commit therefore adds a memory barrier just after the ->completed
increment, ensuring that if the summation misses an increment of
->unlock_count[] from __srcu_read_unlock(), the next __srcu_read_lock()
will see the new value of ->completed, thus bounding the number of
->unlock_count[] increments that can be missed to NR_CPUS.  The actual
overflow computation is more complex due to the possibility of nesting
of __srcu_read_lock().

Reported-by: Lance Roy <ldr709@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/srcu.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 665bc9951523..e773129c8b08 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -320,7 +320,16 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
  */
 static void srcu_flip(struct srcu_struct *sp)
 {
-	sp->completed++;
+	WRITE_ONCE(sp->completed, sp->completed + 1);
+
+	/*
+	 * Ensure that if the updater misses an __srcu_read_unlock()
+	 * increment, that task's next __srcu_read_lock() will see the
+	 * above counter update.  Note that both this memory barrier
+	 * and the one in srcu_readers_active_idx_check() provide the
+	 * guarantee for __srcu_read_lock().
+	 */
+	smp_mb(); /* D */  /* Pairs with C. */
 }
 
 /*
-- 
cgit v1.2.3


From 47087eeb744c83482774e8f6dc20cf2b11fff53f Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@gmail.com>
Date: Mon, 19 Dec 2016 23:03:11 +0800
Subject: PM / Hibernate: Use rb_entry() instead of container_of()

To make the code clearer, use rb_entry() instead of container_of() to
deal with rbtree.

Signed-off-by: Geliang Tang <geliangtang@gmail.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/swap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 32e0c232efba..f80fd33639e0 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -201,7 +201,7 @@ void free_all_swap_pages(int swap)
 		struct swsusp_extent *ext;
 		unsigned long offset;
 
-		ext = container_of(node, struct swsusp_extent, node);
+		ext = rb_entry(node, struct swsusp_extent, node);
 		rb_erase(node, &swsusp_extents);
 		for (offset = ext->start; offset <= ext->end; offset++)
 			swap_free(swp_entry(swap, offset));
-- 
cgit v1.2.3


From b18b6a9cef7f30e9a8b7738d5fc8d568cf660855 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Sat, 21 Jan 2017 00:09:08 -0500
Subject: timers: Omit POSIX timer stuff from task_struct when disabled

When CONFIG_POSIX_TIMERS is disabled, it is preferable to remove related
structures from struct task_struct and struct signal_struct as they
won't contain anything useful and shouldn't be relied upon by mistake.
Code still referencing those structures is also disabled here.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 fs/proc/base.c            |  4 ++--
 include/linux/init_task.h | 40 +++++++++++++++++++++++++---------------
 include/linux/sched.h     | 13 ++++++++++---
 kernel/fork.c             | 10 +++++++++-
 kernel/sched/rt.c         |  4 ++++
 kernel/sched/stats.h      | 32 ++++++++++++++++++++------------
 6 files changed, 70 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8e7e61b28f31..03deeac1e835 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2179,7 +2179,7 @@ static const struct file_operations proc_map_files_operations = {
 	.llseek		= generic_file_llseek,
 };
 
-#ifdef CONFIG_CHECKPOINT_RESTORE
+#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
 struct timers_private {
 	struct pid *pid;
 	struct task_struct *task;
@@ -2936,7 +2936,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
 	REG("setgroups",  S_IRUGO|S_IWUSR, proc_setgroups_operations),
 #endif
-#ifdef CONFIG_CHECKPOINT_RESTORE
+#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
 	REG("timers",	  S_IRUGO, proc_timers_operations),
 #endif
 	REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 325f649d77ff..3a85d61f7614 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -42,6 +42,27 @@ extern struct fs_struct init_fs;
 #define INIT_PREV_CPUTIME(x)
 #endif
 
+#ifdef CONFIG_POSIX_TIMERS
+#define INIT_POSIX_TIMERS(s)						\
+	.posix_timers = LIST_HEAD_INIT(s.posix_timers),
+#define INIT_CPU_TIMERS(s)						\
+	.cpu_timers = {							\
+		LIST_HEAD_INIT(s.cpu_timers[0]),			\
+		LIST_HEAD_INIT(s.cpu_timers[1]),			\
+		LIST_HEAD_INIT(s.cpu_timers[2]),								\
+	},
+#define INIT_CPUTIMER(s)						\
+	.cputimer	= { 						\
+		.cputime_atomic	= INIT_CPUTIME_ATOMIC,			\
+		.running	= false,				\
+		.checking_timer = false,				\
+	},
+#else
+#define INIT_POSIX_TIMERS(s)
+#define INIT_CPU_TIMERS(s)
+#define INIT_CPUTIMER(s)
+#endif
+
 #define INIT_SIGNALS(sig) {						\
 	.nr_threads	= 1,						\
 	.thread_head	= LIST_HEAD_INIT(init_task.thread_node),	\
@@ -49,14 +70,10 @@ extern struct fs_struct init_fs;
 	.shared_pending	= { 						\
 		.list = LIST_HEAD_INIT(sig.shared_pending.list),	\
 		.signal =  {{0}}},					\
-	.posix_timers	 = LIST_HEAD_INIT(sig.posix_timers),		\
-	.cpu_timers	= INIT_CPU_TIMERS(sig.cpu_timers),		\
+	INIT_POSIX_TIMERS(sig)						\
+	INIT_CPU_TIMERS(sig)						\
 	.rlim		= INIT_RLIMITS,					\
-	.cputimer	= { 						\
-		.cputime_atomic	= INIT_CPUTIME_ATOMIC,			\
-		.running	= false,				\
-		.checking_timer = false,				\
-	},								\
+	INIT_CPUTIMER(sig)						\
 	INIT_PREV_CPUTIME(sig)						\
 	.cred_guard_mutex =						\
 		 __MUTEX_INITIALIZER(sig.cred_guard_mutex),		\
@@ -247,7 +264,7 @@ extern struct task_group root_task_group;
 	.blocked	= {{0}},					\
 	.alloc_lock	= __SPIN_LOCK_UNLOCKED(tsk.alloc_lock),		\
 	.journal_info	= NULL,						\
-	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
+	INIT_CPU_TIMERS(tsk)						\
 	.pi_lock	= __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),	\
 	.timer_slack_ns = 50000, /* 50 usec default slack */		\
 	.pids = {							\
@@ -274,13 +291,6 @@ extern struct task_group root_task_group;
 }
 
 
-#define INIT_CPU_TIMERS(cpu_timers)					\
-{									\
-	LIST_HEAD_INIT(cpu_timers[0]),					\
-	LIST_HEAD_INIT(cpu_timers[1]),					\
-	LIST_HEAD_INIT(cpu_timers[2]),					\
-}
-
 /* Attach to the init_task data structure for proper alignment */
 #define __init_task_data __attribute__((__section__(".data..init_task")))
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4d1905245c7a..e8f6af59726d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -734,13 +734,14 @@ struct signal_struct {
 	unsigned int		is_child_subreaper:1;
 	unsigned int		has_child_subreaper:1;
 
+#ifdef CONFIG_POSIX_TIMERS
+
 	/* POSIX.1b Interval Timers */
 	int			posix_timer_id;
 	struct list_head	posix_timers;
 
 	/* ITIMER_REAL timer for the process */
 	struct hrtimer real_timer;
-	struct pid *leader_pid;
 	ktime_t it_real_incr;
 
 	/*
@@ -759,12 +760,16 @@ struct signal_struct {
 	/* Earliest-expiration cache. */
 	struct task_cputime cputime_expires;
 
+	struct list_head cpu_timers[3];
+
+#endif
+
+	struct pid *leader_pid;
+
 #ifdef CONFIG_NO_HZ_FULL
 	atomic_t tick_dep_mask;
 #endif
 
-	struct list_head cpu_timers[3];
-
 	struct pid *tty_old_pgrp;
 
 	/* boolean value for session group leader */
@@ -1681,8 +1686,10 @@ struct task_struct {
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt;
 
+#ifdef CONFIG_POSIX_TIMERS
 	struct task_cputime cputime_expires;
 	struct list_head cpu_timers[3];
+#endif
 
 /* process credentials */
 	const struct cred __rcu *ptracer_cred; /* Tracer's credentials at attach */
diff --git a/kernel/fork.c b/kernel/fork.c
index 11c5c8ab827c..105c6676d93b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1304,6 +1304,7 @@ void __cleanup_sighand(struct sighand_struct *sighand)
 	}
 }
 
+#ifdef CONFIG_POSIX_TIMERS
 /*
  * Initialize POSIX timer handling for a thread group.
  */
@@ -1322,6 +1323,9 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
 	INIT_LIST_HEAD(&sig->cpu_timers[1]);
 	INIT_LIST_HEAD(&sig->cpu_timers[2]);
 }
+#else
+static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { }
+#endif
 
 static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 {
@@ -1346,11 +1350,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	init_waitqueue_head(&sig->wait_chldexit);
 	sig->curr_target = tsk;
 	init_sigpending(&sig->shared_pending);
-	INIT_LIST_HEAD(&sig->posix_timers);
 	seqlock_init(&sig->stats_lock);
 	prev_cputime_init(&sig->prev_cputime);
 
 #ifdef CONFIG_POSIX_TIMERS
+	INIT_LIST_HEAD(&sig->posix_timers);
 	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	sig->real_timer.function = it_real_fn;
 #endif
@@ -1425,6 +1429,7 @@ static void rt_mutex_init_task(struct task_struct *p)
 #endif
 }
 
+#ifdef CONFIG_POSIX_TIMERS
 /*
  * Initialize POSIX timer handling for a single task.
  */
@@ -1437,6 +1442,9 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
 	INIT_LIST_HEAD(&tsk->cpu_timers[1]);
 	INIT_LIST_HEAD(&tsk->cpu_timers[2]);
 }
+#else
+static inline void posix_cpu_timers_init(struct task_struct *tsk) { }
+#endif
 
 static inline void
 init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 2516b8df6dbb..a688a8206727 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2246,6 +2246,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
 	}
 }
 
+#ifdef CONFIG_POSIX_TIMERS
 static void watchdog(struct rq *rq, struct task_struct *p)
 {
 	unsigned long soft, hard;
@@ -2267,6 +2268,9 @@ static void watchdog(struct rq *rq, struct task_struct *p)
 			p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
 	}
 }
+#else
+static inline void watchdog(struct rq *rq, struct task_struct *p) { }
+#endif
 
 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 {
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 34659a853505..c69a9870ab79 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -172,18 +172,19 @@ sched_info_switch(struct rq *rq,
  */
 
 /**
- * cputimer_running - return true if cputimer is running
+ * get_running_cputimer - return &tsk->signal->cputimer if cputimer is running
  *
  * @tsk:	Pointer to target task.
  */
-static inline bool cputimer_running(struct task_struct *tsk)
-
+#ifdef CONFIG_POSIX_TIMERS
+static inline
+struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk)
 {
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 
 	/* Check if cputimer isn't running. This is accessed without locking. */
 	if (!READ_ONCE(cputimer->running))
-		return false;
+		return NULL;
 
 	/*
 	 * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime
@@ -200,10 +201,17 @@ static inline bool cputimer_running(struct task_struct *tsk)
 	 * clock delta is behind the expiring timer value.
 	 */
 	if (unlikely(!tsk->sighand))
-		return false;
+		return NULL;
 
-	return true;
+	return cputimer;
+}
+#else
+static inline
+struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk)
+{
+	return NULL;
 }
+#endif
 
 /**
  * account_group_user_time - Maintain utime for a thread group.
@@ -218,9 +226,9 @@ static inline bool cputimer_running(struct task_struct *tsk)
 static inline void account_group_user_time(struct task_struct *tsk,
 					   cputime_t cputime)
 {
-	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+	struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
 
-	if (!cputimer_running(tsk))
+	if (!cputimer)
 		return;
 
 	atomic64_add(cputime, &cputimer->cputime_atomic.utime);
@@ -239,9 +247,9 @@ static inline void account_group_user_time(struct task_struct *tsk,
 static inline void account_group_system_time(struct task_struct *tsk,
 					     cputime_t cputime)
 {
-	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+	struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
 
-	if (!cputimer_running(tsk))
+	if (!cputimer)
 		return;
 
 	atomic64_add(cputime, &cputimer->cputime_atomic.stime);
@@ -260,9 +268,9 @@ static inline void account_group_system_time(struct task_struct *tsk,
 static inline void account_group_exec_runtime(struct task_struct *tsk,
 					      unsigned long long ns)
 {
-	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+	struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
 
-	if (!cputimer_running(tsk))
+	if (!cputimer)
 		return;
 
 	atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
-- 
cgit v1.2.3


From 48b77ad6084481ef9330a5d2bee289966da0975b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 27 Jan 2017 09:35:54 +0100
Subject: block: cleanup tracing

A couple tweaks to the tracing code:

 - trace the request size for all requests
 - trace request sector and nr_sectors only for fs requests, enforced by
   helpers
 - drop SCSI CDB tracing - we have SCSI tracing for this and are going
   to me the CDB out of the generic struct request soon.

With this the tracing code stops to know about BLOCK_PC requests entirely,
it's just FS vs passthrough requests now, where the latter includes any
driver-private requests.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blktrace_api.h | 14 +++++++-------
 include/trace/events/block.h | 27 +++++++++++----------------
 kernel/trace/blktrace.c      | 43 ++++++-------------------------------------
 3 files changed, 24 insertions(+), 60 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index e417f080219a..a143a67a6f33 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -110,16 +110,16 @@ struct compat_blk_user_trace_setup {
 
 #endif
 
-#if defined(CONFIG_EVENT_TRACING) && defined(CONFIG_BLOCK)
+extern void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes);
 
-static inline int blk_cmd_buf_len(struct request *rq)
+static inline sector_t blk_rq_trace_sector(struct request *rq)
 {
-	return (rq->cmd_type == REQ_TYPE_BLOCK_PC) ? rq->cmd_len * 3 : 1;
+	return (rq->cmd_type != REQ_TYPE_FS) ? 0 : blk_rq_pos(rq);
 }
 
-extern void blk_dump_cmd(char *buf, struct request *rq);
-extern void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes);
-
-#endif /* CONFIG_EVENT_TRACING && CONFIG_BLOCK */
+static inline unsigned int blk_rq_trace_nr_sectors(struct request *rq)
+{
+	return (rq->cmd_type != REQ_TYPE_FS) ? 0 : blk_rq_sectors(rq);
+}
 
 #endif
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 3e02e3a25413..a88ed13446ff 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -73,19 +73,17 @@ DECLARE_EVENT_CLASS(block_rq_with_error,
 		__field(  unsigned int,	nr_sector		)
 		__field(  int,		errors			)
 		__array(  char,		rwbs,	RWBS_LEN	)
-		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+		__dynamic_array( char,	cmd,	1		)
 	),
 
 	TP_fast_assign(
 		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
-		__entry->sector    = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
-					0 : blk_rq_pos(rq);
-		__entry->nr_sector = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
-					0 : blk_rq_sectors(rq);
+		__entry->sector    = blk_rq_trace_sector(rq);
+		__entry->nr_sector = blk_rq_trace_nr_sectors(rq);
 		__entry->errors    = rq->errors;
 
 		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
-		blk_dump_cmd(__get_str(cmd), rq);
+		__get_str(cmd)[0] = '\0';
 	),
 
 	TP_printk("%d,%d %s (%s) %llu + %u [%d]",
@@ -153,7 +151,7 @@ TRACE_EVENT(block_rq_complete,
 		__field(  unsigned int,	nr_sector		)
 		__field(  int,		errors			)
 		__array(  char,		rwbs,	RWBS_LEN	)
-		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+		__dynamic_array( char,	cmd,	1		)
 	),
 
 	TP_fast_assign(
@@ -163,7 +161,7 @@ TRACE_EVENT(block_rq_complete,
 		__entry->errors    = rq->errors;
 
 		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, nr_bytes);
-		blk_dump_cmd(__get_str(cmd), rq);
+		__get_str(cmd)[0] = '\0';
 	),
 
 	TP_printk("%d,%d %s (%s) %llu + %u [%d]",
@@ -186,20 +184,17 @@ DECLARE_EVENT_CLASS(block_rq,
 		__field(  unsigned int,	bytes			)
 		__array(  char,		rwbs,	RWBS_LEN	)
 		__array(  char,         comm,   TASK_COMM_LEN   )
-		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+		__dynamic_array( char,	cmd,	1		)
 	),
 
 	TP_fast_assign(
 		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
-		__entry->sector    = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
-					0 : blk_rq_pos(rq);
-		__entry->nr_sector = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
-					0 : blk_rq_sectors(rq);
-		__entry->bytes     = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
-					blk_rq_bytes(rq) : 0;
+		__entry->sector    = blk_rq_trace_sector(rq);
+		__entry->nr_sector = blk_rq_trace_nr_sectors(rq);
+		__entry->bytes     = blk_rq_bytes(rq);
 
 		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
-		blk_dump_cmd(__get_str(cmd), rq);
+		__get_str(cmd)[0] = '\0';
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 95cecbf67f5c..512b34703ac3 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -712,15 +712,13 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 	if (likely(!bt))
 		return;
 
-	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
+	if (rq->cmd_type != REQ_TYPE_FS)
 		what |= BLK_TC_ACT(BLK_TC_PC);
-		__blk_add_trace(bt, 0, nr_bytes, req_op(rq), rq->cmd_flags,
-				what, rq->errors, rq->cmd_len, rq->cmd);
-	} else  {
+	else
 		what |= BLK_TC_ACT(BLK_TC_FS);
-		__blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, req_op(rq),
-				rq->cmd_flags, what, rq->errors, 0, NULL);
-	}
+
+	__blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
+			rq->cmd_flags, what, rq->errors, 0, NULL);
 }
 
 static void blk_add_trace_rq_abort(void *ignore,
@@ -972,11 +970,7 @@ void blk_add_driver_data(struct request_queue *q,
 	if (likely(!bt))
 		return;
 
-	if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
-		__blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, 0,
-				BLK_TA_DRV_DATA, rq->errors, len, data);
-	else
-		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, 0,
+	__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
 				BLK_TA_DRV_DATA, rq->errors, len, data);
 }
 EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -1752,31 +1746,6 @@ void blk_trace_remove_sysfs(struct device *dev)
 
 #ifdef CONFIG_EVENT_TRACING
 
-void blk_dump_cmd(char *buf, struct request *rq)
-{
-	int i, end;
-	int len = rq->cmd_len;
-	unsigned char *cmd = rq->cmd;
-
-	if (rq->cmd_type != REQ_TYPE_BLOCK_PC) {
-		buf[0] = '\0';
-		return;
-	}
-
-	for (end = len - 1; end >= 0; end--)
-		if (cmd[end])
-			break;
-	end++;
-
-	for (i = 0; i < len; i++) {
-		buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]);
-		if (i == end && end != len - 1) {
-			sprintf(buf, " ..");
-			break;
-		}
-	}
-}
-
 void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes)
 {
 	int i = 0;
-- 
cgit v1.2.3


From 4009f4b3a9d8b74547269f293e6a920adf278996 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 19 Jan 2017 11:32:34 -0500
Subject: locking/rtmutex: Flip unlikely() branch to likely() in
 __rt_mutex_slowlock()

Running my likely/unlikely profiler for 3 weeks on two production
machines, I discovered that the unlikely() test in
__rt_mutex_slowlock() checking if state is TASK_INTERRUPTIBLE is hit
100% of the time, making it a very likely case.

The reason is, on a vanilla kernel, the majority case of calling
rt_mutex() is from the futex code. This code is always called as
TASK_INTERRUPTIBLE. In the -rt patch, this code is commonly called when
PREEMPT_RT is enabled with TASK_UNINTERRUPTIBLE. But that's not the
likely scenario.

The rt_mutex() code should be optimized for the common vanilla case,
and that is from a futex, with TASK_INTERRUPTIBLE as the state.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170119113234.1efeedd1@gandalf.local.home
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rtmutex.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 2f443ed2320a..d340be3a488f 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1179,7 +1179,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
 		 * TASK_INTERRUPTIBLE checks for signals and
 		 * timeout. Ignored otherwise.
 		 */
-		if (unlikely(state == TASK_INTERRUPTIBLE)) {
+		if (likely(state == TASK_INTERRUPTIBLE)) {
 			/* Signal pending? */
 			if (signal_pending(current))
 				ret = -EINTR;
-- 
cgit v1.2.3


From b9c16a0e1f733c97e48798b2a9362c485bb3b731 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 17 Jan 2017 16:06:09 +0100
Subject: locking/mutex: Fix lockdep_assert_held() fail

In commit:

  659cf9f5824a ("locking/ww_mutex: Optimize ww-mutexes by waking at most one waiter for backoff when acquiring the lock")

I replaced a comment with a lockdep_assert_held(). However it turns out
we hide that lock from lockdep for hysterical raisins, which results
in the assertion always firing.

Remove the old debug code as lockdep will easily spot the abuse it was
meant to catch, which will make the lock visible to lockdep and make
the assertion work as intended.

Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicolai Haehnle <Nicolai.Haehnle@amd.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 659cf9f5824a ("locking/ww_mutex: Optimize ww-mutexes by waking at most one waiter for backoff when acquiring the lock")
Link: http://lkml.kernel.org/r/20170117150609.GB32474@worktop
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/mutex-debug.h | 17 -----------------
 kernel/locking/mutex.c       | 25 +++++++++++--------------
 kernel/locking/mutex.h       |  4 ----
 3 files changed, 11 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h
index a459faa48987..4174417d5309 100644
--- a/kernel/locking/mutex-debug.h
+++ b/kernel/locking/mutex-debug.h
@@ -26,20 +26,3 @@ extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
 extern void debug_mutex_unlock(struct mutex *lock);
 extern void debug_mutex_init(struct mutex *lock, const char *name,
 			     struct lock_class_key *key);
-
-#define spin_lock_mutex(lock, flags)			\
-	do {						\
-		struct mutex *l = container_of(lock, struct mutex, wait_lock); \
-							\
-		DEBUG_LOCKS_WARN_ON(in_interrupt());	\
-		local_irq_save(flags);			\
-		arch_spin_lock(&(lock)->rlock.raw_lock);\
-		DEBUG_LOCKS_WARN_ON(l->magic != l);	\
-	} while (0)
-
-#define spin_unlock_mutex(lock, flags)				\
-	do {							\
-		arch_spin_unlock(&(lock)->rlock.raw_lock);	\
-		local_irq_restore(flags);			\
-		preempt_check_resched();			\
-	} while (0)
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 935116723a3d..705e06fe5e6c 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -325,8 +325,6 @@ __ww_mutex_wakeup_for_backoff(struct mutex *lock, struct ww_acquire_ctx *ww_ctx)
 static __always_inline void
 ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
-	unsigned long flags;
-
 	ww_mutex_lock_acquired(lock, ctx);
 
 	lock->ctx = ctx;
@@ -350,9 +348,9 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 	 * Uh oh, we raced in fastpath, wake up everyone in this case,
 	 * so they can see the new lock->ctx.
 	 */
-	spin_lock_mutex(&lock->base.wait_lock, flags);
+	spin_lock(&lock->base.wait_lock);
 	__ww_mutex_wakeup_for_backoff(&lock->base, ctx);
-	spin_unlock_mutex(&lock->base.wait_lock, flags);
+	spin_unlock(&lock->base.wait_lock);
 }
 
 /*
@@ -740,7 +738,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		    struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
 {
 	struct mutex_waiter waiter;
-	unsigned long flags;
 	bool first = false;
 	struct ww_mutex *ww;
 	int ret;
@@ -766,7 +763,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		return 0;
 	}
 
-	spin_lock_mutex(&lock->wait_lock, flags);
+	spin_lock(&lock->wait_lock);
 	/*
 	 * After waiting to acquire the wait_lock, try again.
 	 */
@@ -830,7 +827,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 				goto err;
 		}
 
-		spin_unlock_mutex(&lock->wait_lock, flags);
+		spin_unlock(&lock->wait_lock);
 		schedule_preempt_disabled();
 
 		/*
@@ -853,9 +850,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		    (first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, &waiter)))
 			break;
 
-		spin_lock_mutex(&lock->wait_lock, flags);
+		spin_lock(&lock->wait_lock);
 	}
-	spin_lock_mutex(&lock->wait_lock, flags);
+	spin_lock(&lock->wait_lock);
 acquired:
 	__set_current_state(TASK_RUNNING);
 
@@ -872,7 +869,7 @@ skip_wait:
 	if (use_ww_ctx && ww_ctx)
 		ww_mutex_set_context_slowpath(ww, ww_ctx);
 
-	spin_unlock_mutex(&lock->wait_lock, flags);
+	spin_unlock(&lock->wait_lock);
 	preempt_enable();
 	return 0;
 
@@ -880,7 +877,7 @@ err:
 	__set_current_state(TASK_RUNNING);
 	mutex_remove_waiter(lock, &waiter, current);
 err_early_backoff:
-	spin_unlock_mutex(&lock->wait_lock, flags);
+	spin_unlock(&lock->wait_lock);
 	debug_mutex_free_waiter(&waiter);
 	mutex_release(&lock->dep_map, 1, ip);
 	preempt_enable();
@@ -999,8 +996,8 @@ EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible);
 static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip)
 {
 	struct task_struct *next = NULL;
-	unsigned long owner, flags;
 	DEFINE_WAKE_Q(wake_q);
+	unsigned long owner;
 
 	mutex_release(&lock->dep_map, 1, ip);
 
@@ -1035,7 +1032,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
 		owner = old;
 	}
 
-	spin_lock_mutex(&lock->wait_lock, flags);
+	spin_lock(&lock->wait_lock);
 	debug_mutex_unlock(lock);
 	if (!list_empty(&lock->wait_list)) {
 		/* get the first entry from the wait-list: */
@@ -1052,7 +1049,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
 	if (owner & MUTEX_FLAG_HANDOFF)
 		__mutex_handoff(lock, next);
 
-	spin_unlock_mutex(&lock->wait_lock, flags);
+	spin_unlock(&lock->wait_lock);
 
 	wake_up_q(&wake_q);
 }
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 4410a4af42a3..6ebc1902f779 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -9,10 +9,6 @@
  * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs:
  */
 
-#define spin_lock_mutex(lock, flags) \
-		do { spin_lock(lock); (void)(flags); } while (0)
-#define spin_unlock_mutex(lock, flags) \
-		do { spin_unlock(lock); (void)(flags); } while (0)
 #define mutex_remove_waiter(lock, waiter, task) \
 		__list_del((waiter)->list.prev, (waiter)->list.next)
 
-- 
cgit v1.2.3


From 49ee576809d837442624ac18804b07943267cd57 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 19 Jan 2017 18:44:08 +0100
Subject: sched/core: Optimize pick_next_task() for idle_sched_class

Steve noticed that when we switch from IDLE to SCHED_OTHER we fail to
take the shortcut, even though all runnable tasks are of the fair
class, because prev->sched_class != &fair_sched_class.

Since I reworked the put_prev_task() stuff, we don't really care about
prev->class here, so removing that condition will allow this case.

This increases the likely case from 78% to 98% correct for Steve's
workload.

Reported-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Tested-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170119174408.GN6485@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 49ce1cb3d320..51ca21ef7ae5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3321,15 +3321,14 @@ static inline void schedule_debug(struct task_struct *prev)
 static inline struct task_struct *
 pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 {
-	const struct sched_class *class = &fair_sched_class;
+	const struct sched_class *class;
 	struct task_struct *p;
 
 	/*
 	 * Optimization: we know that if all tasks are in
 	 * the fair class we can call that function directly:
 	 */
-	if (likely(prev->sched_class == class &&
-		   rq->nr_running == rq->cfs.h_nr_running)) {
+	if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
 		p = fair_sched_class.pick_next_task(rq, prev, rf);
 		if (unlikely(p == RETRY_TASK))
 			goto again;
-- 
cgit v1.2.3


From 1b1d62254df0fe42a711eb71948f915918987790 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 23 Jan 2017 16:05:55 +0100
Subject: sched/core: Add missing update_rq_clock() call in sched_move_task()

Bug was noticed via this warning:

  WARNING: CPU: 6 PID: 1 at kernel/sched/sched.h:804 detach_task_cfs_rq+0x8e8/0xb80
  rq->clock_update_flags < RQCF_ACT_SKIP
  Modules linked in:
  CPU: 6 PID: 1 Comm: systemd Not tainted 4.10.0-rc5-00140-g0874170baf55-dirty #1
  Hardware name: Supermicro SYS-4048B-TRFT/X10QBi, BIOS 1.0 04/11/2014
  Call Trace:
   dump_stack+0x4d/0x65
   __warn+0xcb/0xf0
   warn_slowpath_fmt+0x5f/0x80
   detach_task_cfs_rq+0x8e8/0xb80
   ? allocate_cgrp_cset_links+0x59/0x80
   task_change_group_fair+0x27/0x150
   sched_change_group+0x48/0xf0
   sched_move_task+0x53/0x150
   cpu_cgroup_attach+0x36/0x70
   cgroup_taskset_migrate+0x175/0x300
   cgroup_migrate+0xab/0xd0
   cgroup_attach_task+0xf0/0x190
   __cgroup_procs_write+0x1ed/0x2f0
   cgroup_procs_write+0x14/0x20
   cgroup_file_write+0x3f/0x100
   kernfs_fop_write+0x104/0x180
   __vfs_write+0x37/0x140
   vfs_write+0xb8/0x1b0
   SyS_write+0x55/0xc0
   do_syscall_64+0x61/0x170
   entry_SYSCALL64_slow_path+0x25/0x25

Reported-by: Ingo Molnar <mingo@kernel.org>
Reported-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 51ca21ef7ae5..7f983e83a353 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8074,6 +8074,7 @@ void sched_move_task(struct task_struct *tsk)
 	struct rq *rq;
 
 	rq = task_rq_lock(tsk, &rf);
+	update_rq_clock(rq);
 
 	running = task_current(rq, tsk);
 	queued = task_on_rq_queued(tsk);
-- 
cgit v1.2.3


From 4d25b35ea3729affd37d69c78191ce6f92766e1a Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Wed, 26 Oct 2016 16:15:44 +0100
Subject: sched/fair: Restore previous rq_flags when migrating tasks in hotplug

__migrate_task() can return with a different runqueue locked than the
one we passed as an argument. So that we can repin the lock in
migrate_tasks() (and keep the update_rq_clock() bit) we need to
restore the old rq_flags before repinning.

Note that it wouldn't be correct to change move_queued_task() to repin
because of the change of runqueue and the fact that having an
up-to-date clock on the initial rq doesn't mean the new rq has one
too.

Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f983e83a353..3b248b03ad8f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5608,7 +5608,7 @@ static void migrate_tasks(struct rq *dead_rq)
 {
 	struct rq *rq = dead_rq;
 	struct task_struct *next, *stop = rq->stop;
-	struct rq_flags rf;
+	struct rq_flags rf, old_rf;
 	int dest_cpu;
 
 	/*
@@ -5669,6 +5669,13 @@ static void migrate_tasks(struct rq *dead_rq)
 			continue;
 		}
 
+		/*
+		 * __migrate_task() may return with a different
+		 * rq->lock held and a new cookie in 'rf', but we need
+		 * to preserve rf::clock_update_flags for 'dead_rq'.
+		 */
+		old_rf = rf;
+
 		/* Find suitable destination for @next, with force if needed. */
 		dest_cpu = select_fallback_rq(dead_rq->cpu, next);
 
@@ -5677,6 +5684,7 @@ static void migrate_tasks(struct rq *dead_rq)
 			raw_spin_unlock(&rq->lock);
 			rq = dead_rq;
 			raw_spin_lock(&rq->lock);
+			rf = old_rf;
 		}
 		raw_spin_unlock(&next->pi_lock);
 	}
-- 
cgit v1.2.3


From 92c99ac829931abba33107e09358447c8ad6bd32 Mon Sep 17 00:00:00 2001
From: Mathieu Poirier <mathieu.poirier@linaro.org>
Date: Tue, 24 Jan 2017 14:11:34 -0700
Subject: sched/core: Fix &rd->rto_mask memory leak

If function cpudl_init() fails the memory allocated for &rd->rto_mask
needs to be freed, something this patch is addressing.

Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1485292295-21298-1-git-send-email-mathieu.poirier@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3b248b03ad8f..17d1df6e2dbb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5976,7 +5976,7 @@ static int init_rootdomain(struct root_domain *rd)
 
 	init_dl_bw(&rd->dl_bw);
 	if (cpudl_init(&rd->cpudl) != 0)
-		goto free_dlo_mask;
+		goto free_rto_mask;
 
 	if (cpupri_init(&rd->cpupri) != 0)
 		goto free_rto_mask;
-- 
cgit v1.2.3


From 4b12db939166042eb78e592bdf94ef113c14e379 Mon Sep 17 00:00:00 2001
From: Mathieu Poirier <mathieu.poirier@linaro.org>
Date: Tue, 24 Jan 2017 14:11:35 -0700
Subject: sched/core: Fix &rd->cpudl memory leak

While in the process of initialising a root domain, if function
cpupri_init() fails the memory allocated in cpudl_init() is not
reclaimed.

Adding a new goto target to cleanup the previous initialistion of
the root_domain's dl_bw structure reclaims said memory.

Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1485292295-21298-2-git-send-email-mathieu.poirier@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 17d1df6e2dbb..d01f9d047397 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5979,9 +5979,11 @@ static int init_rootdomain(struct root_domain *rd)
 		goto free_rto_mask;
 
 	if (cpupri_init(&rd->cpupri) != 0)
-		goto free_rto_mask;
+		goto free_cpudl;
 	return 0;
 
+free_cpudl:
+	cpudl_cleanup(&rd->cpudl);
 free_rto_mask:
 	free_cpumask_var(rd->rto_mask);
 free_dlo_mask:
-- 
cgit v1.2.3


From 619bd4a71874a8fd78eb6ccf9f272c5e98bcc7b7 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 24 Jan 2017 15:40:06 +0100
Subject: sched/rt: Add a missing rescheduling point

Since the change in commit:

  fd7a4bed1835 ("sched, rt: Convert switched_{from, to}_rt() / prio_changed_rt() to balance callbacks")

... we don't reschedule a task under certain circumstances:

Lets say task-A, SCHED_OTHER, is running on CPU0 (and it may run only on
CPU0) and holds a PI lock. This task is removed from the CPU because it
used up its time slice and another SCHED_OTHER task is running. Task-B on
CPU1 runs at RT priority and asks for the lock owned by task-A. This
results in a priority boost for task-A. Task-B goes to sleep until the
lock has been made available. Task-A is already runnable (but not active),
so it receives no wake up.

The reality now is that task-A gets on the CPU once the scheduler decides
to remove the current task despite the fact that a high priority task is
enqueued and waiting. This may take a long time.

The desired behaviour is that CPU0 immediately reschedules after the
priority boost which made task-A the task with the lowest priority.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: fd7a4bed1835 ("sched, rt: Convert switched_{from, to}_rt() prio_changed_rt() to balance callbacks")
Link: http://lkml.kernel.org/r/20170124144006.29821-1-bigeasy@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 3 +--
 kernel/sched/rt.c       | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 491ff663e1b6..27737f34757d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1729,12 +1729,11 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
 #ifdef CONFIG_SMP
 		if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
 			queue_push_tasks(rq);
-#else
+#endif
 		if (dl_task(rq->curr))
 			check_preempt_curr_dl(rq, p, 0);
 		else
 			resched_curr(rq);
-#endif
 	}
 }
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 88254be118b0..704f2b89abf1 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2198,10 +2198,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
 #ifdef CONFIG_SMP
 		if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded)
 			queue_push_tasks(rq);
-#else
+#endif /* CONFIG_SMP */
 		if (p->prio < rq->curr->prio)
 			resched_curr(rq);
-#endif /* CONFIG_SMP */
 	}
 }
 
-- 
cgit v1.2.3


From 058fe1c0440e68a1ba3c2270ae43e9f0298b27d8 Mon Sep 17 00:00:00 2001
From: David Carrillo-Cisneros <davidcc@google.com>
Date: Wed, 18 Jan 2017 11:24:53 -0800
Subject: perf/core: Make cgroup switch visit only cpuctxs with cgroup events

This patch follows from a conversation in CQM/CMT's last series about
speeding up the context switch for cgroup events:

  https://patchwork.kernel.org/patch/9478617/

This is a low-hanging fruit optimization. It replaces the iteration over
the "pmus" list in cgroup switch by an iteration over a new list that
contains only cpuctxs with at least one cgroup event.

This is necessary because the number of PMUs have increased over the years
e.g modern x86 server systems have well above 50 PMUs.

The iteration over the full PMU list is unneccessary and can be costly in
heavy cache contention scenarios.

Below are some instrumentation measurements with 10, 50 and 90 percentiles
of the total cost of context switch before and after this optimization for
a simple array read/write microbenchark.

  Contention
    Level    Nr events      Before (us)            After (us)       Median
  L2    L3     types      (10%, 50%, 90%)       (10%, 50%, 90%     Speedup
  --------------------------------------------------------------------------
  Low   Low       1       (1.72, 2.42, 5.85)    (1.35, 1.64, 5.46)     29%
  High  Low       1       (2.08, 4.56, 19.8)    (1720, 2.20, 13.7)     51%
  High  High      1       (2.86, 10.4, 12.7)    (2.54, 4.32, 12.1)     58%

  Low   Low       2       (1.98, 3.20, 6.89)    (1.68, 2.41, 8.89)     24%
  High  Low       2       (2.48, 5.28, 22.4)    (2150, 3.69, 14.6)     30%
  High  High      2       (3.32, 8.09, 13.9)    (2.80, 5.15, 13.7)     36%

where:

  1 event type  = cycles
  2 event types = cycles,intel_cqm/llc_occupancy/

   Contention L2 Low: workset  <  L2 cache size.
                 High:  "     >>  L2   "     " .
   Contention L3 Low: workset of task on all sockets  <  L3 cache size.
                 High:   "     "   "   "   "    "    >>  L3   "     " .

   Median Speedup is (50%ile Before - 50%ile After) /  50%ile Before

Unsurprisingly, the benefits of this optimization decrease with the number
of cpuctxs with a cgroup events, yet, is never detrimental.

Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: David Carrillo-Cisneros <davidcc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Cc: Vince Weaver <vince@deater.net>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/20170118192454.58008-2-davidcc@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  1 +
 kernel/events/core.c       | 98 +++++++++++++++++++++-------------------------
 2 files changed, 46 insertions(+), 53 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 78ed8105e64d..dfa725723f28 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -788,6 +788,7 @@ struct perf_cpu_context {
 	struct pmu			*unique_pmu;
 #ifdef CONFIG_CGROUP_PERF
 	struct perf_cgroup		*cgrp;
+	struct list_head		cgrp_cpuctx_entry;
 #endif
 
 	struct list_head		sched_cb_entry;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e5aaa806702d..928a818d912e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -678,6 +678,8 @@ perf_cgroup_set_timestamp(struct task_struct *task,
 	info->timestamp = ctx->timestamp;
 }
 
+static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
+
 #define PERF_CGROUP_SWOUT	0x1 /* cgroup switch out every event */
 #define PERF_CGROUP_SWIN	0x2 /* cgroup switch in events based on task */
 
@@ -690,61 +692,46 @@ perf_cgroup_set_timestamp(struct task_struct *task,
 static void perf_cgroup_switch(struct task_struct *task, int mode)
 {
 	struct perf_cpu_context *cpuctx;
-	struct pmu *pmu;
+	struct list_head *list;
 	unsigned long flags;
 
 	/*
-	 * disable interrupts to avoid geting nr_cgroup
-	 * changes via __perf_event_disable(). Also
-	 * avoids preemption.
+	 * Disable interrupts and preemption to avoid this CPU's
+	 * cgrp_cpuctx_entry to change under us.
 	 */
 	local_irq_save(flags);
 
-	/*
-	 * we reschedule only in the presence of cgroup
-	 * constrained events.
-	 */
+	list = this_cpu_ptr(&cgrp_cpuctx_list);
+	list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
+		WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
 
-	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-		if (cpuctx->unique_pmu != pmu)
-			continue; /* ensure we process each cpuctx once */
+		perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+		perf_pmu_disable(cpuctx->ctx.pmu);
 
-		/*
-		 * perf_cgroup_events says at least one
-		 * context on this CPU has cgroup events.
-		 *
-		 * ctx->nr_cgroups reports the number of cgroup
-		 * events for a context.
-		 */
-		if (cpuctx->ctx.nr_cgroups > 0) {
-			perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-			perf_pmu_disable(cpuctx->ctx.pmu);
-
-			if (mode & PERF_CGROUP_SWOUT) {
-				cpu_ctx_sched_out(cpuctx, EVENT_ALL);
-				/*
-				 * must not be done before ctxswout due
-				 * to event_filter_match() in event_sched_out()
-				 */
-				cpuctx->cgrp = NULL;
-			}
+		if (mode & PERF_CGROUP_SWOUT) {
+			cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+			/*
+			 * must not be done before ctxswout due
+			 * to event_filter_match() in event_sched_out()
+			 */
+			cpuctx->cgrp = NULL;
+		}
 
-			if (mode & PERF_CGROUP_SWIN) {
-				WARN_ON_ONCE(cpuctx->cgrp);
-				/*
-				 * set cgrp before ctxsw in to allow
-				 * event_filter_match() to not have to pass
-				 * task around
-				 * we pass the cpuctx->ctx to perf_cgroup_from_task()
-				 * because cgorup events are only per-cpu
-				 */
-				cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx);
-				cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
-			}
-			perf_pmu_enable(cpuctx->ctx.pmu);
-			perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+		if (mode & PERF_CGROUP_SWIN) {
+			WARN_ON_ONCE(cpuctx->cgrp);
+			/*
+			 * set cgrp before ctxsw in to allow
+			 * event_filter_match() to not have to pass
+			 * task around
+			 * we pass the cpuctx->ctx to perf_cgroup_from_task()
+			 * because cgorup events are only per-cpu
+			 */
+			cpuctx->cgrp = perf_cgroup_from_task(task,
+							     &cpuctx->ctx);
+			cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
 		}
+		perf_pmu_enable(cpuctx->ctx.pmu);
+		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 	}
 
 	local_irq_restore(flags);
@@ -889,6 +876,7 @@ list_update_cgroup_event(struct perf_event *event,
 			 struct perf_event_context *ctx, bool add)
 {
 	struct perf_cpu_context *cpuctx;
+	struct list_head *cpuctx_entry;
 
 	if (!is_cgroup_event(event))
 		return;
@@ -902,15 +890,16 @@ list_update_cgroup_event(struct perf_event *event,
 	 * this will always be called from the right CPU.
 	 */
 	cpuctx = __get_cpu_context(ctx);
-
-	/*
-	 * cpuctx->cgrp is NULL until a cgroup event is sched in or
-	 * ctx->nr_cgroup == 0 .
-	 */
-	if (add && perf_cgroup_from_task(current, ctx) == event->cgrp)
-		cpuctx->cgrp = event->cgrp;
-	else if (!add)
+	cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
+	/* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/
+	if (add) {
+		list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
+		if (perf_cgroup_from_task(current, ctx) == event->cgrp)
+			cpuctx->cgrp = event->cgrp;
+	} else {
+		list_del(cpuctx_entry);
 		cpuctx->cgrp = NULL;
+	}
 }
 
 #else /* !CONFIG_CGROUP_PERF */
@@ -10709,6 +10698,9 @@ static void __init perf_event_init_all_cpus(void)
 		INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
 		raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
 
+#ifdef CONFIG_CGROUP_PERF
+		INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
+#endif
 		INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
 	}
 }
-- 
cgit v1.2.3


From 1fd7e416995401ec082fc0fe6090a223969beda5 Mon Sep 17 00:00:00 2001
From: David Carrillo-Cisneros <davidcc@google.com>
Date: Wed, 18 Jan 2017 11:24:54 -0800
Subject: perf/core: Remove perf_cpu_context::unique_pmu

cpuctx->unique_pmu was originally introduced as a way to identify cpuctxs
with shared pmus in order to avoid visiting the same cpuctx more than once
in a for_each_pmu loop.

cpuctx->unique_pmu == cpuctx->pmu in non-software task contexts since they
have only one pmu per cpuctx. Since perf_pmu_sched_task() is only called in
hw contexts, this patch replaces cpuctx->unique_pmu by cpuctx->pmu in it.

The change above, together with the previous patch in this series, removed
the remaining uses of cpuctx->unique_pmu, so we remove it altogether.

Signed-off-by: David Carrillo-Cisneros <davidcc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Cc: Vince Weaver <vince@deater.net>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/20170118192454.58008-3-davidcc@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  1 -
 kernel/events/core.c       | 31 +------------------------------
 2 files changed, 1 insertion(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index dfa725723f28..5c58e93c130c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -785,7 +785,6 @@ struct perf_cpu_context {
 	ktime_t				hrtimer_interval;
 	unsigned int			hrtimer_active;
 
-	struct pmu			*unique_pmu;
 #ifdef CONFIG_CGROUP_PERF
 	struct perf_cgroup		*cgrp;
 	struct list_head		cgrp_cpuctx_entry;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 928a818d912e..d92c7ad7715d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2932,7 +2932,7 @@ static void perf_pmu_sched_task(struct task_struct *prev,
 		return;
 
 	list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
-		pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */
+		pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
 
 		if (WARN_ON_ONCE(!pmu->sched_task))
 			continue;
@@ -8636,37 +8636,10 @@ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
 	return NULL;
 }
 
-static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		struct perf_cpu_context *cpuctx;
-
-		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
-
-		if (cpuctx->unique_pmu == old_pmu)
-			cpuctx->unique_pmu = pmu;
-	}
-}
-
 static void free_pmu_context(struct pmu *pmu)
 {
-	struct pmu *i;
-
 	mutex_lock(&pmus_lock);
-	/*
-	 * Like a real lame refcount.
-	 */
-	list_for_each_entry(i, &pmus, entry) {
-		if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
-			update_pmu_context(i, pmu);
-			goto out;
-		}
-	}
-
 	free_percpu(pmu->pmu_cpu_context);
-out:
 	mutex_unlock(&pmus_lock);
 }
 
@@ -8870,8 +8843,6 @@ skip_type:
 		cpuctx->ctx.pmu = pmu;
 
 		__perf_mux_hrtimer_init(cpuctx, cpu);
-
-		cpuctx->unique_pmu = pmu;
 	}
 
 got_cpu_context:
-- 
cgit v1.2.3


From fe45bafbd0e1b5e828aa9d44d07e569df85869a2 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Thu, 19 Jan 2017 18:43:29 +0200
Subject: perf/core: Don't re-schedule CPU flexible events needlessly

In the sched-in path, we first remove a CPU's flexible events in order to
give priority to the task's pinned events. However, this step can be safely
skipped if the task doesn't have its own pinned events.

This patch implements this skipping.

Reported-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20170119164330.22887-2-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index d92c7ad7715d..8c0b7334230b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3122,8 +3122,12 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 	 * We want to keep the following priority order:
 	 * cpu pinned (that don't need to move), task pinned,
 	 * cpu flexible, task flexible.
+	 *
+	 * However, if task's ctx is not carrying any pinned
+	 * events, no need to flip the cpuctx's events around.
 	 */
-	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+	if (!list_empty(&ctx->pinned_groups))
+		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 	perf_event_sched_in(cpuctx, ctx, task);
 	perf_pmu_enable(ctx->pmu);
 	perf_ctx_unlock(cpuctx, ctx);
-- 
cgit v1.2.3


From 487f05e18aa4efacee6357480f293a5afe6593b5 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Thu, 19 Jan 2017 18:43:30 +0200
Subject: perf/core: Optimize event rescheduling on active contexts

When new events are added to an active context, we go and reschedule
all cpu groups and all task groups in order to preserve the priority
(cpu pinned, task pinned, cpu flexible, task flexible), but in
reality we only need to reschedule groups of the same priority as
that of the events being added, and below.

This patch changes the behavior so that only groups that need to be
rescheduled are rescheduled.

Reported-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20170119164330.22887-3-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 80 ++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 69 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 8c0b7334230b..cbcee23d05f0 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -355,6 +355,8 @@ enum event_type_t {
 	EVENT_FLEXIBLE = 0x1,
 	EVENT_PINNED = 0x2,
 	EVENT_TIME = 0x4,
+	/* see ctx_resched() for details */
+	EVENT_CPU = 0x8,
 	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 };
 
@@ -1442,6 +1444,20 @@ static void update_group_times(struct perf_event *leader)
 		update_event_times(event);
 }
 
+static enum event_type_t get_event_type(struct perf_event *event)
+{
+	struct perf_event_context *ctx = event->ctx;
+	enum event_type_t event_type;
+
+	lockdep_assert_held(&ctx->lock);
+
+	event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
+	if (!ctx->task)
+		event_type |= EVENT_CPU;
+
+	return event_type;
+}
+
 static struct list_head *
 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 {
@@ -2215,7 +2231,8 @@ ctx_sched_in(struct perf_event_context *ctx,
 	     struct task_struct *task);
 
 static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
-			       struct perf_event_context *ctx)
+			       struct perf_event_context *ctx,
+			       enum event_type_t event_type)
 {
 	if (!cpuctx->task_ctx)
 		return;
@@ -2223,7 +2240,7 @@ static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
 	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
 		return;
 
-	ctx_sched_out(ctx, cpuctx, EVENT_ALL);
+	ctx_sched_out(ctx, cpuctx, event_type);
 }
 
 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
@@ -2238,13 +2255,51 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
 		ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
 }
 
+/*
+ * We want to maintain the following priority of scheduling:
+ *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
+ *  - task pinned (EVENT_PINNED)
+ *  - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
+ *  - task flexible (EVENT_FLEXIBLE).
+ *
+ * In order to avoid unscheduling and scheduling back in everything every
+ * time an event is added, only do it for the groups of equal priority and
+ * below.
+ *
+ * This can be called after a batch operation on task events, in which case
+ * event_type is a bit mask of the types of events involved. For CPU events,
+ * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
+ */
 static void ctx_resched(struct perf_cpu_context *cpuctx,
-			struct perf_event_context *task_ctx)
+			struct perf_event_context *task_ctx,
+			enum event_type_t event_type)
 {
+	enum event_type_t ctx_event_type = event_type & EVENT_ALL;
+	bool cpu_event = !!(event_type & EVENT_CPU);
+
+	/*
+	 * If pinned groups are involved, flexible groups also need to be
+	 * scheduled out.
+	 */
+	if (event_type & EVENT_PINNED)
+		event_type |= EVENT_FLEXIBLE;
+
 	perf_pmu_disable(cpuctx->ctx.pmu);
 	if (task_ctx)
-		task_ctx_sched_out(cpuctx, task_ctx);
-	cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+		task_ctx_sched_out(cpuctx, task_ctx, event_type);
+
+	/*
+	 * Decide which cpu ctx groups to schedule out based on the types
+	 * of events that caused rescheduling:
+	 *  - EVENT_CPU: schedule out corresponding groups;
+	 *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
+	 *  - otherwise, do nothing more.
+	 */
+	if (cpu_event)
+		cpu_ctx_sched_out(cpuctx, ctx_event_type);
+	else if (ctx_event_type & EVENT_PINNED)
+		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+
 	perf_event_sched_in(cpuctx, task_ctx, current);
 	perf_pmu_enable(cpuctx->ctx.pmu);
 }
@@ -2291,7 +2346,7 @@ static int  __perf_install_in_context(void *info)
 	if (reprogram) {
 		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
 		add_event_to_ctx(event, ctx);
-		ctx_resched(cpuctx, task_ctx);
+		ctx_resched(cpuctx, task_ctx, get_event_type(event));
 	} else {
 		add_event_to_ctx(event, ctx);
 	}
@@ -2458,7 +2513,7 @@ static void __perf_event_enable(struct perf_event *event,
 	if (ctx->task)
 		WARN_ON_ONCE(task_ctx != ctx);
 
-	ctx_resched(cpuctx, task_ctx);
+	ctx_resched(cpuctx, task_ctx, get_event_type(event));
 }
 
 /*
@@ -2885,7 +2940,7 @@ unlock:
 
 	if (do_switch) {
 		raw_spin_lock(&ctx->lock);
-		task_ctx_sched_out(cpuctx, ctx);
+		task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
 		raw_spin_unlock(&ctx->lock);
 	}
 }
@@ -3442,6 +3497,7 @@ static int event_enable_on_exec(struct perf_event *event,
 static void perf_event_enable_on_exec(int ctxn)
 {
 	struct perf_event_context *ctx, *clone_ctx = NULL;
+	enum event_type_t event_type = 0;
 	struct perf_cpu_context *cpuctx;
 	struct perf_event *event;
 	unsigned long flags;
@@ -3455,15 +3511,17 @@ static void perf_event_enable_on_exec(int ctxn)
 	cpuctx = __get_cpu_context(ctx);
 	perf_ctx_lock(cpuctx, ctx);
 	ctx_sched_out(ctx, cpuctx, EVENT_TIME);
-	list_for_each_entry(event, &ctx->event_list, event_entry)
+	list_for_each_entry(event, &ctx->event_list, event_entry) {
 		enabled |= event_enable_on_exec(event, ctx);
+		event_type |= get_event_type(event);
+	}
 
 	/*
 	 * Unclone and reschedule this context if we enabled any event.
 	 */
 	if (enabled) {
 		clone_ctx = unclone_ctx(ctx);
-		ctx_resched(cpuctx, ctx);
+		ctx_resched(cpuctx, ctx, event_type);
 	}
 	perf_ctx_unlock(cpuctx, ctx);
 
@@ -10224,7 +10282,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
 	 * in.
 	 */
 	raw_spin_lock_irq(&child_ctx->lock);
-	task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx);
+	task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
 
 	/*
 	 * Now that the context is inactive, destroy the task <-> ctx relation
-- 
cgit v1.2.3


From 40999312c703f80e8d31bc77cf00e6e84d36e091 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Wed, 18 Jan 2017 08:21:01 -0500
Subject: perf/core: Try parent PMU first when initializing a child event

perf has additional overhead when monitoring the task which
frequently generates child tasks.

perf_init_event() is one of the hotspots for the additional overhead:

Currently, to get the PMU, it tries to search the type in pmu_idr at
first. But it is not always successful, especially for the widely used
PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE events. So it has to go to the
slow path which go through the whole PMUs list.

It will be a big performance issue, if the PMUs list is long (e.g. server
with many uncore boxes) and the task frequently generates child tasks.

The child event inherits its parent event. So the child event should
try its parent PMU first.

Here is some data from the overhead test on Broadwell server:

  perf record -e $TEST_EVENTS -- ./loop.sh 50000

  loop.sh
    start=$(date +%s%N)
    i=0
    while [ "$i" -le "$1" ]
    do
            date > /dev/null
            i=`expr $i + 1`
    done
    end=$(date +%s%N)
    elapsed=`expr $end - $start`

  Event#	Original elapsed time	Elapsed time with patch		delta
  1		196,573,192,397		189,162,029,998			-3.77%
  2		257,567,753,013		241,620,788,683			-6.19%
  4		398,730,726,971		370,518,938,714			-7.08%
  8		824,983,761,120		740,702,489,329			-10.22%
  16		1,883,411,923,498	1,672,027,508,355		-11.22%

... which shows a nice performance improvement.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1484745662-15928-2-git-send-email-kan.liang@intel.com
[ Tidied up the changelog and the code comment. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index cbcee23d05f0..88676ff98c0f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9022,6 +9022,14 @@ static struct pmu *perf_init_event(struct perf_event *event)
 
 	idx = srcu_read_lock(&pmus_srcu);
 
+	/* Try parent's PMU first: */
+	if (event->parent && event->parent->pmu) {
+		pmu = event->parent->pmu;
+		ret = perf_try_init_event(pmu, event);
+		if (!ret)
+			goto unlock;
+	}
+
 	rcu_read_lock();
 	pmu = idr_find(&pmu_idr, event->attr.type);
 	rcu_read_unlock();
-- 
cgit v1.2.3


From b807421a720ff00bc31774d254f51313f4a7656b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 20 Jan 2017 12:06:08 -0500
Subject: cgroup: misc cleanups

* cgrp_dfl_implicit_ss_mask is ulong instead of u16 unlike other
  ss_masks.  Make it a u16.

* Move have_canfork_callback together with other callback ss_masks.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 0dcc4c7e935e..a99b15f9b577 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -159,7 +159,7 @@ static bool cgrp_dfl_visible;
 static u16 cgrp_dfl_inhibit_ss_mask;
 
 /* some controllers are implicitly enabled on the default hierarchy */
-static unsigned long cgrp_dfl_implicit_ss_mask;
+static u16 cgrp_dfl_implicit_ss_mask;
 
 /* The list of hierarchy roots */
 LIST_HEAD(cgroup_roots);
@@ -178,13 +178,13 @@ static DEFINE_IDR(cgroup_hierarchy_idr);
 static u64 css_serial_nr_next = 1;
 
 /*
- * These bitmask flags indicate whether tasks in the fork and exit paths have
- * fork/exit handlers to call. This avoids us having to do extra work in the
- * fork/exit path to check which subsystems have fork/exit callbacks.
+ * These bitmasks identify subsystems with specific features to avoid
+ * having to do iterative checks repeatedly.
  */
 static u16 have_fork_callback __read_mostly;
 static u16 have_exit_callback __read_mostly;
 static u16 have_free_callback __read_mostly;
+static u16 have_canfork_callback __read_mostly;
 
 /* cgroup namespace for init task */
 struct cgroup_namespace init_cgroup_ns = {
@@ -195,9 +195,6 @@ struct cgroup_namespace init_cgroup_ns = {
 	.root_cset	= &init_css_set,
 };
 
-/* Ditto for the can_fork callback. */
-static u16 have_canfork_callback __read_mostly;
-
 static struct file_system_type cgroup2_fs_type;
 static struct cftype cgroup_base_files[];
 
-- 
cgit v1.2.3


From 7598d167df99ddadb7f7bd803d1413af740f3617 Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence@redhat.com>
Date: Thu, 12 Jan 2017 11:57:44 -0500
Subject: livepatch/module: print notice of TAINT_LIVEPATCH

Add back the "tainting kernel with TAINT_LIVEPATCH" kernel log message
that commit 2992ef29ae01 ("livepatch/module: make TAINT_LIVEPATCH
module-specific") dropped.  Now that it's a module-specific taint flag,
include the module name.

Signed-off-by: Joe Lawrence <joe.lawrence@redhat.com>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Signed-off-by: Jessica Yu <jeyu@redhat.com>
---
 kernel/module.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 5088784c0cf9..330f64e7e193 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2812,6 +2812,8 @@ static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
 	if (get_modinfo(info, "livepatch")) {
 		mod->klp = true;
 		add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
+		pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n",
+			       mod->name);
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 57292b58ddb58689e8c3b4c6eadbef10d9ca44dd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 31 Jan 2017 16:57:29 +0100
Subject: block: introduce blk_rq_is_passthrough

This can be used to check for fs vs non-fs requests and basically
removes all knowledge of BLOCK_PC specific from the block layer,
as well as preparing for removing the cmd_type field in struct request.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c                      |  8 ++++----
 block/blk-exec.c                      |  2 +-
 block/blk.h                           |  2 +-
 block/elevator.c                      |  4 ++--
 block/mq-deadline.c                   |  2 +-
 drivers/ata/libata-scsi.c             |  2 +-
 drivers/block/cciss.c                 | 36 +++++++++++++++++------------------
 drivers/nvme/host/fc.c                |  2 +-
 drivers/nvme/host/pci.c               |  4 ++--
 drivers/nvme/host/rdma.c              |  4 ++--
 drivers/nvme/target/loop.c            |  2 +-
 drivers/scsi/hpsa.c                   |  4 ++--
 drivers/scsi/scsi.c                   |  2 +-
 drivers/scsi/scsi_error.c             |  4 ++--
 drivers/scsi/scsi_lib.c               |  6 +++---
 drivers/scsi/smartpqi/smartpqi_init.c |  2 +-
 drivers/scsi/sun3_scsi.c              |  2 +-
 include/linux/blkdev.h                | 16 +++++++++++-----
 include/linux/blktrace_api.h          |  4 ++--
 include/scsi/scsi_cmnd.h              |  2 +-
 kernel/trace/blktrace.c               |  2 +-
 21 files changed, 59 insertions(+), 53 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-core.c b/block/blk-core.c
index 95829523cded..44431086e4e7 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2506,10 +2506,10 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 	 * TODO: tj: This is too subtle.  It would be better to let
 	 * low level drivers do what they see fit.
 	 */
-	if (req->cmd_type == REQ_TYPE_FS)
+	if (!blk_rq_is_passthrough(req))
 		req->errors = 0;
 
-	if (error && req->cmd_type == REQ_TYPE_FS &&
+	if (error && !blk_rq_is_passthrough(req) &&
 	    !(req->rq_flags & RQF_QUIET)) {
 		char *error_type;
 
@@ -2581,7 +2581,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 	req->__data_len -= total_bytes;
 
 	/* update sector only for requests with clear definition of sector */
-	if (req->cmd_type == REQ_TYPE_FS)
+	if (!blk_rq_is_passthrough(req))
 		req->__sector += total_bytes >> 9;
 
 	/* mixed attributes always follow the first bio */
@@ -2659,7 +2659,7 @@ void blk_finish_request(struct request *req, int error)
 
 	BUG_ON(blk_queued_rq(req));
 
-	if (unlikely(laptop_mode) && req->cmd_type == REQ_TYPE_FS)
+	if (unlikely(laptop_mode) && !blk_rq_is_passthrough(req))
 		laptop_io_completion(&req->q->backing_dev_info);
 
 	blk_delete_timer(req);
diff --git a/block/blk-exec.c b/block/blk-exec.c
index ed51800f4b44..8cd0e9bc8dc8 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -51,7 +51,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
 
 	WARN_ON(irqs_disabled());
-	WARN_ON(rq->cmd_type == REQ_TYPE_FS);
+	WARN_ON(!blk_rq_is_passthrough(rq));
 
 	rq->rq_disk = bd_disk;
 	rq->end_io = done;
diff --git a/block/blk.h b/block/blk.h
index 9a716b5925a4..c1bd4bf9e645 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -249,7 +249,7 @@ static inline int blk_do_io_stat(struct request *rq)
 {
 	return rq->rq_disk &&
 	       (rq->rq_flags & RQF_IO_STAT) &&
-		(rq->cmd_type == REQ_TYPE_FS);
+		!blk_rq_is_passthrough(rq);
 }
 
 /*
diff --git a/block/elevator.c b/block/elevator.c
index ef7f59469acc..dba9be891a6b 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -634,7 +634,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
 
 	if (rq->rq_flags & RQF_SOFTBARRIER) {
 		/* barriers are scheduling boundary, update end_sector */
-		if (rq->cmd_type == REQ_TYPE_FS) {
+		if (!blk_rq_is_passthrough(rq)) {
 			q->end_sector = rq_end_sector(rq);
 			q->boundary_rq = rq;
 		}
@@ -676,7 +676,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
 		if (elv_attempt_insert_merge(q, rq))
 			break;
 	case ELEVATOR_INSERT_SORT:
-		BUG_ON(rq->cmd_type != REQ_TYPE_FS);
+		BUG_ON(blk_rq_is_passthrough(rq));
 		rq->rq_flags |= RQF_SORTED;
 		q->nr_sorted++;
 		if (rq_mergeable(rq)) {
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index d93ec713fa62..49583536698c 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -398,7 +398,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	if (blk_mq_sched_bypass_insert(hctx, rq))
 		return;
 
-	if (at_head || rq->cmd_type != REQ_TYPE_FS) {
+	if (at_head || blk_rq_is_passthrough(rq)) {
 		if (at_head)
 			list_add(&rq->queuelist, &dd->dispatch);
 		else
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 6abd73975f87..c771d4c341ea 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -1265,7 +1265,7 @@ static void ata_scsi_sdev_config(struct scsi_device *sdev)
  */
 static int atapi_drain_needed(struct request *rq)
 {
-	if (likely(rq->cmd_type != REQ_TYPE_BLOCK_PC))
+	if (likely(!blk_rq_is_passthrough(rq)))
 		return 0;
 
 	if (!blk_rq_bytes(rq) || op_is_write(req_op(rq)))
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index b93bb73b49d2..53dc2971a3ba 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1854,7 +1854,7 @@ static void cciss_softirq_done(struct request *rq)
 	dev_dbg(&h->pdev->dev, "Done with %p\n", rq);
 
 	/* set the residual count for pc requests */
-	if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
+	if (blk_rq_is_passthrough(rq))
 		scsi_req(rq)->resid_len = c->err_info->ResidualCnt;
 
 	blk_end_request_all(rq, (rq->errors == 0) ? 0 : -EIO);
@@ -3083,7 +3083,7 @@ static inline int evaluate_target_status(ctlr_info_t *h,
 	driver_byte = DRIVER_OK;
 	msg_byte = cmd->err_info->CommandStatus; /* correct?  seems too device specific */
 
-	if (cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC)
+	if (blk_rq_is_passthrough(cmd->rq))
 		host_byte = DID_PASSTHROUGH;
 	else
 		host_byte = DID_OK;
@@ -3092,7 +3092,7 @@ static inline int evaluate_target_status(ctlr_info_t *h,
 		host_byte, driver_byte);
 
 	if (cmd->err_info->ScsiStatus != SAM_STAT_CHECK_CONDITION) {
-		if (cmd->rq->cmd_type != REQ_TYPE_BLOCK_PC)
+		if (!blk_rq_is_passthrough(cmd->rq))
 			dev_warn(&h->pdev->dev, "cmd %p "
 			       "has SCSI Status 0x%x\n",
 			       cmd, cmd->err_info->ScsiStatus);
@@ -3103,16 +3103,16 @@ static inline int evaluate_target_status(ctlr_info_t *h,
 	sense_key = 0xf & cmd->err_info->SenseInfo[2];
 	/* no status or recovered error */
 	if (((sense_key == 0x0) || (sense_key == 0x1)) &&
-	    (cmd->rq->cmd_type != REQ_TYPE_BLOCK_PC))
+	    !blk_rq_is_passthrough(cmd->rq))
 		error_value = 0;
 
 	if (check_for_unit_attention(h, cmd)) {
-		*retry_cmd = !(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC);
+		*retry_cmd = !blk_rq_is_passthrough(cmd->rq);
 		return 0;
 	}
 
 	/* Not SG_IO or similar? */
-	if (cmd->rq->cmd_type != REQ_TYPE_BLOCK_PC) {
+	if (!blk_rq_is_passthrough(cmd->rq)) {
 		if (error_value != 0)
 			dev_warn(&h->pdev->dev, "cmd %p has CHECK CONDITION"
 			       " sense key = 0x%x\n", cmd, sense_key);
@@ -3146,14 +3146,14 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 		rq->errors = evaluate_target_status(h, cmd, &retry_cmd);
 		break;
 	case CMD_DATA_UNDERRUN:
-		if (cmd->rq->cmd_type == REQ_TYPE_FS) {
+		if (!blk_rq_is_passthrough(cmd->rq)) {
 			dev_warn(&h->pdev->dev, "cmd %p has"
 			       " completed with data underrun "
 			       "reported\n", cmd);
 		}
 		break;
 	case CMD_DATA_OVERRUN:
-		if (cmd->rq->cmd_type == REQ_TYPE_FS)
+		if (!blk_rq_is_passthrough(cmd->rq))
 			dev_warn(&h->pdev->dev, "cciss: cmd %p has"
 			       " completed with data overrun "
 			       "reported\n", cmd);
@@ -3163,7 +3163,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 		       "reported invalid\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
 		break;
 	case CMD_PROTOCOL_ERR:
@@ -3171,7 +3171,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 		       "protocol error\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
 		break;
 	case CMD_HARDWARE_ERR:
@@ -3179,7 +3179,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 		       " hardware error\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
 		break;
 	case CMD_CONNECTION_LOST:
@@ -3187,7 +3187,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 		       "connection lost\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
 		break;
 	case CMD_ABORTED:
@@ -3195,7 +3195,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 		       "aborted\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ABORT);
 		break;
 	case CMD_ABORT_FAILED:
@@ -3203,7 +3203,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 		       "abort failed\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
 		break;
 	case CMD_UNSOLICITED_ABORT:
@@ -3218,21 +3218,21 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 				"%p retried too many times\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ABORT);
 		break;
 	case CMD_TIMEOUT:
 		dev_warn(&h->pdev->dev, "cmd %p timedout\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
 		break;
 	case CMD_UNABORTABLE:
 		dev_warn(&h->pdev->dev, "cmd %p unabortable\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC ?
+			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
 		break;
 	default:
@@ -3241,7 +3241,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 		       cmd->err_info->CommandStatus);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
 	}
 
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index fcc9dcfdf675..40c979b777b8 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1937,7 +1937,7 @@ nvme_fc_complete_rq(struct request *rq)
 			return;
 		}
 
-		if (rq->cmd_type == REQ_TYPE_DRV_PRIV)
+		if (blk_rq_is_passthrough(rq))
 			error = rq->errors;
 		else
 			error = nvme_error_status(rq->errors);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 7103bce4ba4f..f29365b8c9be 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -588,7 +588,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	 */
 	if (ns && ns->ms && !blk_integrity_rq(req)) {
 		if (!(ns->pi_type && ns->ms == 8) &&
-					req->cmd_type != REQ_TYPE_DRV_PRIV) {
+		    !blk_rq_is_passthrough(req)) {
 			blk_mq_end_request(req, -EFAULT);
 			return BLK_MQ_RQ_QUEUE_OK;
 		}
@@ -645,7 +645,7 @@ static void nvme_complete_rq(struct request *req)
 			return;
 		}
 
-		if (req->cmd_type == REQ_TYPE_DRV_PRIV)
+		if (blk_rq_is_passthrough(req))
 			error = req->errors;
 		else
 			error = nvme_error_status(req->errors);
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 557f29b1f1bb..ecc79b2fcfaf 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1423,7 +1423,7 @@ static inline bool nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue,
 	if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags))) {
 		struct nvme_command *cmd = nvme_req(rq)->cmd;
 
-		if (rq->cmd_type != REQ_TYPE_DRV_PRIV ||
+		if (!blk_rq_is_passthrough(rq) ||
 		    cmd->common.opcode != nvme_fabrics_command ||
 		    cmd->fabrics.fctype != nvme_fabrics_type_connect)
 			return false;
@@ -1522,7 +1522,7 @@ static void nvme_rdma_complete_rq(struct request *rq)
 			return;
 		}
 
-		if (rq->cmd_type == REQ_TYPE_DRV_PRIV)
+		if (blk_rq_is_passthrough(rq))
 			error = rq->errors;
 		else
 			error = nvme_error_status(rq->errors);
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 9aaa70071ae5..f3862e38f574 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -104,7 +104,7 @@ static void nvme_loop_complete_rq(struct request *req)
 			return;
 		}
 
-		if (req->cmd_type == REQ_TYPE_DRV_PRIV)
+		if (blk_rq_is_passthrough(req))
 			error = req->errors;
 		else
 			error = nvme_error_status(req->errors);
diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
index cbc0c5fe5a60..c611412a8de9 100644
--- a/drivers/scsi/hpsa.c
+++ b/drivers/scsi/hpsa.c
@@ -5539,8 +5539,8 @@ static int hpsa_scsi_queue_command(struct Scsi_Host *sh, struct scsi_cmnd *cmd)
 	 * Retries always go down the normal I/O path.
 	 */
 	if (likely(cmd->retries == 0 &&
-		cmd->request->cmd_type == REQ_TYPE_FS &&
-		h->acciopath_status)) {
+			!blk_rq_is_passthrough(cmd->request) &&
+			h->acciopath_status)) {
 		rc = hpsa_ioaccel_submit(h, c, cmd, scsi3addr);
 		if (rc == 0)
 			return 0;
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 3d8d2153b448..7bfbcfa7af40 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -238,7 +238,7 @@ void scsi_finish_command(struct scsi_cmnd *cmd)
 				"(result %x)\n", cmd->result));
 
 	good_bytes = scsi_bufflen(cmd);
-        if (cmd->request->cmd_type != REQ_TYPE_BLOCK_PC) {
+        if (!blk_rq_is_passthrough(cmd->request)) {
 		int old_good_bytes = good_bytes;
 		drv = scsi_cmd_to_driver(cmd);
 		if (drv->done)
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 4b40f746d534..b4ce7bb5d2a9 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -1106,7 +1106,7 @@ static int scsi_request_sense(struct scsi_cmnd *scmd)
 
 static int scsi_eh_action(struct scsi_cmnd *scmd, int rtn)
 {
-	if (scmd->request->cmd_type != REQ_TYPE_BLOCK_PC) {
+	if (!blk_rq_is_passthrough(scmd->request)) {
 		struct scsi_driver *sdrv = scsi_cmd_to_driver(scmd);
 		if (sdrv->eh_action)
 			rtn = sdrv->eh_action(scmd, rtn);
@@ -1746,7 +1746,7 @@ check_type:
 	 * the check condition was retryable.
 	 */
 	if (scmd->request->cmd_flags & REQ_FAILFAST_DEV ||
-	    scmd->request->cmd_type == REQ_TYPE_BLOCK_PC)
+	    blk_rq_is_passthrough(scmd->request))
 		return 1;
 	else
 		return 0;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 8188e5c71f75..31629a7b728d 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -582,7 +582,7 @@ void scsi_run_host_queues(struct Scsi_Host *shost)
 
 static void scsi_uninit_cmd(struct scsi_cmnd *cmd)
 {
-	if (cmd->request->cmd_type == REQ_TYPE_FS) {
+	if (!blk_rq_is_passthrough(cmd->request)) {
 		struct scsi_driver *drv = scsi_cmd_to_driver(cmd);
 
 		if (drv->uninit_command)
@@ -806,7 +806,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 			sense_deferred = scsi_sense_is_deferred(&sshdr);
 	}
 
-	if (req->cmd_type == REQ_TYPE_BLOCK_PC) { /* SG_IO ioctl from block level */
+	if (blk_rq_is_passthrough(req)) {
 		if (result) {
 			if (sense_valid) {
 				/*
@@ -847,7 +847,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 		error = __scsi_error_from_host_byte(cmd, result);
 	}
 
-	/* no bidi support for !REQ_TYPE_BLOCK_PC yet */
+	/* no bidi support for !blk_rq_is_passthrough yet */
 	BUG_ON(blk_bidi_rq(req));
 
 	/*
diff --git a/drivers/scsi/smartpqi/smartpqi_init.c b/drivers/scsi/smartpqi/smartpqi_init.c
index 8702d9cf8040..11c0dfb3dfa3 100644
--- a/drivers/scsi/smartpqi/smartpqi_init.c
+++ b/drivers/scsi/smartpqi/smartpqi_init.c
@@ -4499,7 +4499,7 @@ static int pqi_scsi_queue_command(struct Scsi_Host *shost,
 	if (pqi_is_logical_device(device)) {
 		raid_bypassed = false;
 		if (device->offload_enabled &&
-			scmd->request->cmd_type == REQ_TYPE_FS) {
+				!blk_rq_is_passthrough(scmd->request)) {
 			rc = pqi_raid_bypass_submit_scsi_cmd(ctrl_info, device,
 				scmd, queue_group);
 			if (rc == 0 ||
diff --git a/drivers/scsi/sun3_scsi.c b/drivers/scsi/sun3_scsi.c
index 88db6992420e..bcf7d05d1aab 100644
--- a/drivers/scsi/sun3_scsi.c
+++ b/drivers/scsi/sun3_scsi.c
@@ -260,7 +260,7 @@ static int sun3scsi_dma_xfer_len(struct NCR5380_hostdata *hostdata,
 {
 	int wanted_len = cmd->SCp.this_residual;
 
-	if (wanted_len < DMA_MIN_SIZE || cmd->request->cmd_type != REQ_TYPE_FS)
+	if (wanted_len < DMA_MIN_SIZE || blk_rq_is_passthrough(cmd->request))
 		return 0;
 
 	return wanted_len;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e4c5f284fe2d..7121be081517 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -242,6 +242,11 @@ struct request {
 	struct request *next_rq;
 };
 
+static inline bool blk_rq_is_passthrough(struct request *rq)
+{
+	return rq->cmd_type != REQ_TYPE_FS;
+}
+
 static inline unsigned short req_get_ioprio(struct request *req)
 {
 	return req->ioprio;
@@ -698,9 +703,10 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 	((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
 			     REQ_FAILFAST_DRIVER))
 
-#define blk_account_rq(rq) \
-	(((rq)->rq_flags & RQF_STARTED) && \
-	 ((rq)->cmd_type == REQ_TYPE_FS))
+static inline bool blk_account_rq(struct request *rq)
+{
+	return (rq->rq_flags & RQF_STARTED) && !blk_rq_is_passthrough(rq);
+}
 
 #define blk_rq_cpu_valid(rq)	((rq)->cpu != -1)
 #define blk_bidi_rq(rq)		((rq)->next_rq != NULL)
@@ -775,7 +781,7 @@ static inline void blk_clear_rl_full(struct request_list *rl, bool sync)
 
 static inline bool rq_mergeable(struct request *rq)
 {
-	if (rq->cmd_type != REQ_TYPE_FS)
+	if (blk_rq_is_passthrough(rq))
 		return false;
 
 	if (req_op(rq) == REQ_OP_FLUSH)
@@ -1049,7 +1055,7 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
 {
 	struct request_queue *q = rq->q;
 
-	if (unlikely(rq->cmd_type != REQ_TYPE_FS))
+	if (blk_rq_is_passthrough(rq))
 		return q->limits.max_hw_sectors;
 
 	if (!q->limits.chunk_sectors ||
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index a143a67a6f33..341f9418bd68 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -114,12 +114,12 @@ extern void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes);
 
 static inline sector_t blk_rq_trace_sector(struct request *rq)
 {
-	return (rq->cmd_type != REQ_TYPE_FS) ? 0 : blk_rq_pos(rq);
+	return blk_rq_is_passthrough(rq) ? 0 : blk_rq_pos(rq);
 }
 
 static inline unsigned int blk_rq_trace_nr_sectors(struct request *rq)
 {
-	return (rq->cmd_type != REQ_TYPE_FS) ? 0 : blk_rq_sectors(rq);
+	return blk_rq_is_passthrough(rq) ? 0 : blk_rq_sectors(rq);
 }
 
 #endif
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index f708f1acfc50..b379f93a2c48 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -151,7 +151,7 @@ static inline void *scsi_cmd_priv(struct scsi_cmnd *cmd)
 	return cmd + 1;
 }
 
-/* make sure not to use it with REQ_TYPE_BLOCK_PC commands */
+/* make sure not to use it with passthrough commands */
 static inline struct scsi_driver *scsi_cmd_to_driver(struct scsi_cmnd *cmd)
 {
 	return *(struct scsi_driver **)cmd->request->rq_disk->private_data;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 512b34703ac3..e33050e3ea1c 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -712,7 +712,7 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 	if (likely(!bt))
 		return;
 
-	if (rq->cmd_type != REQ_TYPE_FS)
+	if (blk_rq_is_passthrough(rq))
 		what |= BLK_TC_ACT(BLK_TC_PC);
 	else
 		what |= BLK_TC_ACT(BLK_TC_FS);
-- 
cgit v1.2.3


From f447c196fe7a3a92c6396f7628020cb8d564be15 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 31 Jan 2017 16:48:23 -0500
Subject: tracing: Clean up the hwlat binding code

Instead of initializing the affinity of the hwlat kthread in the thread
itself, simply set up the initial affinity at thread creation. This
simplifies the code.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_hwlat.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index af344a1bf0d0..75fb54a3acb2 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -266,24 +266,13 @@ out:
 static struct cpumask save_cpumask;
 static bool disable_migrate;
 
-static void move_to_next_cpu(bool initmask)
+static void move_to_next_cpu(void)
 {
-	static struct cpumask *current_mask;
+	struct cpumask *current_mask = &save_cpumask;
 	int next_cpu;
 
 	if (disable_migrate)
 		return;
-
-	/* Just pick the first CPU on first iteration */
-	if (initmask) {
-		current_mask = &save_cpumask;
-		get_online_cpus();
-		cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
-		put_online_cpus();
-		next_cpu = cpumask_first(current_mask);
-		goto set_affinity;
-	}
-
 	/*
 	 * If for some reason the user modifies the CPU affinity
 	 * of this thread, than stop migrating for the duration
@@ -300,7 +289,6 @@ static void move_to_next_cpu(bool initmask)
 	if (next_cpu >= nr_cpu_ids)
 		next_cpu = cpumask_first(current_mask);
 
- set_affinity:
 	if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */
 		goto disable;
 
@@ -330,12 +318,10 @@ static void move_to_next_cpu(bool initmask)
 static int kthread_fn(void *data)
 {
 	u64 interval;
-	bool initmask = true;
 
 	while (!kthread_should_stop()) {
 
-		move_to_next_cpu(initmask);
-		initmask = false;
+		move_to_next_cpu();
 
 		local_irq_disable();
 		get_sample();
@@ -366,13 +352,27 @@ static int kthread_fn(void *data)
  */
 static int start_kthread(struct trace_array *tr)
 {
+	struct cpumask *current_mask = &save_cpumask;
 	struct task_struct *kthread;
+	int next_cpu;
+
+	/* Just pick the first CPU on first iteration */
+	current_mask = &save_cpumask;
+	get_online_cpus();
+	cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
+	put_online_cpus();
+	next_cpu = cpumask_first(current_mask);
 
 	kthread = kthread_create(kthread_fn, NULL, "hwlatd");
 	if (IS_ERR(kthread)) {
 		pr_err(BANNER "could not start sampling thread\n");
 		return -ENOMEM;
 	}
+
+	cpumask_clear(current_mask);
+	cpumask_set_cpu(next_cpu, current_mask);
+	sched_setaffinity(kthread->pid, current_mask);
+
 	hwlat_kthread = kthread;
 	wake_up_process(kthread);
 
-- 
cgit v1.2.3


From c6c70f4455d1eda91065e93cc4f7eddf4499b105 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 30 Jan 2017 19:17:35 +0100
Subject: exit: fix the setns() && PR_SET_CHILD_SUBREAPER interaction

find_new_reaper() checks same_thread_group(reaper, child_reaper) to
prevent the cross-namespace reparenting but this is not enough if the
exiting parent was injected by setns() + fork().

Suppose we have a process P in the root namespace and some namespace X.
P does setns() to enter the X namespace, and forks the child C.
C forks a grandchild G and exits.

The grandchild G should be re-parented to X->child_reaper, but in this
case the ->real_parent chain does not lead to ->child_reaper, so it will
be wrongly reparanted to P's sub-reaper or a global init.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/exit.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 8f14b866f9f6..5cfbd595f918 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -578,15 +578,18 @@ static struct task_struct *find_new_reaper(struct task_struct *father,
 		return thread;
 
 	if (father->signal->has_child_subreaper) {
+		unsigned int ns_level = task_pid(father)->level;
 		/*
 		 * Find the first ->is_child_subreaper ancestor in our pid_ns.
-		 * We start from father to ensure we can not look into another
-		 * namespace, this is safe because all its threads are dead.
+		 * We can't check reaper != child_reaper to ensure we do not
+		 * cross the namespaces, the exiting parent could be injected
+		 * by setns() + fork().
+		 * We check pid->level, this is slightly more efficient than
+		 * task_active_pid_ns(reaper) != task_active_pid_ns(father).
 		 */
-		for (reaper = father;
-		     !same_thread_group(reaper, child_reaper);
+		for (reaper = father->real_parent;
+		     task_pid(reaper)->level == ns_level;
 		     reaper = reaper->real_parent) {
-			/* call_usermodehelper() descendants need this check */
 			if (reaper == &init_task)
 				break;
 			if (!reaper->signal->is_child_subreaper)
-- 
cgit v1.2.3


From 93825f2ec736f30e034ab7c9d56b42849c5b00da Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:16 +0100
Subject: jiffies: Reuse TICK_NSEC instead of NSEC_PER_JIFFY

NSEC_PER_JIFFY is an ad-hoc redefinition of TICK_NSEC. Let's rather
use a unique and well maintained version.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-1-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/jiffies.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a4a0e478e44d..7906b3f0c41a 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -27,19 +27,8 @@
 
 #include "timekeeping.h"
 
-/* The Jiffies based clocksource is the lowest common
- * denominator clock source which should function on
- * all systems. It has the same coarse resolution as
- * the timer interrupt frequency HZ and it suffers
- * inaccuracies caused by missed or lost timer
- * interrupts and the inability for the timer
- * interrupt hardware to accuratly tick at the
- * requested HZ value. It is also not recommended
- * for "tick-less" systems.
- */
-#define NSEC_PER_JIFFY	((NSEC_PER_SEC+HZ/2)/HZ)
 
-/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
+/* Since jiffies uses a simple TICK_NSEC multiplier
  * conversion, the .shift value could be zero. However
  * this would make NTP adjustments impossible as they are
  * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to
@@ -47,8 +36,8 @@
  * amount, and give ntp adjustments in units of 1/2^8
  *
  * The value 8 is somewhat carefully chosen, as anything
- * larger can result in overflows. NSEC_PER_JIFFY grows as
- * HZ shrinks, so values greater than 8 overflow 32bits when
+ * larger can result in overflows. TICK_NSEC grows as HZ
+ * shrinks, so values greater than 8 overflow 32bits when
  * HZ=100.
  */
 #if HZ < 34
@@ -64,12 +53,23 @@ static u64 jiffies_read(struct clocksource *cs)
 	return (u64) jiffies;
 }
 
+/*
+ * The Jiffies based clocksource is the lowest common
+ * denominator clock source which should function on
+ * all systems. It has the same coarse resolution as
+ * the timer interrupt frequency HZ and it suffers
+ * inaccuracies caused by missed or lost timer
+ * interrupts and the inability for the timer
+ * interrupt hardware to accuratly tick at the
+ * requested HZ value. It is also not recommended
+ * for "tick-less" systems.
+ */
 static struct clocksource clocksource_jiffies = {
 	.name		= "jiffies",
 	.rating		= 1, /* lowest valid rating*/
 	.read		= jiffies_read,
 	.mask		= CLOCKSOURCE_MASK(32),
-	.mult		= NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
+	.mult		= TICK_NSEC << JIFFIES_SHIFT, /* details above */
 	.shift		= JIFFIES_SHIFT,
 	.max_cycles	= 10,
 };
@@ -125,7 +125,7 @@ int register_refined_jiffies(long cycles_per_second)
 	shift_hz += cycles_per_tick/2;
 	do_div(shift_hz, cycles_per_tick);
 	/* Calculate nsec_per_tick using shift_hz */
-	nsec_per_tick = (u64)NSEC_PER_SEC << 8;
+	nsec_per_tick = (u64)TICK_NSEC << 8;
 	nsec_per_tick += (u32)shift_hz/2;
 	do_div(nsec_per_tick, (u32)shift_hz);
 
-- 
cgit v1.2.3


From 07e5f5e353aaa61696c8353d87050994a0c4648a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:17 +0100
Subject: time: Introduce jiffies64_to_nsecs()

This will be needed for the cputime_t to nsec conversion.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-2-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/jiffies.h  |  2 ++
 kernel/time/time.c       | 10 ++++++++++
 kernel/time/timeconst.bc |  6 ++++++
 3 files changed, 18 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 589d14e970ad..624215cebee5 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -293,6 +293,8 @@ static inline u64 jiffies_to_nsecs(const unsigned long j)
 	return (u64)jiffies_to_usecs(j) * NSEC_PER_USEC;
 }
 
+extern u64 jiffies64_to_nsecs(u64 j);
+
 extern unsigned long __msecs_to_jiffies(const unsigned int m);
 #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
 /*
diff --git a/kernel/time/time.c b/kernel/time/time.c
index a3a9a8a029dc..25bdd2504571 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -702,6 +702,16 @@ u64 nsec_to_clock_t(u64 x)
 #endif
 }
 
+u64 jiffies64_to_nsecs(u64 j)
+{
+#if !(NSEC_PER_SEC % HZ)
+	return (NSEC_PER_SEC / HZ) * j;
+# else
+	return div_u64(j * HZ_TO_NSEC_NUM, HZ_TO_NSEC_DEN);
+#endif
+}
+EXPORT_SYMBOL(jiffies64_to_nsecs);
+
 /**
  * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
  *
diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc
index c48688904f9f..f83bbb81600b 100644
--- a/kernel/time/timeconst.bc
+++ b/kernel/time/timeconst.bc
@@ -98,6 +98,12 @@ define timeconst(hz) {
 		print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n"
 		print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n"
 		print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n"
+
+		cd=gcd(hz,1000000000)
+		print "#define HZ_TO_NSEC_NUM\t\t", 1000000000/cd, "\n"
+		print "#define HZ_TO_NSEC_DEN\t\t", hz/cd, "\n"
+		print "#define NSEC_TO_HZ_NUM\t\t", hz/cd, "\n"
+		print "#define NSEC_TO_HZ_DEN\t\t", 1000000000/cd, "\n"
 		print "\n"
 
 		print "#endif /* KERNEL_TIMECONST_H */\n"
-- 
cgit v1.2.3


From 7fb1327ee9b92fca27662f9b9d60c7c3376d6c69 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:19 +0100
Subject: sched/cputime: Convert kcpustat to nsecs

Kernel CPU stats are stored in cputime_t which is an architecture
defined type, and hence a bit opaque and requiring accessors and mutators
for any operation.

Converting them to nsecs simplifies the code and is one step toward
the removal of cputime_t in the core code.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-4-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/s390/appldata/appldata_os.c   | 16 ++++-----
 drivers/cpufreq/cpufreq.c          |  6 ++--
 drivers/cpufreq/cpufreq_governor.c |  2 +-
 drivers/cpufreq/cpufreq_stats.c    |  1 -
 drivers/macintosh/rack-meter.c     |  2 +-
 fs/proc/stat.c                     | 68 +++++++++++++++++++-------------------
 fs/proc/uptime.c                   |  7 ++--
 kernel/sched/cpuacct.c             |  2 +-
 kernel/sched/cputime.c             | 22 ++++++------
 9 files changed, 61 insertions(+), 65 deletions(-)

(limited to 'kernel')

diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c
index 69b23b25ac34..08b9e942a262 100644
--- a/arch/s390/appldata/appldata_os.c
+++ b/arch/s390/appldata/appldata_os.c
@@ -113,21 +113,21 @@ static void appldata_get_os_data(void *data)
 	j = 0;
 	for_each_online_cpu(i) {
 		os_data->os_cpu[j].per_cpu_user =
-			cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_USER]);
+			nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_USER]);
 		os_data->os_cpu[j].per_cpu_nice =
-			cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_NICE]);
+			nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_NICE]);
 		os_data->os_cpu[j].per_cpu_system =
-			cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]);
+			nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]);
 		os_data->os_cpu[j].per_cpu_idle =
-			cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IDLE]);
+			nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IDLE]);
 		os_data->os_cpu[j].per_cpu_irq =
-			cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IRQ]);
+			nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IRQ]);
 		os_data->os_cpu[j].per_cpu_softirq =
-			cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]);
+			nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]);
 		os_data->os_cpu[j].per_cpu_iowait =
-			cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IOWAIT]);
+			nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IOWAIT]);
 		os_data->os_cpu[j].per_cpu_steal =
-			cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_STEAL]);
+			nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_STEAL]);
 		os_data->os_cpu[j].cpu_id = i;
 		j++;
 	}
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index cc475eff90b3..3e9b319a2e79 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -132,7 +132,7 @@ static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall)
 	u64 cur_wall_time;
 	u64 busy_time;
 
-	cur_wall_time = jiffies64_to_cputime64(get_jiffies_64());
+	cur_wall_time = jiffies64_to_nsecs(get_jiffies_64());
 
 	busy_time = kcpustat_cpu(cpu).cpustat[CPUTIME_USER];
 	busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM];
@@ -143,9 +143,9 @@ static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall)
 
 	idle_time = cur_wall_time - busy_time;
 	if (wall)
-		*wall = cputime_to_usecs(cur_wall_time);
+		*wall = div_u64(cur_wall_time, NSEC_PER_USEC);
 
-	return cputime_to_usecs(idle_time);
+	return div_u64(idle_time, NSEC_PER_USEC);
 }
 
 u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy)
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 0196467280bd..631bd2c86c5e 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -152,7 +152,7 @@ unsigned int dbs_update(struct cpufreq_policy *policy)
 		if (ignore_nice) {
 			u64 cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
 
-			idle_time += cputime_to_usecs(cur_nice - j_cdbs->prev_cpu_nice);
+			idle_time += div_u64(cur_nice - j_cdbs->prev_cpu_nice, NSEC_PER_USEC);
 			j_cdbs->prev_cpu_nice = cur_nice;
 		}
 
diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index ac284e66839c..17048bbec287 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -13,7 +13,6 @@
 #include <linux/cpufreq.h>
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <linux/cputime.h>
 
 static DEFINE_SPINLOCK(cpufreq_stats_lock);
 
diff --git a/drivers/macintosh/rack-meter.c b/drivers/macintosh/rack-meter.c
index 775527135b93..c114594136d4 100644
--- a/drivers/macintosh/rack-meter.c
+++ b/drivers/macintosh/rack-meter.c
@@ -91,7 +91,7 @@ static inline cputime64_t get_cpu_idle_time(unsigned int cpu)
 	if (rackmeter_ignore_nice)
 		retval += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE];
 
-	return retval;
+	return nsecs_to_cputime64(retval);
 }
 
 static void rackmeter_setup_i2s(struct rackmeter *rm)
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index d700c42b3572..44475a44cbf1 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -21,23 +21,23 @@
 
 #ifdef arch_idle_time
 
-static cputime64_t get_idle_time(int cpu)
+static u64 get_idle_time(int cpu)
 {
-	cputime64_t idle;
+	u64 idle;
 
 	idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
 	if (cpu_online(cpu) && !nr_iowait_cpu(cpu))
-		idle += arch_idle_time(cpu);
+		idle += cputime_to_nsecs(arch_idle_time(cpu));
 	return idle;
 }
 
-static cputime64_t get_iowait_time(int cpu)
+static u64 get_iowait_time(int cpu)
 {
-	cputime64_t iowait;
+	u64 iowait;
 
 	iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
 	if (cpu_online(cpu) && nr_iowait_cpu(cpu))
-		iowait += arch_idle_time(cpu);
+		iowait += cputime_to_nsecs(arch_idle_time(cpu));
 	return iowait;
 }
 
@@ -45,32 +45,32 @@ static cputime64_t get_iowait_time(int cpu)
 
 static u64 get_idle_time(int cpu)
 {
-	u64 idle, idle_time = -1ULL;
+	u64 idle, idle_usecs = -1ULL;
 
 	if (cpu_online(cpu))
-		idle_time = get_cpu_idle_time_us(cpu, NULL);
+		idle_usecs = get_cpu_idle_time_us(cpu, NULL);
 
-	if (idle_time == -1ULL)
+	if (idle_usecs == -1ULL)
 		/* !NO_HZ or cpu offline so we can rely on cpustat.idle */
 		idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
 	else
-		idle = usecs_to_cputime64(idle_time);
+		idle = idle_usecs * NSEC_PER_USEC;
 
 	return idle;
 }
 
 static u64 get_iowait_time(int cpu)
 {
-	u64 iowait, iowait_time = -1ULL;
+	u64 iowait, iowait_usecs = -1ULL;
 
 	if (cpu_online(cpu))
-		iowait_time = get_cpu_iowait_time_us(cpu, NULL);
+		iowait_usecs = get_cpu_iowait_time_us(cpu, NULL);
 
-	if (iowait_time == -1ULL)
+	if (iowait_usecs == -1ULL)
 		/* !NO_HZ or cpu offline so we can rely on cpustat.iowait */
 		iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
 	else
-		iowait = usecs_to_cputime64(iowait_time);
+		iowait = iowait_usecs * NSEC_PER_USEC;
 
 	return iowait;
 }
@@ -115,16 +115,16 @@ static int show_stat(struct seq_file *p, void *v)
 	}
 	sum += arch_irq_stat();
 
-	seq_put_decimal_ull(p, "cpu  ", cputime64_to_clock_t(user));
-	seq_put_decimal_ull(p, " ", cputime64_to_clock_t(nice));
-	seq_put_decimal_ull(p, " ", cputime64_to_clock_t(system));
-	seq_put_decimal_ull(p, " ", cputime64_to_clock_t(idle));
-	seq_put_decimal_ull(p, " ", cputime64_to_clock_t(iowait));
-	seq_put_decimal_ull(p, " ", cputime64_to_clock_t(irq));
-	seq_put_decimal_ull(p, " ", cputime64_to_clock_t(softirq));
-	seq_put_decimal_ull(p, " ", cputime64_to_clock_t(steal));
-	seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest));
-	seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest_nice));
+	seq_put_decimal_ull(p, "cpu  ", nsec_to_clock_t(user));
+	seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
+	seq_put_decimal_ull(p, " ", nsec_to_clock_t(system));
+	seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle));
+	seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait));
+	seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq));
+	seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq));
+	seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal));
+	seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest));
+	seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice));
 	seq_putc(p, '\n');
 
 	for_each_online_cpu(i) {
@@ -140,16 +140,16 @@ static int show_stat(struct seq_file *p, void *v)
 		guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
 		guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
 		seq_printf(p, "cpu%d", i);
-		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(user));
-		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(nice));
-		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(system));
-		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(idle));
-		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(iowait));
-		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(irq));
-		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(softirq));
-		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(steal));
-		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest));
-		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest_nice));
+		seq_put_decimal_ull(p, " ", nsec_to_clock_t(user));
+		seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
+		seq_put_decimal_ull(p, " ", nsec_to_clock_t(system));
+		seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle));
+		seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait));
+		seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq));
+		seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq));
+		seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal));
+		seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest));
+		seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice));
 		seq_putc(p, '\n');
 	}
 	seq_put_decimal_ull(p, "intr ", (unsigned long long)sum);
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 33de567c25af..7981c4ffe787 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -5,23 +5,20 @@
 #include <linux/seq_file.h>
 #include <linux/time.h>
 #include <linux/kernel_stat.h>
-#include <linux/cputime.h>
 
 static int uptime_proc_show(struct seq_file *m, void *v)
 {
 	struct timespec uptime;
 	struct timespec idle;
-	u64 idletime;
 	u64 nsec;
 	u32 rem;
 	int i;
 
-	idletime = 0;
+	nsec = 0;
 	for_each_possible_cpu(i)
-		idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
+		nsec += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
 
 	get_monotonic_boottime(&uptime);
-	nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC;
 	idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
 	idle.tv_nsec = rem;
 	seq_printf(m, "%lu.%02lu %lu.%02lu\n",
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 9add206b5608..f95ab29a45d0 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -297,7 +297,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v)
 	for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
 		seq_printf(sf, "%s %lld\n",
 			   cpuacct_stat_desc[stat],
-			   (long long)cputime64_to_clock_t(val[stat]));
+			   (long long)nsec_to_clock_t(val[stat]));
 	}
 
 	return 0;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f7c14cc71d06..61e270926e94 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -75,9 +75,9 @@ static cputime_t irqtime_account_update(u64 irqtime, int idx, cputime_t maxtime)
 	u64 *cpustat = kcpustat_this_cpu->cpustat;
 	cputime_t irq_cputime;
 
-	irq_cputime = nsecs_to_cputime64(irqtime) - cpustat[idx];
+	irq_cputime = nsecs_to_cputime64(irqtime - cpustat[idx]);
 	irq_cputime = min(irq_cputime, maxtime);
-	cpustat[idx] += irq_cputime;
+	cpustat[idx] += cputime_to_nsecs(irq_cputime);
 
 	return irq_cputime;
 }
@@ -140,7 +140,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
 	index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
 
 	/* Add user time to cpustat. */
-	task_group_account_field(p, index, (__force u64) cputime);
+	task_group_account_field(p, index, cputime_to_nsecs(cputime));
 
 	/* Account for user time used */
 	acct_account_cputime(p);
@@ -162,11 +162,11 @@ void account_guest_time(struct task_struct *p, cputime_t cputime)
 
 	/* Add guest time to cpustat. */
 	if (task_nice(p) > 0) {
-		cpustat[CPUTIME_NICE] += (__force u64) cputime;
-		cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
+		cpustat[CPUTIME_NICE] += cputime_to_nsecs(cputime);
+		cpustat[CPUTIME_GUEST_NICE] += cputime_to_nsecs(cputime);
 	} else {
-		cpustat[CPUTIME_USER] += (__force u64) cputime;
-		cpustat[CPUTIME_GUEST] += (__force u64) cputime;
+		cpustat[CPUTIME_USER] += cputime_to_nsecs(cputime);
+		cpustat[CPUTIME_GUEST] += cputime_to_nsecs(cputime);
 	}
 }
 
@@ -184,7 +184,7 @@ void account_system_index_time(struct task_struct *p,
 	account_group_system_time(p, cputime);
 
 	/* Add system time to cpustat. */
-	task_group_account_field(p, index, (__force u64) cputime);
+	task_group_account_field(p, index, cputime_to_nsecs(cputime));
 
 	/* Account for system time used */
 	acct_account_cputime(p);
@@ -224,7 +224,7 @@ void account_steal_time(cputime_t cputime)
 {
 	u64 *cpustat = kcpustat_this_cpu->cpustat;
 
-	cpustat[CPUTIME_STEAL] += (__force u64) cputime;
+	cpustat[CPUTIME_STEAL] += cputime_to_nsecs(cputime);
 }
 
 /*
@@ -237,9 +237,9 @@ void account_idle_time(cputime_t cputime)
 	struct rq *rq = this_rq();
 
 	if (atomic_read(&rq->nr_iowait) > 0)
-		cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
+		cpustat[CPUTIME_IOWAIT] += cputime_to_nsecs(cputime);
 	else
-		cpustat[CPUTIME_IDLE] += (__force u64) cputime;
+		cpustat[CPUTIME_IDLE] += cputime_to_nsecs(cputime);
 }
 
 /*
-- 
cgit v1.2.3


From 16a6d9be90373fb0b521850cd0185a4d460dd152 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:21 +0100
Subject: sched/cputime: Convert guest time accounting to nsecs (u64)

cputime_t is being obsolete and replaced by nsecs units in order to make
internal timestamps less opaque and more granular.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-6-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/proc/array.c        |  6 +++---
 include/linux/sched.h  | 10 +++++-----
 kernel/sched/cputime.c |  8 ++++----
 3 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 51a4213afa2e..25b54cf0c042 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -402,7 +402,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	unsigned long cmin_flt = 0, cmaj_flt = 0;
 	unsigned long  min_flt = 0,  maj_flt = 0;
 	cputime_t cutime, cstime, utime, stime;
-	cputime_t cgtime, gtime;
+	u64 cgtime, gtime;
 	unsigned long rsslim = 0;
 	char tcomm[sizeof(task->comm)];
 	unsigned long flags;
@@ -542,8 +542,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	seq_put_decimal_ull(m, " ", task->rt_priority);
 	seq_put_decimal_ull(m, " ", task->policy);
 	seq_put_decimal_ull(m, " ", delayacct_blkio_ticks(task));
-	seq_put_decimal_ull(m, " ", cputime_to_clock_t(gtime));
-	seq_put_decimal_ll(m, " ", cputime_to_clock_t(cgtime));
+	seq_put_decimal_ull(m, " ", nsec_to_clock_t(gtime));
+	seq_put_decimal_ll(m, " ", nsec_to_clock_t(cgtime));
 
 	if (mm && permitted) {
 		seq_put_decimal_ull(m, " ", mm->start_data);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5f60aed37701..252ff25983c8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -774,8 +774,8 @@ struct signal_struct {
 	 */
 	seqlock_t stats_lock;
 	cputime_t utime, stime, cutime, cstime;
-	cputime_t gtime;
-	cputime_t cgtime;
+	u64 gtime;
+	u64 cgtime;
 	struct prev_cputime prev_cputime;
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
@@ -1658,7 +1658,7 @@ struct task_struct {
 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
 	cputime_t utimescaled, stimescaled;
 #endif
-	cputime_t gtime;
+	u64 gtime;
 	struct prev_cputime prev_cputime;
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 	seqcount_t vtime_seqcount;
@@ -2254,7 +2254,7 @@ struct task_struct *try_get_task_struct(struct task_struct **ptask);
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 extern void task_cputime(struct task_struct *t,
 			 cputime_t *utime, cputime_t *stime);
-extern cputime_t task_gtime(struct task_struct *t);
+extern u64 task_gtime(struct task_struct *t);
 #else
 static inline void task_cputime(struct task_struct *t,
 				cputime_t *utime, cputime_t *stime)
@@ -2263,7 +2263,7 @@ static inline void task_cputime(struct task_struct *t,
 	*stime = t->stime;
 }
 
-static inline cputime_t task_gtime(struct task_struct *t)
+static inline u64 task_gtime(struct task_struct *t)
 {
 	return t->gtime;
 }
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 61e270926e94..8bcd98e2b821 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -158,7 +158,7 @@ void account_guest_time(struct task_struct *p, cputime_t cputime)
 	/* Add guest time to process. */
 	p->utime += cputime;
 	account_group_user_time(p, cputime);
-	p->gtime += cputime;
+	p->gtime += cputime_to_nsecs(cputime);
 
 	/* Add guest time to cpustat. */
 	if (task_nice(p) > 0) {
@@ -824,10 +824,10 @@ void vtime_init_idle(struct task_struct *t, int cpu)
 	local_irq_restore(flags);
 }
 
-cputime_t task_gtime(struct task_struct *t)
+u64 task_gtime(struct task_struct *t)
 {
 	unsigned int seq;
-	cputime_t gtime;
+	u64 gtime;
 
 	if (!vtime_accounting_enabled())
 		return t->gtime;
@@ -837,7 +837,7 @@ cputime_t task_gtime(struct task_struct *t)
 
 		gtime = t->gtime;
 		if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU)
-			gtime += vtime_delta(t);
+			gtime += cputime_to_nsecs(vtime_delta(t));
 
 	} while (read_seqcount_retry(&t->vtime_seqcount, seq));
 
-- 
cgit v1.2.3


From a1cecf2ba78e0a6de00ff99df34b662728535aa5 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:22 +0100
Subject: sched/cputime: Introduce special task_cputime_t() API to return
 old-typed cputime

This API returns a task's cputime in cputime_t in order to ease the
conversion of cputime internals to use nsecs units instead. Blindly
converting all cputime readers to use this API now will later let us
convert more smoothly and step by step all these places to use the
new nsec based cputime.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-7-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/kernel/osf_sys.c    |  2 +-
 arch/x86/kernel/apm_32.c       |  2 +-
 drivers/isdn/mISDN/stack.c     |  2 +-
 fs/binfmt_elf.c                |  6 +++---
 fs/binfmt_elf_fdpic.c          |  6 +++---
 include/linux/sched.h          | 32 ++++++++++++++++++++++++++---
 kernel/acct.c                  |  2 +-
 kernel/delayacct.c             |  4 ++--
 kernel/signal.c                |  4 ++--
 kernel/time/itimer.c           |  2 +-
 kernel/time/posix-cpu-timers.c | 46 +++++++++++++++++++++---------------------
 kernel/tsacct.c                |  6 +++---
 12 files changed, 70 insertions(+), 44 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 54d8616644e2..0f92438d736b 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -1154,7 +1154,7 @@ SYSCALL_DEFINE2(osf_getrusage, int, who, struct rusage32 __user *, ru)
 	memset(&r, 0, sizeof(r));
 	switch (who) {
 	case RUSAGE_SELF:
-		task_cputime(current, &utime, &stime);
+		task_cputime_t(current, &utime, &stime);
 		utime_jiffies = cputime_to_jiffies(utime);
 		stime_jiffies = cputime_to_jiffies(stime);
 		jiffies_to_timeval32(utime_jiffies, &r.ru_utime);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 45d44c173cf9..89c84fcdd3c0 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -913,7 +913,7 @@ static int apm_cpu_idle(struct cpuidle_device *dev,
 	unsigned int bucket;
 
 recalc:
-	task_cputime(current, &utime, &stime);
+	task_cputime_t(current, &utime, &stime);
 	if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
 		use_apm_idle = 0;
 	} else if (jiffies_since_last_check > idle_period) {
diff --git a/drivers/isdn/mISDN/stack.c b/drivers/isdn/mISDN/stack.c
index 9cb4b621fbc3..0a3661767531 100644
--- a/drivers/isdn/mISDN/stack.c
+++ b/drivers/isdn/mISDN/stack.c
@@ -306,7 +306,7 @@ mISDNStackd(void *data)
 	       "msg %d sleep %d stopped\n",
 	       dev_name(&st->dev->dev), st->msg_cnt, st->sleep_cnt,
 	       st->stopped_cnt);
-	task_cputime(st->thread, &utime, &stime);
+	task_cputime_t(st->thread, &utime, &stime);
 	printk(KERN_DEBUG
 	       "mISDNStackd daemon for %s utime(%ld) stime(%ld)\n",
 	       dev_name(&st->dev->dev), utime, stime);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 422370293cfd..68b915650cae 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1421,19 +1421,19 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 	prstatus->pr_pgrp = task_pgrp_vnr(p);
 	prstatus->pr_sid = task_session_vnr(p);
 	if (thread_group_leader(p)) {
-		struct task_cputime cputime;
+		struct task_cputime_t cputime;
 
 		/*
 		 * This is the record for the group leader.  It shows the
 		 * group-wide total, not its individual thread total.
 		 */
-		thread_group_cputime(p, &cputime);
+		thread_group_cputime_t(p, &cputime);
 		cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
 		cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
 	} else {
 		cputime_t utime, stime;
 
-		task_cputime(p, &utime, &stime);
+		task_cputime_t(p, &utime, &stime);
 		cputime_to_timeval(utime, &prstatus->pr_utime);
 		cputime_to_timeval(stime, &prstatus->pr_stime);
 	}
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index d2e36f82c35d..6ccd9df7247a 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1342,19 +1342,19 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 	prstatus->pr_pgrp = task_pgrp_vnr(p);
 	prstatus->pr_sid = task_session_vnr(p);
 	if (thread_group_leader(p)) {
-		struct task_cputime cputime;
+		struct task_cputime_t cputime;
 
 		/*
 		 * This is the record for the group leader.  It shows the
 		 * group-wide total, not its individual thread total.
 		 */
-		thread_group_cputime(p, &cputime);
+		thread_group_cputime_t(p, &cputime);
 		cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
 		cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
 	} else {
 		cputime_t utime, stime;
 
-		task_cputime(p, &utime, &stime);
+		task_cputime_t(p, &utime, &stime);
 		cputime_to_timeval(utime, &prstatus->pr_utime);
 		cputime_to_timeval(stime, &prstatus->pr_stime);
 	}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 252ff25983c8..9cc722f77799 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -615,6 +615,13 @@ struct task_cputime {
 	unsigned long long sum_exec_runtime;
 };
 
+/* Temporary type to ease cputime_t to nsecs conversion */
+struct task_cputime_t {
+	cputime_t utime;
+	cputime_t stime;
+	unsigned long long sum_exec_runtime;
+};
+
 /* Alternate field names when used to cache expirations. */
 #define virt_exp	utime
 #define prof_exp	stime
@@ -748,7 +755,7 @@ struct signal_struct {
 	struct thread_group_cputimer cputimer;
 
 	/* Earliest-expiration cache. */
-	struct task_cputime cputime_expires;
+	struct task_cputime_t cputime_expires;
 
 #ifdef CONFIG_NO_HZ_FULL
 	atomic_t tick_dep_mask;
@@ -1682,7 +1689,7 @@ struct task_struct {
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt;
 
-	struct task_cputime cputime_expires;
+	struct task_cputime_t cputime_expires;
 	struct list_head cpu_timers[3];
 
 /* process credentials */
@@ -2286,6 +2293,19 @@ static inline void task_cputime_scaled(struct task_struct *t,
 }
 #endif
 
+static inline void task_cputime_t(struct task_struct *t,
+				  cputime_t *utime, cputime_t *stime)
+{
+	task_cputime(t, utime, stime);
+}
+
+static inline void task_cputime_t_scaled(struct task_struct *t,
+					 cputime_t *utimescaled,
+					 cputime_t *stimescaled)
+{
+	task_cputime_scaled(t, utimescaled, stimescaled);
+}
+
 extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
 extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
 
@@ -3499,7 +3519,13 @@ static __always_inline bool need_resched(void)
  * Thread group CPU time accounting.
  */
 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
-void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
+void thread_group_cputimer(struct task_struct *tsk, struct task_cputime_t *times);
+
+static inline void thread_group_cputime_t(struct task_struct *tsk,
+					  struct task_cputime_t *times)
+{
+	thread_group_cputime(tsk, (struct task_cputime *)times);
+}
 
 /*
  * Reevaluate whether the task has signals pending delivery.
diff --git a/kernel/acct.c b/kernel/acct.c
index 74963d192c5d..b9b190a8eecf 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -559,7 +559,7 @@ void acct_collect(long exitcode, int group_dead)
 		pacct->ac_flag |= ACORE;
 	if (current->flags & PF_SIGNALED)
 		pacct->ac_flag |= AXSIG;
-	task_cputime(current, &utime, &stime);
+	task_cputime_t(current, &utime, &stime);
 	pacct->ac_utime += utime;
 	pacct->ac_stime += stime;
 	pacct->ac_minflt += current->min_flt;
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 435c14a45118..228640f2b3d2 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -87,12 +87,12 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	unsigned long flags, t1;
 	s64 tmp;
 
-	task_cputime(tsk, &utime, &stime);
+	task_cputime_t(tsk, &utime, &stime);
 	tmp = (s64)d->cpu_run_real_total;
 	tmp += cputime_to_nsecs(utime + stime);
 	d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
 
-	task_cputime_scaled(tsk, &utimescaled, &stimescaled);
+	task_cputime_t_scaled(tsk, &utimescaled, &stimescaled);
 	tmp = (s64)d->cpu_scaled_run_real_total;
 	tmp += cputime_to_nsecs(utimescaled + stimescaled);
 	d->cpu_scaled_run_real_total =
diff --git a/kernel/signal.c b/kernel/signal.c
index 3603d93a1968..218048a837ea 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1619,7 +1619,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
 				       task_uid(tsk));
 	rcu_read_unlock();
 
-	task_cputime(tsk, &utime, &stime);
+	task_cputime_t(tsk, &utime, &stime);
 	info.si_utime = cputime_to_clock_t(utime + tsk->signal->utime);
 	info.si_stime = cputime_to_clock_t(stime + tsk->signal->stime);
 
@@ -1704,7 +1704,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
 	info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
 	rcu_read_unlock();
 
-	task_cputime(tsk, &utime, &stime);
+	task_cputime_t(tsk, &utime, &stime);
 	info.si_utime = cputime_to_clock_t(utime);
 	info.si_stime = cputime_to_clock_t(stime);
 
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 8c89143f9ebf..f2d5097bcb6d 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -53,7 +53,7 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
 	cval = it->expires;
 	cinterval = it->incr;
 	if (cval) {
-		struct task_cputime cputime;
+		struct task_cputime_t cputime;
 		cputime_t t;
 
 		thread_group_cputimer(tsk, &cputime);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index e9e8c10f0d9a..d53ff711a2a8 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -115,7 +115,7 @@ static void bump_cpu_timer(struct k_itimer *timer,
  * Checks @cputime to see if all fields are zero.  Returns true if all fields
  * are zero, false if any field is nonzero.
  */
-static inline int task_cputime_zero(const struct task_cputime *cputime)
+static inline int task_cputime_zero(const struct task_cputime_t *cputime)
 {
 	if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
 		return 1;
@@ -126,7 +126,7 @@ static inline unsigned long long prof_ticks(struct task_struct *p)
 {
 	cputime_t utime, stime;
 
-	task_cputime(p, &utime, &stime);
+	task_cputime_t(p, &utime, &stime);
 
 	return cputime_to_expires(utime + stime);
 }
@@ -134,7 +134,7 @@ static inline unsigned long long virt_ticks(struct task_struct *p)
 {
 	cputime_t utime, stime;
 
-	task_cputime(p, &utime, &stime);
+	task_cputime_t(p, &utime, &stime);
 
 	return cputime_to_expires(utime);
 }
@@ -210,7 +210,7 @@ retry:
 	}
 }
 
-static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum)
+static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime_t *sum)
 {
 	__update_gt_cputime(&cputime_atomic->utime, sum->utime);
 	__update_gt_cputime(&cputime_atomic->stime, sum->stime);
@@ -218,7 +218,7 @@ static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct
 }
 
 /* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */
-static inline void sample_cputime_atomic(struct task_cputime *times,
+static inline void sample_cputime_atomic(struct task_cputime_t *times,
 					 struct task_cputime_atomic *atomic_times)
 {
 	times->utime = atomic64_read(&atomic_times->utime);
@@ -226,10 +226,10 @@ static inline void sample_cputime_atomic(struct task_cputime *times,
 	times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime);
 }
 
-void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
+void thread_group_cputimer(struct task_struct *tsk, struct task_cputime_t *times)
 {
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-	struct task_cputime sum;
+	struct task_cputime_t sum;
 
 	/* Check if cputimer isn't running. This is accessed without locking. */
 	if (!READ_ONCE(cputimer->running)) {
@@ -238,7 +238,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 		 * values through the TIMER_ABSTIME flag, therefore we have
 		 * to synchronize the timer to the clock every time we start it.
 		 */
-		thread_group_cputime(tsk, &sum);
+		thread_group_cputime_t(tsk, &sum);
 		update_gt_cputime(&cputimer->cputime_atomic, &sum);
 
 		/*
@@ -262,21 +262,21 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
 				  struct task_struct *p,
 				  unsigned long long *sample)
 {
-	struct task_cputime cputime;
+	struct task_cputime_t cputime;
 
 	switch (CPUCLOCK_WHICH(which_clock)) {
 	default:
 		return -EINVAL;
 	case CPUCLOCK_PROF:
-		thread_group_cputime(p, &cputime);
+		thread_group_cputime_t(p, &cputime);
 		*sample = cputime_to_expires(cputime.utime + cputime.stime);
 		break;
 	case CPUCLOCK_VIRT:
-		thread_group_cputime(p, &cputime);
+		thread_group_cputime_t(p, &cputime);
 		*sample = cputime_to_expires(cputime.utime);
 		break;
 	case CPUCLOCK_SCHED:
-		thread_group_cputime(p, &cputime);
+		thread_group_cputime_t(p, &cputime);
 		*sample = cputime.sum_exec_runtime;
 		break;
 	}
@@ -466,7 +466,7 @@ static void arm_timer(struct k_itimer *timer)
 {
 	struct task_struct *p = timer->it.cpu.task;
 	struct list_head *head, *listpos;
-	struct task_cputime *cputime_expires;
+	struct task_cputime_t *cputime_expires;
 	struct cpu_timer_list *const nt = &timer->it.cpu;
 	struct cpu_timer_list *next;
 
@@ -562,7 +562,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
 				  struct task_struct *p,
 				  unsigned long long *sample)
 {
-	struct task_cputime cputime;
+	struct task_cputime_t cputime;
 
 	thread_group_cputimer(p, &cputime);
 	switch (CPUCLOCK_WHICH(which_clock)) {
@@ -761,7 +761,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 		/*
 		 * Protect against sighand release/switch in exit/exec and
 		 * also make timer sampling safe if it ends up calling
-		 * thread_group_cputime().
+		 * thread_group_cputime_t().
 		 */
 		sighand = lock_task_sighand(p, &flags);
 		if (unlikely(sighand == NULL)) {
@@ -826,7 +826,7 @@ static void check_thread_timers(struct task_struct *tsk,
 {
 	struct list_head *timers = tsk->cpu_timers;
 	struct signal_struct *const sig = tsk->signal;
-	struct task_cputime *tsk_expires = &tsk->cputime_expires;
+	struct task_cputime_t *tsk_expires = &tsk->cputime_expires;
 	unsigned long long expires;
 	unsigned long soft;
 
@@ -934,7 +934,7 @@ static void check_process_timers(struct task_struct *tsk,
 	unsigned long long utime, ptime, virt_expires, prof_expires;
 	unsigned long long sum_sched_runtime, sched_expires;
 	struct list_head *timers = sig->cpu_timers;
-	struct task_cputime cputime;
+	struct task_cputime_t cputime;
 	unsigned long soft;
 
 	/*
@@ -1037,7 +1037,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 	} else {
 		/*
 		 * Protect arm_timer() and timer sampling in case of call to
-		 * thread_group_cputime().
+		 * thread_group_cputime_t().
 		 */
 		sighand = lock_task_sighand(p, &flags);
 		if (unlikely(sighand == NULL)) {
@@ -1080,8 +1080,8 @@ out:
  * Returns true if any field of the former is greater than the corresponding
  * field of the latter if the latter field is set.  Otherwise returns false.
  */
-static inline int task_cputime_expired(const struct task_cputime *sample,
-					const struct task_cputime *expires)
+static inline int task_cputime_expired(const struct task_cputime_t *sample,
+					const struct task_cputime_t *expires)
 {
 	if (expires->utime && sample->utime >= expires->utime)
 		return 1;
@@ -1108,9 +1108,9 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 	struct signal_struct *sig;
 
 	if (!task_cputime_zero(&tsk->cputime_expires)) {
-		struct task_cputime task_sample;
+		struct task_cputime_t task_sample;
 
-		task_cputime(tsk, &task_sample.utime, &task_sample.stime);
+		task_cputime_t(tsk, &task_sample.utime, &task_sample.stime);
 		task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime;
 		if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
 			return 1;
@@ -1133,7 +1133,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 	 */
 	if (READ_ONCE(sig->cputimer.running) &&
 	    !READ_ONCE(sig->cputimer.checking_timer)) {
-		struct task_cputime group_sample;
+		struct task_cputime_t group_sample;
 
 		sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);
 
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index f8e26ab963ed..040d0a64d0d1 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -66,11 +66,11 @@ void bacct_add_tsk(struct user_namespace *user_ns,
 		task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0;
 	rcu_read_unlock();
 
-	task_cputime(tsk, &utime, &stime);
+	task_cputime_t(tsk, &utime, &stime);
 	stats->ac_utime = cputime_to_usecs(utime);
 	stats->ac_stime = cputime_to_usecs(stime);
 
-	task_cputime_scaled(tsk, &utimescaled, &stimescaled);
+	task_cputime_t_scaled(tsk, &utimescaled, &stimescaled);
 	stats->ac_utimescaled = cputime_to_usecs(utimescaled);
 	stats->ac_stimescaled = cputime_to_usecs(stimescaled);
 
@@ -159,7 +159,7 @@ void acct_update_integrals(struct task_struct *tsk)
 	unsigned long flags;
 
 	local_irq_save(flags);
-	task_cputime(tsk, &utime, &stime);
+	task_cputime_t(tsk, &utime, &stime);
 	__acct_update_integrals(tsk, utime, stime);
 	local_irq_restore(flags);
 }
-- 
cgit v1.2.3


From 5613fda9a503cd6137b120298902a34a1386b2c1 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:23 +0100
Subject: sched/cputime: Convert task/group cputime to nsecs

Now that most cputime readers use the transition API which return the
task cputime in old style cputime_t, we can safely store the cputime in
nsecs. This will eventually make cputime statistics less opaque and more
granular. Back and forth convertions between cputime_t and nsecs in order
to deal with cputime_t random granularity won't be needed anymore.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-8-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/kernel/osf_sys.c |  4 ++--
 arch/powerpc/kernel/time.c  |  4 ++--
 arch/s390/kernel/vtime.c    |  6 ++---
 arch/x86/kvm/hyperv.c       |  5 +++--
 fs/binfmt_elf.c             | 11 +++++++--
 fs/binfmt_elf_fdpic.c       |  4 ++--
 fs/proc/array.c             | 10 ++++-----
 include/linux/sched.h       | 55 ++++++++++++++++++++++++++++-----------------
 kernel/exit.c               |  4 ++--
 kernel/sched/cputime.c      | 35 ++++++++++++++---------------
 kernel/signal.c             |  4 ++--
 kernel/sys.c                | 16 ++++++-------
 12 files changed, 89 insertions(+), 69 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 0f92438d736b..82ccb43b795b 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -1163,8 +1163,8 @@ SYSCALL_DEFINE2(osf_getrusage, int, who, struct rusage32 __user *, ru)
 		r.ru_majflt = current->maj_flt;
 		break;
 	case RUSAGE_CHILDREN:
-		utime_jiffies = cputime_to_jiffies(current->signal->cutime);
-		stime_jiffies = cputime_to_jiffies(current->signal->cstime);
+		utime_jiffies = nsecs_to_jiffies(current->signal->cutime);
+		stime_jiffies = nsecs_to_jiffies(current->signal->cstime);
 		jiffies_to_timeval32(utime_jiffies, &r.ru_utime);
 		jiffies_to_timeval32(stime_jiffies, &r.ru_stime);
 		r.ru_minflt = current->signal->cmin_flt;
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 02e97305d22b..3cca82e065c9 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -396,7 +396,7 @@ void vtime_flush(struct task_struct *tsk)
 		account_user_time(tsk, acct->utime);
 
 	if (acct->utime_scaled)
-		tsk->utimescaled += acct->utime_scaled;
+		tsk->utimescaled += cputime_to_nsecs(acct->utime_scaled);
 
 	if (acct->gtime)
 		account_guest_time(tsk, acct->gtime);
@@ -411,7 +411,7 @@ void vtime_flush(struct task_struct *tsk)
 		account_system_index_time(tsk, acct->stime, CPUTIME_SYSTEM);
 
 	if (acct->stime_scaled)
-		tsk->stimescaled += acct->stime_scaled;
+		tsk->stimescaled += cputime_to_nsecs(acct->stime_scaled);
 
 	if (acct->hardirq_time)
 		account_system_index_time(tsk, acct->hardirq_time, CPUTIME_IRQ);
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index 0a9e5d67547d..f2fc27491604 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -114,7 +114,7 @@ static void account_system_index_scaled(struct task_struct *p,
 					cputime_t cputime, cputime_t scaled,
 					enum cpu_usage_stat index)
 {
-	p->stimescaled += scaled;
+	p->stimescaled += cputime_to_nsecs(scaled);
 	account_system_index_time(p, cputime, index);
 }
 
@@ -167,12 +167,12 @@ static int do_account_vtime(struct task_struct *tsk)
 	/* Push account value */
 	if (user) {
 		account_user_time(tsk, user);
-		tsk->utimescaled += scale_vtime(user);
+		tsk->utimescaled += cputime_to_nsecs(scale_vtime(user));
 	}
 
 	if (guest) {
 		account_guest_time(tsk, guest);
-		tsk->utimescaled += scale_vtime(guest);
+		tsk->utimescaled += cputime_to_nsecs(scale_vtime(guest));
 	}
 
 	if (system)
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 1572c35b4f1a..2ecd7dab4631 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -964,10 +964,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
 /* Calculate cpu time spent by current task in 100ns units */
 static u64 current_task_runtime_100ns(void)
 {
-	cputime_t utime, stime;
+	u64 utime, stime;
 
 	task_cputime_adjusted(current, &utime, &stime);
-	return div_u64(cputime_to_nsecs(utime + stime), 100);
+
+	return div_u64(utime + stime, 100);
 }
 
 static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 68b915650cae..6d451936a858 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1411,6 +1411,8 @@ static void fill_note(struct memelfnote *note, const char *name, int type,
 static void fill_prstatus(struct elf_prstatus *prstatus,
 		struct task_struct *p, long signr)
 {
+	struct timeval tv;
+
 	prstatus->pr_info.si_signo = prstatus->pr_cursig = signr;
 	prstatus->pr_sigpend = p->pending.signal.sig[0];
 	prstatus->pr_sighold = p->blocked.sig[0];
@@ -1437,8 +1439,13 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 		cputime_to_timeval(utime, &prstatus->pr_utime);
 		cputime_to_timeval(stime, &prstatus->pr_stime);
 	}
-	cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime);
-	cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime);
+	tv = ns_to_timeval(p->signal->cutime);
+	prstatus->pr_cutime.tv_sec = tv.tv_sec;
+	prstatus->pr_cutime.tv_usec = tv.tv_usec;
+
+	tv = ns_to_timeval(p->signal->cstime);
+	prstatus->pr_cstime.tv_sec = tv.tv_sec;
+	prstatus->pr_cstime.tv_usec = tv.tv_usec;
 }
 
 static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 6ccd9df7247a..e1f373460257 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1358,8 +1358,8 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 		cputime_to_timeval(utime, &prstatus->pr_utime);
 		cputime_to_timeval(stime, &prstatus->pr_stime);
 	}
-	cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime);
-	cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime);
+	prstatus->pr_cutime = ns_to_timeval(p->signal->cutime);
+	prstatus->pr_cstime = ns_to_timeval(p->signal->cstime);
 
 	prstatus->pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap;
 	prstatus->pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 25b54cf0c042..fe12b519d09b 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -401,7 +401,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	unsigned long long start_time;
 	unsigned long cmin_flt = 0, cmaj_flt = 0;
 	unsigned long  min_flt = 0,  maj_flt = 0;
-	cputime_t cutime, cstime, utime, stime;
+	u64 cutime, cstime, utime, stime;
 	u64 cgtime, gtime;
 	unsigned long rsslim = 0;
 	char tcomm[sizeof(task->comm)];
@@ -497,10 +497,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	seq_put_decimal_ull(m, " ", cmin_flt);
 	seq_put_decimal_ull(m, " ", maj_flt);
 	seq_put_decimal_ull(m, " ", cmaj_flt);
-	seq_put_decimal_ull(m, " ", cputime_to_clock_t(utime));
-	seq_put_decimal_ull(m, " ", cputime_to_clock_t(stime));
-	seq_put_decimal_ll(m, " ", cputime_to_clock_t(cutime));
-	seq_put_decimal_ll(m, " ", cputime_to_clock_t(cstime));
+	seq_put_decimal_ull(m, " ", nsec_to_clock_t(utime));
+	seq_put_decimal_ull(m, " ", nsec_to_clock_t(stime));
+	seq_put_decimal_ll(m, " ", nsec_to_clock_t(cutime));
+	seq_put_decimal_ll(m, " ", nsec_to_clock_t(cstime));
 	seq_put_decimal_ll(m, " ", priority);
 	seq_put_decimal_ll(m, " ", nice);
 	seq_put_decimal_ll(m, " ", num_threads);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9cc722f77799..b7ccc54b35cc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -585,8 +585,8 @@ struct cpu_itimer {
  */
 struct prev_cputime {
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	cputime_t utime;
-	cputime_t stime;
+	u64 utime;
+	u64 stime;
 	raw_spinlock_t lock;
 #endif
 };
@@ -601,8 +601,8 @@ static inline void prev_cputime_init(struct prev_cputime *prev)
 
 /**
  * struct task_cputime - collected CPU time counts
- * @utime:		time spent in user mode, in &cputime_t units
- * @stime:		time spent in kernel mode, in &cputime_t units
+ * @utime:		time spent in user mode, in nanoseconds
+ * @stime:		time spent in kernel mode, in nanoseconds
  * @sum_exec_runtime:	total time spent on the CPU, in nanoseconds
  *
  * This structure groups together three kinds of CPU time that are tracked for
@@ -610,8 +610,8 @@ static inline void prev_cputime_init(struct prev_cputime *prev)
  * these counts together and treat all three of them in parallel.
  */
 struct task_cputime {
-	cputime_t utime;
-	cputime_t stime;
+	u64 utime;
+	u64 stime;
 	unsigned long long sum_exec_runtime;
 };
 
@@ -780,7 +780,7 @@ struct signal_struct {
 	 * in __exit_signal, except for the group leader.
 	 */
 	seqlock_t stats_lock;
-	cputime_t utime, stime, cutime, cstime;
+	u64 utime, stime, cutime, cstime;
 	u64 gtime;
 	u64 cgtime;
 	struct prev_cputime prev_cputime;
@@ -1661,9 +1661,9 @@ struct task_struct {
 	int __user *set_child_tid;		/* CLONE_CHILD_SETTID */
 	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
 
-	cputime_t utime, stime;
+	u64 utime, stime;
 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
-	cputime_t utimescaled, stimescaled;
+	u64 utimescaled, stimescaled;
 #endif
 	u64 gtime;
 	struct prev_cputime prev_cputime;
@@ -2260,11 +2260,11 @@ struct task_struct *try_get_task_struct(struct task_struct **ptask);
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 extern void task_cputime(struct task_struct *t,
-			 cputime_t *utime, cputime_t *stime);
+			 u64 *utime, u64 *stime);
 extern u64 task_gtime(struct task_struct *t);
 #else
 static inline void task_cputime(struct task_struct *t,
-				cputime_t *utime, cputime_t *stime)
+				u64 *utime, u64 *stime)
 {
 	*utime = t->utime;
 	*stime = t->stime;
@@ -2278,16 +2278,16 @@ static inline u64 task_gtime(struct task_struct *t)
 
 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
 static inline void task_cputime_scaled(struct task_struct *t,
-				       cputime_t *utimescaled,
-				       cputime_t *stimescaled)
+				       u64 *utimescaled,
+				       u64 *stimescaled)
 {
 	*utimescaled = t->utimescaled;
 	*stimescaled = t->stimescaled;
 }
 #else
 static inline void task_cputime_scaled(struct task_struct *t,
-				       cputime_t *utimescaled,
-				       cputime_t *stimescaled)
+				       u64 *utimescaled,
+				       u64 *stimescaled)
 {
 	task_cputime(t, utimescaled, stimescaled);
 }
@@ -2296,18 +2296,26 @@ static inline void task_cputime_scaled(struct task_struct *t,
 static inline void task_cputime_t(struct task_struct *t,
 				  cputime_t *utime, cputime_t *stime)
 {
-	task_cputime(t, utime, stime);
+	u64 ut, st;
+
+	task_cputime(t, &ut, &st);
+	*utime = nsecs_to_cputime(ut);
+	*stime = nsecs_to_cputime(st);
 }
 
 static inline void task_cputime_t_scaled(struct task_struct *t,
 					 cputime_t *utimescaled,
 					 cputime_t *stimescaled)
 {
-	task_cputime_scaled(t, utimescaled, stimescaled);
+	u64 ut, st;
+
+	task_cputime_scaled(t, &ut, &st);
+	*utimescaled = nsecs_to_cputime(ut);
+	*stimescaled = nsecs_to_cputime(st);
 }
 
-extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
-extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
+extern void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st);
+extern void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st);
 
 /*
  * Per process flags
@@ -3522,9 +3530,14 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
 void thread_group_cputimer(struct task_struct *tsk, struct task_cputime_t *times);
 
 static inline void thread_group_cputime_t(struct task_struct *tsk,
-					  struct task_cputime_t *times)
+					  struct task_cputime_t *cputime)
 {
-	thread_group_cputime(tsk, (struct task_cputime *)times);
+	struct task_cputime times;
+
+	thread_group_cputime(tsk, &times);
+	cputime->utime = nsecs_to_cputime(times.utime);
+	cputime->stime = nsecs_to_cputime(times.stime);
+	cputime->sum_exec_runtime = times.sum_exec_runtime;
 }
 
 /*
diff --git a/kernel/exit.c b/kernel/exit.c
index 8f14b866f9f6..8e5e21338b3a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -86,7 +86,7 @@ static void __exit_signal(struct task_struct *tsk)
 	bool group_dead = thread_group_leader(tsk);
 	struct sighand_struct *sighand;
 	struct tty_struct *uninitialized_var(tty);
-	cputime_t utime, stime;
+	u64 utime, stime;
 
 	sighand = rcu_dereference_check(tsk->sighand,
 					lockdep_tasklist_lock_is_held());
@@ -1091,7 +1091,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		struct signal_struct *sig = p->signal;
 		struct signal_struct *psig = current->signal;
 		unsigned long maxrss;
-		cputime_t tgutime, tgstime;
+		u64 tgutime, tgstime;
 
 		/*
 		 * The resource counters for the group leader are in its
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8bcd98e2b821..0bdef50d88bc 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -134,7 +134,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
 	int index;
 
 	/* Add user time to process. */
-	p->utime += cputime;
+	p->utime += cputime_to_nsecs(cputime);
 	account_group_user_time(p, cputime);
 
 	index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
@@ -156,7 +156,7 @@ void account_guest_time(struct task_struct *p, cputime_t cputime)
 	u64 *cpustat = kcpustat_this_cpu->cpustat;
 
 	/* Add guest time to process. */
-	p->utime += cputime;
+	p->utime += cputime_to_nsecs(cputime);
 	account_group_user_time(p, cputime);
 	p->gtime += cputime_to_nsecs(cputime);
 
@@ -180,7 +180,7 @@ void account_system_index_time(struct task_struct *p,
 			       cputime_t cputime, enum cpu_usage_stat index)
 {
 	/* Add system time to process. */
-	p->stime += cputime;
+	p->stime += cputime_to_nsecs(cputime);
 	account_group_system_time(p, cputime);
 
 	/* Add system time to cpustat. */
@@ -315,7 +315,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t)
 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 {
 	struct signal_struct *sig = tsk->signal;
-	cputime_t utime, stime;
+	u64 utime, stime;
 	struct task_struct *t;
 	unsigned int seq, nextseq;
 	unsigned long flags;
@@ -465,14 +465,14 @@ void vtime_account_irq_enter(struct task_struct *tsk)
 EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
 #endif /* __ARCH_HAS_VTIME_ACCOUNT */
 
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 {
 	*ut = p->utime;
 	*st = p->stime;
 }
 EXPORT_SYMBOL_GPL(task_cputime_adjusted);
 
-void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 {
 	struct task_cputime cputime;
 
@@ -543,7 +543,7 @@ void account_idle_ticks(unsigned long ticks)
  * Perform (stime * rtime) / total, but avoid multiplication overflow by
  * loosing precision when the numbers are big.
  */
-static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
+static u64 scale_stime(u64 stime, u64 rtime, u64 total)
 {
 	u64 scaled;
 
@@ -580,7 +580,7 @@ drop_precision:
 	 * followed by a 64/32->64 divide.
 	 */
 	scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
-	return (__force cputime_t) scaled;
+	return scaled;
 }
 
 /*
@@ -605,14 +605,14 @@ drop_precision:
  */
 static void cputime_adjust(struct task_cputime *curr,
 			   struct prev_cputime *prev,
-			   cputime_t *ut, cputime_t *st)
+			   u64 *ut, u64 *st)
 {
-	cputime_t rtime, stime, utime;
+	u64 rtime, stime, utime;
 	unsigned long flags;
 
 	/* Serialize concurrent callers such that we can honour our guarantees */
 	raw_spin_lock_irqsave(&prev->lock, flags);
-	rtime = nsecs_to_cputime(curr->sum_exec_runtime);
+	rtime = curr->sum_exec_runtime;
 
 	/*
 	 * This is possible under two circumstances:
@@ -643,8 +643,7 @@ static void cputime_adjust(struct task_cputime *curr,
 		goto update;
 	}
 
-	stime = scale_stime((__force u64)stime, (__force u64)rtime,
-			    (__force u64)(stime + utime));
+	stime = scale_stime(stime, rtime, stime + utime);
 
 update:
 	/*
@@ -677,7 +676,7 @@ out:
 	raw_spin_unlock_irqrestore(&prev->lock, flags);
 }
 
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 {
 	struct task_cputime cputime = {
 		.sum_exec_runtime = p->se.sum_exec_runtime,
@@ -688,7 +687,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 }
 EXPORT_SYMBOL_GPL(task_cputime_adjusted);
 
-void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 {
 	struct task_cputime cputime;
 
@@ -849,9 +848,9 @@ u64 task_gtime(struct task_struct *t)
  * add up the pending nohz execution time since the last
  * cputime snapshot.
  */
-void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
+void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
 {
-	cputime_t delta;
+	u64 delta;
 	unsigned int seq;
 
 	if (!vtime_accounting_enabled()) {
@@ -870,7 +869,7 @@ void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
 		if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t))
 			continue;
 
-		delta = vtime_delta(t);
+		delta = cputime_to_nsecs(vtime_delta(t));
 
 		/*
 		 * Task runs either in user or kernel space, add pending nohz time to
diff --git a/kernel/signal.c b/kernel/signal.c
index 218048a837ea..b63522193076 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1620,8 +1620,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
 	rcu_read_unlock();
 
 	task_cputime_t(tsk, &utime, &stime);
-	info.si_utime = cputime_to_clock_t(utime + tsk->signal->utime);
-	info.si_stime = cputime_to_clock_t(stime + tsk->signal->stime);
+	info.si_utime = cputime_to_clock_t(utime + nsecs_to_cputime(tsk->signal->utime));
+	info.si_stime = cputime_to_clock_t(stime + nsecs_to_cputime(tsk->signal->stime));
 
 	info.si_status = tsk->exit_code & 0x7f;
 	if (tsk->exit_code & 0x80)
diff --git a/kernel/sys.c b/kernel/sys.c
index 842914ef7de4..7d4a9a6df956 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -881,15 +881,15 @@ SYSCALL_DEFINE0(getegid)
 
 void do_sys_times(struct tms *tms)
 {
-	cputime_t tgutime, tgstime, cutime, cstime;
+	u64 tgutime, tgstime, cutime, cstime;
 
 	thread_group_cputime_adjusted(current, &tgutime, &tgstime);
 	cutime = current->signal->cutime;
 	cstime = current->signal->cstime;
-	tms->tms_utime = cputime_to_clock_t(tgutime);
-	tms->tms_stime = cputime_to_clock_t(tgstime);
-	tms->tms_cutime = cputime_to_clock_t(cutime);
-	tms->tms_cstime = cputime_to_clock_t(cstime);
+	tms->tms_utime = nsec_to_clock_t(tgutime);
+	tms->tms_stime = nsec_to_clock_t(tgstime);
+	tms->tms_cutime = nsec_to_clock_t(cutime);
+	tms->tms_cstime = nsec_to_clock_t(cstime);
 }
 
 SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
@@ -1544,7 +1544,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 {
 	struct task_struct *t;
 	unsigned long flags;
-	cputime_t tgutime, tgstime, utime, stime;
+	u64 tgutime, tgstime, utime, stime;
 	unsigned long maxrss = 0;
 
 	memset((char *)r, 0, sizeof (*r));
@@ -1600,8 +1600,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 	unlock_task_sighand(p, &flags);
 
 out:
-	cputime_to_timeval(utime, &r->ru_utime);
-	cputime_to_timeval(stime, &r->ru_stime);
+	r->ru_utime = ns_to_timeval(utime);
+	r->ru_stime = ns_to_timeval(stime);
 
 	if (who != RUSAGE_CHILDREN) {
 		struct mm_struct *mm = get_task_mm(p);
-- 
cgit v1.2.3


From d4bc42af73bf297f7182ed978b19850553242195 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:28 +0100
Subject: acct: Convert obsolete cputime type to nsecs

Use the new nsec based cputime accessors as part of the whole cputime
conversion from cputime_t to nsecs.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-13-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 2 +-
 kernel/acct.c         | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b7ccc54b35cc..75fc773c158d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -563,7 +563,7 @@ struct pacct_struct {
 	int			ac_flag;
 	long			ac_exitcode;
 	unsigned long		ac_mem;
-	cputime_t		ac_utime, ac_stime;
+	u64			ac_utime, ac_stime;
 	unsigned long		ac_minflt, ac_majflt;
 };
 
diff --git a/kernel/acct.c b/kernel/acct.c
index b9b190a8eecf..ca9cb55b5855 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -453,8 +453,8 @@ static void fill_ac(acct_t *ac)
 	spin_lock_irq(&current->sighand->siglock);
 	tty = current->signal->tty;	/* Safe as we hold the siglock */
 	ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
-	ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
-	ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
+	ac->ac_utime = encode_comp_t(nsec_to_AHZ(pacct->ac_utime));
+	ac->ac_stime = encode_comp_t(nsec_to_AHZ(pacct->ac_stime));
 	ac->ac_flag = pacct->ac_flag;
 	ac->ac_mem = encode_comp_t(pacct->ac_mem);
 	ac->ac_minflt = encode_comp_t(pacct->ac_minflt);
@@ -530,7 +530,7 @@ out:
 void acct_collect(long exitcode, int group_dead)
 {
 	struct pacct_struct *pacct = &current->signal->pacct;
-	cputime_t utime, stime;
+	u64 utime, stime;
 	unsigned long vsize = 0;
 
 	if (group_dead && current->mm) {
@@ -559,7 +559,8 @@ void acct_collect(long exitcode, int group_dead)
 		pacct->ac_flag |= ACORE;
 	if (current->flags & PF_SIGNALED)
 		pacct->ac_flag |= AXSIG;
-	task_cputime_t(current, &utime, &stime);
+
+	task_cputime(current, &utime, &stime);
 	pacct->ac_utime += utime;
 	pacct->ac_stime += stime;
 	pacct->ac_minflt += current->min_flt;
-- 
cgit v1.2.3


From dbf3da1c2d14a6b0b9bac76a3f9912aff48487e7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:29 +0100
Subject: delaycct: Convert obsolete cputime type to nsecs

Use the new nsec based cputime accessors as part of the whole cputime
conversion from cputime_t to nsecs.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-14-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/delayacct.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 228640f2b3d2..660549656991 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -82,19 +82,19 @@ void __delayacct_blkio_end(void)
 
 int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 {
-	cputime_t utime, stime, stimescaled, utimescaled;
+	u64 utime, stime, stimescaled, utimescaled;
 	unsigned long long t2, t3;
 	unsigned long flags, t1;
 	s64 tmp;
 
-	task_cputime_t(tsk, &utime, &stime);
+	task_cputime(tsk, &utime, &stime);
 	tmp = (s64)d->cpu_run_real_total;
-	tmp += cputime_to_nsecs(utime + stime);
+	tmp += utime + stime;
 	d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
 
-	task_cputime_t_scaled(tsk, &utimescaled, &stimescaled);
+	task_cputime_scaled(tsk, &utimescaled, &stimescaled);
 	tmp = (s64)d->cpu_scaled_run_real_total;
-	tmp += cputime_to_nsecs(utimescaled + stimescaled);
+	tmp += utimescaled + stimescaled;
 	d->cpu_scaled_run_real_total =
 		(tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
 
-- 
cgit v1.2.3


From 605dc2b31a2ab235570107cb650036b41e741165 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:30 +0100
Subject: tsacct: Convert obsolete cputime type to nsecs

Use the new nsec based cputime accessors as part of the whole cputime
conversion from cputime_t to nsecs.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-15-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  2 +-
 kernel/tsacct.c       | 27 ++++++++++++---------------
 2 files changed, 13 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 75fc773c158d..dc44366128d8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1815,7 +1815,7 @@ struct task_struct {
 #if defined(CONFIG_TASK_XACCT)
 	u64 acct_rss_mem1;	/* accumulated rss usage */
 	u64 acct_vm_mem1;	/* accumulated virtual memory usage */
-	cputime_t acct_timexpd;	/* stime + utime since last update */
+	u64 acct_timexpd;	/* stime + utime since last update */
 #endif
 #ifdef CONFIG_CPUSETS
 	nodemask_t mems_allowed;	/* Protected by alloc_lock */
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 040d0a64d0d1..5c21f0535056 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -31,7 +31,7 @@ void bacct_add_tsk(struct user_namespace *user_ns,
 		   struct taskstats *stats, struct task_struct *tsk)
 {
 	const struct cred *tcred;
-	cputime_t utime, stime, utimescaled, stimescaled;
+	u64 utime, stime, utimescaled, stimescaled;
 	u64 delta;
 
 	BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
@@ -66,13 +66,13 @@ void bacct_add_tsk(struct user_namespace *user_ns,
 		task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0;
 	rcu_read_unlock();
 
-	task_cputime_t(tsk, &utime, &stime);
-	stats->ac_utime = cputime_to_usecs(utime);
-	stats->ac_stime = cputime_to_usecs(stime);
+	task_cputime(tsk, &utime, &stime);
+	stats->ac_utime = div_u64(utime, NSEC_PER_USEC);
+	stats->ac_stime = div_u64(stime, NSEC_PER_USEC);
 
-	task_cputime_t_scaled(tsk, &utimescaled, &stimescaled);
-	stats->ac_utimescaled = cputime_to_usecs(utimescaled);
-	stats->ac_stimescaled = cputime_to_usecs(stimescaled);
+	task_cputime_scaled(tsk, &utimescaled, &stimescaled);
+	stats->ac_utimescaled = div_u64(utimescaled, NSEC_PER_USEC);
+	stats->ac_stimescaled = div_u64(stimescaled, NSEC_PER_USEC);
 
 	stats->ac_minflt = tsk->min_flt;
 	stats->ac_majflt = tsk->maj_flt;
@@ -123,18 +123,15 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 #undef MB
 
 static void __acct_update_integrals(struct task_struct *tsk,
-				    cputime_t utime, cputime_t stime)
+				    u64 utime, u64 stime)
 {
-	cputime_t time, dtime;
-	u64 delta;
+	u64 time, delta;
 
 	if (!likely(tsk->mm))
 		return;
 
 	time = stime + utime;
-	dtime = time - tsk->acct_timexpd;
-	/* Avoid division: cputime_t is often in nanoseconds already. */
-	delta = cputime_to_nsecs(dtime);
+	delta = time - tsk->acct_timexpd;
 
 	if (delta < TICK_NSEC)
 		return;
@@ -155,11 +152,11 @@ static void __acct_update_integrals(struct task_struct *tsk,
  */
 void acct_update_integrals(struct task_struct *tsk)
 {
-	cputime_t utime, stime;
+	u64 utime, stime;
 	unsigned long flags;
 
 	local_irq_save(flags);
-	task_cputime_t(tsk, &utime, &stime);
+	task_cputime(tsk, &utime, &stime);
 	__acct_update_integrals(tsk, utime, stime);
 	local_irq_restore(flags);
 }
-- 
cgit v1.2.3


From bde8285e5cf78417afda79f53bbc26ef801a27e7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:31 +0100
Subject: signal: Convert obsolete cputime type to nsecs

Use the new nsec based cputime accessors as part of the whole cputime
conversion from cputime_t to nsecs.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-16-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/signal.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index b63522193076..13f9def8b24a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1581,7 +1581,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
 	unsigned long flags;
 	struct sighand_struct *psig;
 	bool autoreap = false;
-	cputime_t utime, stime;
+	u64 utime, stime;
 
 	BUG_ON(sig == -1);
 
@@ -1619,9 +1619,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
 				       task_uid(tsk));
 	rcu_read_unlock();
 
-	task_cputime_t(tsk, &utime, &stime);
-	info.si_utime = cputime_to_clock_t(utime + nsecs_to_cputime(tsk->signal->utime));
-	info.si_stime = cputime_to_clock_t(stime + nsecs_to_cputime(tsk->signal->stime));
+	task_cputime(tsk, &utime, &stime);
+	info.si_utime = nsec_to_clock_t(utime + tsk->signal->utime);
+	info.si_stime = nsec_to_clock_t(stime + tsk->signal->stime);
 
 	info.si_status = tsk->exit_code & 0x7f;
 	if (tsk->exit_code & 0x80)
@@ -1685,7 +1685,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
 	unsigned long flags;
 	struct task_struct *parent;
 	struct sighand_struct *sighand;
-	cputime_t utime, stime;
+	u64 utime, stime;
 
 	if (for_ptracer) {
 		parent = tsk->parent;
@@ -1704,9 +1704,9 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
 	info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
 	rcu_read_unlock();
 
-	task_cputime_t(tsk, &utime, &stime);
-	info.si_utime = cputime_to_clock_t(utime);
-	info.si_stime = cputime_to_clock_t(stime);
+	task_cputime(tsk, &utime, &stime);
+	info.si_utime = nsec_to_clock_t(utime);
+	info.si_stime = nsec_to_clock_t(stime);
 
  	info.si_code = why;
  	switch (why) {
-- 
cgit v1.2.3


From a499a5a14dbd1d0315a96fc62a8798059325e9e6 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:32 +0100
Subject: sched/cputime: Increment kcpustat directly on irqtime account

The irqtime is accounted is nsecs and stored in
cpu_irq_time.hardirq_time and cpu_irq_time.softirq_time. Once the
accumulated amount reaches a new jiffy, this one gets accounted to the
kcpustat.

This was necessary when kcpustat was stored in cputime_t, which could at
worst have jiffies granularity. But now kcpustat is stored in nsecs
so this whole discretization game with temporary irqtime storage has
become unnecessary.

We can now directly account the irqtime to the kcpustat.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-17-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cputime.c | 50 +++++++++++++++++---------------------------------
 kernel/sched/sched.h   |  7 ++++---
 2 files changed, 21 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 0bdef50d88bc..bee6c97b1e83 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -44,6 +44,7 @@ void disable_sched_clock_irqtime(void)
 void irqtime_account_irq(struct task_struct *curr)
 {
 	struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
 	s64 delta;
 	int cpu;
 
@@ -61,49 +62,35 @@ void irqtime_account_irq(struct task_struct *curr)
 	 * in that case, so as not to confuse scheduler with a special task
 	 * that do not consume any time, but still wants to run.
 	 */
-	if (hardirq_count())
-		irqtime->hardirq_time += delta;
-	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
-		irqtime->softirq_time += delta;
+	if (hardirq_count()) {
+		cpustat[CPUTIME_IRQ] += delta;
+		irqtime->tick_delta += delta;
+	} else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) {
+		cpustat[CPUTIME_SOFTIRQ] += delta;
+		irqtime->tick_delta += delta;
+	}
 
 	u64_stats_update_end(&irqtime->sync);
 }
 EXPORT_SYMBOL_GPL(irqtime_account_irq);
 
-static cputime_t irqtime_account_update(u64 irqtime, int idx, cputime_t maxtime)
+static cputime_t irqtime_tick_accounted(cputime_t maxtime)
 {
-	u64 *cpustat = kcpustat_this_cpu->cpustat;
-	cputime_t irq_cputime;
-
-	irq_cputime = nsecs_to_cputime64(irqtime - cpustat[idx]);
-	irq_cputime = min(irq_cputime, maxtime);
-	cpustat[idx] += cputime_to_nsecs(irq_cputime);
-
-	return irq_cputime;
-}
+	struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
+	cputime_t delta;
 
-static cputime_t irqtime_account_hi_update(cputime_t maxtime)
-{
-	return irqtime_account_update(__this_cpu_read(cpu_irqtime.hardirq_time),
-				      CPUTIME_IRQ, maxtime);
-}
+	delta = nsecs_to_cputime(irqtime->tick_delta);
+	delta = min(delta, maxtime);
+	irqtime->tick_delta -= cputime_to_nsecs(delta);
 
-static cputime_t irqtime_account_si_update(cputime_t maxtime)
-{
-	return irqtime_account_update(__this_cpu_read(cpu_irqtime.softirq_time),
-				      CPUTIME_SOFTIRQ, maxtime);
+	return delta;
 }
 
 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
 
 #define sched_clock_irqtime	(0)
 
-static cputime_t irqtime_account_hi_update(cputime_t dummy)
-{
-	return 0;
-}
-
-static cputime_t irqtime_account_si_update(cputime_t dummy)
+static cputime_t irqtime_tick_accounted(cputime_t dummy)
 {
 	return 0;
 }
@@ -280,10 +267,7 @@ static inline cputime_t account_other_time(cputime_t max)
 	accounted = steal_account_process_time(max);
 
 	if (accounted < max)
-		accounted += irqtime_account_hi_update(max - accounted);
-
-	if (accounted < max)
-		accounted += irqtime_account_si_update(max - accounted);
+		accounted += irqtime_tick_accounted(max - accounted);
 
 	return accounted;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6eeae7ebd99b..8ff5cc539e8a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -4,6 +4,7 @@
 #include <linux/sched/rt.h>
 #include <linux/u64_stats_sync.h>
 #include <linux/sched/deadline.h>
+#include <linux/kernel_stat.h>
 #include <linux/binfmts.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
@@ -1827,8 +1828,7 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { }
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 struct irqtime {
-	u64			hardirq_time;
-	u64			softirq_time;
+	u64			tick_delta;
 	u64			irq_start_time;
 	struct u64_stats_sync	sync;
 };
@@ -1838,12 +1838,13 @@ DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
 static inline u64 irq_time_read(int cpu)
 {
 	struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
+	u64 *cpustat = kcpustat_cpu(cpu).cpustat;
 	unsigned int seq;
 	u64 total;
 
 	do {
 		seq = __u64_stats_fetch_begin(&irqtime->sync);
-		total = irqtime->softirq_time + irqtime->hardirq_time;
+		total = cpustat[CPUTIME_SOFTIRQ] + cpustat[CPUTIME_IRQ];
 	} while (__u64_stats_fetch_retry(&irqtime->sync, seq));
 
 	return total;
-- 
cgit v1.2.3


From 715eb7a9243a058a0722aa2f6ba703ede9113e76 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:33 +0100
Subject: timers/posix-timers: Use TICK_NSEC instead of a dynamically ad-hoc
 calculated version

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-18-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/posix-cpu-timers.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index d53ff711a2a8..8349e02b1c0c 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -890,8 +890,6 @@ static inline void stop_process_timers(struct signal_struct *sig)
 	tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER);
 }
 
-static u32 onecputick;
-
 static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
 			     unsigned long long *expires,
 			     unsigned long long cur_time, int signo)
@@ -903,9 +901,9 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
 		if (it->incr) {
 			it->expires += it->incr;
 			it->error += it->incr_error;
-			if (it->error >= onecputick) {
+			if (it->error >= TICK_NSEC) {
 				it->expires -= cputime_one_jiffy;
-				it->error -= onecputick;
+				it->error -= TICK_NSEC;
 			}
 		} else {
 			it->expires = 0;
@@ -1476,15 +1474,10 @@ static __init int init_posix_cpu_timers(void)
 		.clock_get	= thread_cpu_clock_get,
 		.timer_create	= thread_cpu_timer_create,
 	};
-	struct timespec ts;
 
 	posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
 	posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
 
-	cputime_to_timespec(cputime_one_jiffy, &ts);
-	onecputick = ts.tv_nsec;
-	WARN_ON(ts.tv_sec != 0);
-
 	return 0;
 }
 __initcall(init_posix_cpu_timers);
-- 
cgit v1.2.3


From ebd7e7fc4bc63be5eaf9da903b8060b02dd711ea Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:34 +0100
Subject: timers/posix-timers: Convert internals to use nsecs

Use the new nsec based cputime accessors as part of the whole cputime
conversion from cputime_t to nsecs.

Also convert posix-cpu-timers to use nsec based internal counters to
simplify it.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-19-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/posix-timers.h   |  12 +--
 include/linux/sched.h          |   6 +-
 kernel/fork.c                  |   2 +-
 kernel/sched/cputime.c         |   6 +-
 kernel/sched/stats.h           |   4 +-
 kernel/time/itimer.c           |   6 +-
 kernel/time/posix-cpu-timers.c | 210 +++++++++++++++++------------------------
 7 files changed, 100 insertions(+), 146 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 62d44c176071..890de52362d0 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -8,19 +8,9 @@
 #include <linux/alarmtimer.h>
 
 
-static inline unsigned long long cputime_to_expires(cputime_t expires)
-{
-	return (__force unsigned long long)expires;
-}
-
-static inline cputime_t expires_to_cputime(unsigned long long expires)
-{
-	return (__force cputime_t)expires;
-}
-
 struct cpu_timer_list {
 	struct list_head entry;
-	unsigned long long expires, incr;
+	u64 expires, incr;
 	struct task_struct *task;
 	int firing;
 };
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dc44366128d8..baa6a2834e0f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -755,7 +755,7 @@ struct signal_struct {
 	struct thread_group_cputimer cputimer;
 
 	/* Earliest-expiration cache. */
-	struct task_cputime_t cputime_expires;
+	struct task_cputime cputime_expires;
 
 #ifdef CONFIG_NO_HZ_FULL
 	atomic_t tick_dep_mask;
@@ -1689,7 +1689,7 @@ struct task_struct {
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt;
 
-	struct task_cputime_t cputime_expires;
+	struct task_cputime cputime_expires;
 	struct list_head cpu_timers[3];
 
 /* process credentials */
@@ -3527,7 +3527,7 @@ static __always_inline bool need_resched(void)
  * Thread group CPU time accounting.
  */
 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
-void thread_group_cputimer(struct task_struct *tsk, struct task_cputime_t *times);
+void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
 
 static inline void thread_group_cputime_t(struct task_struct *tsk,
 					  struct task_cputime_t *cputime)
diff --git a/kernel/fork.c b/kernel/fork.c
index 11c5c8ab827c..09992ff2f8fa 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1313,7 +1313,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
 
 	cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
 	if (cpu_limit != RLIM_INFINITY) {
-		sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
+		sig->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC;
 		sig->cputimer.running = true;
 	}
 
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index bee6c97b1e83..f7b9624c7df0 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -122,7 +122,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
 
 	/* Add user time to process. */
 	p->utime += cputime_to_nsecs(cputime);
-	account_group_user_time(p, cputime);
+	account_group_user_time(p, cputime_to_nsecs(cputime));
 
 	index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
 
@@ -144,7 +144,7 @@ void account_guest_time(struct task_struct *p, cputime_t cputime)
 
 	/* Add guest time to process. */
 	p->utime += cputime_to_nsecs(cputime);
-	account_group_user_time(p, cputime);
+	account_group_user_time(p, cputime_to_nsecs(cputime));
 	p->gtime += cputime_to_nsecs(cputime);
 
 	/* Add guest time to cpustat. */
@@ -168,7 +168,7 @@ void account_system_index_time(struct task_struct *p,
 {
 	/* Add system time to process. */
 	p->stime += cputime_to_nsecs(cputime);
-	account_group_system_time(p, cputime);
+	account_group_system_time(p, cputime_to_nsecs(cputime));
 
 	/* Add system time to cpustat. */
 	task_group_account_field(p, index, cputime_to_nsecs(cputime));
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 34659a853505..9788478a66d4 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -216,7 +216,7 @@ static inline bool cputimer_running(struct task_struct *tsk)
  * running CPU and update the utime field there.
  */
 static inline void account_group_user_time(struct task_struct *tsk,
-					   cputime_t cputime)
+					   u64 cputime)
 {
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 
@@ -237,7 +237,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
  * running CPU and update the stime field there.
  */
 static inline void account_group_system_time(struct task_struct *tsk,
-					     cputime_t cputime)
+					     u64 cputime)
 {
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index f2d5097bcb6d..bb01ff445ce9 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -53,15 +53,15 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
 	cval = it->expires;
 	cinterval = it->incr;
 	if (cval) {
-		struct task_cputime_t cputime;
+		struct task_cputime cputime;
 		cputime_t t;
 
 		thread_group_cputimer(tsk, &cputime);
 		if (clock_id == CPUCLOCK_PROF)
-			t = cputime.utime + cputime.stime;
+			t = nsecs_to_cputime(cputime.utime + cputime.stime);
 		else
 			/* CPUCLOCK_VIRT */
-			t = cputime.utime;
+			t = nsecs_to_cputime(cputime.utime);
 
 		if (cval < t)
 			/* about to fire */
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 8349e02b1c0c..45be3cec0dc2 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -50,39 +50,14 @@ static int check_clock(const clockid_t which_clock)
 	return error;
 }
 
-static inline unsigned long long
-timespec_to_sample(const clockid_t which_clock, const struct timespec *tp)
-{
-	unsigned long long ret;
-
-	ret = 0;		/* high half always zero when .cpu used */
-	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
-		ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
-	} else {
-		ret = cputime_to_expires(timespec_to_cputime(tp));
-	}
-	return ret;
-}
-
-static void sample_to_timespec(const clockid_t which_clock,
-			       unsigned long long expires,
-			       struct timespec *tp)
-{
-	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED)
-		*tp = ns_to_timespec(expires);
-	else
-		cputime_to_timespec((__force cputime_t)expires, tp);
-}
-
 /*
  * Update expiry time from increment, and increase overrun count,
  * given the current clock sample.
  */
-static void bump_cpu_timer(struct k_itimer *timer,
-			   unsigned long long now)
+static void bump_cpu_timer(struct k_itimer *timer, u64 now)
 {
 	int i;
-	unsigned long long delta, incr;
+	u64 delta, incr;
 
 	if (timer->it.cpu.incr == 0)
 		return;
@@ -115,28 +90,28 @@ static void bump_cpu_timer(struct k_itimer *timer,
  * Checks @cputime to see if all fields are zero.  Returns true if all fields
  * are zero, false if any field is nonzero.
  */
-static inline int task_cputime_zero(const struct task_cputime_t *cputime)
+static inline int task_cputime_zero(const struct task_cputime *cputime)
 {
 	if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
 		return 1;
 	return 0;
 }
 
-static inline unsigned long long prof_ticks(struct task_struct *p)
+static inline u64 prof_ticks(struct task_struct *p)
 {
-	cputime_t utime, stime;
+	u64 utime, stime;
 
-	task_cputime_t(p, &utime, &stime);
+	task_cputime(p, &utime, &stime);
 
-	return cputime_to_expires(utime + stime);
+	return utime + stime;
 }
-static inline unsigned long long virt_ticks(struct task_struct *p)
+static inline u64 virt_ticks(struct task_struct *p)
 {
-	cputime_t utime, stime;
+	u64 utime, stime;
 
-	task_cputime_t(p, &utime, &stime);
+	task_cputime(p, &utime, &stime);
 
-	return cputime_to_expires(utime);
+	return utime;
 }
 
 static int
@@ -176,8 +151,8 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
 /*
  * Sample a per-thread clock for the given task.
  */
-static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
-			    unsigned long long *sample)
+static int cpu_clock_sample(const clockid_t which_clock,
+			    struct task_struct *p, u64 *sample)
 {
 	switch (CPUCLOCK_WHICH(which_clock)) {
 	default:
@@ -210,7 +185,7 @@ retry:
 	}
 }
 
-static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime_t *sum)
+static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum)
 {
 	__update_gt_cputime(&cputime_atomic->utime, sum->utime);
 	__update_gt_cputime(&cputime_atomic->stime, sum->stime);
@@ -218,7 +193,7 @@ static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct
 }
 
 /* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */
-static inline void sample_cputime_atomic(struct task_cputime_t *times,
+static inline void sample_cputime_atomic(struct task_cputime *times,
 					 struct task_cputime_atomic *atomic_times)
 {
 	times->utime = atomic64_read(&atomic_times->utime);
@@ -226,10 +201,10 @@ static inline void sample_cputime_atomic(struct task_cputime_t *times,
 	times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime);
 }
 
-void thread_group_cputimer(struct task_struct *tsk, struct task_cputime_t *times)
+void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 {
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-	struct task_cputime_t sum;
+	struct task_cputime sum;
 
 	/* Check if cputimer isn't running. This is accessed without locking. */
 	if (!READ_ONCE(cputimer->running)) {
@@ -238,7 +213,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime_t *times
 		 * values through the TIMER_ABSTIME flag, therefore we have
 		 * to synchronize the timer to the clock every time we start it.
 		 */
-		thread_group_cputime_t(tsk, &sum);
+		thread_group_cputime(tsk, &sum);
 		update_gt_cputime(&cputimer->cputime_atomic, &sum);
 
 		/*
@@ -260,23 +235,23 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime_t *times
  */
 static int cpu_clock_sample_group(const clockid_t which_clock,
 				  struct task_struct *p,
-				  unsigned long long *sample)
+				  u64 *sample)
 {
-	struct task_cputime_t cputime;
+	struct task_cputime cputime;
 
 	switch (CPUCLOCK_WHICH(which_clock)) {
 	default:
 		return -EINVAL;
 	case CPUCLOCK_PROF:
-		thread_group_cputime_t(p, &cputime);
-		*sample = cputime_to_expires(cputime.utime + cputime.stime);
+		thread_group_cputime(p, &cputime);
+		*sample = cputime.utime + cputime.stime;
 		break;
 	case CPUCLOCK_VIRT:
-		thread_group_cputime_t(p, &cputime);
-		*sample = cputime_to_expires(cputime.utime);
+		thread_group_cputime(p, &cputime);
+		*sample = cputime.utime;
 		break;
 	case CPUCLOCK_SCHED:
-		thread_group_cputime_t(p, &cputime);
+		thread_group_cputime(p, &cputime);
 		*sample = cputime.sum_exec_runtime;
 		break;
 	}
@@ -288,7 +263,7 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk,
 				    struct timespec *tp)
 {
 	int err = -EINVAL;
-	unsigned long long rtn;
+	u64 rtn;
 
 	if (CPUCLOCK_PERTHREAD(which_clock)) {
 		if (same_thread_group(tsk, current))
@@ -299,7 +274,7 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk,
 	}
 
 	if (!err)
-		sample_to_timespec(which_clock, rtn, tp);
+		*tp = ns_to_timespec(rtn);
 
 	return err;
 }
@@ -453,7 +428,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
 	cleanup_timers(tsk->signal->cpu_timers);
 }
 
-static inline int expires_gt(cputime_t expires, cputime_t new_exp)
+static inline int expires_gt(u64 expires, u64 new_exp)
 {
 	return expires == 0 || expires > new_exp;
 }
@@ -466,7 +441,7 @@ static void arm_timer(struct k_itimer *timer)
 {
 	struct task_struct *p = timer->it.cpu.task;
 	struct list_head *head, *listpos;
-	struct task_cputime_t *cputime_expires;
+	struct task_cputime *cputime_expires;
 	struct cpu_timer_list *const nt = &timer->it.cpu;
 	struct cpu_timer_list *next;
 
@@ -488,7 +463,7 @@ static void arm_timer(struct k_itimer *timer)
 	list_add(&nt->entry, listpos);
 
 	if (listpos == head) {
-		unsigned long long exp = nt->expires;
+		u64 exp = nt->expires;
 
 		/*
 		 * We are the new earliest-expiring POSIX 1.b timer, hence
@@ -499,16 +474,15 @@ static void arm_timer(struct k_itimer *timer)
 
 		switch (CPUCLOCK_WHICH(timer->it_clock)) {
 		case CPUCLOCK_PROF:
-			if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp)))
-				cputime_expires->prof_exp = expires_to_cputime(exp);
+			if (expires_gt(cputime_expires->prof_exp, exp))
+				cputime_expires->prof_exp = exp;
 			break;
 		case CPUCLOCK_VIRT:
-			if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp)))
-				cputime_expires->virt_exp = expires_to_cputime(exp);
+			if (expires_gt(cputime_expires->virt_exp, exp))
+				cputime_expires->virt_exp = exp;
 			break;
 		case CPUCLOCK_SCHED:
-			if (cputime_expires->sched_exp == 0 ||
-			    cputime_expires->sched_exp > exp)
+			if (expires_gt(cputime_expires->sched_exp, exp))
 				cputime_expires->sched_exp = exp;
 			break;
 		}
@@ -559,20 +533,19 @@ static void cpu_timer_fire(struct k_itimer *timer)
  * traversal.
  */
 static int cpu_timer_sample_group(const clockid_t which_clock,
-				  struct task_struct *p,
-				  unsigned long long *sample)
+				  struct task_struct *p, u64 *sample)
 {
-	struct task_cputime_t cputime;
+	struct task_cputime cputime;
 
 	thread_group_cputimer(p, &cputime);
 	switch (CPUCLOCK_WHICH(which_clock)) {
 	default:
 		return -EINVAL;
 	case CPUCLOCK_PROF:
-		*sample = cputime_to_expires(cputime.utime + cputime.stime);
+		*sample = cputime.utime + cputime.stime;
 		break;
 	case CPUCLOCK_VIRT:
-		*sample = cputime_to_expires(cputime.utime);
+		*sample = cputime.utime;
 		break;
 	case CPUCLOCK_SCHED:
 		*sample = cputime.sum_exec_runtime;
@@ -593,12 +566,12 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 	unsigned long flags;
 	struct sighand_struct *sighand;
 	struct task_struct *p = timer->it.cpu.task;
-	unsigned long long old_expires, new_expires, old_incr, val;
+	u64 old_expires, new_expires, old_incr, val;
 	int ret;
 
 	WARN_ON_ONCE(p == NULL);
 
-	new_expires = timespec_to_sample(timer->it_clock, &new->it_value);
+	new_expires = timespec_to_ns(&new->it_value);
 
 	/*
 	 * Protect against sighand release/switch in exit/exec and p->cpu_timers
@@ -659,9 +632,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 			bump_cpu_timer(timer, val);
 			if (val < timer->it.cpu.expires) {
 				old_expires = timer->it.cpu.expires - val;
-				sample_to_timespec(timer->it_clock,
-						   old_expires,
-						   &old->it_value);
+				old->it_value = ns_to_timespec(old_expires);
 			} else {
 				old->it_value.tv_nsec = 1;
 				old->it_value.tv_sec = 0;
@@ -699,8 +670,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 	 * Install the new reload setting, and
 	 * set up the signal and overrun bookkeeping.
 	 */
-	timer->it.cpu.incr = timespec_to_sample(timer->it_clock,
-						&new->it_interval);
+	timer->it.cpu.incr = timespec_to_ns(&new->it_interval);
 
 	/*
 	 * This acts as a modification timestamp for the timer,
@@ -723,17 +693,15 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 
 	ret = 0;
  out:
-	if (old) {
-		sample_to_timespec(timer->it_clock,
-				   old_incr, &old->it_interval);
-	}
+	if (old)
+		old->it_interval = ns_to_timespec(old_incr);
 
 	return ret;
 }
 
 static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 {
-	unsigned long long now;
+	u64 now;
 	struct task_struct *p = timer->it.cpu.task;
 
 	WARN_ON_ONCE(p == NULL);
@@ -741,8 +709,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 	/*
 	 * Easy part: convert the reload time.
 	 */
-	sample_to_timespec(timer->it_clock,
-			   timer->it.cpu.incr, &itp->it_interval);
+	itp->it_interval = ns_to_timespec(timer->it.cpu.incr);
 
 	if (timer->it.cpu.expires == 0) {	/* Timer not armed at all.  */
 		itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
@@ -761,7 +728,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 		/*
 		 * Protect against sighand release/switch in exit/exec and
 		 * also make timer sampling safe if it ends up calling
-		 * thread_group_cputime_t().
+		 * thread_group_cputime().
 		 */
 		sighand = lock_task_sighand(p, &flags);
 		if (unlikely(sighand == NULL)) {
@@ -771,8 +738,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 			 * Call the timer disarmed, nothing else to do.
 			 */
 			timer->it.cpu.expires = 0;
-			sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
-					   &itp->it_value);
+			itp->it_value = ns_to_timespec(timer->it.cpu.expires);
 			return;
 		} else {
 			cpu_timer_sample_group(timer->it_clock, p, &now);
@@ -781,9 +747,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 	}
 
 	if (now < timer->it.cpu.expires) {
-		sample_to_timespec(timer->it_clock,
-				   timer->it.cpu.expires - now,
-				   &itp->it_value);
+		itp->it_value = ns_to_timespec(timer->it.cpu.expires - now);
 	} else {
 		/*
 		 * The timer should have expired already, but the firing
@@ -826,8 +790,8 @@ static void check_thread_timers(struct task_struct *tsk,
 {
 	struct list_head *timers = tsk->cpu_timers;
 	struct signal_struct *const sig = tsk->signal;
-	struct task_cputime_t *tsk_expires = &tsk->cputime_expires;
-	unsigned long long expires;
+	struct task_cputime *tsk_expires = &tsk->cputime_expires;
+	u64 expires;
 	unsigned long soft;
 
 	/*
@@ -838,10 +802,10 @@ static void check_thread_timers(struct task_struct *tsk,
 		return;
 
 	expires = check_timers_list(timers, firing, prof_ticks(tsk));
-	tsk_expires->prof_exp = expires_to_cputime(expires);
+	tsk_expires->prof_exp = expires;
 
 	expires = check_timers_list(++timers, firing, virt_ticks(tsk));
-	tsk_expires->virt_exp = expires_to_cputime(expires);
+	tsk_expires->virt_exp = expires;
 
 	tsk_expires->sched_exp = check_timers_list(++timers, firing,
 						   tsk->se.sum_exec_runtime);
@@ -891,13 +855,12 @@ static inline void stop_process_timers(struct signal_struct *sig)
 }
 
 static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
-			     unsigned long long *expires,
-			     unsigned long long cur_time, int signo)
+			     u64 *expires, u64 cur_time, int signo)
 {
 	if (!it->expires)
 		return;
 
-	if (cur_time >= it->expires) {
+	if (cur_time >= cputime_to_nsecs(it->expires)) {
 		if (it->incr) {
 			it->expires += it->incr;
 			it->error += it->incr_error;
@@ -915,8 +878,8 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
 		__group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
 	}
 
-	if (it->expires && (!*expires || it->expires < *expires)) {
-		*expires = it->expires;
+	if (it->expires && (!*expires || cputime_to_nsecs(it->expires) < *expires)) {
+		*expires = cputime_to_nsecs(it->expires);
 	}
 }
 
@@ -929,10 +892,10 @@ static void check_process_timers(struct task_struct *tsk,
 				 struct list_head *firing)
 {
 	struct signal_struct *const sig = tsk->signal;
-	unsigned long long utime, ptime, virt_expires, prof_expires;
-	unsigned long long sum_sched_runtime, sched_expires;
+	u64 utime, ptime, virt_expires, prof_expires;
+	u64 sum_sched_runtime, sched_expires;
 	struct list_head *timers = sig->cpu_timers;
-	struct task_cputime_t cputime;
+	struct task_cputime cputime;
 	unsigned long soft;
 
 	/*
@@ -952,8 +915,8 @@ static void check_process_timers(struct task_struct *tsk,
 	 * Collect the current process totals.
 	 */
 	thread_group_cputimer(tsk, &cputime);
-	utime = cputime_to_expires(cputime.utime);
-	ptime = utime + cputime_to_expires(cputime.stime);
+	utime = cputime.utime;
+	ptime = utime + cputime.stime;
 	sum_sched_runtime = cputime.sum_exec_runtime;
 
 	prof_expires = check_timers_list(timers, firing, ptime);
@@ -969,10 +932,10 @@ static void check_process_timers(struct task_struct *tsk,
 			 SIGVTALRM);
 	soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
 	if (soft != RLIM_INFINITY) {
-		unsigned long psecs = cputime_to_secs(ptime);
+		unsigned long psecs = div_u64(ptime, NSEC_PER_SEC);
 		unsigned long hard =
 			READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
-		cputime_t x;
+		u64 x;
 		if (psecs >= hard) {
 			/*
 			 * At the hard limit, we just die.
@@ -991,14 +954,13 @@ static void check_process_timers(struct task_struct *tsk,
 				sig->rlim[RLIMIT_CPU].rlim_cur = soft;
 			}
 		}
-		x = secs_to_cputime(soft);
-		if (!prof_expires || x < prof_expires) {
+		x = soft * NSEC_PER_SEC;
+		if (!prof_expires || x < prof_expires)
 			prof_expires = x;
-		}
 	}
 
-	sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires);
-	sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires);
+	sig->cputime_expires.prof_exp = prof_expires;
+	sig->cputime_expires.virt_exp = virt_expires;
 	sig->cputime_expires.sched_exp = sched_expires;
 	if (task_cputime_zero(&sig->cputime_expires))
 		stop_process_timers(sig);
@@ -1015,7 +977,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 	struct sighand_struct *sighand;
 	unsigned long flags;
 	struct task_struct *p = timer->it.cpu.task;
-	unsigned long long now;
+	u64 now;
 
 	WARN_ON_ONCE(p == NULL);
 
@@ -1035,7 +997,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 	} else {
 		/*
 		 * Protect arm_timer() and timer sampling in case of call to
-		 * thread_group_cputime_t().
+		 * thread_group_cputime().
 		 */
 		sighand = lock_task_sighand(p, &flags);
 		if (unlikely(sighand == NULL)) {
@@ -1078,8 +1040,8 @@ out:
  * Returns true if any field of the former is greater than the corresponding
  * field of the latter if the latter field is set.  Otherwise returns false.
  */
-static inline int task_cputime_expired(const struct task_cputime_t *sample,
-					const struct task_cputime_t *expires)
+static inline int task_cputime_expired(const struct task_cputime *sample,
+					const struct task_cputime *expires)
 {
 	if (expires->utime && sample->utime >= expires->utime)
 		return 1;
@@ -1106,9 +1068,9 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 	struct signal_struct *sig;
 
 	if (!task_cputime_zero(&tsk->cputime_expires)) {
-		struct task_cputime_t task_sample;
+		struct task_cputime task_sample;
 
-		task_cputime_t(tsk, &task_sample.utime, &task_sample.stime);
+		task_cputime(tsk, &task_sample.utime, &task_sample.stime);
 		task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime;
 		if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
 			return 1;
@@ -1131,7 +1093,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 	 */
 	if (READ_ONCE(sig->cputimer.running) &&
 	    !READ_ONCE(sig->cputimer.checking_timer)) {
-		struct task_cputime_t group_sample;
+		struct task_cputime group_sample;
 
 		sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);
 
@@ -1214,7 +1176,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 			   cputime_t *newval, cputime_t *oldval)
 {
-	unsigned long long now;
+	u64 now, new;
 
 	WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED);
 	cpu_timer_sample_group(clock_idx, tsk, &now);
@@ -1226,31 +1188,33 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 		 * it to be absolute.
 		 */
 		if (*oldval) {
-			if (*oldval <= now) {
+			if (cputime_to_nsecs(*oldval) <= now) {
 				/* Just about to fire. */
 				*oldval = cputime_one_jiffy;
 			} else {
-				*oldval -= now;
+				*oldval -= nsecs_to_cputime(now);
 			}
 		}
 
 		if (!*newval)
 			return;
-		*newval += now;
+		*newval += nsecs_to_cputime(now);
 	}
 
+	new = cputime_to_nsecs(*newval);
+
 	/*
 	 * Update expiration cache if we are the earliest timer, or eventually
 	 * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire.
 	 */
 	switch (clock_idx) {
 	case CPUCLOCK_PROF:
-		if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval))
-			tsk->signal->cputime_expires.prof_exp = *newval;
+		if (expires_gt(tsk->signal->cputime_expires.prof_exp, new))
+			tsk->signal->cputime_expires.prof_exp = new;
 		break;
 	case CPUCLOCK_VIRT:
-		if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval))
-			tsk->signal->cputime_expires.virt_exp = *newval;
+		if (expires_gt(tsk->signal->cputime_expires.virt_exp, new))
+			tsk->signal->cputime_expires.virt_exp = new;
 		break;
 	}
 
@@ -1308,7 +1272,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 		/*
 		 * We were interrupted by a signal.
 		 */
-		sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
+		*rqtp = ns_to_timespec(timer.it.cpu.expires);
 		error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
 		if (!error) {
 			/*
-- 
cgit v1.2.3


From 858cf3a8c59968e7c5f7c1a1192459a0d52d1ab4 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:35 +0100
Subject: timers/itimer: Convert internal cputime_t units to nsec

Use the new nsec based cputime accessors as part of the whole cputime
conversion from cputime_t to nsecs.

Also convert itimers to use nsec based internal counters. This simplifies
it and removes the whole game with error/inc_error which served to deal
with cputime_t random granularity.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-20-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/posix-timers.h   |  2 +-
 include/linux/sched.h          |  6 ++--
 include/trace/events/timer.h   | 26 ++++++++---------
 kernel/time/itimer.c           | 64 +++++++++++++++---------------------------
 kernel/time/posix-cpu-timers.c | 43 +++++++++++-----------------
 5 files changed, 55 insertions(+), 86 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 890de52362d0..64aa189efe21 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -119,7 +119,7 @@ void run_posix_cpu_timers(struct task_struct *task);
 void posix_cpu_timers_exit(struct task_struct *task);
 void posix_cpu_timers_exit_group(struct task_struct *task);
 void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,
-			   cputime_t *newval, cputime_t *oldval);
+			   u64 *newval, u64 *oldval);
 
 long clock_nanosleep_restart(struct restart_block *restart_block);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index baa6a2834e0f..268fdd713089 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -568,10 +568,8 @@ struct pacct_struct {
 };
 
 struct cpu_itimer {
-	cputime_t expires;
-	cputime_t incr;
-	u32 error;
-	u32 incr_error;
+	u64 expires;
+	u64 incr;
 };
 
 /**
diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h
index 1448637616d6..1bca99dbb98f 100644
--- a/include/trace/events/timer.h
+++ b/include/trace/events/timer.h
@@ -269,17 +269,17 @@ DEFINE_EVENT(hrtimer_class, hrtimer_cancel,
 TRACE_EVENT(itimer_state,
 
 	TP_PROTO(int which, const struct itimerval *const value,
-		 cputime_t expires),
+		 unsigned long long expires),
 
 	TP_ARGS(which, value, expires),
 
 	TP_STRUCT__entry(
-		__field(	int,		which		)
-		__field(	cputime_t,	expires		)
-		__field(	long,		value_sec	)
-		__field(	long,		value_usec	)
-		__field(	long,		interval_sec	)
-		__field(	long,		interval_usec	)
+		__field(	int,			which		)
+		__field(	unsigned long long,	expires		)
+		__field(	long,			value_sec	)
+		__field(	long,			value_usec	)
+		__field(	long,			interval_sec	)
+		__field(	long,			interval_usec	)
 	),
 
 	TP_fast_assign(
@@ -292,7 +292,7 @@ TRACE_EVENT(itimer_state,
 	),
 
 	TP_printk("which=%d expires=%llu it_value=%ld.%ld it_interval=%ld.%ld",
-		  __entry->which, (unsigned long long)__entry->expires,
+		  __entry->which, __entry->expires,
 		  __entry->value_sec, __entry->value_usec,
 		  __entry->interval_sec, __entry->interval_usec)
 );
@@ -305,14 +305,14 @@ TRACE_EVENT(itimer_state,
  */
 TRACE_EVENT(itimer_expire,
 
-	TP_PROTO(int which, struct pid *pid, cputime_t now),
+	TP_PROTO(int which, struct pid *pid, unsigned long long now),
 
 	TP_ARGS(which, pid, now),
 
 	TP_STRUCT__entry(
-		__field( int ,		which	)
-		__field( pid_t,		pid	)
-		__field( cputime_t,	now	)
+		__field( int ,			which	)
+		__field( pid_t,			pid	)
+		__field( unsigned long long,	now	)
 	),
 
 	TP_fast_assign(
@@ -322,7 +322,7 @@ TRACE_EVENT(itimer_expire,
 	),
 
 	TP_printk("which=%d pid=%d now=%llu", __entry->which,
-		  (int) __entry->pid, (unsigned long long)__entry->now)
+		  (int) __entry->pid, __entry->now)
 );
 
 #ifdef CONFIG_NO_HZ_COMMON
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index bb01ff445ce9..a95f13c31464 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -45,35 +45,35 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer)
 static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
 			   struct itimerval *const value)
 {
-	cputime_t cval, cinterval;
+	u64 val, interval;
 	struct cpu_itimer *it = &tsk->signal->it[clock_id];
 
 	spin_lock_irq(&tsk->sighand->siglock);
 
-	cval = it->expires;
-	cinterval = it->incr;
-	if (cval) {
+	val = it->expires;
+	interval = it->incr;
+	if (val) {
 		struct task_cputime cputime;
-		cputime_t t;
+		u64 t;
 
 		thread_group_cputimer(tsk, &cputime);
 		if (clock_id == CPUCLOCK_PROF)
-			t = nsecs_to_cputime(cputime.utime + cputime.stime);
+			t = cputime.utime + cputime.stime;
 		else
 			/* CPUCLOCK_VIRT */
-			t = nsecs_to_cputime(cputime.utime);
+			t = cputime.utime;
 
-		if (cval < t)
+		if (val < t)
 			/* about to fire */
-			cval = cputime_one_jiffy;
+			val = TICK_NSEC;
 		else
-			cval = cval - t;
+			val -= t;
 	}
 
 	spin_unlock_irq(&tsk->sighand->siglock);
 
-	cputime_to_timeval(cval, &value->it_value);
-	cputime_to_timeval(cinterval, &value->it_interval);
+	value->it_value = ns_to_timeval(val);
+	value->it_interval = ns_to_timeval(interval);
 }
 
 int do_getitimer(int which, struct itimerval *value)
@@ -129,55 +129,35 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
-static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns)
-{
-	struct timespec ts;
-	s64 cpu_ns;
-
-	cputime_to_timespec(ct, &ts);
-	cpu_ns = timespec_to_ns(&ts);
-
-	return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns;
-}
-
 static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
 			   const struct itimerval *const value,
 			   struct itimerval *const ovalue)
 {
-	cputime_t cval, nval, cinterval, ninterval;
-	s64 ns_ninterval, ns_nval;
-	u32 error, incr_error;
+	u64 oval, nval, ointerval, ninterval;
 	struct cpu_itimer *it = &tsk->signal->it[clock_id];
 
-	nval = timeval_to_cputime(&value->it_value);
-	ns_nval = timeval_to_ns(&value->it_value);
-	ninterval = timeval_to_cputime(&value->it_interval);
-	ns_ninterval = timeval_to_ns(&value->it_interval);
-
-	error = cputime_sub_ns(nval, ns_nval);
-	incr_error = cputime_sub_ns(ninterval, ns_ninterval);
+	nval = timeval_to_ns(&value->it_value);
+	ninterval = timeval_to_ns(&value->it_interval);
 
 	spin_lock_irq(&tsk->sighand->siglock);
 
-	cval = it->expires;
-	cinterval = it->incr;
-	if (cval || nval) {
+	oval = it->expires;
+	ointerval = it->incr;
+	if (oval || nval) {
 		if (nval > 0)
-			nval += cputime_one_jiffy;
-		set_process_cpu_timer(tsk, clock_id, &nval, &cval);
+			nval += TICK_NSEC;
+		set_process_cpu_timer(tsk, clock_id, &nval, &oval);
 	}
 	it->expires = nval;
 	it->incr = ninterval;
-	it->error = error;
-	it->incr_error = incr_error;
 	trace_itimer_state(clock_id == CPUCLOCK_VIRT ?
 			   ITIMER_VIRTUAL : ITIMER_PROF, value, nval);
 
 	spin_unlock_irq(&tsk->sighand->siglock);
 
 	if (ovalue) {
-		cputime_to_timeval(cval, &ovalue->it_value);
-		cputime_to_timeval(cinterval, &ovalue->it_interval);
+		ovalue->it_value = ns_to_timeval(oval);
+		ovalue->it_interval = ns_to_timeval(ointerval);
 	}
 }
 
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 45be3cec0dc2..b4377a5e4269 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -20,10 +20,10 @@
  */
 void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
 {
-	cputime_t cputime = secs_to_cputime(rlim_new);
+	u64 nsecs = rlim_new * NSEC_PER_SEC;
 
 	spin_lock_irq(&task->sighand->siglock);
-	set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL);
+	set_process_cpu_timer(task, CPUCLOCK_PROF, &nsecs, NULL);
 	spin_unlock_irq(&task->sighand->siglock);
 }
 
@@ -860,17 +860,11 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
 	if (!it->expires)
 		return;
 
-	if (cur_time >= cputime_to_nsecs(it->expires)) {
-		if (it->incr) {
+	if (cur_time >= it->expires) {
+		if (it->incr)
 			it->expires += it->incr;
-			it->error += it->incr_error;
-			if (it->error >= TICK_NSEC) {
-				it->expires -= cputime_one_jiffy;
-				it->error -= TICK_NSEC;
-			}
-		} else {
+		else
 			it->expires = 0;
-		}
 
 		trace_itimer_expire(signo == SIGPROF ?
 				    ITIMER_PROF : ITIMER_VIRTUAL,
@@ -878,9 +872,8 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
 		__group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
 	}
 
-	if (it->expires && (!*expires || cputime_to_nsecs(it->expires) < *expires)) {
-		*expires = cputime_to_nsecs(it->expires);
-	}
+	if (it->expires && (!*expires || it->expires < *expires))
+		*expires = it->expires;
 }
 
 /*
@@ -1174,9 +1167,9 @@ void run_posix_cpu_timers(struct task_struct *tsk)
  * The tsk->sighand->siglock must be held by the caller.
  */
 void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
-			   cputime_t *newval, cputime_t *oldval)
+			   u64 *newval, u64 *oldval)
 {
-	u64 now, new;
+	u64 now;
 
 	WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED);
 	cpu_timer_sample_group(clock_idx, tsk, &now);
@@ -1188,33 +1181,31 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 		 * it to be absolute.
 		 */
 		if (*oldval) {
-			if (cputime_to_nsecs(*oldval) <= now) {
+			if (*oldval <= now) {
 				/* Just about to fire. */
-				*oldval = cputime_one_jiffy;
+				*oldval = TICK_NSEC;
 			} else {
-				*oldval -= nsecs_to_cputime(now);
+				*oldval -= now;
 			}
 		}
 
 		if (!*newval)
 			return;
-		*newval += nsecs_to_cputime(now);
+		*newval += now;
 	}
 
-	new = cputime_to_nsecs(*newval);
-
 	/*
 	 * Update expiration cache if we are the earliest timer, or eventually
 	 * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire.
 	 */
 	switch (clock_idx) {
 	case CPUCLOCK_PROF:
-		if (expires_gt(tsk->signal->cputime_expires.prof_exp, new))
-			tsk->signal->cputime_expires.prof_exp = new;
+		if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval))
+			tsk->signal->cputime_expires.prof_exp = *newval;
 		break;
 	case CPUCLOCK_VIRT:
-		if (expires_gt(tsk->signal->cputime_expires.virt_exp, new))
-			tsk->signal->cputime_expires.virt_exp = new;
+		if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval))
+			tsk->signal->cputime_expires.virt_exp = *newval;
 		break;
 	}
 
-- 
cgit v1.2.3


From 23244a5c8003d4154161a8289a7d3783b0237c08 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:37 +0100
Subject: sched/cputime: Push time to account_user_time() in nsecs

This is one more step toward converting cputime accounting to pure nsecs.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-22-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/ia64/kernel/time.c     |  2 +-
 arch/powerpc/kernel/time.c  |  2 +-
 arch/s390/kernel/vtime.c    |  2 +-
 include/linux/kernel_stat.h |  2 +-
 kernel/sched/cputime.c      | 42 ++++++++++++++++++++++++------------------
 5 files changed, 28 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index d040f12ea9f9..ed98b26047c2 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -67,7 +67,7 @@ void vtime_flush(struct task_struct *tsk)
 	cputime_t delta;
 
 	if (ti->utime)
-		account_user_time(tsk, cycle_to_cputime(ti->utime));
+		account_user_time(tsk, cputime_to_nsecs(cycle_to_cputime(ti->utime)));
 
 	if (ti->gtime)
 		account_guest_time(tsk, cycle_to_cputime(ti->gtime));
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 3cca82e065c9..c3931d816190 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -393,7 +393,7 @@ void vtime_flush(struct task_struct *tsk)
 	struct cpu_accounting_data *acct = get_accounting(tsk);
 
 	if (acct->utime)
-		account_user_time(tsk, acct->utime);
+		account_user_time(tsk, cputime_to_nsecs(acct->utime));
 
 	if (acct->utime_scaled)
 		tsk->utimescaled += cputime_to_nsecs(acct->utime_scaled);
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index f2fc27491604..ca206d122302 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -166,7 +166,7 @@ static int do_account_vtime(struct task_struct *tsk)
 
 	/* Push account value */
 	if (user) {
-		account_user_time(tsk, user);
+		account_user_time(tsk, cputime_to_nsecs(user));
 		tsk->utimescaled += cputime_to_nsecs(scale_vtime(user));
 	}
 
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index c3e38ded2d73..b716001ac23e 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -78,7 +78,7 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu)
 	return kstat_cpu(cpu).irqs_sum;
 }
 
-extern void account_user_time(struct task_struct *, cputime_t);
+extern void account_user_time(struct task_struct *, u64);
 extern void account_guest_time(struct task_struct *, cputime_t);
 extern void account_system_time(struct task_struct *, int, cputime_t);
 extern void account_system_index_time(struct task_struct *, cputime_t,
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f7b9624c7df0..55d31c35833a 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -116,18 +116,18 @@ static inline void task_group_account_field(struct task_struct *p, int index,
  * @p: the process that the cpu time gets accounted to
  * @cputime: the cpu time spent in user space since the last update
  */
-void account_user_time(struct task_struct *p, cputime_t cputime)
+void account_user_time(struct task_struct *p, u64 cputime)
 {
 	int index;
 
 	/* Add user time to process. */
-	p->utime += cputime_to_nsecs(cputime);
-	account_group_user_time(p, cputime_to_nsecs(cputime));
+	p->utime += cputime;
+	account_group_user_time(p, cputime);
 
 	index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
 
 	/* Add user time to cpustat. */
-	task_group_account_field(p, index, cputime_to_nsecs(cputime));
+	task_group_account_field(p, index, cputime);
 
 	/* Account for user time used */
 	acct_account_cputime(p);
@@ -363,8 +363,9 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 					 struct rq *rq, int ticks)
 {
-	u64 cputime = (__force u64) cputime_one_jiffy * ticks;
+	u64 old_cputime = (__force u64) cputime_one_jiffy * ticks;
 	cputime_t other;
+	u64 cputime;
 
 	/*
 	 * When returning from idle, many ticks can get accounted at
@@ -374,9 +375,11 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 	 * other time can exceed ticks occasionally.
 	 */
 	other = account_other_time(ULONG_MAX);
-	if (other >= cputime)
+	if (other >= old_cputime)
 		return;
-	cputime -= other;
+
+	old_cputime -= other;
+	cputime = cputime_to_nsecs(old_cputime);
 
 	if (this_cpu_ksoftirqd() == p) {
 		/*
@@ -384,15 +387,16 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 		 * So, we have to handle it separately here.
 		 * Also, p->stime needs to be updated for ksoftirqd.
 		 */
-		account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
+		account_system_index_time(p, old_cputime, CPUTIME_SOFTIRQ);
 	} else if (user_tick) {
 		account_user_time(p, cputime);
 	} else if (p == rq->idle) {
-		account_idle_time(cputime);
+		account_idle_time(old_cputime);
 	} else if (p->flags & PF_VCPU) { /* System time or guest time */
-		account_guest_time(p, cputime);
+
+		account_guest_time(p, old_cputime);
 	} else {
-		account_system_index_time(p, cputime, CPUTIME_SYSTEM);
+		account_system_index_time(p, old_cputime, CPUTIME_SYSTEM);
 	}
 }
 
@@ -473,7 +477,8 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
  */
 void account_process_tick(struct task_struct *p, int user_tick)
 {
-	cputime_t cputime, steal;
+	cputime_t old_cputime, steal;
+	u64 cputime;
 	struct rq *rq = this_rq();
 
 	if (vtime_accounting_cpu_enabled())
@@ -484,20 +489,21 @@ void account_process_tick(struct task_struct *p, int user_tick)
 		return;
 	}
 
-	cputime = cputime_one_jiffy;
+	old_cputime = cputime_one_jiffy;
 	steal = steal_account_process_time(ULONG_MAX);
 
-	if (steal >= cputime)
+	if (steal >= old_cputime)
 		return;
 
-	cputime -= steal;
+	old_cputime -= steal;
+	cputime = cputime_to_nsecs(old_cputime);
 
 	if (user_tick)
 		account_user_time(p, cputime);
 	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
-		account_system_time(p, HARDIRQ_OFFSET, cputime);
+		account_system_time(p, HARDIRQ_OFFSET, old_cputime);
 	else
-		account_idle_time(cputime);
+		account_idle_time(old_cputime);
 }
 
 /*
@@ -736,7 +742,7 @@ void vtime_account_user(struct task_struct *tsk)
 	tsk->vtime_snap_whence = VTIME_SYS;
 	if (vtime_delta(tsk)) {
 		delta_cpu = get_vtime_delta(tsk);
-		account_user_time(tsk, delta_cpu);
+		account_user_time(tsk, cputime_to_nsecs(delta_cpu));
 	}
 	write_seqcount_end(&tsk->vtime_seqcount);
 }
-- 
cgit v1.2.3


From be9095ed4fb3cf69e9fdf64e28ff6b5bd0ec7215 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:38 +0100
Subject: sched/cputime: Push time to account_steal_time() in nsecs

This is one more step toward converting cputime accounting to pure nsecs.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-23-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/powerpc/kernel/time.c  |  2 +-
 arch/s390/kernel/vtime.c    |  2 +-
 include/linux/kernel_stat.h |  2 +-
 kernel/sched/cputime.c      | 11 ++++++-----
 4 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index c3931d816190..53e5982edacf 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -402,7 +402,7 @@ void vtime_flush(struct task_struct *tsk)
 		account_guest_time(tsk, acct->gtime);
 
 	if (acct->steal_time)
-		account_steal_time(acct->steal_time);
+		account_steal_time(cputime_to_nsecs(acct->steal_time));
 
 	if (acct->idle_time)
 		account_idle_time(acct->idle_time);
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index ca206d122302..1e7023c6ef23 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -188,7 +188,7 @@ static int do_account_vtime(struct task_struct *tsk)
 	steal = S390_lowcore.steal_timer;
 	if ((s64) steal > 0) {
 		S390_lowcore.steal_timer = 0;
-		account_steal_time(steal);
+		account_steal_time(cputime_to_nsecs(steal));
 	}
 
 	return virt_timer_forward(user + guest + system + hardirq + softirq);
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index b716001ac23e..1d55d10abf9d 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -83,7 +83,7 @@ extern void account_guest_time(struct task_struct *, cputime_t);
 extern void account_system_time(struct task_struct *, int, cputime_t);
 extern void account_system_index_time(struct task_struct *, cputime_t,
 				      enum cpu_usage_stat);
-extern void account_steal_time(cputime_t);
+extern void account_steal_time(u64);
 extern void account_idle_time(cputime_t);
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 55d31c35833a..9a8028760930 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -207,11 +207,11 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
  * Account for involuntary wait time.
  * @cputime: the cpu time spent in involuntary wait
  */
-void account_steal_time(cputime_t cputime)
+void account_steal_time(u64 cputime)
 {
 	u64 *cpustat = kcpustat_this_cpu->cpustat;
 
-	cpustat[CPUTIME_STEAL] += cputime_to_nsecs(cputime);
+	cpustat[CPUTIME_STEAL] += cputime;
 }
 
 /*
@@ -239,14 +239,15 @@ static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
 #ifdef CONFIG_PARAVIRT
 	if (static_key_false(&paravirt_steal_enabled)) {
 		cputime_t steal_cputime;
-		u64 steal;
+		u64 steal, rounded;
 
 		steal = paravirt_steal_clock(smp_processor_id());
 		steal -= this_rq()->prev_steal_time;
 
 		steal_cputime = min(nsecs_to_cputime(steal), maxtime);
-		account_steal_time(steal_cputime);
-		this_rq()->prev_steal_time += cputime_to_nsecs(steal_cputime);
+		rounded = cputime_to_nsecs(steal_cputime);
+		account_steal_time(rounded);
+		this_rq()->prev_steal_time += rounded;
 
 		return steal_cputime;
 	}
-- 
cgit v1.2.3


From 18b43a9bd7ae91185e398dd983fb4fffb9e81b3a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:39 +0100
Subject: sched/cputime: Push time to account_idle_time() in nsecs

This is one more step toward converting cputime accounting to pure nsecs.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-24-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/ia64/kernel/time.c     |  2 +-
 arch/powerpc/kernel/time.c  |  2 +-
 arch/s390/kernel/idle.c     |  2 +-
 include/linux/kernel_stat.h |  2 +-
 kernel/sched/cputime.c      | 18 +++++++++---------
 5 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index ed98b26047c2..5dc801d97790 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -73,7 +73,7 @@ void vtime_flush(struct task_struct *tsk)
 		account_guest_time(tsk, cycle_to_cputime(ti->gtime));
 
 	if (ti->idle_time)
-		account_idle_time(cycle_to_cputime(ti->idle_time));
+		account_idle_time(cputime_to_nsecs(cycle_to_cputime(ti->idle_time)));
 
 	if (ti->stime) {
 		delta = cycle_to_cputime(ti->stime);
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 53e5982edacf..739897a10fd3 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -405,7 +405,7 @@ void vtime_flush(struct task_struct *tsk)
 		account_steal_time(cputime_to_nsecs(acct->steal_time));
 
 	if (acct->idle_time)
-		account_idle_time(acct->idle_time);
+		account_idle_time(cputime_to_nsecs(acct->idle_time));
 
 	if (acct->stime)
 		account_system_index_time(tsk, acct->stime, CPUTIME_SYSTEM);
diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c
index 7a55c29b0b33..99f1d8185ae2 100644
--- a/arch/s390/kernel/idle.c
+++ b/arch/s390/kernel/idle.c
@@ -43,7 +43,7 @@ void enabled_wait(void)
 	idle->clock_idle_enter = idle->clock_idle_exit = 0ULL;
 	idle->idle_time += idle_time;
 	idle->idle_count++;
-	account_idle_time(idle_time);
+	account_idle_time(cputime_to_nsecs(idle_time));
 	write_seqcount_end(&idle->seqcount);
 }
 NOKPROBE_SYMBOL(enabled_wait);
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 1d55d10abf9d..e1cd8970e096 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -84,7 +84,7 @@ extern void account_system_time(struct task_struct *, int, cputime_t);
 extern void account_system_index_time(struct task_struct *, cputime_t,
 				      enum cpu_usage_stat);
 extern void account_steal_time(u64);
-extern void account_idle_time(cputime_t);
+extern void account_idle_time(u64);
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 static inline void account_process_tick(struct task_struct *tsk, int user)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 9a8028760930..fd5375f956fe 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -218,15 +218,15 @@ void account_steal_time(u64 cputime)
  * Account for idle time.
  * @cputime: the cpu time spent in idle wait
  */
-void account_idle_time(cputime_t cputime)
+void account_idle_time(u64 cputime)
 {
 	u64 *cpustat = kcpustat_this_cpu->cpustat;
 	struct rq *rq = this_rq();
 
 	if (atomic_read(&rq->nr_iowait) > 0)
-		cpustat[CPUTIME_IOWAIT] += cputime_to_nsecs(cputime);
+		cpustat[CPUTIME_IOWAIT] += cputime;
 	else
-		cpustat[CPUTIME_IDLE] += cputime_to_nsecs(cputime);
+		cpustat[CPUTIME_IDLE] += cputime;
 }
 
 /*
@@ -392,7 +392,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 	} else if (user_tick) {
 		account_user_time(p, cputime);
 	} else if (p == rq->idle) {
-		account_idle_time(old_cputime);
+		account_idle_time(cputime);
 	} else if (p->flags & PF_VCPU) { /* System time or guest time */
 
 		account_guest_time(p, old_cputime);
@@ -504,7 +504,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
 	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
 		account_system_time(p, HARDIRQ_OFFSET, old_cputime);
 	else
-		account_idle_time(old_cputime);
+		account_idle_time(cputime);
 }
 
 /*
@@ -513,15 +513,15 @@ void account_process_tick(struct task_struct *p, int user_tick)
  */
 void account_idle_ticks(unsigned long ticks)
 {
-	cputime_t cputime, steal;
+	u64 cputime, steal;
 
 	if (sched_clock_irqtime) {
 		irqtime_account_idle_ticks(ticks);
 		return;
 	}
 
-	cputime = jiffies_to_cputime(ticks);
-	steal = steal_account_process_time(ULONG_MAX);
+	cputime = ticks * TICK_NSEC;
+	steal = cputime_to_nsecs(steal_account_process_time(ULONG_MAX));
 
 	if (steal >= cputime)
 		return;
@@ -787,7 +787,7 @@ void vtime_account_idle(struct task_struct *tsk)
 {
 	cputime_t delta_cpu = get_vtime_delta(tsk);
 
-	account_idle_time(delta_cpu);
+	account_idle_time(cputime_to_nsecs(delta_cpu));
 }
 
 void arch_vtime_task_switch(struct task_struct *prev)
-- 
cgit v1.2.3


From fb8b049c988f1ff460b063b8a41ea9a3c79921c2 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:40 +0100
Subject: sched/cputime: Push time to account_system_time() in nsecs

This is one more step toward converting cputime accounting to pure nsecs.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-25-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/ia64/kernel/time.c     | 11 ++++++-----
 arch/powerpc/kernel/time.c  | 15 ++++++++-------
 arch/s390/kernel/idle.c     |  2 +-
 arch/s390/kernel/vtime.c    |  6 +++---
 include/linux/kernel_stat.h |  7 +++----
 kernel/sched/cputime.c      | 39 +++++++++++++++++++--------------------
 6 files changed, 40 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 5dc801d97790..f15bca4776a7 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -21,6 +21,7 @@
 #include <linux/timex.h>
 #include <linux/timekeeper_internal.h>
 #include <linux/platform_device.h>
+#include <linux/cputime.h>
 
 #include <asm/machvec.h>
 #include <asm/delay.h>
@@ -64,29 +65,29 @@ extern cputime_t cycle_to_cputime(u64 cyc);
 void vtime_flush(struct task_struct *tsk)
 {
 	struct thread_info *ti = task_thread_info(tsk);
-	cputime_t delta;
+	u64 delta;
 
 	if (ti->utime)
 		account_user_time(tsk, cputime_to_nsecs(cycle_to_cputime(ti->utime)));
 
 	if (ti->gtime)
-		account_guest_time(tsk, cycle_to_cputime(ti->gtime));
+		account_guest_time(tsk, cputime_to_nsecs(cycle_to_cputime(ti->gtime)));
 
 	if (ti->idle_time)
 		account_idle_time(cputime_to_nsecs(cycle_to_cputime(ti->idle_time)));
 
 	if (ti->stime) {
-		delta = cycle_to_cputime(ti->stime);
+		delta = cputime_to_nsecs(cycle_to_cputime(ti->stime));
 		account_system_index_time(tsk, delta, CPUTIME_SYSTEM);
 	}
 
 	if (ti->hardirq_time) {
-		delta = cycle_to_cputime(ti->hardirq_time);
+		delta = cputime_to_nsecs(cycle_to_cputime(ti->hardirq_time));
 		account_system_index_time(tsk, delta, CPUTIME_IRQ);
 	}
 
 	if (ti->softirq_time) {
-		delta = cycle_to_cputime(ti->softirq_time);
+		delta = cputime_to_nsecs(cycle_to_cputime(ti->softirq_time));
 		account_system_index_time(tsk, delta, CPUTIME_SOFTIRQ);
 	}
 
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 739897a10fd3..01f53bfe100b 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -57,6 +57,7 @@
 #include <linux/clk-provider.h>
 #include <linux/suspend.h>
 #include <linux/rtc.h>
+#include <linux/cputime.h>
 #include <asm/trace.h>
 
 #include <asm/io.h>
@@ -72,7 +73,6 @@
 #include <asm/smp.h>
 #include <asm/vdso_datapage.h>
 #include <asm/firmware.h>
-#include <asm/cputime.h>
 #include <asm/asm-prototypes.h>
 
 /* powerpc clocksource/clockevent code */
@@ -399,7 +399,7 @@ void vtime_flush(struct task_struct *tsk)
 		tsk->utimescaled += cputime_to_nsecs(acct->utime_scaled);
 
 	if (acct->gtime)
-		account_guest_time(tsk, acct->gtime);
+		account_guest_time(tsk, cputime_to_nsecs(acct->gtime));
 
 	if (acct->steal_time)
 		account_steal_time(cputime_to_nsecs(acct->steal_time));
@@ -408,16 +408,17 @@ void vtime_flush(struct task_struct *tsk)
 		account_idle_time(cputime_to_nsecs(acct->idle_time));
 
 	if (acct->stime)
-		account_system_index_time(tsk, acct->stime, CPUTIME_SYSTEM);
-
+		account_system_index_time(tsk, cputime_to_nsecs(acct->stime),
+					  CPUTIME_SYSTEM);
 	if (acct->stime_scaled)
 		tsk->stimescaled += cputime_to_nsecs(acct->stime_scaled);
 
 	if (acct->hardirq_time)
-		account_system_index_time(tsk, acct->hardirq_time, CPUTIME_IRQ);
-
+		account_system_index_time(tsk, cputime_to_nsecs(acct->hardirq_time),
+					  CPUTIME_IRQ);
 	if (acct->softirq_time)
-		account_system_index_time(tsk, acct->softirq_time, CPUTIME_SOFTIRQ);
+		account_system_index_time(tsk, cputime_to_nsecs(acct->softirq_time),
+					  CPUTIME_SOFTIRQ);
 
 	acct->utime = 0;
 	acct->utime_scaled = 0;
diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c
index 99f1d8185ae2..5c0e08e761c3 100644
--- a/arch/s390/kernel/idle.c
+++ b/arch/s390/kernel/idle.c
@@ -12,7 +12,7 @@
 #include <linux/notifier.h>
 #include <linux/init.h>
 #include <linux/cpu.h>
-#include <asm/cputime.h>
+#include <linux/cputime.h>
 #include <asm/nmi.h>
 #include <asm/smp.h>
 #include "entry.h"
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index 1e7023c6ef23..b4a3e9e06ef2 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -6,13 +6,13 @@
  */
 
 #include <linux/kernel_stat.h>
+#include <linux/cputime.h>
 #include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/timex.h>
 #include <linux/types.h>
 #include <linux/time.h>
 
-#include <asm/cputime.h>
 #include <asm/vtimer.h>
 #include <asm/vtime.h>
 #include <asm/cpu_mf.h>
@@ -115,7 +115,7 @@ static void account_system_index_scaled(struct task_struct *p,
 					enum cpu_usage_stat index)
 {
 	p->stimescaled += cputime_to_nsecs(scaled);
-	account_system_index_time(p, cputime, index);
+	account_system_index_time(p, cputime_to_nsecs(cputime), index);
 }
 
 /*
@@ -171,7 +171,7 @@ static int do_account_vtime(struct task_struct *tsk)
 	}
 
 	if (guest) {
-		account_guest_time(tsk, guest);
+		account_guest_time(tsk, cputime_to_nsecs(guest));
 		tsk->utimescaled += cputime_to_nsecs(scale_vtime(guest));
 	}
 
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index e1cd8970e096..66be8b6beceb 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -9,7 +9,6 @@
 #include <linux/sched.h>
 #include <linux/vtime.h>
 #include <asm/irq.h>
-#include <linux/cputime.h>
 
 /*
  * 'kernel_stat.h' contains the definitions needed for doing
@@ -79,9 +78,9 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu)
 }
 
 extern void account_user_time(struct task_struct *, u64);
-extern void account_guest_time(struct task_struct *, cputime_t);
-extern void account_system_time(struct task_struct *, int, cputime_t);
-extern void account_system_index_time(struct task_struct *, cputime_t,
+extern void account_guest_time(struct task_struct *, u64);
+extern void account_system_time(struct task_struct *, int, u64);
+extern void account_system_index_time(struct task_struct *, u64,
 				      enum cpu_usage_stat);
 extern void account_steal_time(u64);
 extern void account_idle_time(u64);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index fd5375f956fe..d28e9c53727c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -4,6 +4,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/static_key.h>
 #include <linux/context_tracking.h>
+#include <linux/cputime.h>
 #include "sched.h"
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
@@ -138,22 +139,22 @@ void account_user_time(struct task_struct *p, u64 cputime)
  * @p: the process that the cpu time gets accounted to
  * @cputime: the cpu time spent in virtual machine since the last update
  */
-void account_guest_time(struct task_struct *p, cputime_t cputime)
+void account_guest_time(struct task_struct *p, u64 cputime)
 {
 	u64 *cpustat = kcpustat_this_cpu->cpustat;
 
 	/* Add guest time to process. */
-	p->utime += cputime_to_nsecs(cputime);
-	account_group_user_time(p, cputime_to_nsecs(cputime));
-	p->gtime += cputime_to_nsecs(cputime);
+	p->utime += cputime;
+	account_group_user_time(p, cputime);
+	p->gtime += cputime;
 
 	/* Add guest time to cpustat. */
 	if (task_nice(p) > 0) {
-		cpustat[CPUTIME_NICE] += cputime_to_nsecs(cputime);
-		cpustat[CPUTIME_GUEST_NICE] += cputime_to_nsecs(cputime);
+		cpustat[CPUTIME_NICE] += cputime;
+		cpustat[CPUTIME_GUEST_NICE] += cputime;
 	} else {
-		cpustat[CPUTIME_USER] += cputime_to_nsecs(cputime);
-		cpustat[CPUTIME_GUEST] += cputime_to_nsecs(cputime);
+		cpustat[CPUTIME_USER] += cputime;
+		cpustat[CPUTIME_GUEST] += cputime;
 	}
 }
 
@@ -164,14 +165,14 @@ void account_guest_time(struct task_struct *p, cputime_t cputime)
  * @index: pointer to cpustat field that has to be updated
  */
 void account_system_index_time(struct task_struct *p,
-			       cputime_t cputime, enum cpu_usage_stat index)
+			       u64 cputime, enum cpu_usage_stat index)
 {
 	/* Add system time to process. */
-	p->stime += cputime_to_nsecs(cputime);
-	account_group_system_time(p, cputime_to_nsecs(cputime));
+	p->stime += cputime;
+	account_group_system_time(p, cputime);
 
 	/* Add system time to cpustat. */
-	task_group_account_field(p, index, cputime_to_nsecs(cputime));
+	task_group_account_field(p, index, cputime);
 
 	/* Account for system time used */
 	acct_account_cputime(p);
@@ -183,8 +184,7 @@ void account_system_index_time(struct task_struct *p,
  * @hardirq_offset: the offset to subtract from hardirq_count()
  * @cputime: the cpu time spent in kernel space since the last update
  */
-void account_system_time(struct task_struct *p, int hardirq_offset,
-			 cputime_t cputime)
+void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
 {
 	int index;
 
@@ -388,16 +388,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 		 * So, we have to handle it separately here.
 		 * Also, p->stime needs to be updated for ksoftirqd.
 		 */
-		account_system_index_time(p, old_cputime, CPUTIME_SOFTIRQ);
+		account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
 	} else if (user_tick) {
 		account_user_time(p, cputime);
 	} else if (p == rq->idle) {
 		account_idle_time(cputime);
 	} else if (p->flags & PF_VCPU) { /* System time or guest time */
-
-		account_guest_time(p, old_cputime);
+		account_guest_time(p, cputime);
 	} else {
-		account_system_index_time(p, old_cputime, CPUTIME_SYSTEM);
+		account_system_index_time(p, cputime, CPUTIME_SYSTEM);
 	}
 }
 
@@ -502,7 +501,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
 	if (user_tick)
 		account_user_time(p, cputime);
 	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
-		account_system_time(p, HARDIRQ_OFFSET, old_cputime);
+		account_system_time(p, HARDIRQ_OFFSET, cputime);
 	else
 		account_idle_time(cputime);
 }
@@ -722,7 +721,7 @@ static void __vtime_account_system(struct task_struct *tsk)
 {
 	cputime_t delta_cpu = get_vtime_delta(tsk);
 
-	account_system_time(tsk, irq_count(), delta_cpu);
+	account_system_time(tsk, irq_count(), cputime_to_nsecs(delta_cpu));
 }
 
 void vtime_account_system(struct task_struct *tsk)
-- 
cgit v1.2.3


From 2b1f967d80e8e5d7361f0e1654c842869570f573 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:41 +0100
Subject: sched/cputime: Complete nsec conversion of tick based accounting

This is the final step toward tick based cputime conversion. Now that
the whole cputime accounting engine accounts in nsecs, we can convert the
very source of the cputime to account in nsecs.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-26-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cputime.c | 52 +++++++++++++++++++++-----------------------------
 1 file changed, 22 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index d28e9c53727c..f3a56bfa745f 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -75,14 +75,13 @@ void irqtime_account_irq(struct task_struct *curr)
 }
 EXPORT_SYMBOL_GPL(irqtime_account_irq);
 
-static cputime_t irqtime_tick_accounted(cputime_t maxtime)
+static u64 irqtime_tick_accounted(u64 maxtime)
 {
 	struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
-	cputime_t delta;
+	u64 delta;
 
-	delta = nsecs_to_cputime(irqtime->tick_delta);
-	delta = min(delta, maxtime);
-	irqtime->tick_delta -= cputime_to_nsecs(delta);
+	delta = min(irqtime->tick_delta, maxtime);
+	irqtime->tick_delta -= delta;
 
 	return delta;
 }
@@ -91,7 +90,7 @@ static cputime_t irqtime_tick_accounted(cputime_t maxtime)
 
 #define sched_clock_irqtime	(0)
 
-static cputime_t irqtime_tick_accounted(cputime_t dummy)
+static u64 irqtime_tick_accounted(u64 dummy)
 {
 	return 0;
 }
@@ -234,22 +233,19 @@ void account_idle_time(u64 cputime)
  * ticks are not redelivered later. Due to that, this function may on
  * occasion account more time than the calling functions think elapsed.
  */
-static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
+static __always_inline u64 steal_account_process_time(u64 maxtime)
 {
 #ifdef CONFIG_PARAVIRT
 	if (static_key_false(&paravirt_steal_enabled)) {
-		cputime_t steal_cputime;
-		u64 steal, rounded;
+		u64 steal;
 
 		steal = paravirt_steal_clock(smp_processor_id());
 		steal -= this_rq()->prev_steal_time;
+		steal = min(steal, maxtime);
+		account_steal_time(steal);
+		this_rq()->prev_steal_time += steal;
 
-		steal_cputime = min(nsecs_to_cputime(steal), maxtime);
-		rounded = cputime_to_nsecs(steal_cputime);
-		account_steal_time(rounded);
-		this_rq()->prev_steal_time += rounded;
-
-		return steal_cputime;
+		return steal;
 	}
 #endif
 	return 0;
@@ -258,9 +254,9 @@ static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
 /*
  * Account how much elapsed time was spent in steal, irq, or softirq time.
  */
-static inline cputime_t account_other_time(cputime_t max)
+static inline u64 account_other_time(u64 max)
 {
-	cputime_t accounted;
+	u64 accounted;
 
 	/* Shall be converted to a lockdep-enabled lightweight check */
 	WARN_ON_ONCE(!irqs_disabled());
@@ -364,9 +360,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 					 struct rq *rq, int ticks)
 {
-	u64 old_cputime = (__force u64) cputime_one_jiffy * ticks;
-	cputime_t other;
-	u64 cputime;
+	u64 other, cputime = TICK_NSEC * ticks;
 
 	/*
 	 * When returning from idle, many ticks can get accounted at
@@ -376,11 +370,10 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 	 * other time can exceed ticks occasionally.
 	 */
 	other = account_other_time(ULONG_MAX);
-	if (other >= old_cputime)
+	if (other >= cputime)
 		return;
 
-	old_cputime -= other;
-	cputime = cputime_to_nsecs(old_cputime);
+	cputime -= other;
 
 	if (this_cpu_ksoftirqd() == p) {
 		/*
@@ -477,8 +470,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
  */
 void account_process_tick(struct task_struct *p, int user_tick)
 {
-	cputime_t old_cputime, steal;
-	u64 cputime;
+	u64 cputime, steal;
 	struct rq *rq = this_rq();
 
 	if (vtime_accounting_cpu_enabled())
@@ -489,14 +481,13 @@ void account_process_tick(struct task_struct *p, int user_tick)
 		return;
 	}
 
-	old_cputime = cputime_one_jiffy;
+	cputime = TICK_NSEC;
 	steal = steal_account_process_time(ULONG_MAX);
 
-	if (steal >= old_cputime)
+	if (steal >= cputime)
 		return;
 
-	old_cputime -= steal;
-	cputime = cputime_to_nsecs(old_cputime);
+	cputime -= steal;
 
 	if (user_tick)
 		account_user_time(p, cputime);
@@ -520,7 +511,7 @@ void account_idle_ticks(unsigned long ticks)
 	}
 
 	cputime = ticks * TICK_NSEC;
-	steal = cputime_to_nsecs(steal_account_process_time(ULONG_MAX));
+	steal = steal_account_process_time(ULONG_MAX);
 
 	if (steal >= cputime)
 		return;
@@ -741,6 +732,7 @@ void vtime_account_user(struct task_struct *tsk)
 	write_seqcount_begin(&tsk->vtime_seqcount);
 	tsk->vtime_snap_whence = VTIME_SYS;
 	if (vtime_delta(tsk)) {
+		u64 nsecs;
 		delta_cpu = get_vtime_delta(tsk);
 		account_user_time(tsk, cputime_to_nsecs(delta_cpu));
 	}
-- 
cgit v1.2.3


From bfce1d6006f383fbb55a89580af37819a77195b7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 31 Jan 2017 04:09:42 +0100
Subject: sched/cputime, vtime: Return nsecs instead of cputime_t to account

Turn the full dynticks cputime clock source to return nsec while keeping
its very internals jiffies based for performance reasons.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1485832191-26889-27-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cputime.c | 31 +++++++++++--------------------
 1 file changed, 11 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f3a56bfa745f..2ecec3a4f1ee 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -678,20 +678,20 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static cputime_t vtime_delta(struct task_struct *tsk)
+static u64 vtime_delta(struct task_struct *tsk)
 {
 	unsigned long now = READ_ONCE(jiffies);
 
 	if (time_before(now, (unsigned long)tsk->vtime_snap))
 		return 0;
 
-	return jiffies_to_cputime(now - tsk->vtime_snap);
+	return jiffies_to_nsecs(now - tsk->vtime_snap);
 }
 
-static cputime_t get_vtime_delta(struct task_struct *tsk)
+static u64 get_vtime_delta(struct task_struct *tsk)
 {
 	unsigned long now = READ_ONCE(jiffies);
-	cputime_t delta, other;
+	u64 delta, other;
 
 	/*
 	 * Unlike tick based timing, vtime based timing never has lost
@@ -700,7 +700,7 @@ static cputime_t get_vtime_delta(struct task_struct *tsk)
 	 * elapsed time. Limit account_other_time to prevent rounding
 	 * errors from causing elapsed vtime to go negative.
 	 */
-	delta = jiffies_to_cputime(now - tsk->vtime_snap);
+	delta = jiffies_to_nsecs(now - tsk->vtime_snap);
 	other = account_other_time(delta);
 	WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
 	tsk->vtime_snap = now;
@@ -710,9 +710,7 @@ static cputime_t get_vtime_delta(struct task_struct *tsk)
 
 static void __vtime_account_system(struct task_struct *tsk)
 {
-	cputime_t delta_cpu = get_vtime_delta(tsk);
-
-	account_system_time(tsk, irq_count(), cputime_to_nsecs(delta_cpu));
+	account_system_time(tsk, irq_count(), get_vtime_delta(tsk));
 }
 
 void vtime_account_system(struct task_struct *tsk)
@@ -727,15 +725,10 @@ void vtime_account_system(struct task_struct *tsk)
 
 void vtime_account_user(struct task_struct *tsk)
 {
-	cputime_t delta_cpu;
-
 	write_seqcount_begin(&tsk->vtime_seqcount);
 	tsk->vtime_snap_whence = VTIME_SYS;
-	if (vtime_delta(tsk)) {
-		u64 nsecs;
-		delta_cpu = get_vtime_delta(tsk);
-		account_user_time(tsk, cputime_to_nsecs(delta_cpu));
-	}
+	if (vtime_delta(tsk))
+		account_user_time(tsk, get_vtime_delta(tsk));
 	write_seqcount_end(&tsk->vtime_seqcount);
 }
 
@@ -776,9 +769,7 @@ EXPORT_SYMBOL_GPL(vtime_guest_exit);
 
 void vtime_account_idle(struct task_struct *tsk)
 {
-	cputime_t delta_cpu = get_vtime_delta(tsk);
-
-	account_idle_time(cputime_to_nsecs(delta_cpu));
+	account_idle_time(get_vtime_delta(tsk));
 }
 
 void arch_vtime_task_switch(struct task_struct *prev)
@@ -818,7 +809,7 @@ u64 task_gtime(struct task_struct *t)
 
 		gtime = t->gtime;
 		if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU)
-			gtime += cputime_to_nsecs(vtime_delta(t));
+			gtime += vtime_delta(t);
 
 	} while (read_seqcount_retry(&t->vtime_seqcount, seq));
 
@@ -851,7 +842,7 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
 		if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t))
 			continue;
 
-		delta = cputime_to_nsecs(vtime_delta(t));
+		delta = vtime_delta(t);
 
 		/*
 		 * Task runs either in user or kernel space, add pending nohz time to
-- 
cgit v1.2.3


From 975e155ed8732cb81f55c021c441ae662dd040b5 Mon Sep 17 00:00:00 2001
From: Shile Zhang <shile.zhang@nokia.com>
Date: Sat, 28 Jan 2017 22:00:49 +0800
Subject: sched/rt: Show the 'sched_rr_timeslice' SCHED_RR timeslice tuning
 knob in milliseconds

We added the 'sched_rr_timeslice_ms' SCHED_RR tuning knob in this commit:

  ce0dbbbb30ae ("sched/rt: Add a tuning knob to allow changing SCHED_RR timeslice")

... which name suggests to users that it's in milliseconds, while in reality
it's being set in milliseconds but the result is shown in jiffies.

This is obviously confusing when HZ is not 1000, it makes it appear like the
value set failed, such as HZ=100:

  root# echo 100 > /proc/sys/kernel/sched_rr_timeslice_ms
  root# cat /proc/sys/kernel/sched_rr_timeslice_ms
  10

Fix this to be milliseconds all around.

Signed-off-by: Shile Zhang <shile.zhang@nokia.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1485612049-20923-1-git-send-email-shile.zhang@nokia.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/sysctl.h | 1 +
 kernel/sched/core.c          | 5 +++--
 kernel/sched/rt.c            | 1 +
 kernel/sysctl.c              | 2 +-
 4 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 441145351301..49308e142aae 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -59,6 +59,7 @@ extern unsigned int sysctl_sched_cfs_bandwidth_slice;
 extern unsigned int sysctl_sched_autogroup_enabled;
 #endif
 
+extern int sysctl_sched_rr_timeslice;
 extern int sched_rr_timeslice;
 
 extern int sched_rr_handler(struct ctl_table *table, int write,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d01f9d047397..10e18faaa632 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8471,8 +8471,9 @@ int sched_rr_handler(struct ctl_table *table, int write,
 	/* make sure that internally we keep jiffies */
 	/* also, writing zero resets timeslice to default */
 	if (!ret && write) {
-		sched_rr_timeslice = sched_rr_timeslice <= 0 ?
-			RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+		sched_rr_timeslice =
+			sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
+			msecs_to_jiffies(sysctl_sched_rr_timeslice);
 	}
 	mutex_unlock(&mutex);
 	return ret;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 704f2b89abf1..4101f9d1aa40 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -9,6 +9,7 @@
 #include <linux/irq_work.h>
 
 int sched_rr_timeslice = RR_TIMESLICE;
+int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
 
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1aea594a54db..bb260ceb3718 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -416,7 +416,7 @@ static struct ctl_table kern_table[] = {
 	},
 	{
 		.procname	= "sched_rr_timeslice_ms",
-		.data		= &sched_rr_timeslice,
+		.data		= &sysctl_sched_rr_timeslice,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= sched_rr_handler,
-- 
cgit v1.2.3


From 93faccbbfa958a9668d3ab4e30f38dd205cee8d8 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 1 Feb 2017 06:06:16 +1300
Subject: fs: Better permission checking for submounts

To support unprivileged users mounting filesystems two permission
checks have to be performed: a test to see if the user allowed to
create a mount in the mount namespace, and a test to see if
the user is allowed to access the specified filesystem.

The automount case is special in that mounting the original filesystem
grants permission to mount the sub-filesystems, to any user who
happens to stumble across the their mountpoint and satisfies the
ordinary filesystem permission checks.

Attempting to handle the automount case by using override_creds
almost works.  It preserves the idea that permission to mount
the original filesystem is permission to mount the sub-filesystem.
Unfortunately using override_creds messes up the filesystems
ordinary permission checks.

Solve this by being explicit that a mount is a submount by introducing
vfs_submount, and using it where appropriate.

vfs_submount uses a new mount internal mount flags MS_SUBMOUNT, to let
sget and friends know that a mount is a submount so they can take appropriate
action.

sget and sget_userns are modified to not perform any permission checks
on submounts.

follow_automount is modified to stop using override_creds as that
has proven problemantic.

do_mount is modified to always remove the new MS_SUBMOUNT flag so
that we know userspace will never by able to specify it.

autofs4 is modified to stop using current_real_cred that was put in
there to handle the previous version of submount permission checking.

cifs is modified to pass the mountpoint all of the way down to vfs_submount.

debugfs is modified to pass the mountpoint all of the way down to
trace_automount by adding a new parameter.  To make this change easier
a new typedef debugfs_automount_t is introduced to capture the type of
the debugfs automount function.

Cc: stable@vger.kernel.org
Fixes: 069d5ac9ae0d ("autofs:  Fix automounts by using current_real_cred()->uid")
Fixes: aeaa4a79ff6a ("fs: Call d_automount with the filesystems creds")
Reviewed-by: Trond Myklebust <trond.myklebust@primarydata.com>
Reviewed-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/afs/mntpt.c          |  2 +-
 fs/autofs4/waitq.c      |  4 ++--
 fs/cifs/cifs_dfs_ref.c  |  7 ++++---
 fs/debugfs/inode.c      |  8 ++++----
 fs/namei.c              |  3 ---
 fs/namespace.c          | 17 ++++++++++++++++-
 fs/nfs/namespace.c      |  2 +-
 fs/nfs/nfs4namespace.c  |  2 +-
 fs/super.c              | 13 ++++++++++---
 include/linux/debugfs.h |  3 ++-
 include/linux/mount.h   |  3 +++
 include/uapi/linux/fs.h |  1 +
 kernel/trace/trace.c    |  4 ++--
 13 files changed, 47 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 81dd075356b9..d4fb0afc0097 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -202,7 +202,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 
 	/* try and do the mount */
 	_debug("--- attempting mount %s -o %s ---", devname, options);
-	mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options);
+	mnt = vfs_submount(mntpt, &afs_fs_type, devname, options);
 	_debug("--- mount result %p ---", mnt);
 
 	free_page((unsigned long) devname);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 1278335ce366..79fbd85db4ba 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -436,8 +436,8 @@ int autofs4_wait(struct autofs_sb_info *sbi,
 		memcpy(&wq->name, &qstr, sizeof(struct qstr));
 		wq->dev = autofs4_get_dev(sbi);
 		wq->ino = autofs4_get_ino(sbi);
-		wq->uid = current_real_cred()->uid;
-		wq->gid = current_real_cred()->gid;
+		wq->uid = current_cred()->uid;
+		wq->gid = current_cred()->gid;
 		wq->pid = pid;
 		wq->tgid = tgid;
 		wq->status = -EINTR; /* Status return if interrupted */
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index ec9dbbcca3b9..9156be545b0f 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -245,7 +245,8 @@ compose_mount_options_err:
  * @fullpath:		full path in UNC format
  * @ref:		server's referral
  */
-static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
+static struct vfsmount *cifs_dfs_do_refmount(struct dentry *mntpt,
+		struct cifs_sb_info *cifs_sb,
 		const char *fullpath, const struct dfs_info3_param *ref)
 {
 	struct vfsmount *mnt;
@@ -259,7 +260,7 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
 	if (IS_ERR(mountdata))
 		return (struct vfsmount *)mountdata;
 
-	mnt = vfs_kern_mount(&cifs_fs_type, 0, devname, mountdata);
+	mnt = vfs_submount(mntpt, &cifs_fs_type, devname, mountdata);
 	kfree(mountdata);
 	kfree(devname);
 	return mnt;
@@ -334,7 +335,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
 			mnt = ERR_PTR(-EINVAL);
 			break;
 		}
-		mnt = cifs_dfs_do_refmount(cifs_sb,
+		mnt = cifs_dfs_do_refmount(mntpt, cifs_sb,
 				full_path, referrals + i);
 		cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n",
 			 __func__, referrals[i].node_name, mnt);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index f17fcf89e18e..1e30f74a9527 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -187,9 +187,9 @@ static const struct super_operations debugfs_super_operations = {
 
 static struct vfsmount *debugfs_automount(struct path *path)
 {
-	struct vfsmount *(*f)(void *);
-	f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata;
-	return f(d_inode(path->dentry)->i_private);
+	debugfs_automount_t f;
+	f = (debugfs_automount_t)path->dentry->d_fsdata;
+	return f(path->dentry, d_inode(path->dentry)->i_private);
 }
 
 static const struct dentry_operations debugfs_dops = {
@@ -504,7 +504,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir);
  */
 struct dentry *debugfs_create_automount(const char *name,
 					struct dentry *parent,
-					struct vfsmount *(*f)(void *),
+					debugfs_automount_t f,
 					void *data)
 {
 	struct dentry *dentry = start_creating(name, parent);
diff --git a/fs/namei.c b/fs/namei.c
index 6fa3e9138fe4..da689c9c005e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1100,7 +1100,6 @@ static int follow_automount(struct path *path, struct nameidata *nd,
 			    bool *need_mntput)
 {
 	struct vfsmount *mnt;
-	const struct cred *old_cred;
 	int err;
 
 	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
@@ -1129,9 +1128,7 @@ static int follow_automount(struct path *path, struct nameidata *nd,
 	if (nd->total_link_count >= 40)
 		return -ELOOP;
 
-	old_cred = override_creds(&init_cred);
 	mnt = path->dentry->d_op->d_automount(path);
-	revert_creds(old_cred);
 	if (IS_ERR(mnt)) {
 		/*
 		 * The filesystem is allowed to return -EISDIR here to indicate
diff --git a/fs/namespace.c b/fs/namespace.c
index 487ba30bb5c6..089a6b23135a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -989,6 +989,21 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 }
 EXPORT_SYMBOL_GPL(vfs_kern_mount);
 
+struct vfsmount *
+vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
+	     const char *name, void *data)
+{
+	/* Until it is worked out how to pass the user namespace
+	 * through from the parent mount to the submount don't support
+	 * unprivileged mounts with submounts.
+	 */
+	if (mountpoint->d_sb->s_user_ns != &init_user_ns)
+		return ERR_PTR(-EPERM);
+
+	return vfs_kern_mount(type, MS_SUBMOUNT, name, data);
+}
+EXPORT_SYMBOL_GPL(vfs_submount);
+
 static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 					int flag)
 {
@@ -2794,7 +2809,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
 
 	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
 		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
-		   MS_STRICTATIME | MS_NOREMOTELOCK);
+		   MS_STRICTATIME | MS_NOREMOTELOCK | MS_SUBMOUNT);
 
 	if (flags & MS_REMOUNT)
 		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 5551e8ef67fd..e49d831c4e85 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -226,7 +226,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
 					   const char *devname,
 					   struct nfs_clone_mount *mountdata)
 {
-	return vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata);
+	return vfs_submount(mountdata->dentry, &nfs_xdev_fs_type, devname, mountdata);
 }
 
 /**
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index d21104912676..d8b040bd9814 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -279,7 +279,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
 				mountdata->hostname,
 				mountdata->mnt_path);
 
-		mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata);
+		mnt = vfs_submount(mountdata->dentry, &nfs4_referral_fs_type, page, mountdata);
 		if (!IS_ERR(mnt))
 			break;
 	}
diff --git a/fs/super.c b/fs/super.c
index 1709ed029a2c..4185844f7a12 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -469,7 +469,7 @@ struct super_block *sget_userns(struct file_system_type *type,
 	struct super_block *old;
 	int err;
 
-	if (!(flags & MS_KERNMOUNT) &&
+	if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) &&
 	    !(type->fs_flags & FS_USERNS_MOUNT) &&
 	    !capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
@@ -499,7 +499,7 @@ retry:
 	}
 	if (!s) {
 		spin_unlock(&sb_lock);
-		s = alloc_super(type, flags, user_ns);
+		s = alloc_super(type, (flags & ~MS_SUBMOUNT), user_ns);
 		if (!s)
 			return ERR_PTR(-ENOMEM);
 		goto retry;
@@ -540,8 +540,15 @@ struct super_block *sget(struct file_system_type *type,
 {
 	struct user_namespace *user_ns = current_user_ns();
 
+	/* We don't yet pass the user namespace of the parent
+	 * mount through to here so always use &init_user_ns
+	 * until that changes.
+	 */
+	if (flags & MS_SUBMOUNT)
+		user_ns = &init_user_ns;
+
 	/* Ensure the requestor has permissions over the target filesystem */
-	if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
+	if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) && !ns_capable(user_ns, CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
 
 	return sget_userns(type, test, set, flags, user_ns, data);
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 014cc564d1c4..233006be30aa 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -97,9 +97,10 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent);
 struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
 				      const char *dest);
 
+typedef struct vfsmount *(*debugfs_automount_t)(struct dentry *, void *);
 struct dentry *debugfs_create_automount(const char *name,
 					struct dentry *parent,
-					struct vfsmount *(*f)(void *),
+					debugfs_automount_t f,
 					void *data);
 
 void debugfs_remove(struct dentry *dentry);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index c6f55158d5e5..8e0352af06b7 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -90,6 +90,9 @@ struct file_system_type;
 extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
 				      int flags, const char *name,
 				      void *data);
+extern struct vfsmount *vfs_submount(const struct dentry *mountpoint,
+				     struct file_system_type *type,
+				     const char *name, void *data);
 
 extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list);
 extern void mark_mounts_for_expiry(struct list_head *mounts);
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 36da93fbf188..048a85e9f017 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -132,6 +132,7 @@ struct inodes_stat_t {
 #define MS_LAZYTIME	(1<<25) /* Update the on-disk [acm]times lazily */
 
 /* These sb flags are internal to the kernel */
+#define MS_SUBMOUNT     (1<<26)
 #define MS_NOREMOTELOCK	(1<<27)
 #define MS_NOSEC	(1<<28)
 #define MS_BORN		(1<<29)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d7449783987a..310f0ea0d1a2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7503,7 +7503,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 	ftrace_init_tracefs(tr, d_tracer);
 }
 
-static struct vfsmount *trace_automount(void *ingore)
+static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
 {
 	struct vfsmount *mnt;
 	struct file_system_type *type;
@@ -7516,7 +7516,7 @@ static struct vfsmount *trace_automount(void *ingore)
 	type = get_fs_type("tracefs");
 	if (!type)
 		return NULL;
-	mnt = vfs_kern_mount(type, 0, "tracefs", NULL);
+	mnt = vfs_submount(mntpt, type, "tracefs", NULL);
 	put_filesystem(type);
 	if (IS_ERR(mnt))
 		return NULL;
-- 
cgit v1.2.3


From a428d314ebcf65842fd64ad850c02c280586e74d Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 31 Jan 2017 14:53:19 -0800
Subject: blktrace: make do_blk_trace_setup() static

This isn't used outside of blktrace.c anymore.

Fixes: 62c2a7d969f3 ("block: push BKL into blktrace ioctls")
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blktrace_api.h | 4 ----
 kernel/trace/blktrace.c      | 6 +++---
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 341f9418bd68..d2e908586e3d 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -30,9 +30,6 @@ struct blk_trace {
 
 extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
 extern void blk_trace_shutdown(struct request_queue *);
-extern int do_blk_trace_setup(struct request_queue *q, char *name,
-			      dev_t dev, struct block_device *bdev,
-			      struct blk_user_trace_setup *buts);
 extern __printf(2, 3)
 void __trace_note_message(struct blk_trace *, const char *fmt, ...);
 
@@ -80,7 +77,6 @@ extern struct attribute_group blk_trace_attr_group;
 #else /* !CONFIG_BLK_DEV_IO_TRACE */
 # define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
 # define blk_trace_shutdown(q)				do { } while (0)
-# define do_blk_trace_setup(q, name, dev, bdev, buts)	(-ENOTTY)
 # define blk_add_driver_data(q, rq, data, len)		do {} while (0)
 # define blk_trace_setup(q, name, dev, bdev, arg)	(-ENOTTY)
 # define blk_trace_startstop(q, start)			(-ENOTTY)
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e33050e3ea1c..84763e0c83cf 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -433,9 +433,9 @@ static void blk_trace_setup_lba(struct blk_trace *bt,
 /*
  * Setup everything required to start tracing
  */
-int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
-		       struct block_device *bdev,
-		       struct blk_user_trace_setup *buts)
+static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+			      struct block_device *bdev,
+			      struct blk_user_trace_setup *buts)
 {
 	struct blk_trace *bt = NULL;
 	struct dentry *dir = NULL;
-- 
cgit v1.2.3


From 18fbda91c6370d520278db9ee1e768b59ef5c4ab Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 31 Jan 2017 14:53:20 -0800
Subject: block: use same block debugfs directory for blk-mq and blktrace

When I added the blk-mq debugging information to debugfs, I didn't
notice that blktrace also creates a "block" directory in debugfs. Make
them use the same dentry, now created in the core block code. Based on a
patch from Jens.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c        |  9 +++++++++
 block/blk-mq-debugfs.c  | 12 +++---------
 block/blk-mq.c          |  2 --
 block/blk-mq.h          |  5 -----
 block/blk.h             |  4 ++++
 kernel/trace/blktrace.c | 18 +++++-------------
 6 files changed, 21 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-core.c b/block/blk-core.c
index a5726e01f839..76c42f559df8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -33,6 +33,7 @@
 #include <linux/ratelimit.h>
 #include <linux/pm_runtime.h>
 #include <linux/blk-cgroup.h>
+#include <linux/debugfs.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/block.h>
@@ -42,6 +43,10 @@
 #include "blk-mq-sched.h"
 #include "blk-wbt.h"
 
+#ifdef CONFIG_DEBUG_FS
+struct dentry *blk_debugfs_root;
+#endif
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
@@ -3441,5 +3446,9 @@ int __init blk_dev_init(void)
 	blk_requestq_cachep = kmem_cache_create("request_queue",
 			sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
 
+#ifdef CONFIG_DEBUG_FS
+	blk_debugfs_root = debugfs_create_dir("block", NULL);
+#endif
+
 	return 0;
 }
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 1077008690b2..f6d917977b33 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -19,6 +19,7 @@
 #include <linux/debugfs.h>
 
 #include <linux/blk-mq.h>
+#include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
 
@@ -28,8 +29,6 @@ struct blk_mq_debugfs_attr {
 	const struct file_operations *fops;
 };
 
-static struct dentry *block_debugfs_root;
-
 static int blk_mq_debugfs_seq_open(struct inode *inode, struct file *file,
 				   const struct seq_operations *ops)
 {
@@ -665,10 +664,10 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
 
 int blk_mq_debugfs_register(struct request_queue *q, const char *name)
 {
-	if (!block_debugfs_root)
+	if (!blk_debugfs_root)
 		return -ENOENT;
 
-	q->debugfs_dir = debugfs_create_dir(name, block_debugfs_root);
+	q->debugfs_dir = debugfs_create_dir(name, blk_debugfs_root);
 	if (!q->debugfs_dir)
 		goto err;
 
@@ -771,8 +770,3 @@ void blk_mq_debugfs_unregister_hctxs(struct request_queue *q)
 	debugfs_remove_recursive(q->mq_debugfs_dir);
 	q->mq_debugfs_dir = NULL;
 }
-
-void blk_mq_debugfs_init(void)
-{
-	block_debugfs_root = debugfs_create_dir("block", NULL);
-}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3d76e860f126..e9957df05700 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2817,8 +2817,6 @@ void blk_mq_enable_hotplug(void)
 
 static int __init blk_mq_init(void)
 {
-	blk_mq_debugfs_init();
-
 	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
 				blk_mq_hctx_notify_dead);
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index b52abd62b1b0..24b2256186f3 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -85,16 +85,11 @@ extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
  * debugfs helpers
  */
 #ifdef CONFIG_BLK_DEBUG_FS
-void blk_mq_debugfs_init(void);
 int blk_mq_debugfs_register(struct request_queue *q, const char *name);
 void blk_mq_debugfs_unregister(struct request_queue *q);
 int blk_mq_debugfs_register_hctxs(struct request_queue *q);
 void blk_mq_debugfs_unregister_hctxs(struct request_queue *q);
 #else
-static inline void blk_mq_debugfs_init(void)
-{
-}
-
 static inline int blk_mq_debugfs_register(struct request_queue *q,
 					  const char *name)
 {
diff --git a/block/blk.h b/block/blk.h
index c1bd4bf9e645..6aa53a4aad88 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -14,6 +14,10 @@
 /* Max future timer expiry for timeouts */
 #define BLK_MAX_TIMEOUT		(5 * HZ)
 
+#ifdef CONFIG_DEBUG_FS
+extern struct dentry *blk_debugfs_root;
+#endif
+
 struct blk_flush_queue {
 	unsigned int		flush_queue_delayed:1;
 	unsigned int		flush_pending_idx:1;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 84763e0c83cf..8cea91d248d9 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -28,6 +28,8 @@
 #include <linux/uaccess.h>
 #include <linux/list.h>
 
+#include "../../block/blk.h"
+
 #include <trace/events/block.h>
 
 #include "trace_output.h"
@@ -292,9 +294,6 @@ record_it:
 	local_irq_restore(flags);
 }
 
-static struct dentry *blk_tree_root;
-static DEFINE_MUTEX(blk_tree_mutex);
-
 static void blk_trace_free(struct blk_trace *bt)
 {
 	debugfs_remove(bt->msg_file);
@@ -468,17 +467,10 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 
 	ret = -ENOENT;
 
-	mutex_lock(&blk_tree_mutex);
-	if (!blk_tree_root) {
-		blk_tree_root = debugfs_create_dir("block", NULL);
-		if (!blk_tree_root) {
-			mutex_unlock(&blk_tree_mutex);
-			goto err;
-		}
-	}
-	mutex_unlock(&blk_tree_mutex);
+	if (!blk_debugfs_root)
+		goto err;
 
-	dir = debugfs_create_dir(buts->name, blk_tree_root);
+	dir = debugfs_create_dir(buts->name, blk_debugfs_root);
 
 	if (!dir)
 		goto err;
-- 
cgit v1.2.3


From 6ac93117ab009d3901ed5d3d0f79056eb5fc0afd Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 31 Jan 2017 14:53:22 -0800
Subject: blktrace: use existing disk debugfs directory

We may already have a directory to put the blktrace stuff in if

1. The disk uses blk-mq
2. CONFIG_BLK_DEBUG_FS is enabled
3. We are tracing the whole disk and not a partition

Instead of hardcoding this very specific case, let's use the new
debugfs_lookup(). If the directory exists, we use it, otherwise we
create one and clean it up later.

Fixes: 07e4fead45e6 ("blk-mq: create debugfs directory tree")
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 kernel/trace/blktrace.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 8cea91d248d9..b2058a7f94bd 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -470,12 +470,12 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (!blk_debugfs_root)
 		goto err;
 
-	dir = debugfs_create_dir(buts->name, blk_debugfs_root);
-
+	dir = debugfs_lookup(buts->name, blk_debugfs_root);
+	if (!dir)
+		bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
 	if (!dir)
 		goto err;
 
-	bt->dir = dir;
 	bt->dev = dev;
 	atomic_set(&bt->dropped, 0);
 	INIT_LIST_HEAD(&bt->running_list);
@@ -517,9 +517,12 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (atomic_inc_return(&blk_probes_ref) == 1)
 		blk_register_tracepoints();
 
-	return 0;
+	ret = 0;
 err:
-	blk_trace_free(bt);
+	if (dir && !bt->dir)
+		dput(dir);
+	if (ret)
+		blk_trace_free(bt);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 968ebff1efde6948564308836ecf1ef57de4e106 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 29 Jan 2017 14:35:20 -0500
Subject: cgroup, perf_event: make perf_event controller work on cgroup2
 hierarchy

perf_event is a utility controller whose primary role is identifying
cgroup membership to filter perf events; however, because it also
tracks some per-css state, it can't be replaced by pure cgroup
membership test.  Mark the controller as implicitly enabled on the
default hierarchy so that perf events can always be filtered based on
cgroup v2 path as long as the controller is not mounted on a legacy
hierarchy.

"perf record" is updated accordingly so that it searches for both v1
and v2 hierarchies.  A v1 hierarchy is used if perf_event is mounted
on it; otherwise, it uses the v2 hierarchy.

v2: Doc updated to reflect more flexible rebinding behavior.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
---
 Documentation/cgroup-v2.txt | 12 ++++++++++++
 kernel/events/core.c        |  6 ++++++
 tools/perf/util/cgroup.c    | 26 +++++++++++++++++++-------
 3 files changed, 37 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index f6f54107fb4d..227ce4883720 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -49,6 +49,8 @@ CONTENTS
     5-3-2. Writeback
   5-4. PID
     5-4-1. PID Interface Files
+  5-5. Misc
+    5-5-1. perf_event
 6. Namespace
   6-1. Basics
   6-2. The Root and Views
@@ -1160,6 +1162,16 @@ through fork() or clone(). These will return -EAGAIN if the creation
 of a new process would cause a cgroup policy to be violated.
 
 
+5-5. Misc
+
+5-5-1. perf_event
+
+perf_event controller, if not mounted on a legacy hierarchy, is
+automatically enabled on the v2 hierarchy so that perf events can
+always be filtered by cgroup v2 path.  The controller can still be
+moved to a legacy hierarchy after v2 hierarchy is populated.
+
+
 6. Namespace
 
 6-1. Basics
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ab15509fab8c..d72128dce1e0 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10792,5 +10792,11 @@ struct cgroup_subsys perf_event_cgrp_subsys = {
 	.css_alloc	= perf_cgroup_css_alloc,
 	.css_free	= perf_cgroup_css_free,
 	.attach		= perf_cgroup_attach,
+	/*
+	 * Implicitly enable on dfl hierarchy so that perf events can
+	 * always be filtered by cgroup2 path as long as perf_event
+	 * controller is not mounted on a legacy hierarchy.
+	 */
+	.implicit_on_dfl = true,
 };
 #endif /* CONFIG_CGROUP_PERF */
diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c
index 8fdee24725a7..eafbf11442b2 100644
--- a/tools/perf/util/cgroup.c
+++ b/tools/perf/util/cgroup.c
@@ -12,8 +12,8 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen)
 {
 	FILE *fp;
 	char mountpoint[PATH_MAX + 1], tokens[PATH_MAX + 1], type[PATH_MAX + 1];
+	char path_v1[PATH_MAX + 1], path_v2[PATH_MAX + 2], *path;
 	char *token, *saved_ptr = NULL;
-	int found = 0;
 
 	fp = fopen("/proc/mounts", "r");
 	if (!fp)
@@ -24,31 +24,43 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen)
 	 * and inspect every cgroupfs mount point to find one that has
 	 * perf_event subsystem
 	 */
+	path_v1[0] = '\0';
+	path_v2[0] = '\0';
+
 	while (fscanf(fp, "%*s %"STR(PATH_MAX)"s %"STR(PATH_MAX)"s %"
 				STR(PATH_MAX)"s %*d %*d\n",
 				mountpoint, type, tokens) == 3) {
 
-		if (!strcmp(type, "cgroup")) {
+		if (!path_v1[0] && !strcmp(type, "cgroup")) {
 
 			token = strtok_r(tokens, ",", &saved_ptr);
 
 			while (token != NULL) {
 				if (!strcmp(token, "perf_event")) {
-					found = 1;
+					strcpy(path_v1, mountpoint);
 					break;
 				}
 				token = strtok_r(NULL, ",", &saved_ptr);
 			}
 		}
-		if (found)
+
+		if (!path_v2[0] && !strcmp(type, "cgroup2"))
+			strcpy(path_v2, mountpoint);
+
+		if (path_v1[0] && path_v2[0])
 			break;
 	}
 	fclose(fp);
-	if (!found)
+
+	if (path_v1[0])
+		path = path_v1;
+	else if (path_v2[0])
+		path = path_v2;
+	else
 		return -1;
 
-	if (strlen(mountpoint) < maxlen) {
-		strcpy(buf, mountpoint);
+	if (strlen(path) < maxlen) {
+		strcpy(buf, path);
 		return 0;
 	}
 	return -1;
-- 
cgit v1.2.3


From 576dd464505fc53d501bb94569db76f220104d28 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 20 Jan 2017 11:29:54 -0500
Subject: cgroup: drop the matching uid requirement on migration for cgroup v2

Along with the write access to the cgroup.procs or tasks file, cgroup
has required the writer's euid, unless root, to match [s]uid of the
target process or task.  On cgroup v1, this is necessary because
there's nothing preventing a delegatee from pulling in tasks or
processes from all over the system.

If a user has a cgroup subdirectory delegated to it, the user would
have write access to the cgroup.procs or tasks file.  If there are no
further checks than file write access check, the user would be able to
pull processes from all over the system into its subhierarchy which is
clearly not the intended behavior.  The matching [s]uid requirement
partially prevents this problem by allowing a delegatee to pull in the
processes that belongs to it.  This isn't a sufficient protection
however, because a user would still be able to jump processes across
two disjoint sub-hierarchies that has been delegated to them.

cgroup v2 resolves the issue by requiring the writer to have access to
the common ancestor of the cgroup.procs file of the source and target
cgroups.  This confines each delegatee to their own sub-hierarchy
proper and bases all permission decisions on the cgroup filesystem
rather than having to pull in explicit uid matching.

cgroup v2 has still been applying the matching [s]uid requirement just
for historical reasons.  On cgroup2, the requirement doesn't serve any
purpose while unnecessarily complicating the permission model.  Let's
drop it.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/cgroup-v2.txt | 12 +++++-------
 kernel/cgroup/cgroup.c      | 27 ++++++++++++++-------------
 2 files changed, 19 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 227ce4883720..1d101423ca92 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -332,14 +332,12 @@ a process with a non-root euid to migrate a target process into a
 cgroup by writing its PID to the "cgroup.procs" file, the following
 conditions must be met.
 
-- The writer's euid must match either uid or suid of the target process.
-
 - The writer must have write access to the "cgroup.procs" file.
 
 - The writer must have write access to the "cgroup.procs" file of the
   common ancestor of the source and destination cgroups.
 
-The above three constraints ensure that while a delegatee may migrate
+The above two constraints ensure that while a delegatee may migrate
 processes around freely in the delegated sub-hierarchy it can't pull
 in from or push out to outside the sub-hierarchy.
 
@@ -354,10 +352,10 @@ all processes under C0 and C1 belong to U0.
 
 Let's also say U0 wants to write the PID of a process which is
 currently in C10 into "C00/cgroup.procs".  U0 has write access to the
-file and uid match on the process; however, the common ancestor of the
-source cgroup C10 and the destination cgroup C00 is above the points
-of delegation and U0 would not have write access to its "cgroup.procs"
-files and thus the write will be denied with -EACCES.
+file; however, the common ancestor of the source cgroup C10 and the
+destination cgroup C00 is above the points of delegation and U0 would
+not have write access to its "cgroup.procs" files and thus the write
+will be denied with -EACCES.
 
 
 2-6. Guidelines
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index a99b15f9b577..fe374f803b20 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2349,20 +2349,9 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 					 struct cgroup *dst_cgrp,
 					 struct kernfs_open_file *of)
 {
-	const struct cred *cred = current_cred();
-	const struct cred *tcred = get_task_cred(task);
 	int ret = 0;
 
-	/*
-	 * even if we're attaching all tasks in the thread group, we only
-	 * need to check permissions on one of them.
-	 */
-	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
-	    !uid_eq(cred->euid, tcred->uid) &&
-	    !uid_eq(cred->euid, tcred->suid))
-		ret = -EACCES;
-
-	if (!ret && cgroup_on_dfl(dst_cgrp)) {
+	if (cgroup_on_dfl(dst_cgrp)) {
 		struct super_block *sb = of->file->f_path.dentry->d_sb;
 		struct cgroup *cgrp;
 		struct inode *inode;
@@ -2380,9 +2369,21 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 			ret = inode_permission(inode, MAY_WRITE);
 			iput(inode);
 		}
+	} else {
+		const struct cred *cred = current_cred();
+		const struct cred *tcred = get_task_cred(task);
+
+		/*
+		 * even if we're attaching all tasks in the thread group,
+		 * we only need to check permissions on one of them.
+		 */
+		if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+		    !uid_eq(cred->euid, tcred->uid) &&
+		    !uid_eq(cred->euid, tcred->suid))
+			ret = -EACCES;
+		put_cred(tcred);
 	}
 
-	put_cred(tcred);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 0f1b92cbdd0309afae0af1963e8cccddb3d2eaff Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 30 Jan 2017 18:06:11 +0300
Subject: introduce the walk_process_tree() helper

Add the new helper to walk the process tree, the next patch adds a user.
Note that it visits the group leaders only, proc_visitor can do
for_each_thread itself or we can trivially extend walk_process_tree() to
do this.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/sched.h |  3 +++
 kernel/fork.c         | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d2334229167f..6261bfc12853 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3053,6 +3053,9 @@ extern bool current_is_single_threaded(void);
 #define for_each_process_thread(p, t)	\
 	for_each_process(p) for_each_thread(p, t)
 
+typedef int (*proc_visitor)(struct task_struct *p, void *data);
+void walk_process_tree(struct task_struct *top, proc_visitor, void *);
+
 static inline int get_nr_threads(struct task_struct *tsk)
 {
 	return tsk->signal->nr_threads;
diff --git a/kernel/fork.c b/kernel/fork.c
index 11c5c8ab827c..135b7a49ad59 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2053,6 +2053,38 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
 }
 #endif
 
+void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
+{
+	struct task_struct *leader, *parent, *child;
+	int res;
+
+	read_lock(&tasklist_lock);
+	leader = top = top->group_leader;
+down:
+	for_each_thread(leader, parent) {
+		list_for_each_entry(child, &parent->children, sibling) {
+			res = visitor(child, data);
+			if (res) {
+				if (res < 0)
+					goto out;
+				leader = child;
+				goto down;
+			}
+up:
+			;
+		}
+	}
+
+	if (leader != top) {
+		child = leader;
+		parent = child->real_parent;
+		leader = parent->group_leader;
+		goto up;
+	}
+out:
+	read_unlock(&tasklist_lock);
+}
+
 #ifndef ARCH_MIN_MMSTRUCT_ALIGN
 #define ARCH_MIN_MMSTRUCT_ALIGN 0
 #endif
-- 
cgit v1.2.3


From 749860ce242798fb090557a5a7868dee40af9268 Mon Sep 17 00:00:00 2001
From: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Date: Mon, 30 Jan 2017 18:06:12 +0300
Subject: prctl: propagate has_child_subreaper flag to every descendant

If process forks some children when it has is_child_subreaper
flag enabled they will inherit has_child_subreaper flag - first
group, when is_child_subreaper is disabled forked children will
not inherit it - second group. So child-subreaper does not reparent
all his descendants when their parents die. Having these two
differently behaving groups can lead to confusion. Also it is
a problem for CRIU, as when we restore process tree we need to
somehow determine which descendants belong to which group and
much harder - to put them exactly to these group.

To simplify these we can add a propagation of has_child_subreaper
flag on PR_SET_CHILD_SUBREAPER, walking all descendants of child-
subreaper to setup has_child_subreaper flag.

In common cases when process like systemd first sets itself to
be a child-subreaper and only after that forks its services, we will
have zero-length list of descendants to walk. Testing with binary
subtree of 2^15 processes prctl took < 0.007 sec and has shown close
to linear dependency(~0.2 * n * usec) on lower numbers of processes.

Moreover, I doubt someone intentionaly pre-forks the children whitch
should reparent to init before becoming subreaper, because some our
ancestor migh have had is_child_subreaper flag while forking our
sub-tree and our childs will all inherit has_child_subreaper flag,
and we have no way to influence it. And only way to check if we have
no has_child_subreaper flag is to create some childs, kill them and
see where they will reparent to.

Using walk_process_tree helper to walk subtree, thanks to Oleg! Timing
seems to be the same.

Optimize:

a) When descendant already has has_child_subreaper flag all his subtree
has it too already.

* for a) to be true need to move has_child_subreaper inheritance under
the same tasklist_lock with adding task to its ->real_parent->children
as without it process can inherit zero has_child_subreaper, then we
set 1 to it's parent flag, check that parent has no more children, and
only after child with wrong flag is added to the tree.

* Also make these inheritance more clear by using real_parent instead of
current, as on clone(CLONE_PARENT) if current has is_child_subreaper
and real_parent has no is_child_subreaper or has_child_subreaper, child
will have has_child_subreaper flag set without actually having a
subreaper in it's ancestors.

b) When some descendant is child_reaper, it's subtree is in different
pidns from us(original child-subreaper) and processes from other pidns
will never reparent to us.

So we can skip their(a,b) subtree from walk.

v2: switch to walk_process_tree() general helper, move
has_child_subreaper inheritance
v3: remove csr_descendant leftover, change current to real_parent
in has_child_subreaper inheritance
v4: small commit message fix

Fixes: ebec18a6d3aa ("prctl: add PR_{SET,GET}_CHILD_SUBREAPER to allow simple process supervision")
Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/fork.c | 10 +++++++---
 kernel/sys.c  | 22 ++++++++++++++++++++++
 2 files changed, 29 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 135b7a49ad59..c814e590cf76 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1367,9 +1367,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->oom_score_adj = current->signal->oom_score_adj;
 	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
 
-	sig->has_child_subreaper = current->signal->has_child_subreaper ||
-				   current->signal->is_child_subreaper;
-
 	mutex_init(&sig->cred_guard_mutex);
 
 	return 0;
@@ -1800,6 +1797,13 @@ static __latent_entropy struct task_struct *copy_process(
 
 			p->signal->leader_pid = pid;
 			p->signal->tty = tty_kref_get(current->signal->tty);
+			/*
+			 * Inherit has_child_subreaper flag under the same
+			 * tasklist_lock with adding child to the process tree
+			 * for propagate_has_child_subreaper optimization.
+			 */
+			p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
+							 p->real_parent->signal->is_child_subreaper;
 			list_add_tail(&p->sibling, &p->real_parent->children);
 			list_add_tail_rcu(&p->tasks, &init_task.tasks);
 			attach_pid(p, PIDTYPE_PGID);
diff --git a/kernel/sys.c b/kernel/sys.c
index 842914ef7de4..0e4d566a6f16 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2063,6 +2063,24 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
 }
 #endif
 
+static int propagate_has_child_subreaper(struct task_struct *p, void *data)
+{
+	/*
+	 * If task has has_child_subreaper - all its decendants
+	 * already have these flag too and new decendants will
+	 * inherit it on fork, skip them.
+	 *
+	 * If we've found child_reaper - skip descendants in
+	 * it's subtree as they will never get out pidns.
+	 */
+	if (p->signal->has_child_subreaper ||
+	    is_child_reaper(task_pid(p)))
+		return 0;
+
+	p->signal->has_child_subreaper = 1;
+	return 1;
+}
+
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		unsigned long, arg4, unsigned long, arg5)
 {
@@ -2214,6 +2232,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		break;
 	case PR_SET_CHILD_SUBREAPER:
 		me->signal->is_child_subreaper = !!arg2;
+		if (!arg2)
+			break;
+
+		walk_process_tree(me, propagate_has_child_subreaper, NULL);
 		break;
 	case PR_GET_CHILD_SUBREAPER:
 		error = put_user(me->signal->is_child_subreaper,
-- 
cgit v1.2.3


From 2b0cce0e190f8f0b37fe8102ae657f5d9eb0976d Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 1 Feb 2017 12:19:33 -0500
Subject: tracing: Add ftrace_hash_key() helper function

Replace the couple of use cases that has small logic to produce the ftrace
function key id with a helper function. No need for duplicate code.

Acked-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2d554a02241d..89240f62061c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1185,6 +1185,15 @@ struct ftrace_page {
 static struct ftrace_page	*ftrace_pages_start;
 static struct ftrace_page	*ftrace_pages;
 
+static __always_inline unsigned long
+ftrace_hash_key(struct ftrace_hash *hash, unsigned long ip)
+{
+	if (hash->size_bits > 0)
+		return hash_long(ip, hash->size_bits);
+
+	return 0;
+}
+
 struct ftrace_func_entry *
 ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
 {
@@ -1195,11 +1204,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
 	if (ftrace_hash_empty(hash))
 		return NULL;
 
-	if (hash->size_bits > 0)
-		key = hash_long(ip, hash->size_bits);
-	else
-		key = 0;
-
+	key = ftrace_hash_key(hash, ip);
 	hhd = &hash->buckets[key];
 
 	hlist_for_each_entry_rcu_notrace(entry, hhd, hlist) {
@@ -1215,11 +1220,7 @@ static void __add_hash_entry(struct ftrace_hash *hash,
 	struct hlist_head *hhd;
 	unsigned long key;
 
-	if (hash->size_bits)
-		key = hash_long(entry->ip, hash->size_bits);
-	else
-		key = 0;
-
+	key = ftrace_hash_key(hash, entry->ip);
 	hhd = &hash->buckets[key];
 	hlist_add_head(&entry->hlist, hhd);
 	hash->count++;
-- 
cgit v1.2.3


From 2b2c279c814112e15577757ec593aa78465e2e56 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 1 Feb 2017 15:37:07 -0500
Subject: ftrace: Create a slight optimization on searching the ftrace_hash

This is a micro-optimization, but as it has to deal with a fast path of the
function tracer, these optimizations can be noticed.

The ftrace_lookup_ip() returns true if the given ip is found in the hash. If
it's not found or the hash is NULL, it returns false. But there's some cases
that a NULL hash is a true, and the ftrace_hash_empty() is tested before
calling ftrace_lookup_ip() in those cases. But as ftrace_lookup_ip() tests
that first, that adds a few extra unneeded instructions in those cases.

A new static "always_inlined" function is created that does not perform the
hash empty test. This most only be used by callers that do the check first
anyway, as an empty or NULL hash could cause a crash if a lookup is
performed on it.

Also add kernel doc for the ftrace_lookup_ip() main function.

Acked-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 89240f62061c..1595df0d7d79 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1194,16 +1194,14 @@ ftrace_hash_key(struct ftrace_hash *hash, unsigned long ip)
 	return 0;
 }
 
-struct ftrace_func_entry *
-ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
+/* Only use this function if ftrace_hash_empty() has already been tested */
+static __always_inline struct ftrace_func_entry *
+__ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
 {
 	unsigned long key;
 	struct ftrace_func_entry *entry;
 	struct hlist_head *hhd;
 
-	if (ftrace_hash_empty(hash))
-		return NULL;
-
 	key = ftrace_hash_key(hash, ip);
 	hhd = &hash->buckets[key];
 
@@ -1214,6 +1212,25 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
 	return NULL;
 }
 
+/**
+ * ftrace_lookup_ip - Test to see if an ip exists in an ftrace_hash
+ * @hash: The hash to look at
+ * @ip: The instruction pointer to test
+ *
+ * Search a given @hash to see if a given instruction pointer (@ip)
+ * exists in it.
+ *
+ * Returns the entry that holds the @ip if found. NULL otherwise.
+ */
+struct ftrace_func_entry *
+ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
+{
+	if (ftrace_hash_empty(hash))
+		return NULL;
+
+	return __ftrace_lookup_ip(hash, ip);
+}
+
 static void __add_hash_entry(struct ftrace_hash *hash,
 			     struct ftrace_func_entry *entry)
 {
@@ -1463,9 +1480,9 @@ static bool hash_contains_ip(unsigned long ip,
 	 * notrace hash is considered not in the notrace hash.
 	 */
 	return (ftrace_hash_empty(hash->filter_hash) ||
-		ftrace_lookup_ip(hash->filter_hash, ip)) &&
+		__ftrace_lookup_ip(hash->filter_hash, ip)) &&
 		(ftrace_hash_empty(hash->notrace_hash) ||
-		 !ftrace_lookup_ip(hash->notrace_hash, ip));
+		 !__ftrace_lookup_ip(hash->notrace_hash, ip));
 }
 
 /*
@@ -2877,7 +2894,7 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
 
 	/* The function must be in the filter */
 	if (!ftrace_hash_empty(ops->func_hash->filter_hash) &&
-	    !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))
+	    !__ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))
 		return 0;
 
 	/* If in notrace hash, we ignore it too */
-- 
cgit v1.2.3


From 555fc7813eeada1738eca42aa9f9ebafd7dc23e6 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 2 Feb 2017 10:15:22 -0500
Subject: ftrace: Replace (void *)1 with a meaningful macro name
 FTRACE_GRAPH_EMPTY

When the set_graph_function or set_graph_notrace contains no records, a
banner is displayed of either "#### all functions enabled ####" or
"#### all functions disabled ####" respectively. To tell the seq operations
to do this, (void *)1 is passed as a return value. Instead of using a
hardcoded meaningless variable, define it as a macro.

Acked-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1595df0d7d79..a9cfc8713198 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4561,6 +4561,8 @@ enum graph_filter_type {
 	GRAPH_FILTER_FUNCTION,
 };
 
+#define FTRACE_GRAPH_EMPTY	((void *)1)
+
 struct ftrace_graph_data {
 	struct ftrace_hash *hash;
 	struct ftrace_func_entry *entry;
@@ -4616,7 +4618,7 @@ static void *g_start(struct seq_file *m, loff_t *pos)
 
 	/* Nothing, tell g_show to print all functions are enabled */
 	if (ftrace_hash_empty(fgd->hash) && !*pos)
-		return (void *)1;
+		return FTRACE_GRAPH_EMPTY;
 
 	fgd->idx = 0;
 	fgd->entry = NULL;
@@ -4635,7 +4637,7 @@ static int g_show(struct seq_file *m, void *v)
 	if (!entry)
 		return 0;
 
-	if (entry == (void *)1) {
+	if (entry == FTRACE_GRAPH_EMPTY) {
 		struct ftrace_graph_data *fgd = m->private;
 
 		if (fgd->type == GRAPH_FILTER_FUNCTION)
-- 
cgit v1.2.3


From d4ad9a1ccac31a04a32b5e7547b70428830e0218 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 2 Feb 2017 14:03:39 -0500
Subject: ftrace: Reset fgd->hash in ftrace_graph_write()

fgd->hash is saved and then freed, but is never reset to either
ftrace_graph_hash nor ftrace_graph_notrace_hash. But if multiple writes are
performed, then the freed hash could be accessed again.

 # cd /sys/kernel/debug/tracing
 # head -1000 available_filter_functions > /tmp/funcs
 # cat /tmp/funcs > set_graph_function

Causes:

 general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC
 Modules linked in:  [...]
 CPU: 2 PID: 1337 Comm: cat Not tainted 4.10.0-rc2-test-00010-g6b052e9 #32
 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v02.05 05/07/2012
 task: ffff880113a12200 task.stack: ffffc90001940000
 RIP: 0010:free_ftrace_hash+0x7c/0x160
 RSP: 0018:ffffc90001943db0 EFLAGS: 00010246
 RAX: 6b6b6b6b6b6b6b6b RBX: 6b6b6b6b6b6b6b6b RCX: 6b6b6b6b6b6b6b6b
 RDX: 0000000000000002 RSI: 0000000000000001 RDI: ffff8800ce1e1d40
 RBP: ffff8800ce1e1d50 R08: 0000000000000000 R09: 0000000000006400
 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
 R13: ffff8800ce1e1d40 R14: 0000000000004000 R15: 0000000000000001
 FS:  00007f9408a07740(0000) GS:ffff88011e500000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000000aee1f0 CR3: 0000000116bb4000 CR4: 00000000001406e0
 Call Trace:
  ? ftrace_graph_write+0x150/0x190
  ? __vfs_write+0x1f6/0x210
  ? __audit_syscall_entry+0x17f/0x200
  ? rw_verify_area+0xdb/0x210
  ? _cond_resched+0x2b/0x50
  ? __sb_start_write+0xb4/0x130
  ? vfs_write+0x1c8/0x330
  ? SyS_write+0x62/0xf0
  ? do_syscall_64+0xa3/0x1b0
  ? entry_SYSCALL64_slow_path+0x25/0x25
 Code: 01 48 85 db 0f 84 92 00 00 00 b8 01 00 00 00 d3 e0 85 c0 7e 3f 83 e8 01 48 8d 6f 10 45 31 e4 4c 8d 34 c5 08 00 00 00 49 8b 45 08 <4a> 8b 34 20 48 85 f6 74 13 48 8b 1e 48 89 ef e8 20 fa ff ff 48
 RIP: free_ftrace_hash+0x7c/0x160 RSP: ffffc90001943db0
 ---[ end trace 999b48216bf4b393 ]---

Acked-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a9cfc8713198..b7df0dcf8652 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4858,10 +4858,13 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
 		if (!new_hash)
 			ret = -ENOMEM;
 
-		if (fgd->type == GRAPH_FILTER_FUNCTION)
+		if (fgd->type == GRAPH_FILTER_FUNCTION) {
 			rcu_assign_pointer(ftrace_graph_hash, new_hash);
-		else
+			fgd->hash = ftrace_graph_hash;
+		} else {
 			rcu_assign_pointer(ftrace_graph_notrace_hash, new_hash);
+			fgd->hash = ftrace_graph_notrace_hash;
+		}
 
 		mutex_unlock(&graph_lock);
 
-- 
cgit v1.2.3


From ae98d27afc3bde5a48f440d905317602a5cfb0d2 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 2 Feb 2017 16:59:06 -0500
Subject: ftrace: Have set_graph_functions handle write with RDWR

Since reading the set_graph_functions uses seq functions, which sets the
file->private_data pointer to a seq_file descriptor. On writes the
ftrace_graph_data descriptor is set to file->private_data. But if the file
is opened for RDWR, the ftrace_graph_write() will incorrectly use the
file->private_data descriptor instead of
((struct seq_file *)file->private_data)->private pointer, and this can crash
the kernel.

Acked-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b7df0dcf8652..0233c8cb45f4 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4842,6 +4842,12 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
 	if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX))
 		return -ENOMEM;
 
+	/* Read mode uses seq functions */
+	if (file->f_mode & FMODE_READ) {
+		struct seq_file *m = file->private_data;
+		fgd = m->private;
+	}
+
 	read = trace_get_user(&parser, ubuf, cnt, ppos);
 
 	if (read >= 0 && trace_parser_loaded((&parser))) {
-- 
cgit v1.2.3


From 0e684b6578ee463ecb5c9a1cd0c20069f063d9f0 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 2 Feb 2017 17:58:18 -0500
Subject: tracing: Reset parser->buffer to allow multiple "puts"

trace_parser_put() simply frees the allocated parser buffer. But it does not
reset the pointer that was freed. This means that if trace_parser_put() is
called on the same parser more than once, it will corrupt the allocation
system. Setting parser->buffer to NULL after free allows it to be called
more than once without any ill effect.

Acked-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d7449783987a..4589b67168fc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1193,6 +1193,7 @@ int trace_parser_get_init(struct trace_parser *parser, int size)
 void trace_parser_put(struct trace_parser *parser)
 {
 	kfree(parser->buffer);
+	parser->buffer = NULL;
 }
 
 /*
-- 
cgit v1.2.3


From 649b988b12ddb9aed16047a3d9bb4d7bfdb47221 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 2 Feb 2017 20:16:29 -0500
Subject: ftrace: Do not hold references of ftrace_graph_{notrace_}hash out of
 graph_lock

The hashs ftrace_graph_hash and ftrace_graph_notrace_hash are modified
within the graph_lock being held. Holding a pointer to them and passing them
along can lead to a use of a stale pointer (fgd->hash). Move assigning the
pointer and its use to within the holding of the lock. Note, it's an
rcu_sched protected data, and other instances of referencing them are done
with preemption disabled. But the file manipuation code must be protected by
the lock.

The fgd->hash pointer is set to NULL when the lock is being released.

Acked-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0233c8cb45f4..b3a4896ef78a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4616,6 +4616,13 @@ static void *g_start(struct seq_file *m, loff_t *pos)
 
 	mutex_lock(&graph_lock);
 
+	if (fgd->type == GRAPH_FILTER_FUNCTION)
+		fgd->hash = rcu_dereference_protected(ftrace_graph_hash,
+					lockdep_is_held(&graph_lock));
+	else
+		fgd->hash = rcu_dereference_protected(ftrace_graph_notrace_hash,
+					lockdep_is_held(&graph_lock));
+
 	/* Nothing, tell g_show to print all functions are enabled */
 	if (ftrace_hash_empty(fgd->hash) && !*pos)
 		return FTRACE_GRAPH_EMPTY;
@@ -4695,6 +4702,14 @@ __ftrace_graph_open(struct inode *inode, struct file *file,
 
 out:
 	fgd->new_hash = new_hash;
+
+	/*
+	 * All uses of fgd->hash must be taken with the graph_lock
+	 * held. The graph_lock is going to be released, so force
+	 * fgd->hash to be reinitialized when it is taken again.
+	 */
+	fgd->hash = NULL;
+
 	return ret;
 }
 
@@ -4713,7 +4728,8 @@ ftrace_graph_open(struct inode *inode, struct file *file)
 
 	mutex_lock(&graph_lock);
 
-	fgd->hash = ftrace_graph_hash;
+	fgd->hash = rcu_dereference_protected(ftrace_graph_hash,
+					lockdep_is_held(&graph_lock));
 	fgd->type = GRAPH_FILTER_FUNCTION;
 	fgd->seq_ops = &ftrace_graph_seq_ops;
 
@@ -4740,7 +4756,8 @@ ftrace_graph_notrace_open(struct inode *inode, struct file *file)
 
 	mutex_lock(&graph_lock);
 
-	fgd->hash = ftrace_graph_notrace_hash;
+	fgd->hash = rcu_dereference_protected(ftrace_graph_notrace_hash,
+					lockdep_is_held(&graph_lock));
 	fgd->type = GRAPH_FILTER_NOTRACE;
 	fgd->seq_ops = &ftrace_graph_seq_ops;
 
@@ -4859,17 +4876,18 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
 		ret = ftrace_graph_set_hash(fgd->new_hash,
 					    parser.buffer);
 
-		old_hash = fgd->hash;
 		new_hash = __ftrace_hash_move(fgd->new_hash);
 		if (!new_hash)
 			ret = -ENOMEM;
 
 		if (fgd->type == GRAPH_FILTER_FUNCTION) {
+			old_hash = rcu_dereference_protected(ftrace_graph_hash,
+					lockdep_is_held(&graph_lock));
 			rcu_assign_pointer(ftrace_graph_hash, new_hash);
-			fgd->hash = ftrace_graph_hash;
 		} else {
+			old_hash = rcu_dereference_protected(ftrace_graph_notrace_hash,
+					lockdep_is_held(&graph_lock));
 			rcu_assign_pointer(ftrace_graph_notrace_hash, new_hash);
-			fgd->hash = ftrace_graph_notrace_hash;
 		}
 
 		mutex_unlock(&graph_lock);
-- 
cgit v1.2.3


From e704eff3ff5138a462443dcd64d071165df18782 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 2 Feb 2017 20:34:37 -0500
Subject: ftrace: Have set_graph_function handle multiple functions in one
 write

Currently, only one function can be written to set_graph_function and
set_graph_notrace. The last function in the list will have saved, even
though other functions will be added then removed.

Change the behavior to be the same as set_ftrace_function as to allow
multiple functions to be written. If any one fails, none of them will be
added. The addition of the functions are done at the end when the file is
closed.

Acked-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 105 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 64 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b3a4896ef78a..0c0609326391 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4564,12 +4564,13 @@ enum graph_filter_type {
 #define FTRACE_GRAPH_EMPTY	((void *)1)
 
 struct ftrace_graph_data {
-	struct ftrace_hash *hash;
-	struct ftrace_func_entry *entry;
-	int idx;   /* for hash table iteration */
-	enum graph_filter_type type;
-	struct ftrace_hash *new_hash;
-	const struct seq_operations *seq_ops;
+	struct ftrace_hash		*hash;
+	struct ftrace_func_entry	*entry;
+	int				idx;   /* for hash table iteration */
+	enum graph_filter_type		type;
+	struct ftrace_hash		*new_hash;
+	const struct seq_operations	*seq_ops;
+	struct trace_parser		parser;
 };
 
 static void *
@@ -4676,6 +4677,9 @@ __ftrace_graph_open(struct inode *inode, struct file *file,
 	if (file->f_mode & FMODE_WRITE) {
 		const int size_bits = FTRACE_HASH_DEFAULT_BITS;
 
+		if (trace_parser_get_init(&fgd->parser, FTRACE_BUFF_MAX))
+			return -ENOMEM;
+
 		if (file->f_flags & O_TRUNC)
 			new_hash = alloc_ftrace_hash(size_bits);
 		else
@@ -4701,6 +4705,9 @@ __ftrace_graph_open(struct inode *inode, struct file *file,
 		file->private_data = fgd;
 
 out:
+	if (ret < 0 && file->f_mode & FMODE_WRITE)
+		trace_parser_put(&fgd->parser);
+
 	fgd->new_hash = new_hash;
 
 	/*
@@ -4773,6 +4780,9 @@ static int
 ftrace_graph_release(struct inode *inode, struct file *file)
 {
 	struct ftrace_graph_data *fgd;
+	struct ftrace_hash *old_hash, *new_hash;
+	struct trace_parser *parser;
+	int ret = 0;
 
 	if (file->f_mode & FMODE_READ) {
 		struct seq_file *m = file->private_data;
@@ -4783,10 +4793,50 @@ ftrace_graph_release(struct inode *inode, struct file *file)
 		fgd = file->private_data;
 	}
 
+
+	if (file->f_mode & FMODE_WRITE) {
+
+		parser = &fgd->parser;
+
+		if (trace_parser_loaded((parser))) {
+			parser->buffer[parser->idx] = 0;
+			ret = ftrace_graph_set_hash(fgd->new_hash,
+						    parser->buffer);
+		}
+
+		trace_parser_put(parser);
+
+		new_hash = __ftrace_hash_move(fgd->new_hash);
+		if (!new_hash) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		mutex_lock(&graph_lock);
+
+		if (fgd->type == GRAPH_FILTER_FUNCTION) {
+			old_hash = rcu_dereference_protected(ftrace_graph_hash,
+					lockdep_is_held(&graph_lock));
+			rcu_assign_pointer(ftrace_graph_hash, new_hash);
+		} else {
+			old_hash = rcu_dereference_protected(ftrace_graph_notrace_hash,
+					lockdep_is_held(&graph_lock));
+			rcu_assign_pointer(ftrace_graph_notrace_hash, new_hash);
+		}
+
+		mutex_unlock(&graph_lock);
+
+		/* Wait till all users are no longer using the old hash */
+		synchronize_sched();
+
+		free_ftrace_hash(old_hash);
+	}
+
+ out:
 	kfree(fgd->new_hash);
 	kfree(fgd);
 
-	return 0;
+	return ret;
 }
 
 static int
@@ -4848,61 +4898,34 @@ static ssize_t
 ftrace_graph_write(struct file *file, const char __user *ubuf,
 		   size_t cnt, loff_t *ppos)
 {
-	struct trace_parser parser;
 	ssize_t read, ret = 0;
 	struct ftrace_graph_data *fgd = file->private_data;
-	struct ftrace_hash *old_hash, *new_hash;
+	struct trace_parser *parser;
 
 	if (!cnt)
 		return 0;
 
-	if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX))
-		return -ENOMEM;
-
 	/* Read mode uses seq functions */
 	if (file->f_mode & FMODE_READ) {
 		struct seq_file *m = file->private_data;
 		fgd = m->private;
 	}
 
-	read = trace_get_user(&parser, ubuf, cnt, ppos);
+	parser = &fgd->parser;
 
-	if (read >= 0 && trace_parser_loaded((&parser))) {
-		parser.buffer[parser.idx] = 0;
+	read = trace_get_user(parser, ubuf, cnt, ppos);
 
-		mutex_lock(&graph_lock);
+	if (read >= 0 && trace_parser_loaded(parser) &&
+	    !trace_parser_cont(parser)) {
 
-		/* we allow only one expression at a time */
 		ret = ftrace_graph_set_hash(fgd->new_hash,
-					    parser.buffer);
-
-		new_hash = __ftrace_hash_move(fgd->new_hash);
-		if (!new_hash)
-			ret = -ENOMEM;
-
-		if (fgd->type == GRAPH_FILTER_FUNCTION) {
-			old_hash = rcu_dereference_protected(ftrace_graph_hash,
-					lockdep_is_held(&graph_lock));
-			rcu_assign_pointer(ftrace_graph_hash, new_hash);
-		} else {
-			old_hash = rcu_dereference_protected(ftrace_graph_notrace_hash,
-					lockdep_is_held(&graph_lock));
-			rcu_assign_pointer(ftrace_graph_notrace_hash, new_hash);
-		}
-
-		mutex_unlock(&graph_lock);
-
-		/* Wait till all users are no longer using the old hash */
-		synchronize_sched();
-
-		free_ftrace_hash(old_hash);
+					    parser->buffer);
+		trace_parser_clear(parser);
 	}
 
 	if (!ret)
 		ret = read;
 
-	trace_parser_put(&parser);
-
 	return ret;
 }
 
-- 
cgit v1.2.3


From 3898fac1f488c76e0eef5b5267b4ba8112a82ac4 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 2 Feb 2017 17:09:54 +0100
Subject: trace: rename trace_print_hex_seq arg and add kdoc

Steven suggested to improve trace_print_hex_seq() a bit after commit
2acae0d5b0f7 ("trace: add variant without spacing in trace_print_hex_seq")
in two ways: i) by adding a kdoc comment for the helper function
itself and ii) by renaming 'spacing' argument into 'concatenate'
to better denote that we don't add spaces between each hex bytes.

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/trace_events.h |  2 +-
 include/trace/trace_events.h |  4 ++--
 kernel/trace/trace_output.c  | 15 +++++++++++++--
 3 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index cfa475a0e9ca..0f165507495c 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -34,7 +34,7 @@ const char *trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
 
 const char *trace_print_hex_seq(struct trace_seq *p,
 				const unsigned char *buf, int len,
-				bool spacing);
+				bool concatenate);
 
 const char *trace_print_array_seq(struct trace_seq *p,
 				   const void *buf, int count,
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index 9f684629c3cf..5c06f4af8323 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -298,11 +298,11 @@ TRACE_MAKE_SYSTEM_STR();
 
 #undef __print_hex
 #define __print_hex(buf, buf_len)					\
-	trace_print_hex_seq(p, buf, buf_len, true)
+	trace_print_hex_seq(p, buf, buf_len, false)
 
 #undef __print_hex_str
 #define __print_hex_str(buf, buf_len)					\
-	trace_print_hex_seq(p, buf, buf_len, false)
+	trace_print_hex_seq(p, buf, buf_len, true)
 
 #undef __print_array
 #define __print_array(array, count, el_size)				\
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 30a144b1b9ee..aea6a1218c7d 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -162,15 +162,26 @@ trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
 }
 EXPORT_SYMBOL_GPL(trace_print_bitmask_seq);
 
+/**
+ * trace_print_hex_seq - print buffer as hex sequence
+ * @p: trace seq struct to write to
+ * @buf: The buffer to print
+ * @buf_len: Length of @buf in bytes
+ * @concatenate: Print @buf as single hex string or with spacing
+ *
+ * Prints the passed buffer as a hex sequence either as a whole,
+ * single hex string if @concatenate is true or with spacing after
+ * each byte in case @concatenate is false.
+ */
 const char *
 trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len,
-		    bool spacing)
+		    bool concatenate)
 {
 	int i;
 	const char *ret = trace_seq_buffer_ptr(p);
 
 	for (i = 0; i < buf_len; i++)
-		trace_seq_printf(p, "%s%2.2x", !spacing || i == 0 ? "" : " ",
+		trace_seq_printf(p, "%s%2.2x", concatenate || i == 0 ? "" : " ",
 				 buf[i]);
 	trace_seq_putc(p, 0);
 
-- 
cgit v1.2.3


From 668802c25729a8e3423015c33c05f1c3be3858e9 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Mon, 30 Jan 2017 12:57:43 -0500
Subject: tick/broadcast: Reduce lock cacheline contention

It was observed that on an Intel x86 system without the ARAT (Always
running APIC timer) feature and with fairly large number of CPUs as
well as CPUs coming in and out of intel_idle frequently, the lock
contention on the tick_broadcast_lock can become significant.

To reduce contention, the lock is put into its own cacheline and all
the cpumask_var_t variables are put into the __read_mostly section.

Running the SP benchmark of the NAS Parallel Benchmarks on a 4-socket
16-core 32-thread Nehalam system, the performance number improved
from 3353.94 Mop/s to 3469.31 Mop/s when this patch was applied on
a 4.9.6 kernel.  This is a 3.4% improvement.

Signed-off-by: Waiman Long <longman@redhat.com>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/1485799063-20857-1-git-send-email-longman@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpumask.h      |  7 ++++++-
 kernel/time/tick-broadcast.c | 15 ++++++++-------
 2 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index c717f5ea88cb..23c1a6d09ec5 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -649,11 +649,15 @@ static inline size_t cpumask_size(void)
  * used. Please use this_cpu_cpumask_var_t in those cases. The direct use
  * of this_cpu_ptr() or this_cpu_read() will lead to failures when the
  * other type of cpumask_var_t implementation is configured.
+ *
+ * Please also note that __cpumask_var_read_mostly can be used to declare
+ * a cpumask_var_t variable itself (not its content) as read mostly.
  */
 #ifdef CONFIG_CPUMASK_OFFSTACK
 typedef struct cpumask *cpumask_var_t;
 
-#define this_cpu_cpumask_var_ptr(x) this_cpu_read(x)
+#define this_cpu_cpumask_var_ptr(x)	this_cpu_read(x)
+#define __cpumask_var_read_mostly	__read_mostly
 
 bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);
 bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags);
@@ -667,6 +671,7 @@ void free_bootmem_cpumask_var(cpumask_var_t mask);
 typedef struct cpumask cpumask_var_t[1];
 
 #define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x)
+#define __cpumask_var_read_mostly
 
 static inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
 {
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 3109204c87cc..244c93544150 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -29,12 +29,13 @@
  */
 
 static struct tick_device tick_broadcast_device;
-static cpumask_var_t tick_broadcast_mask;
-static cpumask_var_t tick_broadcast_on;
-static cpumask_var_t tmpmask;
-static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
+static cpumask_var_t tick_broadcast_mask __cpumask_var_read_mostly;
+static cpumask_var_t tick_broadcast_on __cpumask_var_read_mostly;
+static cpumask_var_t tmpmask __cpumask_var_read_mostly;
 static int tick_broadcast_forced;
 
+static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
+
 #ifdef CONFIG_TICK_ONESHOT
 static void tick_broadcast_clear_oneshot(int cpu);
 static void tick_resume_broadcast_oneshot(struct clock_event_device *bc);
@@ -517,9 +518,9 @@ void tick_resume_broadcast(void)
 
 #ifdef CONFIG_TICK_ONESHOT
 
-static cpumask_var_t tick_broadcast_oneshot_mask;
-static cpumask_var_t tick_broadcast_pending_mask;
-static cpumask_var_t tick_broadcast_force_mask;
+static cpumask_var_t tick_broadcast_oneshot_mask __cpumask_var_read_mostly;
+static cpumask_var_t tick_broadcast_pending_mask __cpumask_var_read_mostly;
+static cpumask_var_t tick_broadcast_force_mask __cpumask_var_read_mostly;
 
 /*
  * Exposed for debugging: see timer_list.c
-- 
cgit v1.2.3


From 63dfef75ed75364901d7caa52c6420cec3e73519 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Sat, 4 Feb 2017 08:37:29 -0800
Subject: bpf: enable verifier to add 0 to packet ptr

The patch fixes the case when adding a zero value to the packet
pointer.  The zero value could come from src_reg equals type
BPF_K or CONST_IMM.  The patch fixes both, otherwise the verifer
reports the following error:
  [...]
    R0=imm0,min_value=0,max_value=0
    R1=pkt(id=0,off=0,r=4)
    R2=pkt_end R3=fp-12
    R4=imm4,min_value=4,max_value=4
    R5=pkt(id=0,off=4,r=4)
  269: (bf) r2 = r0     // r2 becomes imm0
  270: (77) r2 >>= 3
  271: (bf) r4 = r1     // r4 becomes pkt ptr
  272: (0f) r4 += r2    // r4 += 0
  addition of negative constant to packet pointer is not allowed

Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: Mihai Budiu <mbudiu@vmware.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c                       |  2 +-
 tools/testing/selftests/bpf/test_verifier.c | 23 +++++++++++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index fb3513b35c0b..1a754e5d2695 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1397,7 +1397,7 @@ static int check_packet_ptr_add(struct bpf_verifier_env *env,
 		imm = insn->imm;
 
 add_imm:
-		if (imm <= 0) {
+		if (imm < 0) {
 			verbose("addition of negative constant to packet pointer is not allowed\n");
 			return -EACCES;
 		}
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index df194e1d56c2..71f6407cde60 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -2403,6 +2403,29 @@ static struct bpf_test tests[] = {
 		.result = ACCEPT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
+	{
+		"direct packet access: test14 (pkt_ptr += 0, CONST_IMM, good access)",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 22),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 7),
+			BPF_MOV64_IMM(BPF_REG_5, 12),
+			BPF_ALU64_IMM(BPF_RSH, BPF_REG_5, 4),
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_2),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_5),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_6, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
 	{
 		"helper access to packet: test1, valid packet_ptr range",
 		.insns = {
-- 
cgit v1.2.3


From d1ccc66df8bfe30ee2533ceef40633d65154b43d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 11:46:42 +0100
Subject: sched/core: Clean up comments

Refresh the comments in the core scheduler code:

 - Capitalize sentences consistently

 - Capitalize 'CPU' consistently

 - ... and other small details.

Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 358 ++++++++++++++++++++++++++--------------------------
 1 file changed, 182 insertions(+), 176 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 10e18faaa632..87cf7ba82bd5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1,31 +1,10 @@
 /*
  *  kernel/sched/core.c
  *
- *  Kernel scheduler and related syscalls
+ *  Core kernel scheduler code and related syscalls
  *
  *  Copyright (C) 1991-2002  Linus Torvalds
- *
- *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
- *		make semaphores SMP safe
- *  1998-11-19	Implemented schedule_timeout() and related stuff
- *		by Andrea Arcangeli
- *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
- *		hybrid priority-list and round-robin design with
- *		an array-switch method of distributing timeslices
- *		and per-CPU runqueues.  Cleanups and useful suggestions
- *		by Davide Libenzi, preemptible kernel bits by Robert Love.
- *  2003-09-03	Interactivity tuning by Con Kolivas.
- *  2004-04-02	Scheduler domains code by Nick Piggin
- *  2007-04-15  Work begun on replacing all interactivity tuning with a
- *              fair scheduling design by Con Kolivas.
- *  2007-05-05  Load balancing (smp-nice) and other improvements
- *              by Peter Williams
- *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
- *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
- *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
- *              Thomas Gleixner, Mike Kravetz
  */
-
 #include <linux/kasan.h>
 #include <linux/mm.h>
 #include <linux/module.h>
@@ -143,7 +122,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
 const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
 
 /*
- * period over which we measure -rt task cpu usage in us.
+ * period over which we measure -rt task CPU usage in us.
  * default: 1s
  */
 unsigned int sysctl_sched_rt_period = 1000000;
@@ -156,7 +135,7 @@ __read_mostly int scheduler_running;
  */
 int sysctl_sched_rt_runtime = 950000;
 
-/* cpus with isolated domains */
+/* CPUs with isolated domains */
 cpumask_var_t cpu_isolated_map;
 
 /*
@@ -224,7 +203,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
 		 * If we observe the old cpu in task_rq_lock, the acquire of
 		 * the old rq->lock will fully serialize against the stores.
 		 *
-		 * If we observe the new cpu in task_rq_lock, the acquire will
+		 * If we observe the new CPU in task_rq_lock, the acquire will
 		 * pair with the WMB to ensure we must then also see migrating.
 		 */
 		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
@@ -461,7 +440,7 @@ void wake_up_q(struct wake_q_head *head)
 
 		task = container_of(node, struct task_struct, wake_q);
 		BUG_ON(!task);
-		/* task can safely be re-inserted now */
+		/* Task can safely be re-inserted now: */
 		node = node->next;
 		task->wake_q.next = NULL;
 
@@ -519,12 +498,12 @@ void resched_cpu(int cpu)
 #ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ_COMMON
 /*
- * In the semi idle case, use the nearest busy cpu for migrating timers
- * from an idle cpu.  This is good for power-savings.
+ * In the semi idle case, use the nearest busy CPU for migrating timers
+ * from an idle CPU.  This is good for power-savings.
  *
  * We don't do similar optimization for completely idle system, as
- * selecting an idle cpu will add more delays to the timers than intended
- * (as that cpu's timer base may not be uptodate wrt jiffies etc).
+ * selecting an idle CPU will add more delays to the timers than intended
+ * (as that CPU's timer base may not be uptodate wrt jiffies etc).
  */
 int get_nohz_timer_target(void)
 {
@@ -553,6 +532,7 @@ unlock:
 	rcu_read_unlock();
 	return cpu;
 }
+
 /*
  * When add_timer_on() enqueues a timer into the timer wheel of an
  * idle CPU then this timer might expire before the next timer event
@@ -1021,7 +1001,7 @@ struct migration_arg {
 };
 
 /*
- * Move (not current) task off this cpu, onto dest cpu. We're doing
+ * Move (not current) task off this CPU, onto the destination CPU. We're doing
  * this because either it can't run here any more (set_cpus_allowed()
  * away from this CPU, or CPU going down), or because we're
  * attempting to rebalance this task on exec (sched_exec).
@@ -1055,8 +1035,8 @@ static int migration_cpu_stop(void *data)
 	struct rq *rq = this_rq();
 
 	/*
-	 * The original target cpu might have gone down and we might
-	 * be on another cpu but it doesn't matter.
+	 * The original target CPU might have gone down and we might
+	 * be on another CPU but it doesn't matter.
 	 */
 	local_irq_disable();
 	/*
@@ -1174,7 +1154,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 	if (p->flags & PF_KTHREAD) {
 		/*
 		 * For kernel threads that do indeed end up on online &&
-		 * !active we want to ensure they are strict per-cpu threads.
+		 * !active we want to ensure they are strict per-CPU threads.
 		 */
 		WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
 			!cpumask_intersects(new_mask, cpu_active_mask) &&
@@ -1279,7 +1259,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
 		/*
 		 * Task isn't running anymore; make it appear like we migrated
 		 * it before it went to sleep. This means on wakeup we make the
-		 * previous cpu our target instead of where it really is.
+		 * previous CPU our target instead of where it really is.
 		 */
 		p->wake_cpu = cpu;
 	}
@@ -1511,12 +1491,12 @@ EXPORT_SYMBOL_GPL(kick_process);
  *
  *  - on cpu-up we allow per-cpu kthreads on the online && !active cpu,
  *    see __set_cpus_allowed_ptr(). At this point the newly online
- *    cpu isn't yet part of the sched domains, and balancing will not
+ *    CPU isn't yet part of the sched domains, and balancing will not
  *    see it.
  *
- *  - on cpu-down we clear cpu_active() to mask the sched domains and
+ *  - on CPU-down we clear cpu_active() to mask the sched domains and
  *    avoid the load balancer to place new tasks on the to be removed
- *    cpu. Existing tasks will remain running there and will be taken
+ *    CPU. Existing tasks will remain running there and will be taken
  *    off.
  *
  * This means that fallback selection must not select !active CPUs.
@@ -1532,9 +1512,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 	int dest_cpu;
 
 	/*
-	 * If the node that the cpu is on has been offlined, cpu_to_node()
-	 * will return -1. There is no cpu on the node, and we should
-	 * select the cpu on the other node.
+	 * If the node that the CPU is on has been offlined, cpu_to_node()
+	 * will return -1. There is no CPU on the node, and we should
+	 * select the CPU on the other node.
 	 */
 	if (nid != -1) {
 		nodemask = cpumask_of_node(nid);
@@ -1566,7 +1546,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 				state = possible;
 				break;
 			}
-			/* fall-through */
+			/* Fall-through */
 		case possible:
 			do_set_cpus_allowed(p, cpu_possible_mask);
 			state = fail;
@@ -1610,7 +1590,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 	/*
 	 * In order not to call set_task_cpu() on a blocking task we need
 	 * to rely on ttwu() to place the task on a valid ->cpus_allowed
-	 * cpu.
+	 * CPU.
 	 *
 	 * Since this is common to all placement strategies, this lives here.
 	 *
@@ -1684,7 +1664,7 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
 	activate_task(rq, p, en_flags);
 	p->on_rq = TASK_ON_RQ_QUEUED;
 
-	/* if a worker is waking up, notify workqueue */
+	/* If a worker is waking up, notify the workqueue: */
 	if (p->flags & PF_WQ_WORKER)
 		wq_worker_waking_up(p, cpu_of(rq));
 }
@@ -1867,7 +1847,7 @@ void wake_up_if_idle(int cpu)
 		raw_spin_lock_irqsave(&rq->lock, flags);
 		if (is_idle_task(rq->curr))
 			smp_send_reschedule(cpu);
-		/* Else cpu is not in idle, do nothing here */
+		/* Else CPU is not idle, do nothing here: */
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 	}
 
@@ -1888,7 +1868,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
 
 #if defined(CONFIG_SMP)
 	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
-		sched_clock_cpu(cpu); /* sync clocks x-cpu */
+		sched_clock_cpu(cpu); /* Sync clocks across CPUs */
 		ttwu_queue_remote(p, cpu, wake_flags);
 		return;
 	}
@@ -1907,8 +1887,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
  *  MIGRATION
  *
  * The basic program-order guarantee on SMP systems is that when a task [t]
- * migrates, all its activity on its old cpu [c0] happens-before any subsequent
- * execution on its new cpu [c1].
+ * migrates, all its activity on its old CPU [c0] happens-before any subsequent
+ * execution on its new CPU [c1].
  *
  * For migration (of runnable tasks) this is provided by the following means:
  *
@@ -1919,7 +1899,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
  *
  * Transitivity guarantees that B happens after A and C after B.
  * Note: we only require RCpc transitivity.
- * Note: the cpu doing B need not be c0 or c1
+ * Note: the CPU doing B need not be c0 or c1
  *
  * Example:
  *
@@ -2027,7 +2007,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 
 	trace_sched_waking(p);
 
-	success = 1; /* we're going to change ->state */
+	/* We're going to change ->state: */
+	success = 1;
 	cpu = task_cpu(p);
 
 	/*
@@ -2076,7 +2057,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	smp_rmb();
 
 	/*
-	 * If the owning (remote) cpu is still in the middle of schedule() with
+	 * If the owning (remote) CPU is still in the middle of schedule() with
 	 * this task as prev, wait until its done referencing the task.
 	 *
 	 * Pairs with the smp_store_release() in finish_lock_switch().
@@ -2448,7 +2429,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 	 */
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	/*
-	 * We're setting the cpu for the first time, we don't migrate,
+	 * We're setting the CPU for the first time, we don't migrate,
 	 * so use __set_task_cpu().
 	 */
 	__set_task_cpu(p, cpu);
@@ -2591,7 +2572,7 @@ void wake_up_new_task(struct task_struct *p)
 	/*
 	 * Fork balancing, do it here and not earlier because:
 	 *  - cpus_allowed can change in the fork path
-	 *  - any previously selected cpu might disappear through hotplug
+	 *  - any previously selected CPU might disappear through hotplug
 	 *
 	 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
 	 * as we're not fully set-up yet.
@@ -2945,7 +2926,7 @@ unsigned long nr_running(void)
 }
 
 /*
- * Check if only the current task is running on the cpu.
+ * Check if only the current task is running on the CPU.
  *
  * Caution: this function does not check that the caller has disabled
  * preemption, thus the result might have a time-of-check-to-time-of-use
@@ -3104,8 +3085,8 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 	 * So we have a optimization chance when the task's delta_exec is 0.
 	 * Reading ->on_cpu is racy, but this is ok.
 	 *
-	 * If we race with it leaving cpu, we'll take a lock. So we're correct.
-	 * If we race with it entering cpu, unaccounted time is 0. This is
+	 * If we race with it leaving CPU, we'll take a lock. So we're correct.
+	 * If we race with it entering CPU, unaccounted time is 0. This is
 	 * indistinguishable from the read occurring a few cycles earlier.
 	 * If we see ->on_cpu without ->on_rq, the task is leaving, and has
 	 * been accounted, so we're correct here as well.
@@ -3333,7 +3314,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 		if (unlikely(p == RETRY_TASK))
 			goto again;
 
-		/* assumes fair_sched_class->next == idle_sched_class */
+		/* Assumes fair_sched_class->next == idle_sched_class */
 		if (unlikely(!p))
 			p = idle_sched_class.pick_next_task(rq, prev, rf);
 
@@ -3350,7 +3331,8 @@ again:
 		}
 	}
 
-	BUG(); /* the idle class will always have a runnable task */
+	/* The idle class should always have a runnable task: */
+	BUG();
 }
 
 /*
@@ -3421,7 +3403,8 @@ static void __sched notrace __schedule(bool preempt)
 	raw_spin_lock(&rq->lock);
 	rq_pin_lock(rq, &rf);
 
-	rq->clock_update_flags <<= 1; /* promote REQ to ACT */
+	/* Promote REQ to ACT */
+	rq->clock_update_flags <<= 1;
 
 	switch_count = &prev->nivcsw;
 	if (!preempt && prev->state) {
@@ -3465,7 +3448,9 @@ static void __sched notrace __schedule(bool preempt)
 		++*switch_count;
 
 		trace_sched_switch(preempt, prev, next);
-		rq = context_switch(rq, prev, next, &rf); /* unlocks the rq */
+
+		/* Also unlocks the rq: */
+		rq = context_switch(rq, prev, next, &rf);
 	} else {
 		rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
 		rq_unpin_lock(rq, &rf);
@@ -3492,14 +3477,18 @@ void __noreturn do_task_dead(void)
 	smp_mb();
 	raw_spin_unlock_wait(&current->pi_lock);
 
-	/* causes final put_task_struct in finish_task_switch(). */
+	/* Causes final put_task_struct in finish_task_switch(): */
 	__set_current_state(TASK_DEAD);
-	current->flags |= PF_NOFREEZE;	/* tell freezer to ignore us */
+
+	/* Tell freezer to ignore us: */
+	current->flags |= PF_NOFREEZE;
+
 	__schedule(false);
 	BUG();
-	/* Avoid "noreturn function does return".  */
+
+	/* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */
 	for (;;)
-		cpu_relax();	/* For when BUG is null */
+		cpu_relax();
 }
 
 static inline void sched_submit_work(struct task_struct *tsk)
@@ -3792,7 +3781,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
 	check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
-	preempt_disable(); /* avoid rq from going away on us */
+	/* Avoid rq from going away on us: */
+	preempt_disable();
 	__task_rq_unlock(rq, &rf);
 
 	balance_callback(rq);
@@ -3862,7 +3852,7 @@ EXPORT_SYMBOL(set_user_nice);
  */
 int can_nice(const struct task_struct *p, const int nice)
 {
-	/* convert nice value [19,-20] to rlimit style value [1,40] */
+	/* Convert nice value [19,-20] to rlimit style value [1,40]: */
 	int nice_rlim = nice_to_rlimit(nice);
 
 	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
@@ -3918,7 +3908,7 @@ int task_prio(const struct task_struct *p)
 }
 
 /**
- * idle_cpu - is a given cpu idle currently?
+ * idle_cpu - is a given CPU idle currently?
  * @cpu: the processor in question.
  *
  * Return: 1 if the CPU is currently idle. 0 otherwise.
@@ -3942,10 +3932,10 @@ int idle_cpu(int cpu)
 }
 
 /**
- * idle_task - return the idle task for a given cpu.
+ * idle_task - return the idle task for a given CPU.
  * @cpu: the processor in question.
  *
- * Return: The idle task for the cpu @cpu.
+ * Return: The idle task for the CPU @cpu.
  */
 struct task_struct *idle_task(int cpu)
 {
@@ -4111,7 +4101,7 @@ __checkparam_dl(const struct sched_attr *attr)
 }
 
 /*
- * check the target process has a UID that matches the current process's
+ * Check the target process has a UID that matches the current process's:
  */
 static bool check_same_owner(struct task_struct *p)
 {
@@ -4126,8 +4116,7 @@ static bool check_same_owner(struct task_struct *p)
 	return match;
 }
 
-static bool dl_param_changed(struct task_struct *p,
-		const struct sched_attr *attr)
+static bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
 {
 	struct sched_dl_entity *dl_se = &p->dl;
 
@@ -4154,10 +4143,10 @@ static int __sched_setscheduler(struct task_struct *p,
 	int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
 	struct rq *rq;
 
-	/* may grab non-irq protected spin_locks */
+	/* May grab non-irq protected spin_locks: */
 	BUG_ON(in_interrupt());
 recheck:
-	/* double check policy once rq lock held */
+	/* Double check policy once rq lock held: */
 	if (policy < 0) {
 		reset_on_fork = p->sched_reset_on_fork;
 		policy = oldpolicy = p->policy;
@@ -4197,11 +4186,11 @@ recheck:
 			unsigned long rlim_rtprio =
 					task_rlimit(p, RLIMIT_RTPRIO);
 
-			/* can't set/change the rt policy */
+			/* Can't set/change the rt policy: */
 			if (policy != p->policy && !rlim_rtprio)
 				return -EPERM;
 
-			/* can't increase priority */
+			/* Can't increase priority: */
 			if (attr->sched_priority > p->rt_priority &&
 			    attr->sched_priority > rlim_rtprio)
 				return -EPERM;
@@ -4225,11 +4214,11 @@ recheck:
 				return -EPERM;
 		}
 
-		/* can't change other user's priorities */
+		/* Can't change other user's priorities: */
 		if (!check_same_owner(p))
 			return -EPERM;
 
-		/* Normal users shall not reset the sched_reset_on_fork flag */
+		/* Normal users shall not reset the sched_reset_on_fork flag: */
 		if (p->sched_reset_on_fork && !reset_on_fork)
 			return -EPERM;
 	}
@@ -4241,7 +4230,7 @@ recheck:
 	}
 
 	/*
-	 * make sure no PI-waiters arrive (or leave) while we are
+	 * Make sure no PI-waiters arrive (or leave) while we are
 	 * changing the priority of the task:
 	 *
 	 * To be able to change p->policy safely, the appropriate
@@ -4251,7 +4240,7 @@ recheck:
 	update_rq_clock(rq);
 
 	/*
-	 * Changing the policy of the stop threads its a very bad idea
+	 * Changing the policy of the stop threads its a very bad idea:
 	 */
 	if (p == rq->stop) {
 		task_rq_unlock(rq, p, &rf);
@@ -4307,7 +4296,7 @@ change:
 #endif
 	}
 
-	/* recheck policy now with rq lock held */
+	/* Re-check policy now with rq lock held: */
 	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
 		policy = oldpolicy = -1;
 		task_rq_unlock(rq, p, &rf);
@@ -4364,15 +4353,15 @@ change:
 		set_curr_task(rq, p);
 
 	check_class_changed(rq, p, prev_class, oldprio);
-	preempt_disable(); /* avoid rq from going away on us */
+
+	/* Avoid rq from going away on us: */
+	preempt_disable();
 	task_rq_unlock(rq, p, &rf);
 
 	if (pi)
 		rt_mutex_adjust_pi(p);
 
-	/*
-	 * Run balance callbacks after we've adjusted the PI chain.
-	 */
+	/* Run balance callbacks after we've adjusted the PI chain: */
 	balance_callback(rq);
 	preempt_enable();
 
@@ -4465,8 +4454,7 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 /*
  * Mimics kernel/events/core.c perf_copy_attr().
  */
-static int sched_copy_attr(struct sched_attr __user *uattr,
-			   struct sched_attr *attr)
+static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
 {
 	u32 size;
 	int ret;
@@ -4474,19 +4462,19 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
 	if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
 		return -EFAULT;
 
-	/*
-	 * zero the full structure, so that a short copy will be nice.
-	 */
+	/* Zero the full structure, so that a short copy will be nice: */
 	memset(attr, 0, sizeof(*attr));
 
 	ret = get_user(size, &uattr->size);
 	if (ret)
 		return ret;
 
-	if (size > PAGE_SIZE)	/* silly large */
+	/* Bail out on silly large: */
+	if (size > PAGE_SIZE)
 		goto err_size;
 
-	if (!size)		/* abi compat */
+	/* ABI compatibility quirk: */
+	if (!size)
 		size = SCHED_ATTR_SIZE_VER0;
 
 	if (size < SCHED_ATTR_SIZE_VER0)
@@ -4521,7 +4509,7 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
 		return -EFAULT;
 
 	/*
-	 * XXX: do we want to be lenient like existing syscalls; or do we want
+	 * XXX: Do we want to be lenient like existing syscalls; or do we want
 	 * to be strict and return an error on out-of-bounds values?
 	 */
 	attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
@@ -4541,10 +4529,8 @@ err_size:
  *
  * Return: 0 on success. An error code otherwise.
  */
-SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
-		struct sched_param __user *, param)
+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
 {
-	/* negative values for policy are not valid */
 	if (policy < 0)
 		return -EINVAL;
 
@@ -4854,10 +4840,10 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
 }
 
 /**
- * sys_sched_setaffinity - set the cpu affinity of a process
+ * sys_sched_setaffinity - set the CPU affinity of a process
  * @pid: pid of the process
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
- * @user_mask_ptr: user-space pointer to the new cpu mask
+ * @user_mask_ptr: user-space pointer to the new CPU mask
  *
  * Return: 0 on success. An error code otherwise.
  */
@@ -4905,10 +4891,10 @@ out_unlock:
 }
 
 /**
- * sys_sched_getaffinity - get the cpu affinity of a process
+ * sys_sched_getaffinity - get the CPU affinity of a process
  * @pid: pid of the process
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
- * @user_mask_ptr: user-space pointer to hold the current cpu mask
+ * @user_mask_ptr: user-space pointer to hold the current CPU mask
  *
  * Return: size of CPU mask copied to user_mask_ptr on success. An
  * error code otherwise.
@@ -5036,7 +5022,7 @@ EXPORT_SYMBOL(__cond_resched_softirq);
  * Typical broken usage is:
  *
  * while (!event)
- * 	yield();
+ *	yield();
  *
  * where one assumes that yield() will let 'the other' process run that will
  * make event true. If the current task is a SCHED_FIFO task that will never
@@ -5351,7 +5337,7 @@ void init_idle_bootup_task(struct task_struct *idle)
 /**
  * init_idle - set up an idle thread for a given CPU
  * @idle: task in question
- * @cpu: cpu the idle task belongs to
+ * @cpu: CPU the idle task belongs to
  *
  * NOTE: this function does not set the idle thread's NEED_RESCHED
  * flag, to make booting more robust.
@@ -5382,7 +5368,7 @@ void init_idle(struct task_struct *idle, int cpu)
 #endif
 	/*
 	 * We're having a chicken and egg problem, even though we are
-	 * holding rq->lock, the cpu isn't yet set to this cpu so the
+	 * holding rq->lock, the CPU isn't yet set to this CPU so the
 	 * lockdep check in task_group() will fail.
 	 *
 	 * Similar case to sched_fork(). / Alternatively we could
@@ -5447,7 +5433,7 @@ int task_can_attach(struct task_struct *p,
 
 	/*
 	 * Kthreads which disallow setaffinity shouldn't be moved
-	 * to a new cpuset; we don't want to change their cpu
+	 * to a new cpuset; we don't want to change their CPU
 	 * affinity and isolating such threads by their set of
 	 * allowed nodes is unnecessary.  Thus, cpusets are not
 	 * applicable for such threads.  This prevents checking for
@@ -5548,7 +5534,7 @@ void sched_setnuma(struct task_struct *p, int nid)
 
 #ifdef CONFIG_HOTPLUG_CPU
 /*
- * Ensures that the idle task is using init_mm right before its cpu goes
+ * Ensure that the idle task is using init_mm right before its CPU goes
  * offline.
  */
 void idle_task_exit(void)
@@ -5632,13 +5618,13 @@ static void migrate_tasks(struct rq *dead_rq)
 	for (;;) {
 		/*
 		 * There's this thread running, bail when that's the only
-		 * remaining thread.
+		 * remaining thread:
 		 */
 		if (rq->nr_running == 1)
 			break;
 
 		/*
-		 * pick_next_task assumes pinned rq->lock.
+		 * pick_next_task() assumes pinned rq->lock:
 		 */
 		rq_pin_lock(rq, &rf);
 		next = pick_next_task(rq, &fake_task, &rf);
@@ -5730,7 +5716,8 @@ static void set_cpu_rq_start_time(unsigned int cpu)
 	rq->age_stamp = sched_clock_cpu(cpu);
 }
 
-static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
+/* Protected by sched_domains_mutex: */
+static cpumask_var_t sched_domains_tmpmask;
 
 #ifdef CONFIG_SCHED_DEBUG
 
@@ -5997,7 +5984,7 @@ out:
 }
 
 /*
- * By default the system creates a single root-domain with all cpus as
+ * By default the system creates a single root-domain with all CPUs as
  * members (mimicking the global state we have today).
  */
 struct root_domain def_root_domain;
@@ -6083,9 +6070,9 @@ static void destroy_sched_domains(struct sched_domain *sd)
  * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
  * allows us to avoid some pointer chasing select_idle_sibling().
  *
- * Also keep a unique ID per domain (we use the first cpu number in
+ * Also keep a unique ID per domain (we use the first CPU number in
  * the cpumask of the domain), this allows us to quickly tell if
- * two cpus are in the same cache domain, see cpus_share_cache().
+ * two CPUs are in the same cache domain, see cpus_share_cache().
  */
 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
@@ -6170,7 +6157,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 	update_top_cache_domain(cpu);
 }
 
-/* Setup the mask of cpus configured for isolated domains */
+/* Setup the mask of CPUs configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
 	int ret;
@@ -6207,8 +6194,7 @@ enum s_alloc {
  *
  * In that case build_sched_domains() will have terminated the iteration early
  * and our sibling sd spans will be empty. Domains should always include the
- * cpu they're built on, so check that.
- *
+ * CPU they're built on, so check that.
  */
 static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
 {
@@ -6227,7 +6213,7 @@ static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
 }
 
 /*
- * Return the canonical balance cpu for this group, this is the first cpu
+ * Return the canonical balance CPU for this group, this is the first CPU
  * of this group that's also in the iteration mask.
  */
 int group_balance_cpu(struct sched_group *sg)
@@ -6287,7 +6273,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 
 		/*
 		 * Make sure the first group of this domain contains the
-		 * canonical balance cpu. Otherwise the sched_domain iteration
+		 * canonical balance CPU. Otherwise the sched_domain iteration
 		 * breaks. See update_sg_lb_stats().
 		 */
 		if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
@@ -6322,7 +6308,9 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
 	if (sg) {
 		*sg = *per_cpu_ptr(sdd->sg, cpu);
 		(*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
-		atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */
+
+		/* For claim_allocations: */
+		atomic_set(&(*sg)->sgc->ref, 1);
 	}
 
 	return cpu;
@@ -6456,10 +6444,10 @@ static void set_domain_attribute(struct sched_domain *sd,
 	} else
 		request = attr->relax_domain_level;
 	if (request < sd->level) {
-		/* turn off idle balance on this domain */
+		/* Turn off idle balance on this domain: */
 		sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
 	} else {
-		/* turn on idle balance on this domain */
+		/* Turn on idle balance on this domain: */
 		sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
 	}
 }
@@ -6473,18 +6461,21 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
 	switch (what) {
 	case sa_rootdomain:
 		if (!atomic_read(&d->rd->refcount))
-			free_rootdomain(&d->rd->rcu); /* fall through */
+			free_rootdomain(&d->rd->rcu);
+		/* Fall through */
 	case sa_sd:
-		free_percpu(d->sd); /* fall through */
+		free_percpu(d->sd);
+		/* Fall through */
 	case sa_sd_storage:
-		__sdt_free(cpu_map); /* fall through */
+		__sdt_free(cpu_map);
+		/* Fall through */
 	case sa_none:
 		break;
 	}
 }
 
-static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
-						   const struct cpumask *cpu_map)
+static enum s_alloc
+__visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
 {
 	memset(d, 0, sizeof(*d));
 
@@ -6883,7 +6874,7 @@ static void sched_init_numa(void)
 
 	/*
 	 * Now for each level, construct a mask per node which contains all
-	 * cpus of nodes that are that many hops away from us.
+	 * CPUs of nodes that are that many hops away from us.
 	 */
 	for (i = 0; i < level; i++) {
 		sched_domains_numa_masks[i] =
@@ -7103,11 +7094,11 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
 }
 
 /*
- * Build sched domains for a given set of cpus and attach the sched domains
- * to the individual cpus
+ * Build sched domains for a given set of CPUs and attach the sched domains
+ * to the individual CPUs
  */
-static int build_sched_domains(const struct cpumask *cpu_map,
-			       struct sched_domain_attr *attr)
+static int
+build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
 {
 	enum s_alloc alloc_state;
 	struct sched_domain *sd;
@@ -7119,7 +7110,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,
 	if (alloc_state != sa_rootdomain)
 		goto error;
 
-	/* Set up domains for cpus specified by the cpu_map. */
+	/* Set up domains for CPUs specified by the cpu_map: */
 	for_each_cpu(i, cpu_map) {
 		struct sched_domain_topology_level *tl;
 
@@ -7185,21 +7176,25 @@ error:
 	return ret;
 }
 
-static cpumask_var_t *doms_cur;	/* current sched domains */
-static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
-static struct sched_domain_attr *dattr_cur;
-				/* attribues of custom domains in 'doms_cur' */
+/* Current sched domains: */
+static cpumask_var_t			*doms_cur;
+
+/* Number of sched domains in 'doms_cur': */
+static int				ndoms_cur;
+
+/* Attribues of custom domains in 'doms_cur' */
+static struct sched_domain_attr		*dattr_cur;
 
 /*
- * Special case: If a kmalloc of a doms_cur partition (array of
+ * Special case: If a kmalloc() of a doms_cur partition (array of
  * cpumask) fails, then fallback to a single sched domain,
  * as determined by the single cpumask fallback_doms.
  */
-static cpumask_var_t fallback_doms;
+static cpumask_var_t			fallback_doms;
 
 /*
  * arch_update_cpu_topology lets virtualized architectures update the
- * cpu core maps. It is supposed to return 1 if the topology changed
+ * CPU core maps. It is supposed to return 1 if the topology changed
  * or 0 if it stayed the same.
  */
 int __weak arch_update_cpu_topology(void)
@@ -7234,7 +7229,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
 
 /*
  * Set up scheduler domains and groups. Callers must hold the hotplug lock.
- * For now this just excludes isolated cpus, but could be used to
+ * For now this just excludes isolated CPUs, but could be used to
  * exclude other special cases in the future.
  */
 static int init_sched_domains(const struct cpumask *cpu_map)
@@ -7254,8 +7249,8 @@ static int init_sched_domains(const struct cpumask *cpu_map)
 }
 
 /*
- * Detach sched domains from a group of cpus specified in cpu_map
- * These cpus will now be attached to the NULL domain
+ * Detach sched domains from a group of CPUs specified in cpu_map
+ * These CPUs will now be attached to the NULL domain
  */
 static void detach_destroy_domains(const struct cpumask *cpu_map)
 {
@@ -7273,7 +7268,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
 {
 	struct sched_domain_attr tmp;
 
-	/* fast path */
+	/* Fast path: */
 	if (!new && !cur)
 		return 1;
 
@@ -7317,22 +7312,22 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 
 	mutex_lock(&sched_domains_mutex);
 
-	/* always unregister in case we don't destroy any domains */
+	/* Always unregister in case we don't destroy any domains: */
 	unregister_sched_domain_sysctl();
 
-	/* Let architecture update cpu core mappings. */
+	/* Let the architecture update CPU core mappings: */
 	new_topology = arch_update_cpu_topology();
 
 	n = doms_new ? ndoms_new : 0;
 
-	/* Destroy deleted domains */
+	/* Destroy deleted domains: */
 	for (i = 0; i < ndoms_cur; i++) {
 		for (j = 0; j < n && !new_topology; j++) {
 			if (cpumask_equal(doms_cur[i], doms_new[j])
 			    && dattrs_equal(dattr_cur, i, dattr_new, j))
 				goto match1;
 		}
-		/* no match - a current sched domain not in new doms_new[] */
+		/* No match - a current sched domain not in new doms_new[] */
 		detach_destroy_domains(doms_cur[i]);
 match1:
 		;
@@ -7346,23 +7341,24 @@ match1:
 		WARN_ON_ONCE(dattr_new);
 	}
 
-	/* Build new domains */
+	/* Build new domains: */
 	for (i = 0; i < ndoms_new; i++) {
 		for (j = 0; j < n && !new_topology; j++) {
 			if (cpumask_equal(doms_new[i], doms_cur[j])
 			    && dattrs_equal(dattr_new, i, dattr_cur, j))
 				goto match2;
 		}
-		/* no match - add a new doms_new */
+		/* No match - add a new doms_new */
 		build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
 match2:
 		;
 	}
 
-	/* Remember the new sched domains */
+	/* Remember the new sched domains: */
 	if (doms_cur != &fallback_doms)
 		free_sched_domains(doms_cur, ndoms_cur);
-	kfree(dattr_cur);	/* kfree(NULL) is safe */
+
+	kfree(dattr_cur);
 	doms_cur = doms_new;
 	dattr_cur = dattr_new;
 	ndoms_cur = ndoms_new;
@@ -7372,7 +7368,10 @@ match2:
 	mutex_unlock(&sched_domains_mutex);
 }
 
-static int num_cpus_frozen;	/* used to mark begin/end of suspend/resume */
+/*
+ * used to mark begin/end of suspend/resume:
+ */
+static int num_cpus_frozen;
 
 /*
  * Update cpusets according to cpu_active mask.  If cpusets are
@@ -7449,7 +7448,7 @@ int sched_cpu_activate(unsigned int cpu)
 	 * Put the rq online, if not already. This happens:
 	 *
 	 * 1) In the early boot process, because we build the real domains
-	 *    after all cpus have been brought up.
+	 *    after all CPUs have been brought up.
 	 *
 	 * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
 	 *    domains.
@@ -7564,7 +7563,7 @@ void __init sched_init_smp(void)
 
 	/*
 	 * There's no userspace yet to cause hotplug operations; hence all the
-	 * cpu masks are stable and all blatant races in the below code cannot
+	 * CPU masks are stable and all blatant races in the below code cannot
 	 * happen.
 	 */
 	mutex_lock(&sched_domains_mutex);
@@ -7684,10 +7683,8 @@ void __init sched_init(void)
 	}
 #endif /* CONFIG_CPUMASK_OFFSTACK */
 
-	init_rt_bandwidth(&def_rt_bandwidth,
-			global_rt_period(), global_rt_runtime());
-	init_dl_bandwidth(&def_dl_bandwidth,
-			global_rt_period(), global_rt_runtime());
+	init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
+	init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime());
 
 #ifdef CONFIG_SMP
 	init_defrootdomain();
@@ -7723,18 +7720,18 @@ void __init sched_init(void)
 		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
 		rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
 		/*
-		 * How much cpu bandwidth does root_task_group get?
+		 * How much CPU bandwidth does root_task_group get?
 		 *
 		 * In case of task-groups formed thr' the cgroup filesystem, it
-		 * gets 100% of the cpu resources in the system. This overall
-		 * system cpu resource is divided among the tasks of
+		 * gets 100% of the CPU resources in the system. This overall
+		 * system CPU resource is divided among the tasks of
 		 * root_task_group and its child task-groups in a fair manner,
 		 * based on each entity's (task or task-group's) weight
 		 * (se->load.weight).
 		 *
 		 * In other words, if root_task_group has 10 tasks of weight
 		 * 1024) and two child groups A0 and A1 (of weight 1024 each),
-		 * then A0's share of the cpu resource is:
+		 * then A0's share of the CPU resource is:
 		 *
 		 *	A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
 		 *
@@ -7843,10 +7840,14 @@ EXPORT_SYMBOL(__might_sleep);
 
 void ___might_sleep(const char *file, int line, int preempt_offset)
 {
-	static unsigned long prev_jiffy;	/* ratelimiting */
+	/* Ratelimiting timestamp: */
+	static unsigned long prev_jiffy;
+
 	unsigned long preempt_disable_ip;
 
-	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
+	/* WARN_ON_ONCE() by default, no rate limit required: */
+	rcu_sleep_check();
+
 	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
 	     !is_idle_task(current)) ||
 	    system_state != SYSTEM_RUNNING || oops_in_progress)
@@ -7855,7 +7856,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
 		return;
 	prev_jiffy = jiffies;
 
-	/* Save this before calling printk(), since that will clobber it */
+	/* Save this before calling printk(), since that will clobber it: */
 	preempt_disable_ip = get_preempt_disable_ip(current);
 
 	printk(KERN_ERR
@@ -7934,7 +7935,7 @@ void normalize_rt_tasks(void)
  */
 
 /**
- * curr_task - return the current task for a given cpu.
+ * curr_task - return the current task for a given CPU.
  * @cpu: the processor in question.
  *
  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
@@ -7950,13 +7951,13 @@ struct task_struct *curr_task(int cpu)
 
 #ifdef CONFIG_IA64
 /**
- * set_curr_task - set the current task for a given cpu.
+ * set_curr_task - set the current task for a given CPU.
  * @cpu: the processor in question.
  * @p: the task pointer to set.
  *
  * Description: This function must only be used when non-maskable interrupts
  * are serviced on a separate stack. It allows the architecture to switch the
- * notion of the current task on a cpu in a non-blocking manner. This function
+ * notion of the current task on a CPU in a non-blocking manner. This function
  * must be called with all CPU's synchronized, and interrupts disabled, the
  * and caller must save the original value of the current task (see
  * curr_task() above) and restore that value before reenabling interrupts and
@@ -8012,7 +8013,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
 	spin_lock_irqsave(&task_group_lock, flags);
 	list_add_rcu(&tg->list, &task_groups);
 
-	WARN_ON(!parent); /* root should already exist */
+	/* Root should already exist: */
+	WARN_ON(!parent);
 
 	tg->parent = parent;
 	INIT_LIST_HEAD(&tg->children);
@@ -8025,13 +8027,13 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
 /* rcu callback to free various structures associated with a task group */
 static void sched_free_group_rcu(struct rcu_head *rhp)
 {
-	/* now it should be safe to free those cfs_rqs */
+	/* Now it should be safe to free those cfs_rqs: */
 	sched_free_group(container_of(rhp, struct task_group, rcu));
 }
 
 void sched_destroy_group(struct task_group *tg)
 {
-	/* wait for possible concurrent references to cfs_rqs complete */
+	/* Wait for possible concurrent references to cfs_rqs complete: */
 	call_rcu(&tg->rcu, sched_free_group_rcu);
 }
 
@@ -8039,7 +8041,7 @@ void sched_offline_group(struct task_group *tg)
 {
 	unsigned long flags;
 
-	/* end participation in shares distribution */
+	/* End participation in shares distribution: */
 	unregister_fair_sched_group(tg);
 
 	spin_lock_irqsave(&task_group_lock, flags);
@@ -8468,8 +8470,10 @@ int sched_rr_handler(struct ctl_table *table, int write,
 
 	mutex_lock(&mutex);
 	ret = proc_dointvec(table, write, buffer, lenp, ppos);
-	/* make sure that internally we keep jiffies */
-	/* also, writing zero resets timeslice to default */
+	/*
+	 * Make sure that internally we keep jiffies.
+	 * Also, writing zero resets the timeslice to default:
+	 */
 	if (!ret && write) {
 		sched_rr_timeslice =
 			sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
@@ -8654,9 +8658,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 	cfs_b->quota = quota;
 
 	__refill_cfs_bandwidth_runtime(cfs_b);
-	/* restart the period timer (if active) to handle new period expiry */
+
+	/* Restart the period timer (if active) to handle new period expiry: */
 	if (runtime_enabled)
 		start_cfs_bandwidth(cfs_b);
+
 	raw_spin_unlock_irq(&cfs_b->lock);
 
 	for_each_online_cpu(i) {
@@ -8794,8 +8800,8 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
 		parent_quota = parent_b->hierarchical_quota;
 
 		/*
-		 * ensure max(child_quota) <= parent_quota, inherit when no
-		 * limit is set
+		 * Ensure max(child_quota) <= parent_quota, inherit when no
+		 * limit is set:
 		 */
 		if (quota == RUNTIME_INF)
 			quota = parent_quota;
@@ -8904,7 +8910,7 @@ static struct cftype cpu_files[] = {
 		.write_u64 = cpu_rt_period_write_uint,
 	},
 #endif
-	{ }	/* terminate */
+	{ }	/* Terminate */
 };
 
 struct cgroup_subsys cpu_cgrp_subsys = {
-- 
cgit v1.2.3


From 535b9552bb81eebe112ee7bd34ee498857b0c26b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 12:29:21 +0100
Subject: sched/rq_clock: Consolidate the ordering of the rq_clock methods

update_rq_clock_task() and update_rq_clock() we unnecessarily
spread across core.c, requiring an extra prototype line.

Move them next to each other and in the proper order.

Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 153 ++++++++++++++++++++++++++--------------------------
 1 file changed, 78 insertions(+), 75 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 87cf7ba82bd5..a400190792b9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -73,27 +73,6 @@
 DEFINE_MUTEX(sched_domains_mutex);
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
-static void update_rq_clock_task(struct rq *rq, s64 delta);
-
-void update_rq_clock(struct rq *rq)
-{
-	s64 delta;
-
-	lockdep_assert_held(&rq->lock);
-
-	if (rq->clock_update_flags & RQCF_ACT_SKIP)
-		return;
-
-#ifdef CONFIG_SCHED_DEBUG
-	rq->clock_update_flags |= RQCF_UPDATED;
-#endif
-	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
-	if (delta < 0)
-		return;
-	rq->clock += delta;
-	update_rq_clock_task(rq, delta);
-}
-
 /*
  * Debugging: various feature bits
  */
@@ -218,6 +197,84 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
 	}
 }
 
+/*
+ * RQ-clock updating methods:
+ */
+
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+	s64 steal = 0, irq_delta = 0;
+#endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+
+	/*
+	 * Since irq_time is only updated on {soft,}irq_exit, we might run into
+	 * this case when a previous update_rq_clock() happened inside a
+	 * {soft,}irq region.
+	 *
+	 * When this happens, we stop ->clock_task and only update the
+	 * prev_irq_time stamp to account for the part that fit, so that a next
+	 * update will consume the rest. This ensures ->clock_task is
+	 * monotonic.
+	 *
+	 * It does however cause some slight miss-attribution of {soft,}irq
+	 * time, a more accurate solution would be to update the irq_time using
+	 * the current rq->clock timestamp, except that would require using
+	 * atomic ops.
+	 */
+	if (irq_delta > delta)
+		irq_delta = delta;
+
+	rq->prev_irq_time += irq_delta;
+	delta -= irq_delta;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+	if (static_key_false((&paravirt_steal_rq_enabled))) {
+		steal = paravirt_steal_clock(cpu_of(rq));
+		steal -= rq->prev_steal_time_rq;
+
+		if (unlikely(steal > delta))
+			steal = delta;
+
+		rq->prev_steal_time_rq += steal;
+		delta -= steal;
+	}
+#endif
+
+	rq->clock_task += delta;
+
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
+		sched_rt_avg_update(rq, irq_delta + steal);
+#endif
+}
+
+void update_rq_clock(struct rq *rq)
+{
+	s64 delta;
+
+	lockdep_assert_held(&rq->lock);
+
+	if (rq->clock_update_flags & RQCF_ACT_SKIP)
+		return;
+
+#ifdef CONFIG_SCHED_DEBUG
+	rq->clock_update_flags |= RQCF_UPDATED;
+#endif
+	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+	if (delta < 0)
+		return;
+	rq->clock += delta;
+	update_rq_clock_task(rq, delta);
+}
+
+
 #ifdef CONFIG_SCHED_HRTICK
 /*
  * Use HR-timers to deliver accurate preemption points.
@@ -767,60 +824,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 	dequeue_task(rq, p, flags);
 }
 
-static void update_rq_clock_task(struct rq *rq, s64 delta)
-{
-/*
- * In theory, the compile should just see 0 here, and optimize out the call
- * to sched_rt_avg_update. But I don't trust it...
- */
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
-	s64 steal = 0, irq_delta = 0;
-#endif
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
-
-	/*
-	 * Since irq_time is only updated on {soft,}irq_exit, we might run into
-	 * this case when a previous update_rq_clock() happened inside a
-	 * {soft,}irq region.
-	 *
-	 * When this happens, we stop ->clock_task and only update the
-	 * prev_irq_time stamp to account for the part that fit, so that a next
-	 * update will consume the rest. This ensures ->clock_task is
-	 * monotonic.
-	 *
-	 * It does however cause some slight miss-attribution of {soft,}irq
-	 * time, a more accurate solution would be to update the irq_time using
-	 * the current rq->clock timestamp, except that would require using
-	 * atomic ops.
-	 */
-	if (irq_delta > delta)
-		irq_delta = delta;
-
-	rq->prev_irq_time += irq_delta;
-	delta -= irq_delta;
-#endif
-#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
-	if (static_key_false((&paravirt_steal_rq_enabled))) {
-		steal = paravirt_steal_clock(cpu_of(rq));
-		steal -= rq->prev_steal_time_rq;
-
-		if (unlikely(steal > delta))
-			steal = delta;
-
-		rq->prev_steal_time_rq += steal;
-		delta -= steal;
-	}
-#endif
-
-	rq->clock_task += delta;
-
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
-	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
-		sched_rt_avg_update(rq, irq_delta + steal);
-#endif
-}
-
 void sched_set_stop_task(int cpu, struct task_struct *stop)
 {
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
-- 
cgit v1.2.3


From 004172bdad644327dc7a6543186b9d7b529ee944 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 12:01:04 +0100
Subject: sched/core: Remove unnecessary #include headers

Over the years sched/core.c accumulated over 50 #include lines,
40 of which are superfluous. (!)

Removing them decreases the preprocessed .c file (.i) size noticeably:

            triton:~/tip> wc -l kernel/sched/core.i

  Before:   76387 kernel/sched/core.i
  After:    75896 kernel/sched/core.i

Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 59 +++++++++--------------------------------------------
 1 file changed, 10 insertions(+), 49 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a400190792b9..1cea6c61fb01 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5,63 +5,24 @@
  *
  *  Copyright (C) 1991-2002  Linus Torvalds
  */
-#include <linux/kasan.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/nmi.h>
-#include <linux/init.h>
-#include <linux/uaccess.h>
-#include <linux/highmem.h>
-#include <linux/mmu_context.h>
-#include <linux/interrupt.h>
-#include <linux/capability.h>
-#include <linux/completion.h>
-#include <linux/kernel_stat.h>
-#include <linux/debug_locks.h>
-#include <linux/perf_event.h>
-#include <linux/security.h>
-#include <linux/notifier.h>
-#include <linux/profile.h>
-#include <linux/freezer.h>
-#include <linux/vmalloc.h>
-#include <linux/blkdev.h>
-#include <linux/delay.h>
-#include <linux/pid_namespace.h>
-#include <linux/smp.h>
-#include <linux/threads.h>
-#include <linux/timer.h>
-#include <linux/rcupdate.h>
-#include <linux/cpu.h>
+#include <linux/sched.h>
 #include <linux/cpuset.h>
-#include <linux/percpu.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/sysctl.h>
-#include <linux/syscalls.h>
-#include <linux/times.h>
-#include <linux/tsacct_kern.h>
-#include <linux/kprobes.h>
 #include <linux/delayacct.h>
-#include <linux/unistd.h>
-#include <linux/pagemap.h>
-#include <linux/hrtimer.h>
-#include <linux/tick.h>
-#include <linux/ctype.h>
-#include <linux/ftrace.h>
-#include <linux/slab.h>
 #include <linux/init_task.h>
 #include <linux/context_tracking.h>
-#include <linux/compiler.h>
-#include <linux/frame.h>
+
+#include <linux/blkdev.h>
+#include <linux/kprobes.h>
+#include <linux/mmu_context.h>
+#include <linux/module.h>
+#include <linux/nmi.h>
 #include <linux/prefetch.h>
-#include <linux/mutex.h>
+#include <linux/profile.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
 
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
-#include <asm/irq_regs.h>
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#endif
 
 #include "sched.h"
 #include "../workqueue_internal.h"
-- 
cgit v1.2.3


From f2cb13609d5397cdd747f3ed6fb651233851717d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 13:10:18 +0100
Subject: sched/topology: Split out scheduler topology code from core.c into
 topology.c

Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/Makefile   |    2 +-
 kernel/sched/core.c     | 1659 +----------------------------------------------
 kernel/sched/sched.h    |   23 +-
 kernel/sched/topology.c | 1658 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 1684 insertions(+), 1658 deletions(-)
 create mode 100644 kernel/sched/topology.c

(limited to 'kernel')

diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 5e59b832ae2b..130ce8ac725b 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -18,7 +18,7 @@ endif
 obj-y += core.o loadavg.o clock.o cputime.o
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
 obj-y += wait.o swait.o completion.o idle.o
-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
+obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1cea6c61fb01..e4aa470ed454 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -31,7 +31,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
 
-DEFINE_MUTEX(sched_domains_mutex);
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
 /*
@@ -5446,7 +5445,7 @@ out:
 
 #ifdef CONFIG_SMP
 
-static bool sched_smp_initialized __read_mostly;
+bool sched_smp_initialized __read_mostly;
 
 #ifdef CONFIG_NUMA_BALANCING
 /* Migrate current task p to target_cpu */
@@ -5643,7 +5642,7 @@ static void migrate_tasks(struct rq *dead_rq)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static void set_rq_online(struct rq *rq)
+void set_rq_online(struct rq *rq)
 {
 	if (!rq->online) {
 		const struct sched_class *class;
@@ -5658,7 +5657,7 @@ static void set_rq_online(struct rq *rq)
 	}
 }
 
-static void set_rq_offline(struct rq *rq)
+void set_rq_offline(struct rq *rq)
 {
 	if (rq->online) {
 		const struct sched_class *class;
@@ -5680,1658 +5679,6 @@ static void set_cpu_rq_start_time(unsigned int cpu)
 	rq->age_stamp = sched_clock_cpu(cpu);
 }
 
-/* Protected by sched_domains_mutex: */
-static cpumask_var_t sched_domains_tmpmask;
-
-#ifdef CONFIG_SCHED_DEBUG
-
-static __read_mostly int sched_debug_enabled;
-
-static int __init sched_debug_setup(char *str)
-{
-	sched_debug_enabled = 1;
-
-	return 0;
-}
-early_param("sched_debug", sched_debug_setup);
-
-static inline bool sched_debug(void)
-{
-	return sched_debug_enabled;
-}
-
-static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
-				  struct cpumask *groupmask)
-{
-	struct sched_group *group = sd->groups;
-
-	cpumask_clear(groupmask);
-
-	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
-
-	if (!(sd->flags & SD_LOAD_BALANCE)) {
-		printk("does not load-balance\n");
-		if (sd->parent)
-			printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
-					" has parent");
-		return -1;
-	}
-
-	printk(KERN_CONT "span %*pbl level %s\n",
-	       cpumask_pr_args(sched_domain_span(sd)), sd->name);
-
-	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-		printk(KERN_ERR "ERROR: domain->span does not contain "
-				"CPU%d\n", cpu);
-	}
-	if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
-		printk(KERN_ERR "ERROR: domain->groups does not contain"
-				" CPU%d\n", cpu);
-	}
-
-	printk(KERN_DEBUG "%*s groups:", level + 1, "");
-	do {
-		if (!group) {
-			printk("\n");
-			printk(KERN_ERR "ERROR: group is NULL\n");
-			break;
-		}
-
-		if (!cpumask_weight(sched_group_cpus(group))) {
-			printk(KERN_CONT "\n");
-			printk(KERN_ERR "ERROR: empty group\n");
-			break;
-		}
-
-		if (!(sd->flags & SD_OVERLAP) &&
-		    cpumask_intersects(groupmask, sched_group_cpus(group))) {
-			printk(KERN_CONT "\n");
-			printk(KERN_ERR "ERROR: repeated CPUs\n");
-			break;
-		}
-
-		cpumask_or(groupmask, groupmask, sched_group_cpus(group));
-
-		printk(KERN_CONT " %*pbl",
-		       cpumask_pr_args(sched_group_cpus(group)));
-		if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
-			printk(KERN_CONT " (cpu_capacity = %lu)",
-				group->sgc->capacity);
-		}
-
-		group = group->next;
-	} while (group != sd->groups);
-	printk(KERN_CONT "\n");
-
-	if (!cpumask_equal(sched_domain_span(sd), groupmask))
-		printk(KERN_ERR "ERROR: groups don't span domain->span\n");
-
-	if (sd->parent &&
-	    !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
-		printk(KERN_ERR "ERROR: parent span is not a superset "
-			"of domain->span\n");
-	return 0;
-}
-
-static void sched_domain_debug(struct sched_domain *sd, int cpu)
-{
-	int level = 0;
-
-	if (!sched_debug_enabled)
-		return;
-
-	if (!sd) {
-		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
-		return;
-	}
-
-	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
-
-	for (;;) {
-		if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
-			break;
-		level++;
-		sd = sd->parent;
-		if (!sd)
-			break;
-	}
-}
-#else /* !CONFIG_SCHED_DEBUG */
-
-# define sched_debug_enabled 0
-# define sched_domain_debug(sd, cpu) do { } while (0)
-static inline bool sched_debug(void)
-{
-	return false;
-}
-#endif /* CONFIG_SCHED_DEBUG */
-
-static int sd_degenerate(struct sched_domain *sd)
-{
-	if (cpumask_weight(sched_domain_span(sd)) == 1)
-		return 1;
-
-	/* Following flags need at least 2 groups */
-	if (sd->flags & (SD_LOAD_BALANCE |
-			 SD_BALANCE_NEWIDLE |
-			 SD_BALANCE_FORK |
-			 SD_BALANCE_EXEC |
-			 SD_SHARE_CPUCAPACITY |
-			 SD_ASYM_CPUCAPACITY |
-			 SD_SHARE_PKG_RESOURCES |
-			 SD_SHARE_POWERDOMAIN)) {
-		if (sd->groups != sd->groups->next)
-			return 0;
-	}
-
-	/* Following flags don't use groups */
-	if (sd->flags & (SD_WAKE_AFFINE))
-		return 0;
-
-	return 1;
-}
-
-static int
-sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
-{
-	unsigned long cflags = sd->flags, pflags = parent->flags;
-
-	if (sd_degenerate(parent))
-		return 1;
-
-	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
-		return 0;
-
-	/* Flags needing groups don't count if only 1 group in parent */
-	if (parent->groups == parent->groups->next) {
-		pflags &= ~(SD_LOAD_BALANCE |
-				SD_BALANCE_NEWIDLE |
-				SD_BALANCE_FORK |
-				SD_BALANCE_EXEC |
-				SD_ASYM_CPUCAPACITY |
-				SD_SHARE_CPUCAPACITY |
-				SD_SHARE_PKG_RESOURCES |
-				SD_PREFER_SIBLING |
-				SD_SHARE_POWERDOMAIN);
-		if (nr_node_ids == 1)
-			pflags &= ~SD_SERIALIZE;
-	}
-	if (~cflags & pflags)
-		return 0;
-
-	return 1;
-}
-
-static void free_rootdomain(struct rcu_head *rcu)
-{
-	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
-
-	cpupri_cleanup(&rd->cpupri);
-	cpudl_cleanup(&rd->cpudl);
-	free_cpumask_var(rd->dlo_mask);
-	free_cpumask_var(rd->rto_mask);
-	free_cpumask_var(rd->online);
-	free_cpumask_var(rd->span);
-	kfree(rd);
-}
-
-static void rq_attach_root(struct rq *rq, struct root_domain *rd)
-{
-	struct root_domain *old_rd = NULL;
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&rq->lock, flags);
-
-	if (rq->rd) {
-		old_rd = rq->rd;
-
-		if (cpumask_test_cpu(rq->cpu, old_rd->online))
-			set_rq_offline(rq);
-
-		cpumask_clear_cpu(rq->cpu, old_rd->span);
-
-		/*
-		 * If we dont want to free the old_rd yet then
-		 * set old_rd to NULL to skip the freeing later
-		 * in this function:
-		 */
-		if (!atomic_dec_and_test(&old_rd->refcount))
-			old_rd = NULL;
-	}
-
-	atomic_inc(&rd->refcount);
-	rq->rd = rd;
-
-	cpumask_set_cpu(rq->cpu, rd->span);
-	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
-		set_rq_online(rq);
-
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
-
-	if (old_rd)
-		call_rcu_sched(&old_rd->rcu, free_rootdomain);
-}
-
-static int init_rootdomain(struct root_domain *rd)
-{
-	memset(rd, 0, sizeof(*rd));
-
-	if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
-		goto out;
-	if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
-		goto free_span;
-	if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
-		goto free_online;
-	if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
-		goto free_dlo_mask;
-
-	init_dl_bw(&rd->dl_bw);
-	if (cpudl_init(&rd->cpudl) != 0)
-		goto free_rto_mask;
-
-	if (cpupri_init(&rd->cpupri) != 0)
-		goto free_cpudl;
-	return 0;
-
-free_cpudl:
-	cpudl_cleanup(&rd->cpudl);
-free_rto_mask:
-	free_cpumask_var(rd->rto_mask);
-free_dlo_mask:
-	free_cpumask_var(rd->dlo_mask);
-free_online:
-	free_cpumask_var(rd->online);
-free_span:
-	free_cpumask_var(rd->span);
-out:
-	return -ENOMEM;
-}
-
-/*
- * By default the system creates a single root-domain with all CPUs as
- * members (mimicking the global state we have today).
- */
-struct root_domain def_root_domain;
-
-static void init_defrootdomain(void)
-{
-	init_rootdomain(&def_root_domain);
-
-	atomic_set(&def_root_domain.refcount, 1);
-}
-
-static struct root_domain *alloc_rootdomain(void)
-{
-	struct root_domain *rd;
-
-	rd = kmalloc(sizeof(*rd), GFP_KERNEL);
-	if (!rd)
-		return NULL;
-
-	if (init_rootdomain(rd) != 0) {
-		kfree(rd);
-		return NULL;
-	}
-
-	return rd;
-}
-
-static void free_sched_groups(struct sched_group *sg, int free_sgc)
-{
-	struct sched_group *tmp, *first;
-
-	if (!sg)
-		return;
-
-	first = sg;
-	do {
-		tmp = sg->next;
-
-		if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
-			kfree(sg->sgc);
-
-		kfree(sg);
-		sg = tmp;
-	} while (sg != first);
-}
-
-static void destroy_sched_domain(struct sched_domain *sd)
-{
-	/*
-	 * If its an overlapping domain it has private groups, iterate and
-	 * nuke them all.
-	 */
-	if (sd->flags & SD_OVERLAP) {
-		free_sched_groups(sd->groups, 1);
-	} else if (atomic_dec_and_test(&sd->groups->ref)) {
-		kfree(sd->groups->sgc);
-		kfree(sd->groups);
-	}
-	if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
-		kfree(sd->shared);
-	kfree(sd);
-}
-
-static void destroy_sched_domains_rcu(struct rcu_head *rcu)
-{
-	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
-
-	while (sd) {
-		struct sched_domain *parent = sd->parent;
-		destroy_sched_domain(sd);
-		sd = parent;
-	}
-}
-
-static void destroy_sched_domains(struct sched_domain *sd)
-{
-	if (sd)
-		call_rcu(&sd->rcu, destroy_sched_domains_rcu);
-}
-
-/*
- * Keep a special pointer to the highest sched_domain that has
- * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
- * allows us to avoid some pointer chasing select_idle_sibling().
- *
- * Also keep a unique ID per domain (we use the first CPU number in
- * the cpumask of the domain), this allows us to quickly tell if
- * two CPUs are in the same cache domain, see cpus_share_cache().
- */
-DEFINE_PER_CPU(struct sched_domain *, sd_llc);
-DEFINE_PER_CPU(int, sd_llc_size);
-DEFINE_PER_CPU(int, sd_llc_id);
-DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
-DEFINE_PER_CPU(struct sched_domain *, sd_numa);
-DEFINE_PER_CPU(struct sched_domain *, sd_asym);
-
-static void update_top_cache_domain(int cpu)
-{
-	struct sched_domain_shared *sds = NULL;
-	struct sched_domain *sd;
-	int id = cpu;
-	int size = 1;
-
-	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
-	if (sd) {
-		id = cpumask_first(sched_domain_span(sd));
-		size = cpumask_weight(sched_domain_span(sd));
-		sds = sd->shared;
-	}
-
-	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
-	per_cpu(sd_llc_size, cpu) = size;
-	per_cpu(sd_llc_id, cpu) = id;
-	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
-
-	sd = lowest_flag_domain(cpu, SD_NUMA);
-	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
-
-	sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
-	rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
-}
-
-/*
- * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
- * hold the hotplug lock.
- */
-static void
-cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	struct sched_domain *tmp;
-
-	/* Remove the sched domains which do not contribute to scheduling. */
-	for (tmp = sd; tmp; ) {
-		struct sched_domain *parent = tmp->parent;
-		if (!parent)
-			break;
-
-		if (sd_parent_degenerate(tmp, parent)) {
-			tmp->parent = parent->parent;
-			if (parent->parent)
-				parent->parent->child = tmp;
-			/*
-			 * Transfer SD_PREFER_SIBLING down in case of a
-			 * degenerate parent; the spans match for this
-			 * so the property transfers.
-			 */
-			if (parent->flags & SD_PREFER_SIBLING)
-				tmp->flags |= SD_PREFER_SIBLING;
-			destroy_sched_domain(parent);
-		} else
-			tmp = tmp->parent;
-	}
-
-	if (sd && sd_degenerate(sd)) {
-		tmp = sd;
-		sd = sd->parent;
-		destroy_sched_domain(tmp);
-		if (sd)
-			sd->child = NULL;
-	}
-
-	sched_domain_debug(sd, cpu);
-
-	rq_attach_root(rq, rd);
-	tmp = rq->sd;
-	rcu_assign_pointer(rq->sd, sd);
-	destroy_sched_domains(tmp);
-
-	update_top_cache_domain(cpu);
-}
-
-/* Setup the mask of CPUs configured for isolated domains */
-static int __init isolated_cpu_setup(char *str)
-{
-	int ret;
-
-	alloc_bootmem_cpumask_var(&cpu_isolated_map);
-	ret = cpulist_parse(str, cpu_isolated_map);
-	if (ret) {
-		pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
-		return 0;
-	}
-	return 1;
-}
-__setup("isolcpus=", isolated_cpu_setup);
-
-struct s_data {
-	struct sched_domain ** __percpu sd;
-	struct root_domain	*rd;
-};
-
-enum s_alloc {
-	sa_rootdomain,
-	sa_sd,
-	sa_sd_storage,
-	sa_none,
-};
-
-/*
- * Build an iteration mask that can exclude certain CPUs from the upwards
- * domain traversal.
- *
- * Asymmetric node setups can result in situations where the domain tree is of
- * unequal depth, make sure to skip domains that already cover the entire
- * range.
- *
- * In that case build_sched_domains() will have terminated the iteration early
- * and our sibling sd spans will be empty. Domains should always include the
- * CPU they're built on, so check that.
- */
-static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
-{
-	const struct cpumask *span = sched_domain_span(sd);
-	struct sd_data *sdd = sd->private;
-	struct sched_domain *sibling;
-	int i;
-
-	for_each_cpu(i, span) {
-		sibling = *per_cpu_ptr(sdd->sd, i);
-		if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
-			continue;
-
-		cpumask_set_cpu(i, sched_group_mask(sg));
-	}
-}
-
-/*
- * Return the canonical balance CPU for this group, this is the first CPU
- * of this group that's also in the iteration mask.
- */
-int group_balance_cpu(struct sched_group *sg)
-{
-	return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
-}
-
-static int
-build_overlap_sched_groups(struct sched_domain *sd, int cpu)
-{
-	struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
-	const struct cpumask *span = sched_domain_span(sd);
-	struct cpumask *covered = sched_domains_tmpmask;
-	struct sd_data *sdd = sd->private;
-	struct sched_domain *sibling;
-	int i;
-
-	cpumask_clear(covered);
-
-	for_each_cpu(i, span) {
-		struct cpumask *sg_span;
-
-		if (cpumask_test_cpu(i, covered))
-			continue;
-
-		sibling = *per_cpu_ptr(sdd->sd, i);
-
-		/* See the comment near build_group_mask(). */
-		if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
-			continue;
-
-		sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
-				GFP_KERNEL, cpu_to_node(cpu));
-
-		if (!sg)
-			goto fail;
-
-		sg_span = sched_group_cpus(sg);
-		if (sibling->child)
-			cpumask_copy(sg_span, sched_domain_span(sibling->child));
-		else
-			cpumask_set_cpu(i, sg_span);
-
-		cpumask_or(covered, covered, sg_span);
-
-		sg->sgc = *per_cpu_ptr(sdd->sgc, i);
-		if (atomic_inc_return(&sg->sgc->ref) == 1)
-			build_group_mask(sd, sg);
-
-		/*
-		 * Initialize sgc->capacity such that even if we mess up the
-		 * domains and no possible iteration will get us here, we won't
-		 * die on a /0 trap.
-		 */
-		sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
-		sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
-
-		/*
-		 * Make sure the first group of this domain contains the
-		 * canonical balance CPU. Otherwise the sched_domain iteration
-		 * breaks. See update_sg_lb_stats().
-		 */
-		if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
-		    group_balance_cpu(sg) == cpu)
-			groups = sg;
-
-		if (!first)
-			first = sg;
-		if (last)
-			last->next = sg;
-		last = sg;
-		last->next = first;
-	}
-	sd->groups = groups;
-
-	return 0;
-
-fail:
-	free_sched_groups(first, 0);
-
-	return -ENOMEM;
-}
-
-static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
-{
-	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
-	struct sched_domain *child = sd->child;
-
-	if (child)
-		cpu = cpumask_first(sched_domain_span(child));
-
-	if (sg) {
-		*sg = *per_cpu_ptr(sdd->sg, cpu);
-		(*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
-
-		/* For claim_allocations: */
-		atomic_set(&(*sg)->sgc->ref, 1);
-	}
-
-	return cpu;
-}
-
-/*
- * build_sched_groups will build a circular linked list of the groups
- * covered by the given span, and will set each group's ->cpumask correctly,
- * and ->cpu_capacity to 0.
- *
- * Assumes the sched_domain tree is fully constructed
- */
-static int
-build_sched_groups(struct sched_domain *sd, int cpu)
-{
-	struct sched_group *first = NULL, *last = NULL;
-	struct sd_data *sdd = sd->private;
-	const struct cpumask *span = sched_domain_span(sd);
-	struct cpumask *covered;
-	int i;
-
-	get_group(cpu, sdd, &sd->groups);
-	atomic_inc(&sd->groups->ref);
-
-	if (cpu != cpumask_first(span))
-		return 0;
-
-	lockdep_assert_held(&sched_domains_mutex);
-	covered = sched_domains_tmpmask;
-
-	cpumask_clear(covered);
-
-	for_each_cpu(i, span) {
-		struct sched_group *sg;
-		int group, j;
-
-		if (cpumask_test_cpu(i, covered))
-			continue;
-
-		group = get_group(i, sdd, &sg);
-		cpumask_setall(sched_group_mask(sg));
-
-		for_each_cpu(j, span) {
-			if (get_group(j, sdd, NULL) != group)
-				continue;
-
-			cpumask_set_cpu(j, covered);
-			cpumask_set_cpu(j, sched_group_cpus(sg));
-		}
-
-		if (!first)
-			first = sg;
-		if (last)
-			last->next = sg;
-		last = sg;
-	}
-	last->next = first;
-
-	return 0;
-}
-
-/*
- * Initialize sched groups cpu_capacity.
- *
- * cpu_capacity indicates the capacity of sched group, which is used while
- * distributing the load between different sched groups in a sched domain.
- * Typically cpu_capacity for all the groups in a sched domain will be same
- * unless there are asymmetries in the topology. If there are asymmetries,
- * group having more cpu_capacity will pickup more load compared to the
- * group having less cpu_capacity.
- */
-static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
-{
-	struct sched_group *sg = sd->groups;
-
-	WARN_ON(!sg);
-
-	do {
-		int cpu, max_cpu = -1;
-
-		sg->group_weight = cpumask_weight(sched_group_cpus(sg));
-
-		if (!(sd->flags & SD_ASYM_PACKING))
-			goto next;
-
-		for_each_cpu(cpu, sched_group_cpus(sg)) {
-			if (max_cpu < 0)
-				max_cpu = cpu;
-			else if (sched_asym_prefer(cpu, max_cpu))
-				max_cpu = cpu;
-		}
-		sg->asym_prefer_cpu = max_cpu;
-
-next:
-		sg = sg->next;
-	} while (sg != sd->groups);
-
-	if (cpu != group_balance_cpu(sg))
-		return;
-
-	update_group_capacity(sd, cpu);
-}
-
-/*
- * Initializers for schedule domains
- * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
- */
-
-static int default_relax_domain_level = -1;
-int sched_domain_level_max;
-
-static int __init setup_relax_domain_level(char *str)
-{
-	if (kstrtoint(str, 0, &default_relax_domain_level))
-		pr_warn("Unable to set relax_domain_level\n");
-
-	return 1;
-}
-__setup("relax_domain_level=", setup_relax_domain_level);
-
-static void set_domain_attribute(struct sched_domain *sd,
-				 struct sched_domain_attr *attr)
-{
-	int request;
-
-	if (!attr || attr->relax_domain_level < 0) {
-		if (default_relax_domain_level < 0)
-			return;
-		else
-			request = default_relax_domain_level;
-	} else
-		request = attr->relax_domain_level;
-	if (request < sd->level) {
-		/* Turn off idle balance on this domain: */
-		sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
-	} else {
-		/* Turn on idle balance on this domain: */
-		sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
-	}
-}
-
-static void __sdt_free(const struct cpumask *cpu_map);
-static int __sdt_alloc(const struct cpumask *cpu_map);
-
-static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
-				 const struct cpumask *cpu_map)
-{
-	switch (what) {
-	case sa_rootdomain:
-		if (!atomic_read(&d->rd->refcount))
-			free_rootdomain(&d->rd->rcu);
-		/* Fall through */
-	case sa_sd:
-		free_percpu(d->sd);
-		/* Fall through */
-	case sa_sd_storage:
-		__sdt_free(cpu_map);
-		/* Fall through */
-	case sa_none:
-		break;
-	}
-}
-
-static enum s_alloc
-__visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
-{
-	memset(d, 0, sizeof(*d));
-
-	if (__sdt_alloc(cpu_map))
-		return sa_sd_storage;
-	d->sd = alloc_percpu(struct sched_domain *);
-	if (!d->sd)
-		return sa_sd_storage;
-	d->rd = alloc_rootdomain();
-	if (!d->rd)
-		return sa_sd;
-	return sa_rootdomain;
-}
-
-/*
- * NULL the sd_data elements we've used to build the sched_domain and
- * sched_group structure so that the subsequent __free_domain_allocs()
- * will not free the data we're using.
- */
-static void claim_allocations(int cpu, struct sched_domain *sd)
-{
-	struct sd_data *sdd = sd->private;
-
-	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
-	*per_cpu_ptr(sdd->sd, cpu) = NULL;
-
-	if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
-		*per_cpu_ptr(sdd->sds, cpu) = NULL;
-
-	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
-		*per_cpu_ptr(sdd->sg, cpu) = NULL;
-
-	if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
-		*per_cpu_ptr(sdd->sgc, cpu) = NULL;
-}
-
-#ifdef CONFIG_NUMA
-static int sched_domains_numa_levels;
-enum numa_topology_type sched_numa_topology_type;
-static int *sched_domains_numa_distance;
-int sched_max_numa_distance;
-static struct cpumask ***sched_domains_numa_masks;
-static int sched_domains_curr_level;
-#endif
-
-/*
- * SD_flags allowed in topology descriptions.
- *
- * These flags are purely descriptive of the topology and do not prescribe
- * behaviour. Behaviour is artificial and mapped in the below sd_init()
- * function:
- *
- *   SD_SHARE_CPUCAPACITY   - describes SMT topologies
- *   SD_SHARE_PKG_RESOURCES - describes shared caches
- *   SD_NUMA                - describes NUMA topologies
- *   SD_SHARE_POWERDOMAIN   - describes shared power domain
- *   SD_ASYM_CPUCAPACITY    - describes mixed capacity topologies
- *
- * Odd one out, which beside describing the topology has a quirk also
- * prescribes the desired behaviour that goes along with it:
- *
- *   SD_ASYM_PACKING        - describes SMT quirks
- */
-#define TOPOLOGY_SD_FLAGS		\
-	(SD_SHARE_CPUCAPACITY |		\
-	 SD_SHARE_PKG_RESOURCES |	\
-	 SD_NUMA |			\
-	 SD_ASYM_PACKING |		\
-	 SD_ASYM_CPUCAPACITY |		\
-	 SD_SHARE_POWERDOMAIN)
-
-static struct sched_domain *
-sd_init(struct sched_domain_topology_level *tl,
-	const struct cpumask *cpu_map,
-	struct sched_domain *child, int cpu)
-{
-	struct sd_data *sdd = &tl->data;
-	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
-	int sd_id, sd_weight, sd_flags = 0;
-
-#ifdef CONFIG_NUMA
-	/*
-	 * Ugly hack to pass state to sd_numa_mask()...
-	 */
-	sched_domains_curr_level = tl->numa_level;
-#endif
-
-	sd_weight = cpumask_weight(tl->mask(cpu));
-
-	if (tl->sd_flags)
-		sd_flags = (*tl->sd_flags)();
-	if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
-			"wrong sd_flags in topology description\n"))
-		sd_flags &= ~TOPOLOGY_SD_FLAGS;
-
-	*sd = (struct sched_domain){
-		.min_interval		= sd_weight,
-		.max_interval		= 2*sd_weight,
-		.busy_factor		= 32,
-		.imbalance_pct		= 125,
-
-		.cache_nice_tries	= 0,
-		.busy_idx		= 0,
-		.idle_idx		= 0,
-		.newidle_idx		= 0,
-		.wake_idx		= 0,
-		.forkexec_idx		= 0,
-
-		.flags			= 1*SD_LOAD_BALANCE
-					| 1*SD_BALANCE_NEWIDLE
-					| 1*SD_BALANCE_EXEC
-					| 1*SD_BALANCE_FORK
-					| 0*SD_BALANCE_WAKE
-					| 1*SD_WAKE_AFFINE
-					| 0*SD_SHARE_CPUCAPACITY
-					| 0*SD_SHARE_PKG_RESOURCES
-					| 0*SD_SERIALIZE
-					| 0*SD_PREFER_SIBLING
-					| 0*SD_NUMA
-					| sd_flags
-					,
-
-		.last_balance		= jiffies,
-		.balance_interval	= sd_weight,
-		.smt_gain		= 0,
-		.max_newidle_lb_cost	= 0,
-		.next_decay_max_lb_cost	= jiffies,
-		.child			= child,
-#ifdef CONFIG_SCHED_DEBUG
-		.name			= tl->name,
-#endif
-	};
-
-	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
-	sd_id = cpumask_first(sched_domain_span(sd));
-
-	/*
-	 * Convert topological properties into behaviour.
-	 */
-
-	if (sd->flags & SD_ASYM_CPUCAPACITY) {
-		struct sched_domain *t = sd;
-
-		for_each_lower_domain(t)
-			t->flags |= SD_BALANCE_WAKE;
-	}
-
-	if (sd->flags & SD_SHARE_CPUCAPACITY) {
-		sd->flags |= SD_PREFER_SIBLING;
-		sd->imbalance_pct = 110;
-		sd->smt_gain = 1178; /* ~15% */
-
-	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
-		sd->imbalance_pct = 117;
-		sd->cache_nice_tries = 1;
-		sd->busy_idx = 2;
-
-#ifdef CONFIG_NUMA
-	} else if (sd->flags & SD_NUMA) {
-		sd->cache_nice_tries = 2;
-		sd->busy_idx = 3;
-		sd->idle_idx = 2;
-
-		sd->flags |= SD_SERIALIZE;
-		if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
-			sd->flags &= ~(SD_BALANCE_EXEC |
-				       SD_BALANCE_FORK |
-				       SD_WAKE_AFFINE);
-		}
-
-#endif
-	} else {
-		sd->flags |= SD_PREFER_SIBLING;
-		sd->cache_nice_tries = 1;
-		sd->busy_idx = 2;
-		sd->idle_idx = 1;
-	}
-
-	/*
-	 * For all levels sharing cache; connect a sched_domain_shared
-	 * instance.
-	 */
-	if (sd->flags & SD_SHARE_PKG_RESOURCES) {
-		sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
-		atomic_inc(&sd->shared->ref);
-		atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
-	}
-
-	sd->private = sdd;
-
-	return sd;
-}
-
-/*
- * Topology list, bottom-up.
- */
-static struct sched_domain_topology_level default_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-#ifdef CONFIG_SCHED_MC
-	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
-#endif
-	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
-	{ NULL, },
-};
-
-static struct sched_domain_topology_level *sched_domain_topology =
-	default_topology;
-
-#define for_each_sd_topology(tl)			\
-	for (tl = sched_domain_topology; tl->mask; tl++)
-
-void set_sched_topology(struct sched_domain_topology_level *tl)
-{
-	if (WARN_ON_ONCE(sched_smp_initialized))
-		return;
-
-	sched_domain_topology = tl;
-}
-
-#ifdef CONFIG_NUMA
-
-static const struct cpumask *sd_numa_mask(int cpu)
-{
-	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
-}
-
-static void sched_numa_warn(const char *str)
-{
-	static int done = false;
-	int i,j;
-
-	if (done)
-		return;
-
-	done = true;
-
-	printk(KERN_WARNING "ERROR: %s\n\n", str);
-
-	for (i = 0; i < nr_node_ids; i++) {
-		printk(KERN_WARNING "  ");
-		for (j = 0; j < nr_node_ids; j++)
-			printk(KERN_CONT "%02d ", node_distance(i,j));
-		printk(KERN_CONT "\n");
-	}
-	printk(KERN_WARNING "\n");
-}
-
-bool find_numa_distance(int distance)
-{
-	int i;
-
-	if (distance == node_distance(0, 0))
-		return true;
-
-	for (i = 0; i < sched_domains_numa_levels; i++) {
-		if (sched_domains_numa_distance[i] == distance)
-			return true;
-	}
-
-	return false;
-}
-
-/*
- * A system can have three types of NUMA topology:
- * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
- * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
- * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
- *
- * The difference between a glueless mesh topology and a backplane
- * topology lies in whether communication between not directly
- * connected nodes goes through intermediary nodes (where programs
- * could run), or through backplane controllers. This affects
- * placement of programs.
- *
- * The type of topology can be discerned with the following tests:
- * - If the maximum distance between any nodes is 1 hop, the system
- *   is directly connected.
- * - If for two nodes A and B, located N > 1 hops away from each other,
- *   there is an intermediary node C, which is < N hops away from both
- *   nodes A and B, the system is a glueless mesh.
- */
-static void init_numa_topology_type(void)
-{
-	int a, b, c, n;
-
-	n = sched_max_numa_distance;
-
-	if (sched_domains_numa_levels <= 1) {
-		sched_numa_topology_type = NUMA_DIRECT;
-		return;
-	}
-
-	for_each_online_node(a) {
-		for_each_online_node(b) {
-			/* Find two nodes furthest removed from each other. */
-			if (node_distance(a, b) < n)
-				continue;
-
-			/* Is there an intermediary node between a and b? */
-			for_each_online_node(c) {
-				if (node_distance(a, c) < n &&
-				    node_distance(b, c) < n) {
-					sched_numa_topology_type =
-							NUMA_GLUELESS_MESH;
-					return;
-				}
-			}
-
-			sched_numa_topology_type = NUMA_BACKPLANE;
-			return;
-		}
-	}
-}
-
-static void sched_init_numa(void)
-{
-	int next_distance, curr_distance = node_distance(0, 0);
-	struct sched_domain_topology_level *tl;
-	int level = 0;
-	int i, j, k;
-
-	sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
-	if (!sched_domains_numa_distance)
-		return;
-
-	/*
-	 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
-	 * unique distances in the node_distance() table.
-	 *
-	 * Assumes node_distance(0,j) includes all distances in
-	 * node_distance(i,j) in order to avoid cubic time.
-	 */
-	next_distance = curr_distance;
-	for (i = 0; i < nr_node_ids; i++) {
-		for (j = 0; j < nr_node_ids; j++) {
-			for (k = 0; k < nr_node_ids; k++) {
-				int distance = node_distance(i, k);
-
-				if (distance > curr_distance &&
-				    (distance < next_distance ||
-				     next_distance == curr_distance))
-					next_distance = distance;
-
-				/*
-				 * While not a strong assumption it would be nice to know
-				 * about cases where if node A is connected to B, B is not
-				 * equally connected to A.
-				 */
-				if (sched_debug() && node_distance(k, i) != distance)
-					sched_numa_warn("Node-distance not symmetric");
-
-				if (sched_debug() && i && !find_numa_distance(distance))
-					sched_numa_warn("Node-0 not representative");
-			}
-			if (next_distance != curr_distance) {
-				sched_domains_numa_distance[level++] = next_distance;
-				sched_domains_numa_levels = level;
-				curr_distance = next_distance;
-			} else break;
-		}
-
-		/*
-		 * In case of sched_debug() we verify the above assumption.
-		 */
-		if (!sched_debug())
-			break;
-	}
-
-	if (!level)
-		return;
-
-	/*
-	 * 'level' contains the number of unique distances, excluding the
-	 * identity distance node_distance(i,i).
-	 *
-	 * The sched_domains_numa_distance[] array includes the actual distance
-	 * numbers.
-	 */
-
-	/*
-	 * Here, we should temporarily reset sched_domains_numa_levels to 0.
-	 * If it fails to allocate memory for array sched_domains_numa_masks[][],
-	 * the array will contain less then 'level' members. This could be
-	 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
-	 * in other functions.
-	 *
-	 * We reset it to 'level' at the end of this function.
-	 */
-	sched_domains_numa_levels = 0;
-
-	sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
-	if (!sched_domains_numa_masks)
-		return;
-
-	/*
-	 * Now for each level, construct a mask per node which contains all
-	 * CPUs of nodes that are that many hops away from us.
-	 */
-	for (i = 0; i < level; i++) {
-		sched_domains_numa_masks[i] =
-			kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
-		if (!sched_domains_numa_masks[i])
-			return;
-
-		for (j = 0; j < nr_node_ids; j++) {
-			struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
-			if (!mask)
-				return;
-
-			sched_domains_numa_masks[i][j] = mask;
-
-			for_each_node(k) {
-				if (node_distance(j, k) > sched_domains_numa_distance[i])
-					continue;
-
-				cpumask_or(mask, mask, cpumask_of_node(k));
-			}
-		}
-	}
-
-	/* Compute default topology size */
-	for (i = 0; sched_domain_topology[i].mask; i++);
-
-	tl = kzalloc((i + level + 1) *
-			sizeof(struct sched_domain_topology_level), GFP_KERNEL);
-	if (!tl)
-		return;
-
-	/*
-	 * Copy the default topology bits..
-	 */
-	for (i = 0; sched_domain_topology[i].mask; i++)
-		tl[i] = sched_domain_topology[i];
-
-	/*
-	 * .. and append 'j' levels of NUMA goodness.
-	 */
-	for (j = 0; j < level; i++, j++) {
-		tl[i] = (struct sched_domain_topology_level){
-			.mask = sd_numa_mask,
-			.sd_flags = cpu_numa_flags,
-			.flags = SDTL_OVERLAP,
-			.numa_level = j,
-			SD_INIT_NAME(NUMA)
-		};
-	}
-
-	sched_domain_topology = tl;
-
-	sched_domains_numa_levels = level;
-	sched_max_numa_distance = sched_domains_numa_distance[level - 1];
-
-	init_numa_topology_type();
-}
-
-static void sched_domains_numa_masks_set(unsigned int cpu)
-{
-	int node = cpu_to_node(cpu);
-	int i, j;
-
-	for (i = 0; i < sched_domains_numa_levels; i++) {
-		for (j = 0; j < nr_node_ids; j++) {
-			if (node_distance(j, node) <= sched_domains_numa_distance[i])
-				cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
-		}
-	}
-}
-
-static void sched_domains_numa_masks_clear(unsigned int cpu)
-{
-	int i, j;
-
-	for (i = 0; i < sched_domains_numa_levels; i++) {
-		for (j = 0; j < nr_node_ids; j++)
-			cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
-	}
-}
-
-#else
-static inline void sched_init_numa(void) { }
-static void sched_domains_numa_masks_set(unsigned int cpu) { }
-static void sched_domains_numa_masks_clear(unsigned int cpu) { }
-#endif /* CONFIG_NUMA */
-
-static int __sdt_alloc(const struct cpumask *cpu_map)
-{
-	struct sched_domain_topology_level *tl;
-	int j;
-
-	for_each_sd_topology(tl) {
-		struct sd_data *sdd = &tl->data;
-
-		sdd->sd = alloc_percpu(struct sched_domain *);
-		if (!sdd->sd)
-			return -ENOMEM;
-
-		sdd->sds = alloc_percpu(struct sched_domain_shared *);
-		if (!sdd->sds)
-			return -ENOMEM;
-
-		sdd->sg = alloc_percpu(struct sched_group *);
-		if (!sdd->sg)
-			return -ENOMEM;
-
-		sdd->sgc = alloc_percpu(struct sched_group_capacity *);
-		if (!sdd->sgc)
-			return -ENOMEM;
-
-		for_each_cpu(j, cpu_map) {
-			struct sched_domain *sd;
-			struct sched_domain_shared *sds;
-			struct sched_group *sg;
-			struct sched_group_capacity *sgc;
-
-			sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
-					GFP_KERNEL, cpu_to_node(j));
-			if (!sd)
-				return -ENOMEM;
-
-			*per_cpu_ptr(sdd->sd, j) = sd;
-
-			sds = kzalloc_node(sizeof(struct sched_domain_shared),
-					GFP_KERNEL, cpu_to_node(j));
-			if (!sds)
-				return -ENOMEM;
-
-			*per_cpu_ptr(sdd->sds, j) = sds;
-
-			sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
-					GFP_KERNEL, cpu_to_node(j));
-			if (!sg)
-				return -ENOMEM;
-
-			sg->next = sg;
-
-			*per_cpu_ptr(sdd->sg, j) = sg;
-
-			sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
-					GFP_KERNEL, cpu_to_node(j));
-			if (!sgc)
-				return -ENOMEM;
-
-			*per_cpu_ptr(sdd->sgc, j) = sgc;
-		}
-	}
-
-	return 0;
-}
-
-static void __sdt_free(const struct cpumask *cpu_map)
-{
-	struct sched_domain_topology_level *tl;
-	int j;
-
-	for_each_sd_topology(tl) {
-		struct sd_data *sdd = &tl->data;
-
-		for_each_cpu(j, cpu_map) {
-			struct sched_domain *sd;
-
-			if (sdd->sd) {
-				sd = *per_cpu_ptr(sdd->sd, j);
-				if (sd && (sd->flags & SD_OVERLAP))
-					free_sched_groups(sd->groups, 0);
-				kfree(*per_cpu_ptr(sdd->sd, j));
-			}
-
-			if (sdd->sds)
-				kfree(*per_cpu_ptr(sdd->sds, j));
-			if (sdd->sg)
-				kfree(*per_cpu_ptr(sdd->sg, j));
-			if (sdd->sgc)
-				kfree(*per_cpu_ptr(sdd->sgc, j));
-		}
-		free_percpu(sdd->sd);
-		sdd->sd = NULL;
-		free_percpu(sdd->sds);
-		sdd->sds = NULL;
-		free_percpu(sdd->sg);
-		sdd->sg = NULL;
-		free_percpu(sdd->sgc);
-		sdd->sgc = NULL;
-	}
-}
-
-struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
-		const struct cpumask *cpu_map, struct sched_domain_attr *attr,
-		struct sched_domain *child, int cpu)
-{
-	struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
-
-	if (child) {
-		sd->level = child->level + 1;
-		sched_domain_level_max = max(sched_domain_level_max, sd->level);
-		child->parent = sd;
-
-		if (!cpumask_subset(sched_domain_span(child),
-				    sched_domain_span(sd))) {
-			pr_err("BUG: arch topology borken\n");
-#ifdef CONFIG_SCHED_DEBUG
-			pr_err("     the %s domain not a subset of the %s domain\n",
-					child->name, sd->name);
-#endif
-			/* Fixup, ensure @sd has at least @child cpus. */
-			cpumask_or(sched_domain_span(sd),
-				   sched_domain_span(sd),
-				   sched_domain_span(child));
-		}
-
-	}
-	set_domain_attribute(sd, attr);
-
-	return sd;
-}
-
-/*
- * Build sched domains for a given set of CPUs and attach the sched domains
- * to the individual CPUs
- */
-static int
-build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
-{
-	enum s_alloc alloc_state;
-	struct sched_domain *sd;
-	struct s_data d;
-	struct rq *rq = NULL;
-	int i, ret = -ENOMEM;
-
-	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
-	if (alloc_state != sa_rootdomain)
-		goto error;
-
-	/* Set up domains for CPUs specified by the cpu_map: */
-	for_each_cpu(i, cpu_map) {
-		struct sched_domain_topology_level *tl;
-
-		sd = NULL;
-		for_each_sd_topology(tl) {
-			sd = build_sched_domain(tl, cpu_map, attr, sd, i);
-			if (tl == sched_domain_topology)
-				*per_cpu_ptr(d.sd, i) = sd;
-			if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
-				sd->flags |= SD_OVERLAP;
-			if (cpumask_equal(cpu_map, sched_domain_span(sd)))
-				break;
-		}
-	}
-
-	/* Build the groups for the domains */
-	for_each_cpu(i, cpu_map) {
-		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
-			sd->span_weight = cpumask_weight(sched_domain_span(sd));
-			if (sd->flags & SD_OVERLAP) {
-				if (build_overlap_sched_groups(sd, i))
-					goto error;
-			} else {
-				if (build_sched_groups(sd, i))
-					goto error;
-			}
-		}
-	}
-
-	/* Calculate CPU capacity for physical packages and nodes */
-	for (i = nr_cpumask_bits-1; i >= 0; i--) {
-		if (!cpumask_test_cpu(i, cpu_map))
-			continue;
-
-		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
-			claim_allocations(i, sd);
-			init_sched_groups_capacity(i, sd);
-		}
-	}
-
-	/* Attach the domains */
-	rcu_read_lock();
-	for_each_cpu(i, cpu_map) {
-		rq = cpu_rq(i);
-		sd = *per_cpu_ptr(d.sd, i);
-
-		/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
-		if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
-			WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
-
-		cpu_attach_domain(sd, d.rd, i);
-	}
-	rcu_read_unlock();
-
-	if (rq && sched_debug_enabled) {
-		pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
-			cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
-	}
-
-	ret = 0;
-error:
-	__free_domain_allocs(&d, alloc_state, cpu_map);
-	return ret;
-}
-
-/* Current sched domains: */
-static cpumask_var_t			*doms_cur;
-
-/* Number of sched domains in 'doms_cur': */
-static int				ndoms_cur;
-
-/* Attribues of custom domains in 'doms_cur' */
-static struct sched_domain_attr		*dattr_cur;
-
-/*
- * Special case: If a kmalloc() of a doms_cur partition (array of
- * cpumask) fails, then fallback to a single sched domain,
- * as determined by the single cpumask fallback_doms.
- */
-static cpumask_var_t			fallback_doms;
-
-/*
- * arch_update_cpu_topology lets virtualized architectures update the
- * CPU core maps. It is supposed to return 1 if the topology changed
- * or 0 if it stayed the same.
- */
-int __weak arch_update_cpu_topology(void)
-{
-	return 0;
-}
-
-cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
-{
-	int i;
-	cpumask_var_t *doms;
-
-	doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
-	if (!doms)
-		return NULL;
-	for (i = 0; i < ndoms; i++) {
-		if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
-			free_sched_domains(doms, i);
-			return NULL;
-		}
-	}
-	return doms;
-}
-
-void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
-{
-	unsigned int i;
-	for (i = 0; i < ndoms; i++)
-		free_cpumask_var(doms[i]);
-	kfree(doms);
-}
-
-/*
- * Set up scheduler domains and groups. Callers must hold the hotplug lock.
- * For now this just excludes isolated CPUs, but could be used to
- * exclude other special cases in the future.
- */
-static int init_sched_domains(const struct cpumask *cpu_map)
-{
-	int err;
-
-	arch_update_cpu_topology();
-	ndoms_cur = 1;
-	doms_cur = alloc_sched_domains(ndoms_cur);
-	if (!doms_cur)
-		doms_cur = &fallback_doms;
-	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
-	err = build_sched_domains(doms_cur[0], NULL);
-	register_sched_domain_sysctl();
-
-	return err;
-}
-
-/*
- * Detach sched domains from a group of CPUs specified in cpu_map
- * These CPUs will now be attached to the NULL domain
- */
-static void detach_destroy_domains(const struct cpumask *cpu_map)
-{
-	int i;
-
-	rcu_read_lock();
-	for_each_cpu(i, cpu_map)
-		cpu_attach_domain(NULL, &def_root_domain, i);
-	rcu_read_unlock();
-}
-
-/* handle null as "default" */
-static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
-			struct sched_domain_attr *new, int idx_new)
-{
-	struct sched_domain_attr tmp;
-
-	/* Fast path: */
-	if (!new && !cur)
-		return 1;
-
-	tmp = SD_ATTR_INIT;
-	return !memcmp(cur ? (cur + idx_cur) : &tmp,
-			new ? (new + idx_new) : &tmp,
-			sizeof(struct sched_domain_attr));
-}
-
-/*
- * Partition sched domains as specified by the 'ndoms_new'
- * cpumasks in the array doms_new[] of cpumasks. This compares
- * doms_new[] to the current sched domain partitioning, doms_cur[].
- * It destroys each deleted domain and builds each new domain.
- *
- * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
- * The masks don't intersect (don't overlap.) We should setup one
- * sched domain for each mask. CPUs not in any of the cpumasks will
- * not be load balanced. If the same cpumask appears both in the
- * current 'doms_cur' domains and in the new 'doms_new', we can leave
- * it as it is.
- *
- * The passed in 'doms_new' should be allocated using
- * alloc_sched_domains.  This routine takes ownership of it and will
- * free_sched_domains it when done with it. If the caller failed the
- * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
- * and partition_sched_domains() will fallback to the single partition
- * 'fallback_doms', it also forces the domains to be rebuilt.
- *
- * If doms_new == NULL it will be replaced with cpu_online_mask.
- * ndoms_new == 0 is a special case for destroying existing domains,
- * and it will not create the default domain.
- *
- * Call with hotplug lock held
- */
-void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
-			     struct sched_domain_attr *dattr_new)
-{
-	int i, j, n;
-	int new_topology;
-
-	mutex_lock(&sched_domains_mutex);
-
-	/* Always unregister in case we don't destroy any domains: */
-	unregister_sched_domain_sysctl();
-
-	/* Let the architecture update CPU core mappings: */
-	new_topology = arch_update_cpu_topology();
-
-	n = doms_new ? ndoms_new : 0;
-
-	/* Destroy deleted domains: */
-	for (i = 0; i < ndoms_cur; i++) {
-		for (j = 0; j < n && !new_topology; j++) {
-			if (cpumask_equal(doms_cur[i], doms_new[j])
-			    && dattrs_equal(dattr_cur, i, dattr_new, j))
-				goto match1;
-		}
-		/* No match - a current sched domain not in new doms_new[] */
-		detach_destroy_domains(doms_cur[i]);
-match1:
-		;
-	}
-
-	n = ndoms_cur;
-	if (doms_new == NULL) {
-		n = 0;
-		doms_new = &fallback_doms;
-		cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
-		WARN_ON_ONCE(dattr_new);
-	}
-
-	/* Build new domains: */
-	for (i = 0; i < ndoms_new; i++) {
-		for (j = 0; j < n && !new_topology; j++) {
-			if (cpumask_equal(doms_new[i], doms_cur[j])
-			    && dattrs_equal(dattr_new, i, dattr_cur, j))
-				goto match2;
-		}
-		/* No match - add a new doms_new */
-		build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
-match2:
-		;
-	}
-
-	/* Remember the new sched domains: */
-	if (doms_cur != &fallback_doms)
-		free_sched_domains(doms_cur, ndoms_cur);
-
-	kfree(dattr_cur);
-	doms_cur = doms_new;
-	dattr_cur = dattr_new;
-	ndoms_cur = ndoms_new;
-
-	register_sched_domain_sysctl();
-
-	mutex_unlock(&sched_domains_mutex);
-}
-
 /*
  * used to mark begin/end of suspend/resume:
  */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8ff5cc539e8a..17ed94b9b413 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -223,7 +223,7 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
 	       dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
 }
 
-extern struct mutex sched_domains_mutex;
+extern void init_dl_bw(struct dl_bw *dl_b);
 
 #ifdef CONFIG_CGROUP_SCHED
 
@@ -584,6 +584,13 @@ struct root_domain {
 };
 
 extern struct root_domain def_root_domain;
+extern struct mutex sched_domains_mutex;
+extern cpumask_var_t fallback_doms;
+extern cpumask_var_t sched_domains_tmpmask;
+
+extern void init_defrootdomain(void);
+extern int init_sched_domains(const struct cpumask *cpu_map);
+extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
 
 #endif /* CONFIG_SMP */
 
@@ -886,6 +893,16 @@ extern int sched_max_numa_distance;
 extern bool find_numa_distance(int distance);
 #endif
 
+#ifdef CONFIG_NUMA
+extern void sched_init_numa(void);
+extern void sched_domains_numa_masks_set(unsigned int cpu);
+extern void sched_domains_numa_masks_clear(unsigned int cpu);
+#else
+static inline void sched_init_numa(void) { }
+static inline void sched_domains_numa_masks_set(unsigned int cpu) { }
+static inline void sched_domains_numa_masks_clear(unsigned int cpu) { }
+#endif
+
 #ifdef CONFIG_NUMA_BALANCING
 /* The regions in numa_faults array from task_struct */
 enum numa_faults_stats {
@@ -1752,6 +1769,10 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 		__release(rq2->lock);
 }
 
+extern void set_rq_online (struct rq *rq);
+extern void set_rq_offline(struct rq *rq);
+extern bool sched_smp_initialized;
+
 #else /* CONFIG_SMP */
 
 /*
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
new file mode 100644
index 000000000000..1b0b4fb12837
--- /dev/null
+++ b/kernel/sched/topology.c
@@ -0,0 +1,1658 @@
+/*
+ * Scheduler topology setup/handling methods
+ */
+#include <linux/sched.h>
+#include <linux/mutex.h>
+
+#include "sched.h"
+
+DEFINE_MUTEX(sched_domains_mutex);
+
+/* Protected by sched_domains_mutex: */
+cpumask_var_t sched_domains_tmpmask;
+
+#ifdef CONFIG_SCHED_DEBUG
+
+static __read_mostly int sched_debug_enabled;
+
+static int __init sched_debug_setup(char *str)
+{
+	sched_debug_enabled = 1;
+
+	return 0;
+}
+early_param("sched_debug", sched_debug_setup);
+
+static inline bool sched_debug(void)
+{
+	return sched_debug_enabled;
+}
+
+static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
+				  struct cpumask *groupmask)
+{
+	struct sched_group *group = sd->groups;
+
+	cpumask_clear(groupmask);
+
+	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
+
+	if (!(sd->flags & SD_LOAD_BALANCE)) {
+		printk("does not load-balance\n");
+		if (sd->parent)
+			printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
+					" has parent");
+		return -1;
+	}
+
+	printk(KERN_CONT "span %*pbl level %s\n",
+	       cpumask_pr_args(sched_domain_span(sd)), sd->name);
+
+	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+		printk(KERN_ERR "ERROR: domain->span does not contain "
+				"CPU%d\n", cpu);
+	}
+	if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
+		printk(KERN_ERR "ERROR: domain->groups does not contain"
+				" CPU%d\n", cpu);
+	}
+
+	printk(KERN_DEBUG "%*s groups:", level + 1, "");
+	do {
+		if (!group) {
+			printk("\n");
+			printk(KERN_ERR "ERROR: group is NULL\n");
+			break;
+		}
+
+		if (!cpumask_weight(sched_group_cpus(group))) {
+			printk(KERN_CONT "\n");
+			printk(KERN_ERR "ERROR: empty group\n");
+			break;
+		}
+
+		if (!(sd->flags & SD_OVERLAP) &&
+		    cpumask_intersects(groupmask, sched_group_cpus(group))) {
+			printk(KERN_CONT "\n");
+			printk(KERN_ERR "ERROR: repeated CPUs\n");
+			break;
+		}
+
+		cpumask_or(groupmask, groupmask, sched_group_cpus(group));
+
+		printk(KERN_CONT " %*pbl",
+		       cpumask_pr_args(sched_group_cpus(group)));
+		if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
+			printk(KERN_CONT " (cpu_capacity = %lu)",
+				group->sgc->capacity);
+		}
+
+		group = group->next;
+	} while (group != sd->groups);
+	printk(KERN_CONT "\n");
+
+	if (!cpumask_equal(sched_domain_span(sd), groupmask))
+		printk(KERN_ERR "ERROR: groups don't span domain->span\n");
+
+	if (sd->parent &&
+	    !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
+		printk(KERN_ERR "ERROR: parent span is not a superset "
+			"of domain->span\n");
+	return 0;
+}
+
+static void sched_domain_debug(struct sched_domain *sd, int cpu)
+{
+	int level = 0;
+
+	if (!sched_debug_enabled)
+		return;
+
+	if (!sd) {
+		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+		return;
+	}
+
+	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
+
+	for (;;) {
+		if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
+			break;
+		level++;
+		sd = sd->parent;
+		if (!sd)
+			break;
+	}
+}
+#else /* !CONFIG_SCHED_DEBUG */
+
+# define sched_debug_enabled 0
+# define sched_domain_debug(sd, cpu) do { } while (0)
+static inline bool sched_debug(void)
+{
+	return false;
+}
+#endif /* CONFIG_SCHED_DEBUG */
+
+static int sd_degenerate(struct sched_domain *sd)
+{
+	if (cpumask_weight(sched_domain_span(sd)) == 1)
+		return 1;
+
+	/* Following flags need at least 2 groups */
+	if (sd->flags & (SD_LOAD_BALANCE |
+			 SD_BALANCE_NEWIDLE |
+			 SD_BALANCE_FORK |
+			 SD_BALANCE_EXEC |
+			 SD_SHARE_CPUCAPACITY |
+			 SD_ASYM_CPUCAPACITY |
+			 SD_SHARE_PKG_RESOURCES |
+			 SD_SHARE_POWERDOMAIN)) {
+		if (sd->groups != sd->groups->next)
+			return 0;
+	}
+
+	/* Following flags don't use groups */
+	if (sd->flags & (SD_WAKE_AFFINE))
+		return 0;
+
+	return 1;
+}
+
+static int
+sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
+{
+	unsigned long cflags = sd->flags, pflags = parent->flags;
+
+	if (sd_degenerate(parent))
+		return 1;
+
+	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
+		return 0;
+
+	/* Flags needing groups don't count if only 1 group in parent */
+	if (parent->groups == parent->groups->next) {
+		pflags &= ~(SD_LOAD_BALANCE |
+				SD_BALANCE_NEWIDLE |
+				SD_BALANCE_FORK |
+				SD_BALANCE_EXEC |
+				SD_ASYM_CPUCAPACITY |
+				SD_SHARE_CPUCAPACITY |
+				SD_SHARE_PKG_RESOURCES |
+				SD_PREFER_SIBLING |
+				SD_SHARE_POWERDOMAIN);
+		if (nr_node_ids == 1)
+			pflags &= ~SD_SERIALIZE;
+	}
+	if (~cflags & pflags)
+		return 0;
+
+	return 1;
+}
+
+static void free_rootdomain(struct rcu_head *rcu)
+{
+	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
+
+	cpupri_cleanup(&rd->cpupri);
+	cpudl_cleanup(&rd->cpudl);
+	free_cpumask_var(rd->dlo_mask);
+	free_cpumask_var(rd->rto_mask);
+	free_cpumask_var(rd->online);
+	free_cpumask_var(rd->span);
+	kfree(rd);
+}
+
+void rq_attach_root(struct rq *rq, struct root_domain *rd)
+{
+	struct root_domain *old_rd = NULL;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+
+	if (rq->rd) {
+		old_rd = rq->rd;
+
+		if (cpumask_test_cpu(rq->cpu, old_rd->online))
+			set_rq_offline(rq);
+
+		cpumask_clear_cpu(rq->cpu, old_rd->span);
+
+		/*
+		 * If we dont want to free the old_rd yet then
+		 * set old_rd to NULL to skip the freeing later
+		 * in this function:
+		 */
+		if (!atomic_dec_and_test(&old_rd->refcount))
+			old_rd = NULL;
+	}
+
+	atomic_inc(&rd->refcount);
+	rq->rd = rd;
+
+	cpumask_set_cpu(rq->cpu, rd->span);
+	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
+		set_rq_online(rq);
+
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+	if (old_rd)
+		call_rcu_sched(&old_rd->rcu, free_rootdomain);
+}
+
+static int init_rootdomain(struct root_domain *rd)
+{
+	memset(rd, 0, sizeof(*rd));
+
+	if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
+		goto out;
+	if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
+		goto free_span;
+	if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
+		goto free_online;
+	if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+		goto free_dlo_mask;
+
+	init_dl_bw(&rd->dl_bw);
+	if (cpudl_init(&rd->cpudl) != 0)
+		goto free_rto_mask;
+
+	if (cpupri_init(&rd->cpupri) != 0)
+		goto free_cpudl;
+	return 0;
+
+free_cpudl:
+	cpudl_cleanup(&rd->cpudl);
+free_rto_mask:
+	free_cpumask_var(rd->rto_mask);
+free_dlo_mask:
+	free_cpumask_var(rd->dlo_mask);
+free_online:
+	free_cpumask_var(rd->online);
+free_span:
+	free_cpumask_var(rd->span);
+out:
+	return -ENOMEM;
+}
+
+/*
+ * By default the system creates a single root-domain with all CPUs as
+ * members (mimicking the global state we have today).
+ */
+struct root_domain def_root_domain;
+
+void init_defrootdomain(void)
+{
+	init_rootdomain(&def_root_domain);
+
+	atomic_set(&def_root_domain.refcount, 1);
+}
+
+static struct root_domain *alloc_rootdomain(void)
+{
+	struct root_domain *rd;
+
+	rd = kmalloc(sizeof(*rd), GFP_KERNEL);
+	if (!rd)
+		return NULL;
+
+	if (init_rootdomain(rd) != 0) {
+		kfree(rd);
+		return NULL;
+	}
+
+	return rd;
+}
+
+static void free_sched_groups(struct sched_group *sg, int free_sgc)
+{
+	struct sched_group *tmp, *first;
+
+	if (!sg)
+		return;
+
+	first = sg;
+	do {
+		tmp = sg->next;
+
+		if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
+			kfree(sg->sgc);
+
+		kfree(sg);
+		sg = tmp;
+	} while (sg != first);
+}
+
+static void destroy_sched_domain(struct sched_domain *sd)
+{
+	/*
+	 * If its an overlapping domain it has private groups, iterate and
+	 * nuke them all.
+	 */
+	if (sd->flags & SD_OVERLAP) {
+		free_sched_groups(sd->groups, 1);
+	} else if (atomic_dec_and_test(&sd->groups->ref)) {
+		kfree(sd->groups->sgc);
+		kfree(sd->groups);
+	}
+	if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
+		kfree(sd->shared);
+	kfree(sd);
+}
+
+static void destroy_sched_domains_rcu(struct rcu_head *rcu)
+{
+	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+
+	while (sd) {
+		struct sched_domain *parent = sd->parent;
+		destroy_sched_domain(sd);
+		sd = parent;
+	}
+}
+
+static void destroy_sched_domains(struct sched_domain *sd)
+{
+	if (sd)
+		call_rcu(&sd->rcu, destroy_sched_domains_rcu);
+}
+
+/*
+ * Keep a special pointer to the highest sched_domain that has
+ * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
+ * allows us to avoid some pointer chasing select_idle_sibling().
+ *
+ * Also keep a unique ID per domain (we use the first CPU number in
+ * the cpumask of the domain), this allows us to quickly tell if
+ * two CPUs are in the same cache domain, see cpus_share_cache().
+ */
+DEFINE_PER_CPU(struct sched_domain *, sd_llc);
+DEFINE_PER_CPU(int, sd_llc_size);
+DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
+DEFINE_PER_CPU(struct sched_domain *, sd_numa);
+DEFINE_PER_CPU(struct sched_domain *, sd_asym);
+
+static void update_top_cache_domain(int cpu)
+{
+	struct sched_domain_shared *sds = NULL;
+	struct sched_domain *sd;
+	int id = cpu;
+	int size = 1;
+
+	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+	if (sd) {
+		id = cpumask_first(sched_domain_span(sd));
+		size = cpumask_weight(sched_domain_span(sd));
+		sds = sd->shared;
+	}
+
+	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
+	per_cpu(sd_llc_size, cpu) = size;
+	per_cpu(sd_llc_id, cpu) = id;
+	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
+
+	sd = lowest_flag_domain(cpu, SD_NUMA);
+	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
+
+	sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
+	rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
+}
+
+/*
+ * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
+ * hold the hotplug lock.
+ */
+static void
+cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct sched_domain *tmp;
+
+	/* Remove the sched domains which do not contribute to scheduling. */
+	for (tmp = sd; tmp; ) {
+		struct sched_domain *parent = tmp->parent;
+		if (!parent)
+			break;
+
+		if (sd_parent_degenerate(tmp, parent)) {
+			tmp->parent = parent->parent;
+			if (parent->parent)
+				parent->parent->child = tmp;
+			/*
+			 * Transfer SD_PREFER_SIBLING down in case of a
+			 * degenerate parent; the spans match for this
+			 * so the property transfers.
+			 */
+			if (parent->flags & SD_PREFER_SIBLING)
+				tmp->flags |= SD_PREFER_SIBLING;
+			destroy_sched_domain(parent);
+		} else
+			tmp = tmp->parent;
+	}
+
+	if (sd && sd_degenerate(sd)) {
+		tmp = sd;
+		sd = sd->parent;
+		destroy_sched_domain(tmp);
+		if (sd)
+			sd->child = NULL;
+	}
+
+	sched_domain_debug(sd, cpu);
+
+	rq_attach_root(rq, rd);
+	tmp = rq->sd;
+	rcu_assign_pointer(rq->sd, sd);
+	destroy_sched_domains(tmp);
+
+	update_top_cache_domain(cpu);
+}
+
+/* Setup the mask of CPUs configured for isolated domains */
+static int __init isolated_cpu_setup(char *str)
+{
+	int ret;
+
+	alloc_bootmem_cpumask_var(&cpu_isolated_map);
+	ret = cpulist_parse(str, cpu_isolated_map);
+	if (ret) {
+		pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
+		return 0;
+	}
+	return 1;
+}
+__setup("isolcpus=", isolated_cpu_setup);
+
+struct s_data {
+	struct sched_domain ** __percpu sd;
+	struct root_domain	*rd;
+};
+
+enum s_alloc {
+	sa_rootdomain,
+	sa_sd,
+	sa_sd_storage,
+	sa_none,
+};
+
+/*
+ * Build an iteration mask that can exclude certain CPUs from the upwards
+ * domain traversal.
+ *
+ * Asymmetric node setups can result in situations where the domain tree is of
+ * unequal depth, make sure to skip domains that already cover the entire
+ * range.
+ *
+ * In that case build_sched_domains() will have terminated the iteration early
+ * and our sibling sd spans will be empty. Domains should always include the
+ * CPU they're built on, so check that.
+ */
+static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
+{
+	const struct cpumask *span = sched_domain_span(sd);
+	struct sd_data *sdd = sd->private;
+	struct sched_domain *sibling;
+	int i;
+
+	for_each_cpu(i, span) {
+		sibling = *per_cpu_ptr(sdd->sd, i);
+		if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+			continue;
+
+		cpumask_set_cpu(i, sched_group_mask(sg));
+	}
+}
+
+/*
+ * Return the canonical balance CPU for this group, this is the first CPU
+ * of this group that's also in the iteration mask.
+ */
+int group_balance_cpu(struct sched_group *sg)
+{
+	return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
+}
+
+static int
+build_overlap_sched_groups(struct sched_domain *sd, int cpu)
+{
+	struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
+	const struct cpumask *span = sched_domain_span(sd);
+	struct cpumask *covered = sched_domains_tmpmask;
+	struct sd_data *sdd = sd->private;
+	struct sched_domain *sibling;
+	int i;
+
+	cpumask_clear(covered);
+
+	for_each_cpu(i, span) {
+		struct cpumask *sg_span;
+
+		if (cpumask_test_cpu(i, covered))
+			continue;
+
+		sibling = *per_cpu_ptr(sdd->sd, i);
+
+		/* See the comment near build_group_mask(). */
+		if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+			continue;
+
+		sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+				GFP_KERNEL, cpu_to_node(cpu));
+
+		if (!sg)
+			goto fail;
+
+		sg_span = sched_group_cpus(sg);
+		if (sibling->child)
+			cpumask_copy(sg_span, sched_domain_span(sibling->child));
+		else
+			cpumask_set_cpu(i, sg_span);
+
+		cpumask_or(covered, covered, sg_span);
+
+		sg->sgc = *per_cpu_ptr(sdd->sgc, i);
+		if (atomic_inc_return(&sg->sgc->ref) == 1)
+			build_group_mask(sd, sg);
+
+		/*
+		 * Initialize sgc->capacity such that even if we mess up the
+		 * domains and no possible iteration will get us here, we won't
+		 * die on a /0 trap.
+		 */
+		sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
+		sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
+
+		/*
+		 * Make sure the first group of this domain contains the
+		 * canonical balance CPU. Otherwise the sched_domain iteration
+		 * breaks. See update_sg_lb_stats().
+		 */
+		if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
+		    group_balance_cpu(sg) == cpu)
+			groups = sg;
+
+		if (!first)
+			first = sg;
+		if (last)
+			last->next = sg;
+		last = sg;
+		last->next = first;
+	}
+	sd->groups = groups;
+
+	return 0;
+
+fail:
+	free_sched_groups(first, 0);
+
+	return -ENOMEM;
+}
+
+static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
+{
+	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+	struct sched_domain *child = sd->child;
+
+	if (child)
+		cpu = cpumask_first(sched_domain_span(child));
+
+	if (sg) {
+		*sg = *per_cpu_ptr(sdd->sg, cpu);
+		(*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+
+		/* For claim_allocations: */
+		atomic_set(&(*sg)->sgc->ref, 1);
+	}
+
+	return cpu;
+}
+
+/*
+ * build_sched_groups will build a circular linked list of the groups
+ * covered by the given span, and will set each group's ->cpumask correctly,
+ * and ->cpu_capacity to 0.
+ *
+ * Assumes the sched_domain tree is fully constructed
+ */
+static int
+build_sched_groups(struct sched_domain *sd, int cpu)
+{
+	struct sched_group *first = NULL, *last = NULL;
+	struct sd_data *sdd = sd->private;
+	const struct cpumask *span = sched_domain_span(sd);
+	struct cpumask *covered;
+	int i;
+
+	get_group(cpu, sdd, &sd->groups);
+	atomic_inc(&sd->groups->ref);
+
+	if (cpu != cpumask_first(span))
+		return 0;
+
+	lockdep_assert_held(&sched_domains_mutex);
+	covered = sched_domains_tmpmask;
+
+	cpumask_clear(covered);
+
+	for_each_cpu(i, span) {
+		struct sched_group *sg;
+		int group, j;
+
+		if (cpumask_test_cpu(i, covered))
+			continue;
+
+		group = get_group(i, sdd, &sg);
+		cpumask_setall(sched_group_mask(sg));
+
+		for_each_cpu(j, span) {
+			if (get_group(j, sdd, NULL) != group)
+				continue;
+
+			cpumask_set_cpu(j, covered);
+			cpumask_set_cpu(j, sched_group_cpus(sg));
+		}
+
+		if (!first)
+			first = sg;
+		if (last)
+			last->next = sg;
+		last = sg;
+	}
+	last->next = first;
+
+	return 0;
+}
+
+/*
+ * Initialize sched groups cpu_capacity.
+ *
+ * cpu_capacity indicates the capacity of sched group, which is used while
+ * distributing the load between different sched groups in a sched domain.
+ * Typically cpu_capacity for all the groups in a sched domain will be same
+ * unless there are asymmetries in the topology. If there are asymmetries,
+ * group having more cpu_capacity will pickup more load compared to the
+ * group having less cpu_capacity.
+ */
+static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
+{
+	struct sched_group *sg = sd->groups;
+
+	WARN_ON(!sg);
+
+	do {
+		int cpu, max_cpu = -1;
+
+		sg->group_weight = cpumask_weight(sched_group_cpus(sg));
+
+		if (!(sd->flags & SD_ASYM_PACKING))
+			goto next;
+
+		for_each_cpu(cpu, sched_group_cpus(sg)) {
+			if (max_cpu < 0)
+				max_cpu = cpu;
+			else if (sched_asym_prefer(cpu, max_cpu))
+				max_cpu = cpu;
+		}
+		sg->asym_prefer_cpu = max_cpu;
+
+next:
+		sg = sg->next;
+	} while (sg != sd->groups);
+
+	if (cpu != group_balance_cpu(sg))
+		return;
+
+	update_group_capacity(sd, cpu);
+}
+
+/*
+ * Initializers for schedule domains
+ * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
+ */
+
+static int default_relax_domain_level = -1;
+int sched_domain_level_max;
+
+static int __init setup_relax_domain_level(char *str)
+{
+	if (kstrtoint(str, 0, &default_relax_domain_level))
+		pr_warn("Unable to set relax_domain_level\n");
+
+	return 1;
+}
+__setup("relax_domain_level=", setup_relax_domain_level);
+
+static void set_domain_attribute(struct sched_domain *sd,
+				 struct sched_domain_attr *attr)
+{
+	int request;
+
+	if (!attr || attr->relax_domain_level < 0) {
+		if (default_relax_domain_level < 0)
+			return;
+		else
+			request = default_relax_domain_level;
+	} else
+		request = attr->relax_domain_level;
+	if (request < sd->level) {
+		/* Turn off idle balance on this domain: */
+		sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
+	} else {
+		/* Turn on idle balance on this domain: */
+		sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
+	}
+}
+
+static void __sdt_free(const struct cpumask *cpu_map);
+static int __sdt_alloc(const struct cpumask *cpu_map);
+
+static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
+				 const struct cpumask *cpu_map)
+{
+	switch (what) {
+	case sa_rootdomain:
+		if (!atomic_read(&d->rd->refcount))
+			free_rootdomain(&d->rd->rcu);
+		/* Fall through */
+	case sa_sd:
+		free_percpu(d->sd);
+		/* Fall through */
+	case sa_sd_storage:
+		__sdt_free(cpu_map);
+		/* Fall through */
+	case sa_none:
+		break;
+	}
+}
+
+static enum s_alloc
+__visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
+{
+	memset(d, 0, sizeof(*d));
+
+	if (__sdt_alloc(cpu_map))
+		return sa_sd_storage;
+	d->sd = alloc_percpu(struct sched_domain *);
+	if (!d->sd)
+		return sa_sd_storage;
+	d->rd = alloc_rootdomain();
+	if (!d->rd)
+		return sa_sd;
+	return sa_rootdomain;
+}
+
+/*
+ * NULL the sd_data elements we've used to build the sched_domain and
+ * sched_group structure so that the subsequent __free_domain_allocs()
+ * will not free the data we're using.
+ */
+static void claim_allocations(int cpu, struct sched_domain *sd)
+{
+	struct sd_data *sdd = sd->private;
+
+	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
+	*per_cpu_ptr(sdd->sd, cpu) = NULL;
+
+	if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
+		*per_cpu_ptr(sdd->sds, cpu) = NULL;
+
+	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
+		*per_cpu_ptr(sdd->sg, cpu) = NULL;
+
+	if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
+		*per_cpu_ptr(sdd->sgc, cpu) = NULL;
+}
+
+#ifdef CONFIG_NUMA
+static int sched_domains_numa_levels;
+enum numa_topology_type sched_numa_topology_type;
+static int *sched_domains_numa_distance;
+int sched_max_numa_distance;
+static struct cpumask ***sched_domains_numa_masks;
+static int sched_domains_curr_level;
+#endif
+
+/*
+ * SD_flags allowed in topology descriptions.
+ *
+ * These flags are purely descriptive of the topology and do not prescribe
+ * behaviour. Behaviour is artificial and mapped in the below sd_init()
+ * function:
+ *
+ *   SD_SHARE_CPUCAPACITY   - describes SMT topologies
+ *   SD_SHARE_PKG_RESOURCES - describes shared caches
+ *   SD_NUMA                - describes NUMA topologies
+ *   SD_SHARE_POWERDOMAIN   - describes shared power domain
+ *   SD_ASYM_CPUCAPACITY    - describes mixed capacity topologies
+ *
+ * Odd one out, which beside describing the topology has a quirk also
+ * prescribes the desired behaviour that goes along with it:
+ *
+ *   SD_ASYM_PACKING        - describes SMT quirks
+ */
+#define TOPOLOGY_SD_FLAGS		\
+	(SD_SHARE_CPUCAPACITY |		\
+	 SD_SHARE_PKG_RESOURCES |	\
+	 SD_NUMA |			\
+	 SD_ASYM_PACKING |		\
+	 SD_ASYM_CPUCAPACITY |		\
+	 SD_SHARE_POWERDOMAIN)
+
+static struct sched_domain *
+sd_init(struct sched_domain_topology_level *tl,
+	const struct cpumask *cpu_map,
+	struct sched_domain *child, int cpu)
+{
+	struct sd_data *sdd = &tl->data;
+	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+	int sd_id, sd_weight, sd_flags = 0;
+
+#ifdef CONFIG_NUMA
+	/*
+	 * Ugly hack to pass state to sd_numa_mask()...
+	 */
+	sched_domains_curr_level = tl->numa_level;
+#endif
+
+	sd_weight = cpumask_weight(tl->mask(cpu));
+
+	if (tl->sd_flags)
+		sd_flags = (*tl->sd_flags)();
+	if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
+			"wrong sd_flags in topology description\n"))
+		sd_flags &= ~TOPOLOGY_SD_FLAGS;
+
+	*sd = (struct sched_domain){
+		.min_interval		= sd_weight,
+		.max_interval		= 2*sd_weight,
+		.busy_factor		= 32,
+		.imbalance_pct		= 125,
+
+		.cache_nice_tries	= 0,
+		.busy_idx		= 0,
+		.idle_idx		= 0,
+		.newidle_idx		= 0,
+		.wake_idx		= 0,
+		.forkexec_idx		= 0,
+
+		.flags			= 1*SD_LOAD_BALANCE
+					| 1*SD_BALANCE_NEWIDLE
+					| 1*SD_BALANCE_EXEC
+					| 1*SD_BALANCE_FORK
+					| 0*SD_BALANCE_WAKE
+					| 1*SD_WAKE_AFFINE
+					| 0*SD_SHARE_CPUCAPACITY
+					| 0*SD_SHARE_PKG_RESOURCES
+					| 0*SD_SERIALIZE
+					| 0*SD_PREFER_SIBLING
+					| 0*SD_NUMA
+					| sd_flags
+					,
+
+		.last_balance		= jiffies,
+		.balance_interval	= sd_weight,
+		.smt_gain		= 0,
+		.max_newidle_lb_cost	= 0,
+		.next_decay_max_lb_cost	= jiffies,
+		.child			= child,
+#ifdef CONFIG_SCHED_DEBUG
+		.name			= tl->name,
+#endif
+	};
+
+	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+	sd_id = cpumask_first(sched_domain_span(sd));
+
+	/*
+	 * Convert topological properties into behaviour.
+	 */
+
+	if (sd->flags & SD_ASYM_CPUCAPACITY) {
+		struct sched_domain *t = sd;
+
+		for_each_lower_domain(t)
+			t->flags |= SD_BALANCE_WAKE;
+	}
+
+	if (sd->flags & SD_SHARE_CPUCAPACITY) {
+		sd->flags |= SD_PREFER_SIBLING;
+		sd->imbalance_pct = 110;
+		sd->smt_gain = 1178; /* ~15% */
+
+	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+		sd->imbalance_pct = 117;
+		sd->cache_nice_tries = 1;
+		sd->busy_idx = 2;
+
+#ifdef CONFIG_NUMA
+	} else if (sd->flags & SD_NUMA) {
+		sd->cache_nice_tries = 2;
+		sd->busy_idx = 3;
+		sd->idle_idx = 2;
+
+		sd->flags |= SD_SERIALIZE;
+		if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+			sd->flags &= ~(SD_BALANCE_EXEC |
+				       SD_BALANCE_FORK |
+				       SD_WAKE_AFFINE);
+		}
+
+#endif
+	} else {
+		sd->flags |= SD_PREFER_SIBLING;
+		sd->cache_nice_tries = 1;
+		sd->busy_idx = 2;
+		sd->idle_idx = 1;
+	}
+
+	/*
+	 * For all levels sharing cache; connect a sched_domain_shared
+	 * instance.
+	 */
+	if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+		sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
+		atomic_inc(&sd->shared->ref);
+		atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
+	}
+
+	sd->private = sdd;
+
+	return sd;
+}
+
+/*
+ * Topology list, bottom-up.
+ */
+static struct sched_domain_topology_level default_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+#endif
+	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
+	{ NULL, },
+};
+
+static struct sched_domain_topology_level *sched_domain_topology =
+	default_topology;
+
+#define for_each_sd_topology(tl)			\
+	for (tl = sched_domain_topology; tl->mask; tl++)
+
+void set_sched_topology(struct sched_domain_topology_level *tl)
+{
+	if (WARN_ON_ONCE(sched_smp_initialized))
+		return;
+
+	sched_domain_topology = tl;
+}
+
+#ifdef CONFIG_NUMA
+
+static const struct cpumask *sd_numa_mask(int cpu)
+{
+	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+}
+
+static void sched_numa_warn(const char *str)
+{
+	static int done = false;
+	int i,j;
+
+	if (done)
+		return;
+
+	done = true;
+
+	printk(KERN_WARNING "ERROR: %s\n\n", str);
+
+	for (i = 0; i < nr_node_ids; i++) {
+		printk(KERN_WARNING "  ");
+		for (j = 0; j < nr_node_ids; j++)
+			printk(KERN_CONT "%02d ", node_distance(i,j));
+		printk(KERN_CONT "\n");
+	}
+	printk(KERN_WARNING "\n");
+}
+
+bool find_numa_distance(int distance)
+{
+	int i;
+
+	if (distance == node_distance(0, 0))
+		return true;
+
+	for (i = 0; i < sched_domains_numa_levels; i++) {
+		if (sched_domains_numa_distance[i] == distance)
+			return true;
+	}
+
+	return false;
+}
+
+/*
+ * A system can have three types of NUMA topology:
+ * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
+ * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
+ * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
+ *
+ * The difference between a glueless mesh topology and a backplane
+ * topology lies in whether communication between not directly
+ * connected nodes goes through intermediary nodes (where programs
+ * could run), or through backplane controllers. This affects
+ * placement of programs.
+ *
+ * The type of topology can be discerned with the following tests:
+ * - If the maximum distance between any nodes is 1 hop, the system
+ *   is directly connected.
+ * - If for two nodes A and B, located N > 1 hops away from each other,
+ *   there is an intermediary node C, which is < N hops away from both
+ *   nodes A and B, the system is a glueless mesh.
+ */
+static void init_numa_topology_type(void)
+{
+	int a, b, c, n;
+
+	n = sched_max_numa_distance;
+
+	if (sched_domains_numa_levels <= 1) {
+		sched_numa_topology_type = NUMA_DIRECT;
+		return;
+	}
+
+	for_each_online_node(a) {
+		for_each_online_node(b) {
+			/* Find two nodes furthest removed from each other. */
+			if (node_distance(a, b) < n)
+				continue;
+
+			/* Is there an intermediary node between a and b? */
+			for_each_online_node(c) {
+				if (node_distance(a, c) < n &&
+				    node_distance(b, c) < n) {
+					sched_numa_topology_type =
+							NUMA_GLUELESS_MESH;
+					return;
+				}
+			}
+
+			sched_numa_topology_type = NUMA_BACKPLANE;
+			return;
+		}
+	}
+}
+
+void sched_init_numa(void)
+{
+	int next_distance, curr_distance = node_distance(0, 0);
+	struct sched_domain_topology_level *tl;
+	int level = 0;
+	int i, j, k;
+
+	sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
+	if (!sched_domains_numa_distance)
+		return;
+
+	/*
+	 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
+	 * unique distances in the node_distance() table.
+	 *
+	 * Assumes node_distance(0,j) includes all distances in
+	 * node_distance(i,j) in order to avoid cubic time.
+	 */
+	next_distance = curr_distance;
+	for (i = 0; i < nr_node_ids; i++) {
+		for (j = 0; j < nr_node_ids; j++) {
+			for (k = 0; k < nr_node_ids; k++) {
+				int distance = node_distance(i, k);
+
+				if (distance > curr_distance &&
+				    (distance < next_distance ||
+				     next_distance == curr_distance))
+					next_distance = distance;
+
+				/*
+				 * While not a strong assumption it would be nice to know
+				 * about cases where if node A is connected to B, B is not
+				 * equally connected to A.
+				 */
+				if (sched_debug() && node_distance(k, i) != distance)
+					sched_numa_warn("Node-distance not symmetric");
+
+				if (sched_debug() && i && !find_numa_distance(distance))
+					sched_numa_warn("Node-0 not representative");
+			}
+			if (next_distance != curr_distance) {
+				sched_domains_numa_distance[level++] = next_distance;
+				sched_domains_numa_levels = level;
+				curr_distance = next_distance;
+			} else break;
+		}
+
+		/*
+		 * In case of sched_debug() we verify the above assumption.
+		 */
+		if (!sched_debug())
+			break;
+	}
+
+	if (!level)
+		return;
+
+	/*
+	 * 'level' contains the number of unique distances, excluding the
+	 * identity distance node_distance(i,i).
+	 *
+	 * The sched_domains_numa_distance[] array includes the actual distance
+	 * numbers.
+	 */
+
+	/*
+	 * Here, we should temporarily reset sched_domains_numa_levels to 0.
+	 * If it fails to allocate memory for array sched_domains_numa_masks[][],
+	 * the array will contain less then 'level' members. This could be
+	 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
+	 * in other functions.
+	 *
+	 * We reset it to 'level' at the end of this function.
+	 */
+	sched_domains_numa_levels = 0;
+
+	sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+	if (!sched_domains_numa_masks)
+		return;
+
+	/*
+	 * Now for each level, construct a mask per node which contains all
+	 * CPUs of nodes that are that many hops away from us.
+	 */
+	for (i = 0; i < level; i++) {
+		sched_domains_numa_masks[i] =
+			kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+		if (!sched_domains_numa_masks[i])
+			return;
+
+		for (j = 0; j < nr_node_ids; j++) {
+			struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
+			if (!mask)
+				return;
+
+			sched_domains_numa_masks[i][j] = mask;
+
+			for_each_node(k) {
+				if (node_distance(j, k) > sched_domains_numa_distance[i])
+					continue;
+
+				cpumask_or(mask, mask, cpumask_of_node(k));
+			}
+		}
+	}
+
+	/* Compute default topology size */
+	for (i = 0; sched_domain_topology[i].mask; i++);
+
+	tl = kzalloc((i + level + 1) *
+			sizeof(struct sched_domain_topology_level), GFP_KERNEL);
+	if (!tl)
+		return;
+
+	/*
+	 * Copy the default topology bits..
+	 */
+	for (i = 0; sched_domain_topology[i].mask; i++)
+		tl[i] = sched_domain_topology[i];
+
+	/*
+	 * .. and append 'j' levels of NUMA goodness.
+	 */
+	for (j = 0; j < level; i++, j++) {
+		tl[i] = (struct sched_domain_topology_level){
+			.mask = sd_numa_mask,
+			.sd_flags = cpu_numa_flags,
+			.flags = SDTL_OVERLAP,
+			.numa_level = j,
+			SD_INIT_NAME(NUMA)
+		};
+	}
+
+	sched_domain_topology = tl;
+
+	sched_domains_numa_levels = level;
+	sched_max_numa_distance = sched_domains_numa_distance[level - 1];
+
+	init_numa_topology_type();
+}
+
+void sched_domains_numa_masks_set(unsigned int cpu)
+{
+	int node = cpu_to_node(cpu);
+	int i, j;
+
+	for (i = 0; i < sched_domains_numa_levels; i++) {
+		for (j = 0; j < nr_node_ids; j++) {
+			if (node_distance(j, node) <= sched_domains_numa_distance[i])
+				cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
+		}
+	}
+}
+
+void sched_domains_numa_masks_clear(unsigned int cpu)
+{
+	int i, j;
+
+	for (i = 0; i < sched_domains_numa_levels; i++) {
+		for (j = 0; j < nr_node_ids; j++)
+			cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
+	}
+}
+
+#endif /* CONFIG_NUMA */
+
+static int __sdt_alloc(const struct cpumask *cpu_map)
+{
+	struct sched_domain_topology_level *tl;
+	int j;
+
+	for_each_sd_topology(tl) {
+		struct sd_data *sdd = &tl->data;
+
+		sdd->sd = alloc_percpu(struct sched_domain *);
+		if (!sdd->sd)
+			return -ENOMEM;
+
+		sdd->sds = alloc_percpu(struct sched_domain_shared *);
+		if (!sdd->sds)
+			return -ENOMEM;
+
+		sdd->sg = alloc_percpu(struct sched_group *);
+		if (!sdd->sg)
+			return -ENOMEM;
+
+		sdd->sgc = alloc_percpu(struct sched_group_capacity *);
+		if (!sdd->sgc)
+			return -ENOMEM;
+
+		for_each_cpu(j, cpu_map) {
+			struct sched_domain *sd;
+			struct sched_domain_shared *sds;
+			struct sched_group *sg;
+			struct sched_group_capacity *sgc;
+
+			sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+					GFP_KERNEL, cpu_to_node(j));
+			if (!sd)
+				return -ENOMEM;
+
+			*per_cpu_ptr(sdd->sd, j) = sd;
+
+			sds = kzalloc_node(sizeof(struct sched_domain_shared),
+					GFP_KERNEL, cpu_to_node(j));
+			if (!sds)
+				return -ENOMEM;
+
+			*per_cpu_ptr(sdd->sds, j) = sds;
+
+			sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+					GFP_KERNEL, cpu_to_node(j));
+			if (!sg)
+				return -ENOMEM;
+
+			sg->next = sg;
+
+			*per_cpu_ptr(sdd->sg, j) = sg;
+
+			sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
+					GFP_KERNEL, cpu_to_node(j));
+			if (!sgc)
+				return -ENOMEM;
+
+			*per_cpu_ptr(sdd->sgc, j) = sgc;
+		}
+	}
+
+	return 0;
+}
+
+static void __sdt_free(const struct cpumask *cpu_map)
+{
+	struct sched_domain_topology_level *tl;
+	int j;
+
+	for_each_sd_topology(tl) {
+		struct sd_data *sdd = &tl->data;
+
+		for_each_cpu(j, cpu_map) {
+			struct sched_domain *sd;
+
+			if (sdd->sd) {
+				sd = *per_cpu_ptr(sdd->sd, j);
+				if (sd && (sd->flags & SD_OVERLAP))
+					free_sched_groups(sd->groups, 0);
+				kfree(*per_cpu_ptr(sdd->sd, j));
+			}
+
+			if (sdd->sds)
+				kfree(*per_cpu_ptr(sdd->sds, j));
+			if (sdd->sg)
+				kfree(*per_cpu_ptr(sdd->sg, j));
+			if (sdd->sgc)
+				kfree(*per_cpu_ptr(sdd->sgc, j));
+		}
+		free_percpu(sdd->sd);
+		sdd->sd = NULL;
+		free_percpu(sdd->sds);
+		sdd->sds = NULL;
+		free_percpu(sdd->sg);
+		sdd->sg = NULL;
+		free_percpu(sdd->sgc);
+		sdd->sgc = NULL;
+	}
+}
+
+struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+		const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+		struct sched_domain *child, int cpu)
+{
+	struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
+
+	if (child) {
+		sd->level = child->level + 1;
+		sched_domain_level_max = max(sched_domain_level_max, sd->level);
+		child->parent = sd;
+
+		if (!cpumask_subset(sched_domain_span(child),
+				    sched_domain_span(sd))) {
+			pr_err("BUG: arch topology borken\n");
+#ifdef CONFIG_SCHED_DEBUG
+			pr_err("     the %s domain not a subset of the %s domain\n",
+					child->name, sd->name);
+#endif
+			/* Fixup, ensure @sd has at least @child cpus. */
+			cpumask_or(sched_domain_span(sd),
+				   sched_domain_span(sd),
+				   sched_domain_span(child));
+		}
+
+	}
+	set_domain_attribute(sd, attr);
+
+	return sd;
+}
+
+/*
+ * Build sched domains for a given set of CPUs and attach the sched domains
+ * to the individual CPUs
+ */
+static int
+build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
+{
+	enum s_alloc alloc_state;
+	struct sched_domain *sd;
+	struct s_data d;
+	struct rq *rq = NULL;
+	int i, ret = -ENOMEM;
+
+	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
+	if (alloc_state != sa_rootdomain)
+		goto error;
+
+	/* Set up domains for CPUs specified by the cpu_map: */
+	for_each_cpu(i, cpu_map) {
+		struct sched_domain_topology_level *tl;
+
+		sd = NULL;
+		for_each_sd_topology(tl) {
+			sd = build_sched_domain(tl, cpu_map, attr, sd, i);
+			if (tl == sched_domain_topology)
+				*per_cpu_ptr(d.sd, i) = sd;
+			if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
+				sd->flags |= SD_OVERLAP;
+			if (cpumask_equal(cpu_map, sched_domain_span(sd)))
+				break;
+		}
+	}
+
+	/* Build the groups for the domains */
+	for_each_cpu(i, cpu_map) {
+		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+			sd->span_weight = cpumask_weight(sched_domain_span(sd));
+			if (sd->flags & SD_OVERLAP) {
+				if (build_overlap_sched_groups(sd, i))
+					goto error;
+			} else {
+				if (build_sched_groups(sd, i))
+					goto error;
+			}
+		}
+	}
+
+	/* Calculate CPU capacity for physical packages and nodes */
+	for (i = nr_cpumask_bits-1; i >= 0; i--) {
+		if (!cpumask_test_cpu(i, cpu_map))
+			continue;
+
+		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+			claim_allocations(i, sd);
+			init_sched_groups_capacity(i, sd);
+		}
+	}
+
+	/* Attach the domains */
+	rcu_read_lock();
+	for_each_cpu(i, cpu_map) {
+		rq = cpu_rq(i);
+		sd = *per_cpu_ptr(d.sd, i);
+
+		/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
+		if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
+			WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
+
+		cpu_attach_domain(sd, d.rd, i);
+	}
+	rcu_read_unlock();
+
+	if (rq && sched_debug_enabled) {
+		pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
+			cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
+	}
+
+	ret = 0;
+error:
+	__free_domain_allocs(&d, alloc_state, cpu_map);
+	return ret;
+}
+
+/* Current sched domains: */
+static cpumask_var_t			*doms_cur;
+
+/* Number of sched domains in 'doms_cur': */
+static int				ndoms_cur;
+
+/* Attribues of custom domains in 'doms_cur' */
+static struct sched_domain_attr		*dattr_cur;
+
+/*
+ * Special case: If a kmalloc() of a doms_cur partition (array of
+ * cpumask) fails, then fallback to a single sched domain,
+ * as determined by the single cpumask fallback_doms.
+ */
+cpumask_var_t				fallback_doms;
+
+/*
+ * arch_update_cpu_topology lets virtualized architectures update the
+ * CPU core maps. It is supposed to return 1 if the topology changed
+ * or 0 if it stayed the same.
+ */
+int __weak arch_update_cpu_topology(void)
+{
+	return 0;
+}
+
+cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
+{
+	int i;
+	cpumask_var_t *doms;
+
+	doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
+	if (!doms)
+		return NULL;
+	for (i = 0; i < ndoms; i++) {
+		if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
+			free_sched_domains(doms, i);
+			return NULL;
+		}
+	}
+	return doms;
+}
+
+void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
+{
+	unsigned int i;
+	for (i = 0; i < ndoms; i++)
+		free_cpumask_var(doms[i]);
+	kfree(doms);
+}
+
+/*
+ * Set up scheduler domains and groups. Callers must hold the hotplug lock.
+ * For now this just excludes isolated CPUs, but could be used to
+ * exclude other special cases in the future.
+ */
+int init_sched_domains(const struct cpumask *cpu_map)
+{
+	int err;
+
+	arch_update_cpu_topology();
+	ndoms_cur = 1;
+	doms_cur = alloc_sched_domains(ndoms_cur);
+	if (!doms_cur)
+		doms_cur = &fallback_doms;
+	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
+	err = build_sched_domains(doms_cur[0], NULL);
+	register_sched_domain_sysctl();
+
+	return err;
+}
+
+/*
+ * Detach sched domains from a group of CPUs specified in cpu_map
+ * These CPUs will now be attached to the NULL domain
+ */
+static void detach_destroy_domains(const struct cpumask *cpu_map)
+{
+	int i;
+
+	rcu_read_lock();
+	for_each_cpu(i, cpu_map)
+		cpu_attach_domain(NULL, &def_root_domain, i);
+	rcu_read_unlock();
+}
+
+/* handle null as "default" */
+static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
+			struct sched_domain_attr *new, int idx_new)
+{
+	struct sched_domain_attr tmp;
+
+	/* Fast path: */
+	if (!new && !cur)
+		return 1;
+
+	tmp = SD_ATTR_INIT;
+	return !memcmp(cur ? (cur + idx_cur) : &tmp,
+			new ? (new + idx_new) : &tmp,
+			sizeof(struct sched_domain_attr));
+}
+
+/*
+ * Partition sched domains as specified by the 'ndoms_new'
+ * cpumasks in the array doms_new[] of cpumasks. This compares
+ * doms_new[] to the current sched domain partitioning, doms_cur[].
+ * It destroys each deleted domain and builds each new domain.
+ *
+ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
+ * The masks don't intersect (don't overlap.) We should setup one
+ * sched domain for each mask. CPUs not in any of the cpumasks will
+ * not be load balanced. If the same cpumask appears both in the
+ * current 'doms_cur' domains and in the new 'doms_new', we can leave
+ * it as it is.
+ *
+ * The passed in 'doms_new' should be allocated using
+ * alloc_sched_domains.  This routine takes ownership of it and will
+ * free_sched_domains it when done with it. If the caller failed the
+ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
+ * and partition_sched_domains() will fallback to the single partition
+ * 'fallback_doms', it also forces the domains to be rebuilt.
+ *
+ * If doms_new == NULL it will be replaced with cpu_online_mask.
+ * ndoms_new == 0 is a special case for destroying existing domains,
+ * and it will not create the default domain.
+ *
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
+			     struct sched_domain_attr *dattr_new)
+{
+	int i, j, n;
+	int new_topology;
+
+	mutex_lock(&sched_domains_mutex);
+
+	/* Always unregister in case we don't destroy any domains: */
+	unregister_sched_domain_sysctl();
+
+	/* Let the architecture update CPU core mappings: */
+	new_topology = arch_update_cpu_topology();
+
+	n = doms_new ? ndoms_new : 0;
+
+	/* Destroy deleted domains: */
+	for (i = 0; i < ndoms_cur; i++) {
+		for (j = 0; j < n && !new_topology; j++) {
+			if (cpumask_equal(doms_cur[i], doms_new[j])
+			    && dattrs_equal(dattr_cur, i, dattr_new, j))
+				goto match1;
+		}
+		/* No match - a current sched domain not in new doms_new[] */
+		detach_destroy_domains(doms_cur[i]);
+match1:
+		;
+	}
+
+	n = ndoms_cur;
+	if (doms_new == NULL) {
+		n = 0;
+		doms_new = &fallback_doms;
+		cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
+		WARN_ON_ONCE(dattr_new);
+	}
+
+	/* Build new domains: */
+	for (i = 0; i < ndoms_new; i++) {
+		for (j = 0; j < n && !new_topology; j++) {
+			if (cpumask_equal(doms_new[i], doms_cur[j])
+			    && dattrs_equal(dattr_new, i, dattr_cur, j))
+				goto match2;
+		}
+		/* No match - add a new doms_new */
+		build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
+match2:
+		;
+	}
+
+	/* Remember the new sched domains: */
+	if (doms_cur != &fallback_doms)
+		free_sched_domains(doms_cur, ndoms_cur);
+
+	kfree(dattr_cur);
+	doms_cur = doms_new;
+	dattr_cur = dattr_new;
+	ndoms_cur = ndoms_new;
+
+	register_sched_domain_sysctl();
+
+	mutex_unlock(&sched_domains_mutex);
+}
+
-- 
cgit v1.2.3


From 0f5bf6d0afe4be6e1391908ff2d6dc9730e91550 Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@redhat.com>
Date: Mon, 6 Feb 2017 16:31:58 -0800
Subject: arch: Rename CONFIG_DEBUG_RODATA and CONFIG_DEBUG_MODULE_RONX

Both of these options are poorly named. The features they provide are
necessary for system security and should not be considered debug only.
Change the names to CONFIG_STRICT_KERNEL_RWX and
CONFIG_STRICT_MODULE_RWX to better describe what these options do.

Signed-off-by: Laura Abbott <labbott@redhat.com>
Acked-by: Jessica Yu <jeyu@redhat.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 Documentation/DocBook/kgdb.tmpl            | 8 ++++----
 Documentation/security/self-protection.txt | 4 ++--
 arch/Kconfig                               | 4 ++--
 arch/arm/configs/aspeed_g4_defconfig       | 4 ++--
 arch/arm/configs/aspeed_g5_defconfig       | 4 ++--
 arch/arm/include/asm/cacheflush.h          | 2 +-
 arch/arm/kernel/patch.c                    | 4 ++--
 arch/arm/kernel/vmlinux.lds.S              | 8 ++++----
 arch/arm/mm/Kconfig                        | 2 +-
 arch/arm/mm/init.c                         | 4 ++--
 arch/arm64/Kconfig.debug                   | 2 +-
 arch/arm64/kernel/insn.c                   | 2 +-
 arch/parisc/configs/712_defconfig          | 1 -
 arch/parisc/configs/c3000_defconfig        | 1 -
 arch/parisc/mm/init.c                      | 2 +-
 include/linux/filter.h                     | 4 ++--
 include/linux/init.h                       | 4 ++--
 include/linux/module.h                     | 2 +-
 init/main.c                                | 4 ++--
 kernel/configs/android-recommended.config  | 2 +-
 kernel/module.c                            | 6 +++---
 kernel/power/hibernate.c                   | 2 +-
 kernel/power/power.h                       | 4 ++--
 kernel/power/snapshot.c                    | 4 ++--
 24 files changed, 41 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/DocBook/kgdb.tmpl b/Documentation/DocBook/kgdb.tmpl
index f3abca7ec53d..856ac20bf367 100644
--- a/Documentation/DocBook/kgdb.tmpl
+++ b/Documentation/DocBook/kgdb.tmpl
@@ -115,12 +115,12 @@
     </para>
     <para>
     If the architecture that you are using supports the kernel option
-    CONFIG_DEBUG_RODATA, you should consider turning it off.  This
+    CONFIG_STRICT_KERNEL_RWX, you should consider turning it off.  This
     option will prevent the use of software breakpoints because it
     marks certain regions of the kernel's memory space as read-only.
     If kgdb supports it for the architecture you are using, you can
     use hardware breakpoints if you desire to run with the
-    CONFIG_DEBUG_RODATA option turned on, else you need to turn off
+    CONFIG_STRICT_KERNEL_RWX option turned on, else you need to turn off
     this option.
     </para>
     <para>
@@ -135,7 +135,7 @@
     <para>Here is an example set of .config symbols to enable or
     disable for kgdb:
     <itemizedlist>
-    <listitem><para># CONFIG_DEBUG_RODATA is not set</para></listitem>
+    <listitem><para># CONFIG_STRICT_KERNEL_RWX is not set</para></listitem>
     <listitem><para>CONFIG_FRAME_POINTER=y</para></listitem>
     <listitem><para>CONFIG_KGDB=y</para></listitem>
     <listitem><para>CONFIG_KGDB_SERIAL_CONSOLE=y</para></listitem>
@@ -166,7 +166,7 @@
     </para>
     <para>Here is an example set of .config symbols to enable/disable kdb:
     <itemizedlist>
-    <listitem><para># CONFIG_DEBUG_RODATA is not set</para></listitem>
+    <listitem><para># CONFIG_STRICT_KERNEL_RWX is not set</para></listitem>
     <listitem><para>CONFIG_FRAME_POINTER=y</para></listitem>
     <listitem><para>CONFIG_KGDB=y</para></listitem>
     <listitem><para>CONFIG_KGDB_SERIAL_CONSOLE=y</para></listitem>
diff --git a/Documentation/security/self-protection.txt b/Documentation/security/self-protection.txt
index f41dd00e8b98..141acfebe6ef 100644
--- a/Documentation/security/self-protection.txt
+++ b/Documentation/security/self-protection.txt
@@ -51,8 +51,8 @@ kernel, they are implemented in a way where the memory is temporarily
 made writable during the update, and then returned to the original
 permissions.)
 
-In support of this are (the poorly named) CONFIG_DEBUG_RODATA and
-CONFIG_DEBUG_SET_MODULE_RONX, which seek to make sure that code is not
+In support of this are CONFIG_STRICT_KERNEL_RWX and
+CONFIG_STRICT_MODULE_RWX, which seek to make sure that code is not
 writable, data is not executable, and read-only data is neither writable
 nor executable.
 
diff --git a/arch/Kconfig b/arch/Kconfig
index 3f8b8be3036f..33f5a555c32a 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -790,7 +790,7 @@ config ARCH_OPTIONAL_KERNEL_RWX_DEFAULT
 config ARCH_HAS_STRICT_KERNEL_RWX
 	def_bool n
 
-config DEBUG_RODATA
+config STRICT_KERNEL_RWX
 	bool "Make kernel text and rodata read-only" if ARCH_OPTIONAL_KERNEL_RWX
 	depends on ARCH_HAS_STRICT_KERNEL_RWX
 	default !ARCH_OPTIONAL_KERNEL_RWX || ARCH_OPTIONAL_KERNEL_RWX_DEFAULT
@@ -806,7 +806,7 @@ config DEBUG_RODATA
 config ARCH_HAS_STRICT_MODULE_RWX
 	def_bool n
 
-config DEBUG_SET_MODULE_RONX
+config STRICT_MODULE_RWX
 	bool "Set loadable kernel module data as NX and text as RO" if ARCH_OPTIONAL_KERNEL_RWX
 	depends on ARCH_HAS_STRICT_MODULE_RWX && MODULES
 	default !ARCH_OPTIONAL_KERNEL_RWX || ARCH_OPTIONAL_KERNEL_RWX_DEFAULT
diff --git a/arch/arm/configs/aspeed_g4_defconfig b/arch/arm/configs/aspeed_g4_defconfig
index ca39c04fec6b..05b99bc1c1ce 100644
--- a/arch/arm/configs/aspeed_g4_defconfig
+++ b/arch/arm/configs/aspeed_g4_defconfig
@@ -25,7 +25,6 @@ CONFIG_MODULE_UNLOAD=y
 # CONFIG_ARCH_MULTI_V7 is not set
 CONFIG_ARCH_ASPEED=y
 CONFIG_MACH_ASPEED_G4=y
-CONFIG_DEBUG_RODATA=y
 CONFIG_AEABI=y
 CONFIG_UACCESS_WITH_MEMCPY=y
 CONFIG_SECCOMP=y
@@ -79,7 +78,8 @@ CONFIG_DEBUG_LL_UART_8250=y
 CONFIG_DEBUG_UART_PHYS=0x1e784000
 CONFIG_DEBUG_UART_VIRT=0xe8784000
 CONFIG_EARLY_PRINTK=y
-CONFIG_DEBUG_SET_MODULE_RONX=y
+CONFIG_STRICT_MODULE_RWX=y
+CONFIG_STRICT_KERNEL_RWX=y
 # CONFIG_XZ_DEC_X86 is not set
 # CONFIG_XZ_DEC_POWERPC is not set
 # CONFIG_XZ_DEC_IA64 is not set
diff --git a/arch/arm/configs/aspeed_g5_defconfig b/arch/arm/configs/aspeed_g5_defconfig
index 4f366b0370e9..05a16d53d03c 100644
--- a/arch/arm/configs/aspeed_g5_defconfig
+++ b/arch/arm/configs/aspeed_g5_defconfig
@@ -26,7 +26,6 @@ CONFIG_ARCH_MULTI_V6=y
 # CONFIG_ARCH_MULTI_V7 is not set
 CONFIG_ARCH_ASPEED=y
 CONFIG_MACH_ASPEED_G5=y
-CONFIG_DEBUG_RODATA=y
 CONFIG_AEABI=y
 CONFIG_UACCESS_WITH_MEMCPY=y
 CONFIG_SECCOMP=y
@@ -81,7 +80,8 @@ CONFIG_DEBUG_LL_UART_8250=y
 CONFIG_DEBUG_UART_PHYS=0x1e784000
 CONFIG_DEBUG_UART_VIRT=0xe8784000
 CONFIG_EARLY_PRINTK=y
-CONFIG_DEBUG_SET_MODULE_RONX=y
+CONFIG_STRICT_MODULE_RWX=y
+CONFIG_STRICT_KERNEL_RWX=y
 # CONFIG_XZ_DEC_X86 is not set
 # CONFIG_XZ_DEC_POWERPC is not set
 # CONFIG_XZ_DEC_IA64 is not set
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index bdd283bc5842..02454fa15d2c 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -490,7 +490,7 @@ static inline int set_memory_x(unsigned long addr, int numpages) { return 0; }
 static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; }
 #endif
 
-#ifdef CONFIG_DEBUG_RODATA
+#ifdef CONFIG_STRICT_KERNEL_RWX
 void set_kernel_text_rw(void);
 void set_kernel_text_ro(void);
 #else
diff --git a/arch/arm/kernel/patch.c b/arch/arm/kernel/patch.c
index 69bda1a5707e..020560b2dcb7 100644
--- a/arch/arm/kernel/patch.c
+++ b/arch/arm/kernel/patch.c
@@ -24,9 +24,9 @@ static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
 	bool module = !core_kernel_text(uintaddr);
 	struct page *page;
 
-	if (module && IS_ENABLED(CONFIG_DEBUG_SET_MODULE_RONX))
+	if (module && IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
 		page = vmalloc_to_page(addr);
-	else if (!module && IS_ENABLED(CONFIG_DEBUG_RODATA))
+	else if (!module && IS_ENABLED(CONFIG_STRICT_KERNEL_RWX))
 		page = virt_to_page(addr);
 	else
 		return addr;
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index f7f55df0bf7b..ce18007f9e4e 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -97,7 +97,7 @@ SECTIONS
 		HEAD_TEXT
 	}
 
-#ifdef CONFIG_DEBUG_RODATA
+#ifdef CONFIG_STRICT_KERNEL_RWX
 	. = ALIGN(1<<SECTION_SHIFT);
 #endif
 
@@ -158,7 +158,7 @@ SECTIONS
 
 	NOTES
 
-#ifdef CONFIG_DEBUG_RODATA
+#ifdef CONFIG_STRICT_KERNEL_RWX
 	. = ALIGN(1<<SECTION_SHIFT);
 #else
 	. = ALIGN(PAGE_SIZE);
@@ -230,7 +230,7 @@ SECTIONS
 	PERCPU_SECTION(L1_CACHE_BYTES)
 #endif
 
-#ifdef CONFIG_DEBUG_RODATA
+#ifdef CONFIG_STRICT_KERNEL_RWX
 	. = ALIGN(1<<SECTION_SHIFT);
 #else
 	. = ALIGN(THREAD_SIZE);
@@ -325,7 +325,7 @@ SECTIONS
 	STABS_DEBUG
 }
 
-#ifdef CONFIG_DEBUG_RODATA
+#ifdef CONFIG_STRICT_KERNEL_RWX
 /*
  * Without CONFIG_DEBUG_ALIGN_RODATA, __start_rodata_section_aligned will
  * be the first section-aligned location after __start_rodata. Otherwise,
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index 419a0355d4e4..35e3a56e5d86 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -1053,7 +1053,7 @@ config ARCH_SUPPORTS_BIG_ENDIAN
 
 config DEBUG_ALIGN_RODATA
 	bool "Make rodata strictly non-executable"
-	depends on DEBUG_RODATA
+	depends on STRICT_KERNEL_RWX
 	default y
 	help
 	  If this is set, rodata will be made explicitly non-executable. This
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 370581aeb871..4be0bee4c357 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -572,7 +572,7 @@ void __init mem_init(void)
 	}
 }
 
-#ifdef CONFIG_DEBUG_RODATA
+#ifdef CONFIG_STRICT_KERNEL_RWX
 struct section_perm {
 	const char *name;
 	unsigned long start;
@@ -741,7 +741,7 @@ void set_kernel_text_ro(void)
 
 #else
 static inline void fix_kernmem_perms(void) { }
-#endif /* CONFIG_DEBUG_RODATA */
+#endif /* CONFIG_STRICT_KERNEL_RWX */
 
 void free_tcmmem(void)
 {
diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug
index 939815e8d695..560a8d85a4f8 100644
--- a/arch/arm64/Kconfig.debug
+++ b/arch/arm64/Kconfig.debug
@@ -72,7 +72,7 @@ config DEBUG_WX
 	  If in doubt, say "Y".
 
 config DEBUG_ALIGN_RODATA
-	depends on DEBUG_RODATA
+	depends on STRICT_KERNEL_RWX
 	bool "Align linker sections up to SECTION_SIZE"
 	help
 	  If this option is enabled, sections that may potentially be marked as
diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
index 94b62c1fa4df..67f9cb9e8512 100644
--- a/arch/arm64/kernel/insn.c
+++ b/arch/arm64/kernel/insn.c
@@ -93,7 +93,7 @@ static void __kprobes *patch_map(void *addr, int fixmap)
 	bool module = !core_kernel_text(uintaddr);
 	struct page *page;
 
-	if (module && IS_ENABLED(CONFIG_DEBUG_SET_MODULE_RONX))
+	if (module && IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
 		page = vmalloc_to_page(addr);
 	else if (!module)
 		page = pfn_to_page(PHYS_PFN(__pa(addr)));
diff --git a/arch/parisc/configs/712_defconfig b/arch/parisc/configs/712_defconfig
index db8f56bf3883..143d02652792 100644
--- a/arch/parisc/configs/712_defconfig
+++ b/arch/parisc/configs/712_defconfig
@@ -182,7 +182,6 @@ CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DEBUG_MUTEXES=y
 # CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_DEBUG_RODATA=y
 CONFIG_CRYPTO_NULL=m
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_HMAC=y
diff --git a/arch/parisc/configs/c3000_defconfig b/arch/parisc/configs/c3000_defconfig
index fb92b8920785..8e8f0e34f817 100644
--- a/arch/parisc/configs/c3000_defconfig
+++ b/arch/parisc/configs/c3000_defconfig
@@ -166,7 +166,6 @@ CONFIG_DEBUG_KERNEL=y
 CONFIG_DEBUG_MUTEXES=y
 # CONFIG_DEBUG_BUGVERBOSE is not set
 # CONFIG_RCU_CPU_STALL_DETECTOR is not set
-CONFIG_DEBUG_RODATA=y
 CONFIG_CRYPTO_NULL=m
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_MD5=m
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index e02ada312be8..a055e5b6b380 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -545,7 +545,7 @@ void free_initmem(void)
 }
 
 
-#ifdef CONFIG_DEBUG_RODATA
+#ifdef CONFIG_STRICT_KERNEL_RWX
 void mark_rodata_ro(void)
 {
 	/* rodata memory was already mapped with KERNEL_RO access rights by
diff --git a/include/linux/filter.h b/include/linux/filter.h
index a0934e6c9bab..c6dd53e88711 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -543,7 +543,7 @@ static inline bool bpf_prog_was_classic(const struct bpf_prog *prog)
 
 #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0]))
 
-#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+#ifdef CONFIG_STRICT_MODULE_RWX
 static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
 {
 	set_memory_ro((unsigned long)fp, fp->pages);
@@ -561,7 +561,7 @@ static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
 static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
 {
 }
-#endif /* CONFIG_DEBUG_SET_MODULE_RONX */
+#endif /* CONFIG_STRICT_MODULE_RWX */
 
 int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap);
 static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
diff --git a/include/linux/init.h b/include/linux/init.h
index 885c3e6d0f9d..79af0962fd52 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -126,10 +126,10 @@ void prepare_namespace(void);
 void __init load_default_modules(void);
 int __init init_rootfs(void);
 
-#if defined(CONFIG_DEBUG_RODATA) || defined(CONFIG_DEBUG_SET_MODULE_RONX)
+#if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_STRICT_MODULE_RWX)
 extern bool rodata_enabled;
 #endif
-#ifdef CONFIG_DEBUG_RODATA
+#ifdef CONFIG_STRICT_KERNEL_RWX
 void mark_rodata_ro(void);
 #endif
 
diff --git a/include/linux/module.h b/include/linux/module.h
index 7c84273d60b9..d5afd142818f 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -764,7 +764,7 @@ extern int module_sysfs_initialized;
 
 #define __MODULE_STRING(x) __stringify(x)
 
-#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+#ifdef CONFIG_STRICT_MODULE_RWX
 extern void set_all_modules_text_rw(void);
 extern void set_all_modules_text_ro(void);
 extern void module_enable_ro(const struct module *mod, bool after_init);
diff --git a/init/main.c b/init/main.c
index b0c9d6facef9..0b7bae29eef6 100644
--- a/init/main.c
+++ b/init/main.c
@@ -925,7 +925,7 @@ static int try_to_run_init_process(const char *init_filename)
 
 static noinline void __init kernel_init_freeable(void);
 
-#if defined(CONFIG_DEBUG_RODATA) || defined(CONFIG_DEBUG_SET_MODULE_RONX)
+#if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_STRICT_MODULE_RWX)
 bool rodata_enabled __ro_after_init = true;
 static int __init set_debug_rodata(char *str)
 {
@@ -934,7 +934,7 @@ static int __init set_debug_rodata(char *str)
 __setup("rodata=", set_debug_rodata);
 #endif
 
-#ifdef CONFIG_DEBUG_RODATA
+#ifdef CONFIG_STRICT_KERNEL_RWX
 static void mark_readonly(void)
 {
 	if (rodata_enabled)
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
index 297756be369c..99127edc5204 100644
--- a/kernel/configs/android-recommended.config
+++ b/kernel/configs/android-recommended.config
@@ -11,7 +11,7 @@ CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=8192
 CONFIG_COMPACTION=y
-CONFIG_DEBUG_RODATA=y
+CONFIG_STRICT_KERNEL_RWX=y
 CONFIG_DM_CRYPT=y
 CONFIG_DM_UEVENT=y
 CONFIG_DM_VERITY=y
diff --git a/kernel/module.c b/kernel/module.c
index 5088784c0cf9..e71478569273 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -74,9 +74,9 @@
 /*
  * Modules' sections will be aligned on page boundaries
  * to ensure complete separation of code and data, but
- * only when CONFIG_DEBUG_SET_MODULE_RONX=y
+ * only when CONFIG_STRICT_MODULE_RWX=y
  */
-#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+#ifdef CONFIG_STRICT_MODULE_RWX
 # define debug_align(X) ALIGN(X, PAGE_SIZE)
 #else
 # define debug_align(X) (X)
@@ -1847,7 +1847,7 @@ static void mod_sysfs_teardown(struct module *mod)
 	mod_sysfs_fini(mod);
 }
 
-#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+#ifdef CONFIG_STRICT_MODULE_RWX
 /*
  * LKM RO/NX protection: protect module's text/ro-data
  * from modification and any data from execution.
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index b26dbc48c75b..86385af1080f 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -1156,7 +1156,7 @@ static int __init hibernate_setup(char *str)
 	} else if (!strncmp(str, "no", 2)) {
 		noresume = 1;
 		nohibernate = 1;
-	} else if (IS_ENABLED(CONFIG_DEBUG_RODATA)
+	} else if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)
 		   && !strncmp(str, "protect_image", 13)) {
 		enable_restore_image_protection();
 	}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 1dfa0da827d3..7fdc40d31b7d 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -61,12 +61,12 @@ extern int hibernation_snapshot(int platform_mode);
 extern int hibernation_restore(int platform_mode);
 extern int hibernation_platform_enter(void);
 
-#ifdef CONFIG_DEBUG_RODATA
+#ifdef CONFIG_STRICT_KERNEL_RWX
 /* kernel/power/snapshot.c */
 extern void enable_restore_image_protection(void);
 #else
 static inline void enable_restore_image_protection(void) {}
-#endif /* CONFIG_DEBUG_RODATA */
+#endif /* CONFIG_STRICT_KERNEL_RWX */
 
 #else /* !CONFIG_HIBERNATION */
 
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 2d8e2b227db8..905d5bbd595f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -38,7 +38,7 @@
 
 #include "power.h"
 
-#ifdef CONFIG_DEBUG_RODATA
+#ifdef CONFIG_STRICT_KERNEL_RWX
 static bool hibernate_restore_protection;
 static bool hibernate_restore_protection_active;
 
@@ -73,7 +73,7 @@ static inline void hibernate_restore_protection_begin(void) {}
 static inline void hibernate_restore_protection_end(void) {}
 static inline void hibernate_restore_protect_page(void *page_address) {}
 static inline void hibernate_restore_unprotect_page(void *page_address) {}
-#endif /* CONFIG_DEBUG_RODATA */
+#endif /* CONFIG_STRICT_KERNEL_RWX */
 
 static int swsusp_page_is_free(struct page *);
 static void swsusp_set_page_forbidden(struct page *);
-- 
cgit v1.2.3


From 1051408f7ecdcd1baf86f5dd5fdc44740be3b23d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 18:42:41 +0100
Subject: sched/autogroup: Rename auto_group.[ch] to autogroup.[ch]

The names are all 'autogroup', not 'auto_group' - so rename
the kernel/sched/auto_group.[ch] to match the existing
nomenclature.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/Makefile     |   2 +-
 kernel/sched/auto_group.c | 271 ----------------------------------------------
 kernel/sched/auto_group.h |  64 -----------
 kernel/sched/autogroup.c  | 271 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/autogroup.h  |  64 +++++++++++
 kernel/sched/sched.h      |   2 +-
 6 files changed, 337 insertions(+), 337 deletions(-)
 delete mode 100644 kernel/sched/auto_group.c
 delete mode 100644 kernel/sched/auto_group.h
 create mode 100644 kernel/sched/autogroup.c
 create mode 100644 kernel/sched/autogroup.h

(limited to 'kernel')

diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 130ce8ac725b..89ab6758667b 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -19,7 +19,7 @@ obj-y += core.o loadavg.o clock.o cputime.o
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
 obj-y += wait.o swait.o completion.o idle.o
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o
-obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
+obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
deleted file mode 100644
index da39489d2d80..000000000000
--- a/kernel/sched/auto_group.c
+++ /dev/null
@@ -1,271 +0,0 @@
-#include "sched.h"
-
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/kallsyms.h>
-#include <linux/utsname.h>
-#include <linux/security.h>
-#include <linux/export.h>
-
-unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
-static struct autogroup autogroup_default;
-static atomic_t autogroup_seq_nr;
-
-void __init autogroup_init(struct task_struct *init_task)
-{
-	autogroup_default.tg = &root_task_group;
-	kref_init(&autogroup_default.kref);
-	init_rwsem(&autogroup_default.lock);
-	init_task->signal->autogroup = &autogroup_default;
-}
-
-void autogroup_free(struct task_group *tg)
-{
-	kfree(tg->autogroup);
-}
-
-static inline void autogroup_destroy(struct kref *kref)
-{
-	struct autogroup *ag = container_of(kref, struct autogroup, kref);
-
-#ifdef CONFIG_RT_GROUP_SCHED
-	/* We've redirected RT tasks to the root task group... */
-	ag->tg->rt_se = NULL;
-	ag->tg->rt_rq = NULL;
-#endif
-	sched_offline_group(ag->tg);
-	sched_destroy_group(ag->tg);
-}
-
-static inline void autogroup_kref_put(struct autogroup *ag)
-{
-	kref_put(&ag->kref, autogroup_destroy);
-}
-
-static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
-{
-	kref_get(&ag->kref);
-	return ag;
-}
-
-static inline struct autogroup *autogroup_task_get(struct task_struct *p)
-{
-	struct autogroup *ag;
-	unsigned long flags;
-
-	if (!lock_task_sighand(p, &flags))
-		return autogroup_kref_get(&autogroup_default);
-
-	ag = autogroup_kref_get(p->signal->autogroup);
-	unlock_task_sighand(p, &flags);
-
-	return ag;
-}
-
-static inline struct autogroup *autogroup_create(void)
-{
-	struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
-	struct task_group *tg;
-
-	if (!ag)
-		goto out_fail;
-
-	tg = sched_create_group(&root_task_group);
-
-	if (IS_ERR(tg))
-		goto out_free;
-
-	kref_init(&ag->kref);
-	init_rwsem(&ag->lock);
-	ag->id = atomic_inc_return(&autogroup_seq_nr);
-	ag->tg = tg;
-#ifdef CONFIG_RT_GROUP_SCHED
-	/*
-	 * Autogroup RT tasks are redirected to the root task group
-	 * so we don't have to move tasks around upon policy change,
-	 * or flail around trying to allocate bandwidth on the fly.
-	 * A bandwidth exception in __sched_setscheduler() allows
-	 * the policy change to proceed.
-	 */
-	free_rt_sched_group(tg);
-	tg->rt_se = root_task_group.rt_se;
-	tg->rt_rq = root_task_group.rt_rq;
-#endif
-	tg->autogroup = ag;
-
-	sched_online_group(tg, &root_task_group);
-	return ag;
-
-out_free:
-	kfree(ag);
-out_fail:
-	if (printk_ratelimit()) {
-		printk(KERN_WARNING "autogroup_create: %s failure.\n",
-			ag ? "sched_create_group()" : "kmalloc()");
-	}
-
-	return autogroup_kref_get(&autogroup_default);
-}
-
-bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
-{
-	if (tg != &root_task_group)
-		return false;
-	/*
-	 * If we race with autogroup_move_group() the caller can use the old
-	 * value of signal->autogroup but in this case sched_move_task() will
-	 * be called again before autogroup_kref_put().
-	 *
-	 * However, there is no way sched_autogroup_exit_task() could tell us
-	 * to avoid autogroup->tg, so we abuse PF_EXITING flag for this case.
-	 */
-	if (p->flags & PF_EXITING)
-		return false;
-
-	return true;
-}
-
-void sched_autogroup_exit_task(struct task_struct *p)
-{
-	/*
-	 * We are going to call exit_notify() and autogroup_move_group() can't
-	 * see this thread after that: we can no longer use signal->autogroup.
-	 * See the PF_EXITING check in task_wants_autogroup().
-	 */
-	sched_move_task(p);
-}
-
-static void
-autogroup_move_group(struct task_struct *p, struct autogroup *ag)
-{
-	struct autogroup *prev;
-	struct task_struct *t;
-	unsigned long flags;
-
-	BUG_ON(!lock_task_sighand(p, &flags));
-
-	prev = p->signal->autogroup;
-	if (prev == ag) {
-		unlock_task_sighand(p, &flags);
-		return;
-	}
-
-	p->signal->autogroup = autogroup_kref_get(ag);
-	/*
-	 * We can't avoid sched_move_task() after we changed signal->autogroup,
-	 * this process can already run with task_group() == prev->tg or we can
-	 * race with cgroup code which can read autogroup = prev under rq->lock.
-	 * In the latter case for_each_thread() can not miss a migrating thread,
-	 * cpu_cgroup_attach() must not be possible after cgroup_exit() and it
-	 * can't be removed from thread list, we hold ->siglock.
-	 *
-	 * If an exiting thread was already removed from thread list we rely on
-	 * sched_autogroup_exit_task().
-	 */
-	for_each_thread(p, t)
-		sched_move_task(t);
-
-	unlock_task_sighand(p, &flags);
-	autogroup_kref_put(prev);
-}
-
-/* Allocates GFP_KERNEL, cannot be called under any spinlock */
-void sched_autogroup_create_attach(struct task_struct *p)
-{
-	struct autogroup *ag = autogroup_create();
-
-	autogroup_move_group(p, ag);
-	/* drop extra reference added by autogroup_create() */
-	autogroup_kref_put(ag);
-}
-EXPORT_SYMBOL(sched_autogroup_create_attach);
-
-/* Cannot be called under siglock.  Currently has no users */
-void sched_autogroup_detach(struct task_struct *p)
-{
-	autogroup_move_group(p, &autogroup_default);
-}
-EXPORT_SYMBOL(sched_autogroup_detach);
-
-void sched_autogroup_fork(struct signal_struct *sig)
-{
-	sig->autogroup = autogroup_task_get(current);
-}
-
-void sched_autogroup_exit(struct signal_struct *sig)
-{
-	autogroup_kref_put(sig->autogroup);
-}
-
-static int __init setup_autogroup(char *str)
-{
-	sysctl_sched_autogroup_enabled = 0;
-
-	return 1;
-}
-
-__setup("noautogroup", setup_autogroup);
-
-#ifdef CONFIG_PROC_FS
-
-int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
-{
-	static unsigned long next = INITIAL_JIFFIES;
-	struct autogroup *ag;
-	unsigned long shares;
-	int err;
-
-	if (nice < MIN_NICE || nice > MAX_NICE)
-		return -EINVAL;
-
-	err = security_task_setnice(current, nice);
-	if (err)
-		return err;
-
-	if (nice < 0 && !can_nice(current, nice))
-		return -EPERM;
-
-	/* this is a heavy operation taking global locks.. */
-	if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
-		return -EAGAIN;
-
-	next = HZ / 10 + jiffies;
-	ag = autogroup_task_get(p);
-	shares = scale_load(sched_prio_to_weight[nice + 20]);
-
-	down_write(&ag->lock);
-	err = sched_group_set_shares(ag->tg, shares);
-	if (!err)
-		ag->nice = nice;
-	up_write(&ag->lock);
-
-	autogroup_kref_put(ag);
-
-	return err;
-}
-
-void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
-{
-	struct autogroup *ag = autogroup_task_get(p);
-
-	if (!task_group_is_autogroup(ag->tg))
-		goto out;
-
-	down_read(&ag->lock);
-	seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
-	up_read(&ag->lock);
-
-out:
-	autogroup_kref_put(ag);
-}
-#endif /* CONFIG_PROC_FS */
-
-#ifdef CONFIG_SCHED_DEBUG
-int autogroup_path(struct task_group *tg, char *buf, int buflen)
-{
-	if (!task_group_is_autogroup(tg))
-		return 0;
-
-	return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
-}
-#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h
deleted file mode 100644
index 890c95f2587a..000000000000
--- a/kernel/sched/auto_group.h
+++ /dev/null
@@ -1,64 +0,0 @@
-#ifdef CONFIG_SCHED_AUTOGROUP
-
-#include <linux/kref.h>
-#include <linux/rwsem.h>
-
-struct autogroup {
-	/*
-	 * reference doesn't mean how many thread attach to this
-	 * autogroup now. It just stands for the number of task
-	 * could use this autogroup.
-	 */
-	struct kref		kref;
-	struct task_group	*tg;
-	struct rw_semaphore	lock;
-	unsigned long		id;
-	int			nice;
-};
-
-extern void autogroup_init(struct task_struct *init_task);
-extern void autogroup_free(struct task_group *tg);
-
-static inline bool task_group_is_autogroup(struct task_group *tg)
-{
-	return !!tg->autogroup;
-}
-
-extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
-
-static inline struct task_group *
-autogroup_task_group(struct task_struct *p, struct task_group *tg)
-{
-	int enabled = READ_ONCE(sysctl_sched_autogroup_enabled);
-
-	if (enabled && task_wants_autogroup(p, tg))
-		return p->signal->autogroup->tg;
-
-	return tg;
-}
-
-extern int autogroup_path(struct task_group *tg, char *buf, int buflen);
-
-#else /* !CONFIG_SCHED_AUTOGROUP */
-
-static inline void autogroup_init(struct task_struct *init_task) {  }
-static inline void autogroup_free(struct task_group *tg) { }
-static inline bool task_group_is_autogroup(struct task_group *tg)
-{
-	return 0;
-}
-
-static inline struct task_group *
-autogroup_task_group(struct task_struct *p, struct task_group *tg)
-{
-	return tg;
-}
-
-#ifdef CONFIG_SCHED_DEBUG
-static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
-{
-	return 0;
-}
-#endif
-
-#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
new file mode 100644
index 000000000000..da39489d2d80
--- /dev/null
+++ b/kernel/sched/autogroup.c
@@ -0,0 +1,271 @@
+#include "sched.h"
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/utsname.h>
+#include <linux/security.h>
+#include <linux/export.h>
+
+unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
+static struct autogroup autogroup_default;
+static atomic_t autogroup_seq_nr;
+
+void __init autogroup_init(struct task_struct *init_task)
+{
+	autogroup_default.tg = &root_task_group;
+	kref_init(&autogroup_default.kref);
+	init_rwsem(&autogroup_default.lock);
+	init_task->signal->autogroup = &autogroup_default;
+}
+
+void autogroup_free(struct task_group *tg)
+{
+	kfree(tg->autogroup);
+}
+
+static inline void autogroup_destroy(struct kref *kref)
+{
+	struct autogroup *ag = container_of(kref, struct autogroup, kref);
+
+#ifdef CONFIG_RT_GROUP_SCHED
+	/* We've redirected RT tasks to the root task group... */
+	ag->tg->rt_se = NULL;
+	ag->tg->rt_rq = NULL;
+#endif
+	sched_offline_group(ag->tg);
+	sched_destroy_group(ag->tg);
+}
+
+static inline void autogroup_kref_put(struct autogroup *ag)
+{
+	kref_put(&ag->kref, autogroup_destroy);
+}
+
+static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
+{
+	kref_get(&ag->kref);
+	return ag;
+}
+
+static inline struct autogroup *autogroup_task_get(struct task_struct *p)
+{
+	struct autogroup *ag;
+	unsigned long flags;
+
+	if (!lock_task_sighand(p, &flags))
+		return autogroup_kref_get(&autogroup_default);
+
+	ag = autogroup_kref_get(p->signal->autogroup);
+	unlock_task_sighand(p, &flags);
+
+	return ag;
+}
+
+static inline struct autogroup *autogroup_create(void)
+{
+	struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
+	struct task_group *tg;
+
+	if (!ag)
+		goto out_fail;
+
+	tg = sched_create_group(&root_task_group);
+
+	if (IS_ERR(tg))
+		goto out_free;
+
+	kref_init(&ag->kref);
+	init_rwsem(&ag->lock);
+	ag->id = atomic_inc_return(&autogroup_seq_nr);
+	ag->tg = tg;
+#ifdef CONFIG_RT_GROUP_SCHED
+	/*
+	 * Autogroup RT tasks are redirected to the root task group
+	 * so we don't have to move tasks around upon policy change,
+	 * or flail around trying to allocate bandwidth on the fly.
+	 * A bandwidth exception in __sched_setscheduler() allows
+	 * the policy change to proceed.
+	 */
+	free_rt_sched_group(tg);
+	tg->rt_se = root_task_group.rt_se;
+	tg->rt_rq = root_task_group.rt_rq;
+#endif
+	tg->autogroup = ag;
+
+	sched_online_group(tg, &root_task_group);
+	return ag;
+
+out_free:
+	kfree(ag);
+out_fail:
+	if (printk_ratelimit()) {
+		printk(KERN_WARNING "autogroup_create: %s failure.\n",
+			ag ? "sched_create_group()" : "kmalloc()");
+	}
+
+	return autogroup_kref_get(&autogroup_default);
+}
+
+bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
+{
+	if (tg != &root_task_group)
+		return false;
+	/*
+	 * If we race with autogroup_move_group() the caller can use the old
+	 * value of signal->autogroup but in this case sched_move_task() will
+	 * be called again before autogroup_kref_put().
+	 *
+	 * However, there is no way sched_autogroup_exit_task() could tell us
+	 * to avoid autogroup->tg, so we abuse PF_EXITING flag for this case.
+	 */
+	if (p->flags & PF_EXITING)
+		return false;
+
+	return true;
+}
+
+void sched_autogroup_exit_task(struct task_struct *p)
+{
+	/*
+	 * We are going to call exit_notify() and autogroup_move_group() can't
+	 * see this thread after that: we can no longer use signal->autogroup.
+	 * See the PF_EXITING check in task_wants_autogroup().
+	 */
+	sched_move_task(p);
+}
+
+static void
+autogroup_move_group(struct task_struct *p, struct autogroup *ag)
+{
+	struct autogroup *prev;
+	struct task_struct *t;
+	unsigned long flags;
+
+	BUG_ON(!lock_task_sighand(p, &flags));
+
+	prev = p->signal->autogroup;
+	if (prev == ag) {
+		unlock_task_sighand(p, &flags);
+		return;
+	}
+
+	p->signal->autogroup = autogroup_kref_get(ag);
+	/*
+	 * We can't avoid sched_move_task() after we changed signal->autogroup,
+	 * this process can already run with task_group() == prev->tg or we can
+	 * race with cgroup code which can read autogroup = prev under rq->lock.
+	 * In the latter case for_each_thread() can not miss a migrating thread,
+	 * cpu_cgroup_attach() must not be possible after cgroup_exit() and it
+	 * can't be removed from thread list, we hold ->siglock.
+	 *
+	 * If an exiting thread was already removed from thread list we rely on
+	 * sched_autogroup_exit_task().
+	 */
+	for_each_thread(p, t)
+		sched_move_task(t);
+
+	unlock_task_sighand(p, &flags);
+	autogroup_kref_put(prev);
+}
+
+/* Allocates GFP_KERNEL, cannot be called under any spinlock */
+void sched_autogroup_create_attach(struct task_struct *p)
+{
+	struct autogroup *ag = autogroup_create();
+
+	autogroup_move_group(p, ag);
+	/* drop extra reference added by autogroup_create() */
+	autogroup_kref_put(ag);
+}
+EXPORT_SYMBOL(sched_autogroup_create_attach);
+
+/* Cannot be called under siglock.  Currently has no users */
+void sched_autogroup_detach(struct task_struct *p)
+{
+	autogroup_move_group(p, &autogroup_default);
+}
+EXPORT_SYMBOL(sched_autogroup_detach);
+
+void sched_autogroup_fork(struct signal_struct *sig)
+{
+	sig->autogroup = autogroup_task_get(current);
+}
+
+void sched_autogroup_exit(struct signal_struct *sig)
+{
+	autogroup_kref_put(sig->autogroup);
+}
+
+static int __init setup_autogroup(char *str)
+{
+	sysctl_sched_autogroup_enabled = 0;
+
+	return 1;
+}
+
+__setup("noautogroup", setup_autogroup);
+
+#ifdef CONFIG_PROC_FS
+
+int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
+{
+	static unsigned long next = INITIAL_JIFFIES;
+	struct autogroup *ag;
+	unsigned long shares;
+	int err;
+
+	if (nice < MIN_NICE || nice > MAX_NICE)
+		return -EINVAL;
+
+	err = security_task_setnice(current, nice);
+	if (err)
+		return err;
+
+	if (nice < 0 && !can_nice(current, nice))
+		return -EPERM;
+
+	/* this is a heavy operation taking global locks.. */
+	if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
+		return -EAGAIN;
+
+	next = HZ / 10 + jiffies;
+	ag = autogroup_task_get(p);
+	shares = scale_load(sched_prio_to_weight[nice + 20]);
+
+	down_write(&ag->lock);
+	err = sched_group_set_shares(ag->tg, shares);
+	if (!err)
+		ag->nice = nice;
+	up_write(&ag->lock);
+
+	autogroup_kref_put(ag);
+
+	return err;
+}
+
+void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
+{
+	struct autogroup *ag = autogroup_task_get(p);
+
+	if (!task_group_is_autogroup(ag->tg))
+		goto out;
+
+	down_read(&ag->lock);
+	seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
+	up_read(&ag->lock);
+
+out:
+	autogroup_kref_put(ag);
+}
+#endif /* CONFIG_PROC_FS */
+
+#ifdef CONFIG_SCHED_DEBUG
+int autogroup_path(struct task_group *tg, char *buf, int buflen)
+{
+	if (!task_group_is_autogroup(tg))
+		return 0;
+
+	return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
+}
+#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h
new file mode 100644
index 000000000000..890c95f2587a
--- /dev/null
+++ b/kernel/sched/autogroup.h
@@ -0,0 +1,64 @@
+#ifdef CONFIG_SCHED_AUTOGROUP
+
+#include <linux/kref.h>
+#include <linux/rwsem.h>
+
+struct autogroup {
+	/*
+	 * reference doesn't mean how many thread attach to this
+	 * autogroup now. It just stands for the number of task
+	 * could use this autogroup.
+	 */
+	struct kref		kref;
+	struct task_group	*tg;
+	struct rw_semaphore	lock;
+	unsigned long		id;
+	int			nice;
+};
+
+extern void autogroup_init(struct task_struct *init_task);
+extern void autogroup_free(struct task_group *tg);
+
+static inline bool task_group_is_autogroup(struct task_group *tg)
+{
+	return !!tg->autogroup;
+}
+
+extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+	int enabled = READ_ONCE(sysctl_sched_autogroup_enabled);
+
+	if (enabled && task_wants_autogroup(p, tg))
+		return p->signal->autogroup->tg;
+
+	return tg;
+}
+
+extern int autogroup_path(struct task_group *tg, char *buf, int buflen);
+
+#else /* !CONFIG_SCHED_AUTOGROUP */
+
+static inline void autogroup_init(struct task_struct *init_task) {  }
+static inline void autogroup_free(struct task_group *tg) { }
+static inline bool task_group_is_autogroup(struct task_group *tg)
+{
+	return 0;
+}
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+	return tg;
+}
+
+#ifdef CONFIG_SCHED_DEBUG
+static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
+{
+	return 0;
+}
+#endif
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 17ed94b9b413..71b10a9b73cf 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1069,7 +1069,7 @@ static inline void sched_ttwu_pending(void) { }
 #endif /* CONFIG_SMP */
 
 #include "stats.h"
-#include "auto_group.h"
+#include "autogroup.h"
 
 #ifdef CONFIG_CGROUP_SCHED
 
-- 
cgit v1.2.3


From bd66a89249892acc9d938ba4956066b21403fa5f Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Tue, 27 Dec 2016 23:16:04 +0900
Subject: printk: use vprintk_func in vprintk()

vprintk(), just like printk(), better be using per-cpu printk_func
instead of direct vprintk_emit() call. Just in case if vprintk()
will ever be called from NMI, or from any other context that can
deadlock in printk().

Link: http://lkml.kernel.org/r/20161227141611.940-2-sergey.senozhatsky@gmail.com
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Calvin Owens <calvinowens@fb.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Hurley <peter@hurleysoftware.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 kernel/printk/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 8b2696420abb..106843a83b63 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1803,7 +1803,7 @@ EXPORT_SYMBOL(vprintk_emit);
 
 asmlinkage int vprintk(const char *fmt, va_list args)
 {
-	return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
+	return vprintk_func(fmt, args);
 }
 EXPORT_SYMBOL(vprintk);
 
-- 
cgit v1.2.3


From f92bac3b141b8233e34ddf32d227e12bfba07b48 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Tue, 27 Dec 2016 23:16:05 +0900
Subject: printk: rename nmi.c and exported api

A preparation patch for printk_safe work. No functional change.
- rename nmi.c to print_safe.c
- add `printk_safe' prefix to some (which used both by printk-safe
  and printk-nmi) of the exported functions.

Link: http://lkml.kernel.org/r/20161227141611.940-3-sergey.senozhatsky@gmail.com
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Calvin Owens <calvinowens@fb.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Hurley <peter@hurleysoftware.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/printk.h      |  12 +-
 init/Kconfig                |  16 +--
 init/main.c                 |   2 +-
 kernel/kexec_core.c         |   2 +-
 kernel/panic.c              |   4 +-
 kernel/printk/Makefile      |   2 +-
 kernel/printk/nmi.c         | 291 --------------------------------------------
 kernel/printk/printk_safe.c | 291 ++++++++++++++++++++++++++++++++++++++++++++
 lib/nmi_backtrace.c         |   2 +-
 9 files changed, 312 insertions(+), 310 deletions(-)
 delete mode 100644 kernel/printk/nmi.c
 create mode 100644 kernel/printk/printk_safe.c

(limited to 'kernel')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 3472cc6b7a60..37e933eeffb2 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -147,17 +147,17 @@ void early_printk(const char *s, ...) { }
 #endif
 
 #ifdef CONFIG_PRINTK_NMI
-extern void printk_nmi_init(void);
+extern void printk_safe_init(void);
 extern void printk_nmi_enter(void);
 extern void printk_nmi_exit(void);
-extern void printk_nmi_flush(void);
-extern void printk_nmi_flush_on_panic(void);
+extern void printk_safe_flush(void);
+extern void printk_safe_flush_on_panic(void);
 #else
-static inline void printk_nmi_init(void) { }
+static inline void printk_safe_init(void) { }
 static inline void printk_nmi_enter(void) { }
 static inline void printk_nmi_exit(void) { }
-static inline void printk_nmi_flush(void) { }
-static inline void printk_nmi_flush_on_panic(void) { }
+static inline void printk_safe_flush(void) { }
+static inline void printk_safe_flush_on_panic(void) { }
 #endif /* PRINTK_NMI */
 
 #ifdef CONFIG_PRINTK
diff --git a/init/Kconfig b/init/Kconfig
index 223b734abccd..760b7d0bc9d7 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -875,17 +875,19 @@ config LOG_CPU_MAX_BUF_SHIFT
 		     13 =>   8 KB for each CPU
 		     12 =>   4 KB for each CPU
 
-config NMI_LOG_BUF_SHIFT
-	int "Temporary per-CPU NMI log buffer size (12 => 4KB, 13 => 8KB)"
+config PRINTK_SAFE_LOG_BUF_SHIFT
+	int "Temporary per-CPU printk log buffer size (12 => 4KB, 13 => 8KB)"
 	range 10 21
 	default 13
-	depends on PRINTK_NMI
+	depends on PRINTK
 	help
-	  Select the size of a per-CPU buffer where NMI messages are temporary
-	  stored. They are copied to the main log buffer in a safe context
-	  to avoid a deadlock. The value defines the size as a power of 2.
+	  Select the size of an alternate printk per-CPU buffer where messages
+	  printed from usafe contexts are temporary stored. One example would
+	  be NMI messages, another one - printk recursion. The messages are
+	  copied to the main log buffer in a safe context to avoid a deadlock.
+	  The value defines the size as a power of 2.
 
-	  NMI messages are rare and limited. The largest one is when
+	  Those messages are rare and limited. The largest one is when
 	  a backtrace is printed. It usually fits into 4KB. Select
 	  8KB if you want to be on the safe side.
 
diff --git a/init/main.c b/init/main.c
index b0c9d6facef9..b4ca17d9bdeb 100644
--- a/init/main.c
+++ b/init/main.c
@@ -580,7 +580,7 @@ asmlinkage __visible void __init start_kernel(void)
 	timekeeping_init();
 	time_init();
 	sched_clock_postinit();
-	printk_nmi_init();
+	printk_safe_init();
 	perf_event_init();
 	profile_init();
 	call_function_init();
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 5617cc412444..14bb9eb76665 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -916,7 +916,7 @@ void crash_kexec(struct pt_regs *regs)
 	old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
 	if (old_cpu == PANIC_CPU_INVALID) {
 		/* This is the 1st CPU which comes here, so go ahead. */
-		printk_nmi_flush_on_panic();
+		printk_safe_flush_on_panic();
 		__crash_kexec(regs);
 
 		/*
diff --git a/kernel/panic.c b/kernel/panic.c
index c51edaa04fce..8c8efcd310e7 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -188,7 +188,7 @@ void panic(const char *fmt, ...)
 	 * Bypass the panic_cpu check and call __crash_kexec directly.
 	 */
 	if (!_crash_kexec_post_notifiers) {
-		printk_nmi_flush_on_panic();
+		printk_safe_flush_on_panic();
 		__crash_kexec(NULL);
 
 		/*
@@ -213,7 +213,7 @@ void panic(const char *fmt, ...)
 	atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
 
 	/* Call flush even twice. It tries harder with a single online CPU */
-	printk_nmi_flush_on_panic();
+	printk_safe_flush_on_panic();
 	kmsg_dump(KMSG_DUMP_PANIC);
 
 	/*
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index abb0042a427b..607928119f26 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,3 +1,3 @@
 obj-y	= printk.o
-obj-$(CONFIG_PRINTK_NMI)		+= nmi.o
+obj-$(CONFIG_PRINTK_NMI)		+= printk_safe.o
 obj-$(CONFIG_A11Y_BRAILLE_CONSOLE)	+= braille.o
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
deleted file mode 100644
index f011aaef583c..000000000000
--- a/kernel/printk/nmi.c
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * nmi.c - Safe printk in NMI context
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/preempt.h>
-#include <linux/spinlock.h>
-#include <linux/debug_locks.h>
-#include <linux/smp.h>
-#include <linux/cpumask.h>
-#include <linux/irq_work.h>
-#include <linux/printk.h>
-
-#include "internal.h"
-
-/*
- * printk() could not take logbuf_lock in NMI context. Instead,
- * it uses an alternative implementation that temporary stores
- * the strings into a per-CPU buffer. The content of the buffer
- * is later flushed into the main ring buffer via IRQ work.
- *
- * The alternative implementation is chosen transparently
- * via @printk_func per-CPU variable.
- *
- * The implementation allows to flush the strings also from another CPU.
- * There are situations when we want to make sure that all buffers
- * were handled or when IRQs are blocked.
- */
-DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
-static int printk_nmi_irq_ready;
-atomic_t nmi_message_lost;
-
-#define NMI_LOG_BUF_LEN ((1 << CONFIG_NMI_LOG_BUF_SHIFT) -		\
-			 sizeof(atomic_t) - sizeof(struct irq_work))
-
-struct nmi_seq_buf {
-	atomic_t		len;	/* length of written data */
-	struct irq_work		work;	/* IRQ work that flushes the buffer */
-	unsigned char		buffer[NMI_LOG_BUF_LEN];
-};
-static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
-
-/*
- * Safe printk() for NMI context. It uses a per-CPU buffer to
- * store the message. NMIs are not nested, so there is always only
- * one writer running. But the buffer might get flushed from another
- * CPU, so we need to be careful.
- */
-static int vprintk_nmi(const char *fmt, va_list args)
-{
-	struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
-	int add = 0;
-	size_t len;
-
-again:
-	len = atomic_read(&s->len);
-
-	/* The trailing '\0' is not counted into len. */
-	if (len >= sizeof(s->buffer) - 1) {
-		atomic_inc(&nmi_message_lost);
-		return 0;
-	}
-
-	/*
-	 * Make sure that all old data have been read before the buffer was
-	 * reseted. This is not needed when we just append data.
-	 */
-	if (!len)
-		smp_rmb();
-
-	add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
-
-	/*
-	 * Do it once again if the buffer has been flushed in the meantime.
-	 * Note that atomic_cmpxchg() is an implicit memory barrier that
-	 * makes sure that the data were written before updating s->len.
-	 */
-	if (atomic_cmpxchg(&s->len, len, len + add) != len)
-		goto again;
-
-	/* Get flushed in a more safe context. */
-	if (add && printk_nmi_irq_ready) {
-		/* Make sure that IRQ work is really initialized. */
-		smp_rmb();
-		irq_work_queue(&s->work);
-	}
-
-	return add;
-}
-
-static void printk_nmi_flush_line(const char *text, int len)
-{
-	/*
-	 * The buffers are flushed in NMI only on panic.  The messages must
-	 * go only into the ring buffer at this stage.  Consoles will get
-	 * explicitly called later when a crashdump is not generated.
-	 */
-	if (in_nmi())
-		printk_deferred("%.*s", len, text);
-	else
-		printk("%.*s", len, text);
-
-}
-
-/* printk part of the temporary buffer line by line */
-static int printk_nmi_flush_buffer(const char *start, size_t len)
-{
-	const char *c, *end;
-	bool header;
-
-	c = start;
-	end = start + len;
-	header = true;
-
-	/* Print line by line. */
-	while (c < end) {
-		if (*c == '\n') {
-			printk_nmi_flush_line(start, c - start + 1);
-			start = ++c;
-			header = true;
-			continue;
-		}
-
-		/* Handle continuous lines or missing new line. */
-		if ((c + 1 < end) && printk_get_level(c)) {
-			if (header) {
-				c = printk_skip_level(c);
-				continue;
-			}
-
-			printk_nmi_flush_line(start, c - start);
-			start = c++;
-			header = true;
-			continue;
-		}
-
-		header = false;
-		c++;
-	}
-
-	/* Check if there was a partial line. Ignore pure header. */
-	if (start < end && !header) {
-		static const char newline[] = KERN_CONT "\n";
-
-		printk_nmi_flush_line(start, end - start);
-		printk_nmi_flush_line(newline, strlen(newline));
-	}
-
-	return len;
-}
-
-/*
- * Flush data from the associated per_CPU buffer. The function
- * can be called either via IRQ work or independently.
- */
-static void __printk_nmi_flush(struct irq_work *work)
-{
-	static raw_spinlock_t read_lock =
-		__RAW_SPIN_LOCK_INITIALIZER(read_lock);
-	struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
-	unsigned long flags;
-	size_t len;
-	int i;
-
-	/*
-	 * The lock has two functions. First, one reader has to flush all
-	 * available message to make the lockless synchronization with
-	 * writers easier. Second, we do not want to mix messages from
-	 * different CPUs. This is especially important when printing
-	 * a backtrace.
-	 */
-	raw_spin_lock_irqsave(&read_lock, flags);
-
-	i = 0;
-more:
-	len = atomic_read(&s->len);
-
-	/*
-	 * This is just a paranoid check that nobody has manipulated
-	 * the buffer an unexpected way. If we printed something then
-	 * @len must only increase. Also it should never overflow the
-	 * buffer size.
-	 */
-	if ((i && i >= len) || len > sizeof(s->buffer)) {
-		const char *msg = "printk_nmi_flush: internal error\n";
-
-		printk_nmi_flush_line(msg, strlen(msg));
-		len = 0;
-	}
-
-	if (!len)
-		goto out; /* Someone else has already flushed the buffer. */
-
-	/* Make sure that data has been written up to the @len */
-	smp_rmb();
-	i += printk_nmi_flush_buffer(s->buffer + i, len - i);
-
-	/*
-	 * Check that nothing has got added in the meantime and truncate
-	 * the buffer. Note that atomic_cmpxchg() is an implicit memory
-	 * barrier that makes sure that the data were copied before
-	 * updating s->len.
-	 */
-	if (atomic_cmpxchg(&s->len, len, 0) != len)
-		goto more;
-
-out:
-	raw_spin_unlock_irqrestore(&read_lock, flags);
-}
-
-/**
- * printk_nmi_flush - flush all per-cpu nmi buffers.
- *
- * The buffers are flushed automatically via IRQ work. This function
- * is useful only when someone wants to be sure that all buffers have
- * been flushed at some point.
- */
-void printk_nmi_flush(void)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		__printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work);
-}
-
-/**
- * printk_nmi_flush_on_panic - flush all per-cpu nmi buffers when the system
- *	goes down.
- *
- * Similar to printk_nmi_flush() but it can be called even in NMI context when
- * the system goes down. It does the best effort to get NMI messages into
- * the main ring buffer.
- *
- * Note that it could try harder when there is only one CPU online.
- */
-void printk_nmi_flush_on_panic(void)
-{
-	/*
-	 * Make sure that we could access the main ring buffer.
-	 * Do not risk a double release when more CPUs are up.
-	 */
-	if (in_nmi() && raw_spin_is_locked(&logbuf_lock)) {
-		if (num_online_cpus() > 1)
-			return;
-
-		debug_locks_off();
-		raw_spin_lock_init(&logbuf_lock);
-	}
-
-	printk_nmi_flush();
-}
-
-void __init printk_nmi_init(void)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu);
-
-		init_irq_work(&s->work, __printk_nmi_flush);
-	}
-
-	/* Make sure that IRQ works are initialized before enabling. */
-	smp_wmb();
-	printk_nmi_irq_ready = 1;
-
-	/* Flush pending messages that did not have scheduled IRQ works. */
-	printk_nmi_flush();
-}
-
-void printk_nmi_enter(void)
-{
-	this_cpu_write(printk_func, vprintk_nmi);
-}
-
-void printk_nmi_exit(void)
-{
-	this_cpu_write(printk_func, vprintk_default);
-}
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
new file mode 100644
index 000000000000..fc80359dcd78
--- /dev/null
+++ b/kernel/printk/printk_safe.c
@@ -0,0 +1,291 @@
+/*
+ * printk_safe.c - Safe printk in NMI context
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/preempt.h>
+#include <linux/spinlock.h>
+#include <linux/debug_locks.h>
+#include <linux/smp.h>
+#include <linux/cpumask.h>
+#include <linux/irq_work.h>
+#include <linux/printk.h>
+
+#include "internal.h"
+
+/*
+ * printk() could not take logbuf_lock in NMI context. Instead,
+ * it uses an alternative implementation that temporary stores
+ * the strings into a per-CPU buffer. The content of the buffer
+ * is later flushed into the main ring buffer via IRQ work.
+ *
+ * The alternative implementation is chosen transparently
+ * via @printk_func per-CPU variable.
+ *
+ * The implementation allows to flush the strings also from another CPU.
+ * There are situations when we want to make sure that all buffers
+ * were handled or when IRQs are blocked.
+ */
+DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
+static int printk_safe_irq_ready;
+atomic_t nmi_message_lost;
+
+#define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) -	\
+			 sizeof(atomic_t) - sizeof(struct irq_work))
+
+struct printk_safe_seq_buf {
+	atomic_t		len;	/* length of written data */
+	struct irq_work		work;	/* IRQ work that flushes the buffer */
+	unsigned char		buffer[SAFE_LOG_BUF_LEN];
+};
+static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq);
+
+/*
+ * Safe printk() for NMI context. It uses a per-CPU buffer to
+ * store the message. NMIs are not nested, so there is always only
+ * one writer running. But the buffer might get flushed from another
+ * CPU, so we need to be careful.
+ */
+static int vprintk_nmi(const char *fmt, va_list args)
+{
+	struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
+	int add = 0;
+	size_t len;
+
+again:
+	len = atomic_read(&s->len);
+
+	/* The trailing '\0' is not counted into len. */
+	if (len >= sizeof(s->buffer) - 1) {
+		atomic_inc(&nmi_message_lost);
+		return 0;
+	}
+
+	/*
+	 * Make sure that all old data have been read before the buffer was
+	 * reseted. This is not needed when we just append data.
+	 */
+	if (!len)
+		smp_rmb();
+
+	add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
+
+	/*
+	 * Do it once again if the buffer has been flushed in the meantime.
+	 * Note that atomic_cmpxchg() is an implicit memory barrier that
+	 * makes sure that the data were written before updating s->len.
+	 */
+	if (atomic_cmpxchg(&s->len, len, len + add) != len)
+		goto again;
+
+	/* Get flushed in a more safe context. */
+	if (add && printk_safe_irq_ready) {
+		/* Make sure that IRQ work is really initialized. */
+		smp_rmb();
+		irq_work_queue(&s->work);
+	}
+
+	return add;
+}
+
+static void printk_safe_flush_line(const char *text, int len)
+{
+	/*
+	 * The buffers are flushed in NMI only on panic.  The messages must
+	 * go only into the ring buffer at this stage.  Consoles will get
+	 * explicitly called later when a crashdump is not generated.
+	 */
+	if (in_nmi())
+		printk_deferred("%.*s", len, text);
+	else
+		printk("%.*s", len, text);
+}
+
+/* printk part of the temporary buffer line by line */
+static int printk_safe_flush_buffer(const char *start, size_t len)
+{
+	const char *c, *end;
+	bool header;
+
+	c = start;
+	end = start + len;
+	header = true;
+
+	/* Print line by line. */
+	while (c < end) {
+		if (*c == '\n') {
+			printk_safe_flush_line(start, c - start + 1);
+			start = ++c;
+			header = true;
+			continue;
+		}
+
+		/* Handle continuous lines or missing new line. */
+		if ((c + 1 < end) && printk_get_level(c)) {
+			if (header) {
+				c = printk_skip_level(c);
+				continue;
+			}
+
+			printk_safe_flush_line(start, c - start);
+			start = c++;
+			header = true;
+			continue;
+		}
+
+		header = false;
+		c++;
+	}
+
+	/* Check if there was a partial line. Ignore pure header. */
+	if (start < end && !header) {
+		static const char newline[] = KERN_CONT "\n";
+
+		printk_safe_flush_line(start, end - start);
+		printk_safe_flush_line(newline, strlen(newline));
+	}
+
+	return len;
+}
+
+/*
+ * Flush data from the associated per_CPU buffer. The function
+ * can be called either via IRQ work or independently.
+ */
+static void __printk_safe_flush(struct irq_work *work)
+{
+	static raw_spinlock_t read_lock =
+		__RAW_SPIN_LOCK_INITIALIZER(read_lock);
+	struct printk_safe_seq_buf *s =
+		container_of(work, struct printk_safe_seq_buf, work);
+	unsigned long flags;
+	size_t len;
+	int i;
+
+	/*
+	 * The lock has two functions. First, one reader has to flush all
+	 * available message to make the lockless synchronization with
+	 * writers easier. Second, we do not want to mix messages from
+	 * different CPUs. This is especially important when printing
+	 * a backtrace.
+	 */
+	raw_spin_lock_irqsave(&read_lock, flags);
+
+	i = 0;
+more:
+	len = atomic_read(&s->len);
+
+	/*
+	 * This is just a paranoid check that nobody has manipulated
+	 * the buffer an unexpected way. If we printed something then
+	 * @len must only increase. Also it should never overflow the
+	 * buffer size.
+	 */
+	if ((i && i >= len) || len > sizeof(s->buffer)) {
+		const char *msg = "printk_safe_flush: internal error\n";
+
+		printk_safe_flush_line(msg, strlen(msg));
+		len = 0;
+	}
+
+	if (!len)
+		goto out; /* Someone else has already flushed the buffer. */
+
+	/* Make sure that data has been written up to the @len */
+	smp_rmb();
+	i += printk_safe_flush_buffer(s->buffer + i, len - i);
+
+	/*
+	 * Check that nothing has got added in the meantime and truncate
+	 * the buffer. Note that atomic_cmpxchg() is an implicit memory
+	 * barrier that makes sure that the data were copied before
+	 * updating s->len.
+	 */
+	if (atomic_cmpxchg(&s->len, len, 0) != len)
+		goto more;
+
+out:
+	raw_spin_unlock_irqrestore(&read_lock, flags);
+}
+
+/**
+ * printk_safe_flush - flush all per-cpu nmi buffers.
+ *
+ * The buffers are flushed automatically via IRQ work. This function
+ * is useful only when someone wants to be sure that all buffers have
+ * been flushed at some point.
+ */
+void printk_safe_flush(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		__printk_safe_flush(&per_cpu(nmi_print_seq, cpu).work);
+}
+
+/**
+ * printk_safe_flush_on_panic - flush all per-cpu nmi buffers when the system
+ *	goes down.
+ *
+ * Similar to printk_safe_flush() but it can be called even in NMI context when
+ * the system goes down. It does the best effort to get NMI messages into
+ * the main ring buffer.
+ *
+ * Note that it could try harder when there is only one CPU online.
+ */
+void printk_safe_flush_on_panic(void)
+{
+	/*
+	 * Make sure that we could access the main ring buffer.
+	 * Do not risk a double release when more CPUs are up.
+	 */
+	if (in_nmi() && raw_spin_is_locked(&logbuf_lock)) {
+		if (num_online_cpus() > 1)
+			return;
+
+		debug_locks_off();
+		raw_spin_lock_init(&logbuf_lock);
+	}
+
+	printk_safe_flush();
+}
+
+void __init printk_safe_init(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct printk_safe_seq_buf *s = &per_cpu(nmi_print_seq, cpu);
+
+		init_irq_work(&s->work, __printk_safe_flush);
+	}
+
+	/* Make sure that IRQ works are initialized before enabling. */
+	smp_wmb();
+	printk_safe_irq_ready = 1;
+
+	/* Flush pending messages that did not have scheduled IRQ works. */
+	printk_safe_flush();
+}
+
+void printk_nmi_enter(void)
+{
+	this_cpu_write(printk_func, vprintk_nmi);
+}
+
+void printk_nmi_exit(void)
+{
+	this_cpu_write(printk_func, vprintk_default);
+}
diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c
index 75554754eadf..5f7999eacad5 100644
--- a/lib/nmi_backtrace.c
+++ b/lib/nmi_backtrace.c
@@ -77,7 +77,7 @@ void nmi_trigger_cpumask_backtrace(const cpumask_t *mask,
 	 * Force flush any remote buffers that might be stuck in IRQ context
 	 * and therefore could not run their irq_work.
 	 */
-	printk_nmi_flush();
+	printk_safe_flush();
 
 	clear_bit_unlock(0, &backtrace_flag);
 	put_cpu();
-- 
cgit v1.2.3


From 099f1c84c0052ec1b27f1c3942eed5830d86bdbb Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Tue, 27 Dec 2016 23:16:06 +0900
Subject: printk: introduce per-cpu safe_print seq buffer

This patch extends the idea of NMI per-cpu buffers to regions
that may cause recursive printk() calls and possible deadlocks.
Namely, printk() can't handle printk calls from schedule code
or printk() calls from lock debugging code (spin_dump() for instance);
because those may be called with `sem->lock' already taken or any
other `critical' locks (p->pi_lock, etc.). An example of deadlock
can be

 vprintk_emit()
  console_unlock()
   up()                        << raw_spin_lock_irqsave(&sem->lock, flags);
    wake_up_process()
     try_to_wake_up()
      ttwu_queue()
       ttwu_activate()
        activate_task()
         enqueue_task()
          enqueue_task_fair()
           cfs_rq_of()
            task_of()
             WARN_ON_ONCE(!entity_is_task(se))
              vprintk_emit()
               console_trylock()
                down_trylock()
                 raw_spin_lock_irqsave(&sem->lock, flags)
                 ^^^^ deadlock

and some other cases.

Just like in NMI implementation, the solution uses a per-cpu
`printk_func' pointer to 'redirect' printk() calls to a 'safe'
callback, that store messages in a per-cpu buffer and flushes
them back to logbuf buffer later.

Usage example:

 printk()
  printk_safe_enter_irqsave(flags)
  //
  //  any printk() call from here will endup in vprintk_safe(),
  //  that stores messages in a special per-CPU buffer.
  //
  printk_safe_exit_irqrestore(flags)

The 'redirection' mechanism, though, has been reworked, as suggested
by Petr Mladek. Instead of using a per-cpu @print_func callback we now
keep a per-cpu printk-context variable and call either default or nmi
vprintk function depending on its value. printk_nmi_entrer/exit and
printk_safe_enter/exit, thus, just set/celar corresponding bits in
printk-context functions.

The patch only adds printk_safe support, we don't use it yet.

Link: http://lkml.kernel.org/r/20161227141611.940-4-sergey.senozhatsky@gmail.com
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Calvin Owens <calvinowens@fb.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Hurley <peter@hurleysoftware.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/printk.h      |  21 +++++---
 kernel/printk/Makefile      |   2 +-
 kernel/printk/internal.h    |  76 ++++++++++++++++++--------
 kernel/printk/printk.c      |   3 --
 kernel/printk/printk_safe.c | 128 +++++++++++++++++++++++++++++++++++---------
 5 files changed, 172 insertions(+), 58 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 37e933eeffb2..571257e0f53d 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -147,17 +147,11 @@ void early_printk(const char *s, ...) { }
 #endif
 
 #ifdef CONFIG_PRINTK_NMI
-extern void printk_safe_init(void);
 extern void printk_nmi_enter(void);
 extern void printk_nmi_exit(void);
-extern void printk_safe_flush(void);
-extern void printk_safe_flush_on_panic(void);
 #else
-static inline void printk_safe_init(void) { }
 static inline void printk_nmi_enter(void) { }
 static inline void printk_nmi_exit(void) { }
-static inline void printk_safe_flush(void) { }
-static inline void printk_safe_flush_on_panic(void) { }
 #endif /* PRINTK_NMI */
 
 #ifdef CONFIG_PRINTK
@@ -209,6 +203,9 @@ void __init setup_log_buf(int early);
 __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...);
 void dump_stack_print_info(const char *log_lvl);
 void show_regs_print_info(const char *log_lvl);
+extern void printk_safe_init(void);
+extern void printk_safe_flush(void);
+extern void printk_safe_flush_on_panic(void);
 #else
 static inline __printf(1, 0)
 int vprintk(const char *s, va_list args)
@@ -268,6 +265,18 @@ static inline void dump_stack_print_info(const char *log_lvl)
 static inline void show_regs_print_info(const char *log_lvl)
 {
 }
+
+static inline void printk_safe_init(void)
+{
+}
+
+static inline void printk_safe_flush(void)
+{
+}
+
+static inline void printk_safe_flush_on_panic(void)
+{
+}
 #endif
 
 extern asmlinkage void dump_stack(void) __cold;
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index 607928119f26..4a2ffc39eb95 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,3 +1,3 @@
 obj-y	= printk.o
-obj-$(CONFIG_PRINTK_NMI)		+= printk_safe.o
+obj-$(CONFIG_PRINTK)	+= printk_safe.o
 obj-$(CONFIG_A11Y_BRAILLE_CONSOLE)	+= braille.o
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 7fd2838fa417..c65e36509f3b 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -16,26 +16,8 @@
  */
 #include <linux/percpu.h>
 
-typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
-
-int __printf(1, 0) vprintk_default(const char *fmt, va_list args);
-
 #ifdef CONFIG_PRINTK_NMI
 
-extern raw_spinlock_t logbuf_lock;
-
-/*
- * printk() could not take logbuf_lock in NMI context. Instead,
- * it temporary stores the strings into a per-CPU buffer.
- * The alternative implementation is chosen transparently
- * via per-CPU variable.
- */
-DECLARE_PER_CPU(printk_func_t, printk_func);
-static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
-{
-	return this_cpu_read(printk_func)(fmt, args);
-}
-
 extern atomic_t nmi_message_lost;
 static inline int get_nmi_message_lost(void)
 {
@@ -44,14 +26,62 @@ static inline int get_nmi_message_lost(void)
 
 #else /* CONFIG_PRINTK_NMI */
 
-static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
-{
-	return vprintk_default(fmt, args);
-}
-
 static inline int get_nmi_message_lost(void)
 {
 	return 0;
 }
 
 #endif /* CONFIG_PRINTK_NMI */
+
+#ifdef CONFIG_PRINTK
+
+#define PRINTK_SAFE_CONTEXT_MASK	0x7fffffff
+#define PRINTK_NMI_CONTEXT_MASK	0x80000000
+
+extern raw_spinlock_t logbuf_lock;
+
+__printf(1, 0) int vprintk_default(const char *fmt, va_list args);
+__printf(1, 0) int vprintk_func(const char *fmt, va_list args);
+void __printk_safe_enter(void);
+void __printk_safe_exit(void);
+
+#define printk_safe_enter_irqsave(flags)	\
+	do {					\
+		local_irq_save(flags);		\
+		__printk_safe_enter();		\
+	} while (0)
+
+#define printk_safe_exit_irqrestore(flags)	\
+	do {					\
+		__printk_safe_exit();		\
+		local_irq_restore(flags);	\
+	} while (0)
+
+#define printk_safe_enter_irq()		\
+	do {					\
+		local_irq_disable();		\
+		__printk_safe_enter();		\
+	} while (0)
+
+#define printk_safe_exit_irq()			\
+	do {					\
+		__printk_safe_exit();		\
+		local_irq_enable();		\
+	} while (0)
+
+#else
+
+__printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; }
+
+/*
+ * In !PRINTK builds we still export logbuf_lock spin_lock, console_sem
+ * semaphore and some of console functions (console_unlock()/etc.), so
+ * printk-safe must preserve the existing local IRQ guarantees.
+ */
+#define printk_safe_enter_irqsave(flags) local_irq_save(flags)
+#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)
+
+#define printk_safe_enter_irq() local_irq_disable()
+#define printk_safe_exit_irq() local_irq_enable()
+
+#endif /* CONFIG_PRINTK */
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 106843a83b63..f73046f7a6df 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1902,9 +1902,6 @@ static size_t msg_print_text(const struct printk_log *msg,
 			     bool syslog, char *buf, size_t size) { return 0; }
 static bool suppress_message_printing(int level) { return false; }
 
-/* Still needs to be defined for users */
-DEFINE_PER_CPU(printk_func_t, printk_func);
-
 #endif /* CONFIG_PRINTK */
 
 #ifdef CONFIG_EARLY_PRINTK
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index fc80359dcd78..efc89a4e9df5 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -1,5 +1,5 @@
 /*
- * printk_safe.c - Safe printk in NMI context
+ * printk_safe.c - Safe printk for printk-deadlock-prone contexts
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
@@ -32,13 +32,13 @@
  * is later flushed into the main ring buffer via IRQ work.
  *
  * The alternative implementation is chosen transparently
- * via @printk_func per-CPU variable.
+ * by examinig current printk() context mask stored in @printk_context
+ * per-CPU variable.
  *
  * The implementation allows to flush the strings also from another CPU.
  * There are situations when we want to make sure that all buffers
  * were handled or when IRQs are blocked.
  */
-DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
 static int printk_safe_irq_ready;
 atomic_t nmi_message_lost;
 
@@ -50,18 +50,28 @@ struct printk_safe_seq_buf {
 	struct irq_work		work;	/* IRQ work that flushes the buffer */
 	unsigned char		buffer[SAFE_LOG_BUF_LEN];
 };
+
+static DEFINE_PER_CPU(struct printk_safe_seq_buf, safe_print_seq);
+static DEFINE_PER_CPU(int, printk_context);
+
+#ifdef CONFIG_PRINTK_NMI
 static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq);
+#endif
 
 /*
- * Safe printk() for NMI context. It uses a per-CPU buffer to
- * store the message. NMIs are not nested, so there is always only
- * one writer running. But the buffer might get flushed from another
- * CPU, so we need to be careful.
+ * Add a message to per-CPU context-dependent buffer. NMI and printk-safe
+ * have dedicated buffers, because otherwise printk-safe preempted by
+ * NMI-printk would have overwritten the NMI messages.
+ *
+ * The messages are fushed from irq work (or from panic()), possibly,
+ * from other CPU, concurrently with printk_safe_log_store(). Should this
+ * happen, printk_safe_log_store() will notice the buffer->len mismatch
+ * and repeat the write.
  */
-static int vprintk_nmi(const char *fmt, va_list args)
+static int printk_safe_log_store(struct printk_safe_seq_buf *s,
+				 const char *fmt, va_list args)
 {
-	struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
-	int add = 0;
+	int add;
 	size_t len;
 
 again:
@@ -74,8 +84,8 @@ again:
 	}
 
 	/*
-	 * Make sure that all old data have been read before the buffer was
-	 * reseted. This is not needed when we just append data.
+	 * Make sure that all old data have been read before the buffer
+	 * was reset. This is not needed when we just append data.
 	 */
 	if (!len)
 		smp_rmb();
@@ -161,7 +171,7 @@ static int printk_safe_flush_buffer(const char *start, size_t len)
 }
 
 /*
- * Flush data from the associated per_CPU buffer. The function
+ * Flush data from the associated per-CPU buffer. The function
  * can be called either via IRQ work or independently.
  */
 static void __printk_safe_flush(struct irq_work *work)
@@ -231,8 +241,12 @@ void printk_safe_flush(void)
 {
 	int cpu;
 
-	for_each_possible_cpu(cpu)
+	for_each_possible_cpu(cpu) {
+#ifdef CONFIG_PRINTK_NMI
 		__printk_safe_flush(&per_cpu(nmi_print_seq, cpu).work);
+#endif
+		__printk_safe_flush(&per_cpu(safe_print_seq, cpu).work);
+	}
 }
 
 /**
@@ -262,14 +276,88 @@ void printk_safe_flush_on_panic(void)
 	printk_safe_flush();
 }
 
+#ifdef CONFIG_PRINTK_NMI
+/*
+ * Safe printk() for NMI context. It uses a per-CPU buffer to
+ * store the message. NMIs are not nested, so there is always only
+ * one writer running. But the buffer might get flushed from another
+ * CPU, so we need to be careful.
+ */
+static int vprintk_nmi(const char *fmt, va_list args)
+{
+	struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
+
+	return printk_safe_log_store(s, fmt, args);
+}
+
+void printk_nmi_enter(void)
+{
+	this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK);
+}
+
+void printk_nmi_exit(void)
+{
+	this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK);
+}
+
+#else
+
+static int vprintk_nmi(const char *fmt, va_list args)
+{
+	return 0;
+}
+
+#endif /* CONFIG_PRINTK_NMI */
+
+/*
+ * Lock-less printk(), to avoid deadlocks should the printk() recurse
+ * into itself. It uses a per-CPU buffer to store the message, just like
+ * NMI.
+ */
+static int vprintk_safe(const char *fmt, va_list args)
+{
+	struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq);
+
+	return printk_safe_log_store(s, fmt, args);
+}
+
+/* Can be preempted by NMI. */
+void __printk_safe_enter(void)
+{
+	this_cpu_inc(printk_context);
+}
+
+/* Can be preempted by NMI. */
+void __printk_safe_exit(void)
+{
+	this_cpu_dec(printk_context);
+}
+
+__printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+{
+	if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
+		return vprintk_nmi(fmt, args);
+
+	if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK)
+		return vprintk_safe(fmt, args);
+
+	return vprintk_default(fmt, args);
+}
+
 void __init printk_safe_init(void)
 {
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
-		struct printk_safe_seq_buf *s = &per_cpu(nmi_print_seq, cpu);
+		struct printk_safe_seq_buf *s;
+
+		s = &per_cpu(safe_print_seq, cpu);
+		init_irq_work(&s->work, __printk_safe_flush);
 
+#ifdef CONFIG_PRINTK_NMI
+		s = &per_cpu(nmi_print_seq, cpu);
 		init_irq_work(&s->work, __printk_safe_flush);
+#endif
 	}
 
 	/* Make sure that IRQ works are initialized before enabling. */
@@ -279,13 +367,3 @@ void __init printk_safe_init(void)
 	/* Flush pending messages that did not have scheduled IRQ works. */
 	printk_safe_flush();
 }
-
-void printk_nmi_enter(void)
-{
-	this_cpu_write(printk_func, vprintk_nmi);
-}
-
-void printk_nmi_exit(void)
-{
-	this_cpu_write(printk_func, vprintk_default);
-}
-- 
cgit v1.2.3


From 7acac3445acde1c94054cde69ab53503d296c393 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Tue, 7 Feb 2017 01:42:53 +0900
Subject: printk: always use deferred printk when flush printk_safe lines

Always use printk_deferred() in printk_safe_flush_line().
Flushing can be done from NMI or printk_safe contexts (when
we are in panic), so we can't call console drivers, yet still
want to store the messages in the logbuf buffer. Therefore we
use a deferred printk version.

Link: http://lkml.kernel.org/r/20170206164253.GA463@tigerII.localdomain
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Calvin Owens <calvinowens@fb.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Hurley <peter@hurleysoftware.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Suggested-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/printk/printk_safe.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index efc89a4e9df5..5214d326d3ba 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -110,17 +110,15 @@ again:
 	return add;
 }
 
-static void printk_safe_flush_line(const char *text, int len)
+static inline void printk_safe_flush_line(const char *text, int len)
 {
 	/*
-	 * The buffers are flushed in NMI only on panic.  The messages must
-	 * go only into the ring buffer at this stage.  Consoles will get
-	 * explicitly called later when a crashdump is not generated.
+	 * Avoid any console drivers calls from here, because we may be
+	 * in NMI or printk_safe context (when in panic). The messages
+	 * must go only into the ring buffer at this stage.  Consoles will
+	 * get explicitly called later when a crashdump is not generated.
 	 */
-	if (in_nmi())
-		printk_deferred("%.*s", len, text);
-	else
-		printk("%.*s", len, text);
+	printk_deferred("%.*s", len, text);
 }
 
 /* printk part of the temporary buffer line by line */
-- 
cgit v1.2.3


From ddb9baa822265b55afffd9815a2758a4b70006c1 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Tue, 27 Dec 2016 23:16:08 +0900
Subject: printk: report lost messages in printk safe/nmi contexts

Account lost messages in pritk-safe and printk-safe-nmi
contexts and report those numbers during printk_safe_flush().

The patch also moves lost message counter to struct
`printk_safe_seq_buf' instead of having dedicated static
counters - this simplifies the code.

Link: http://lkml.kernel.org/r/20161227141611.940-6-sergey.senozhatsky@gmail.com
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Calvin Owens <calvinowens@fb.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Hurley <peter@hurleysoftware.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 kernel/printk/internal.h    | 17 -----------------
 kernel/printk/printk.c      | 10 ----------
 kernel/printk/printk_safe.c | 38 ++++++++++++++++++++++++++++----------
 3 files changed, 28 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index c65e36509f3b..1db044f808b7 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -16,23 +16,6 @@
  */
 #include <linux/percpu.h>
 
-#ifdef CONFIG_PRINTK_NMI
-
-extern atomic_t nmi_message_lost;
-static inline int get_nmi_message_lost(void)
-{
-	return atomic_xchg(&nmi_message_lost, 0);
-}
-
-#else /* CONFIG_PRINTK_NMI */
-
-static inline int get_nmi_message_lost(void)
-{
-	return 0;
-}
-
-#endif /* CONFIG_PRINTK_NMI */
-
 #ifdef CONFIG_PRINTK
 
 #define PRINTK_SAFE_CONTEXT_MASK	0x7fffffff
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index f73046f7a6df..6bb77eff502d 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1677,7 +1677,6 @@ asmlinkage int vprintk_emit(int facility, int level,
 	unsigned long flags;
 	int this_cpu;
 	int printed_len = 0;
-	int nmi_message_lost;
 	bool in_sched = false;
 	/* cpu currently holding logbuf_lock in this function */
 	static unsigned int logbuf_cpu = UINT_MAX;
@@ -1728,15 +1727,6 @@ asmlinkage int vprintk_emit(int facility, int level,
 					 strlen(recursion_msg));
 	}
 
-	nmi_message_lost = get_nmi_message_lost();
-	if (unlikely(nmi_message_lost)) {
-		text_len = scnprintf(textbuf, sizeof(textbuf),
-				     "BAD LUCK: lost %d message(s) from NMI context!",
-				     nmi_message_lost);
-		printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
-					 NULL, 0, textbuf, text_len);
-	}
-
 	/*
 	 * The printf needs to come first; we need the syslog
 	 * prefix which might be passed-in as a parameter.
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 5214d326d3ba..033e50a7d706 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -40,13 +40,15 @@
  * were handled or when IRQs are blocked.
  */
 static int printk_safe_irq_ready;
-atomic_t nmi_message_lost;
 
 #define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) -	\
-			 sizeof(atomic_t) - sizeof(struct irq_work))
+				sizeof(atomic_t) -			\
+				sizeof(atomic_t) -			\
+				sizeof(struct irq_work))
 
 struct printk_safe_seq_buf {
 	atomic_t		len;	/* length of written data */
+	atomic_t		message_lost;
 	struct irq_work		work;	/* IRQ work that flushes the buffer */
 	unsigned char		buffer[SAFE_LOG_BUF_LEN];
 };
@@ -58,6 +60,16 @@ static DEFINE_PER_CPU(int, printk_context);
 static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq);
 #endif
 
+/* Get flushed in a more safe context. */
+static void queue_flush_work(struct printk_safe_seq_buf *s)
+{
+	if (printk_safe_irq_ready) {
+		/* Make sure that IRQ work is really initialized. */
+		smp_rmb();
+		irq_work_queue(&s->work);
+	}
+}
+
 /*
  * Add a message to per-CPU context-dependent buffer. NMI and printk-safe
  * have dedicated buffers, because otherwise printk-safe preempted by
@@ -79,7 +91,8 @@ again:
 
 	/* The trailing '\0' is not counted into len. */
 	if (len >= sizeof(s->buffer) - 1) {
-		atomic_inc(&nmi_message_lost);
+		atomic_inc(&s->message_lost);
+		queue_flush_work(s);
 		return 0;
 	}
 
@@ -91,6 +104,8 @@ again:
 		smp_rmb();
 
 	add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
+	if (!add)
+		return 0;
 
 	/*
 	 * Do it once again if the buffer has been flushed in the meantime.
@@ -100,13 +115,7 @@ again:
 	if (atomic_cmpxchg(&s->len, len, len + add) != len)
 		goto again;
 
-	/* Get flushed in a more safe context. */
-	if (add && printk_safe_irq_ready) {
-		/* Make sure that IRQ work is really initialized. */
-		smp_rmb();
-		irq_work_queue(&s->work);
-	}
-
+	queue_flush_work(s);
 	return add;
 }
 
@@ -168,6 +177,14 @@ static int printk_safe_flush_buffer(const char *start, size_t len)
 	return len;
 }
 
+static void report_message_lost(struct printk_safe_seq_buf *s)
+{
+	int lost = atomic_xchg(&s->message_lost, 0);
+
+	if (lost)
+		printk_deferred("Lost %d message(s)!\n", lost);
+}
+
 /*
  * Flush data from the associated per-CPU buffer. The function
  * can be called either via IRQ work or independently.
@@ -225,6 +242,7 @@ more:
 		goto more;
 
 out:
+	report_message_lost(s);
 	raw_spin_unlock_irqrestore(&read_lock, flags);
 }
 
-- 
cgit v1.2.3


From f975237b76827956fe13ecfe993a319158e2c303 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Tue, 27 Dec 2016 23:16:09 +0900
Subject: printk: use printk_safe buffers in printk

Use printk_safe per-CPU buffers in printk recursion-prone blocks:
-- around logbuf_lock protected sections in vprintk_emit() and
   console_unlock()
-- around down_trylock_console_sem() and up_console_sem()

Note that this solution addresses deadlocks caused by printk()
recursive calls only. That is vprintk_emit() and console_unlock().
The rest will be converted in a followup patch.

Another thing to note is that we now keep lockdep enabled in printk,
because we are protected against the printk recursion caused by
lockdep in vprintk_emit() by the printk-safe mechanism - we first
switch to per-CPU buffers and only then access the deadlock-prone
locks.

Examples:

1) printk() from logbuf_lock spin_lock section

Assume the following code:
  printk()
    raw_spin_lock(&logbuf_lock);
    WARN_ON(1);
    raw_spin_unlock(&logbuf_lock);

which now produces:

 ------------[ cut here ]------------
 WARNING: CPU: 0 PID: 366 at kernel/printk/printk.c:1811 vprintk_emit
 CPU: 0 PID: 366 Comm: bash
 Call Trace:
   warn_slowpath_null+0x1d/0x1f
   vprintk_emit+0x1cd/0x438
   vprintk_default+0x1d/0x1f
   printk+0x48/0x50
  [..]

2) printk() from semaphore sem->lock spin_lock section

Assume the following code

  printk()
    console_trylock()
      down_trylock()
        raw_spin_lock_irqsave(&sem->lock, flags);
        WARN_ON(1);
        raw_spin_unlock_irqrestore(&sem->lock, flags);

which now produces:

 ------------[ cut here ]------------
 WARNING: CPU: 1 PID: 363 at kernel/locking/semaphore.c:141 down_trylock
 CPU: 1 PID: 363 Comm: bash
 Call Trace:
   warn_slowpath_null+0x1d/0x1f
   down_trylock+0x3d/0x62
   ? vprintk_emit+0x3f9/0x414
   console_trylock+0x31/0xeb
   vprintk_emit+0x3f9/0x414
   vprintk_default+0x1d/0x1f
   printk+0x48/0x50
  [..]

3) printk() from console_unlock()

Assume the following code:

  printk()
    console_unlock()
      raw_spin_lock(&logbuf_lock);
      WARN_ON(1);
      raw_spin_unlock(&logbuf_lock);

which now produces:

 ------------[ cut here ]------------
 WARNING: CPU: 1 PID: 329 at kernel/printk/printk.c:2384 console_unlock
 CPU: 1 PID: 329 Comm: bash
 Call Trace:
   warn_slowpath_null+0x18/0x1a
   console_unlock+0x12d/0x559
   ? trace_hardirqs_on_caller+0x16d/0x189
   ? trace_hardirqs_on+0xd/0xf
   vprintk_emit+0x363/0x374
   vprintk_default+0x18/0x1a
   printk+0x43/0x4b
  [..]

4) printk() from try_to_wake_up()

Assume the following code:

  printk()
    console_unlock()
      up()
        try_to_wake_up()
          raw_spin_lock_irqsave(&p->pi_lock, flags);
          WARN_ON(1);
          raw_spin_unlock_irqrestore(&p->pi_lock, flags);

which now produces:

 ------------[ cut here ]------------
 WARNING: CPU: 3 PID: 363 at kernel/sched/core.c:2028 try_to_wake_up
 CPU: 3 PID: 363 Comm: bash
 Call Trace:
   warn_slowpath_null+0x1d/0x1f
   try_to_wake_up+0x7f/0x4f7
   wake_up_process+0x15/0x17
   __up.isra.0+0x56/0x63
   up+0x32/0x42
   __up_console_sem+0x37/0x55
   console_unlock+0x21e/0x4c2
   vprintk_emit+0x41c/0x462
   vprintk_default+0x1d/0x1f
   printk+0x48/0x50
  [..]

5) printk() from call_console_drivers()

Assume the following code:
  printk()
    console_unlock()
      call_console_drivers()
      ...
          WARN_ON(1);

which now produces:

 ------------[ cut here ]------------
 WARNING: CPU: 2 PID: 305 at kernel/printk/printk.c:1604 call_console_drivers
 CPU: 2 PID: 305 Comm: bash
 Call Trace:
   warn_slowpath_null+0x18/0x1a
   call_console_drivers.isra.6.constprop.16+0x3a/0xb0
   console_unlock+0x471/0x48e
   vprintk_emit+0x1f4/0x206
   vprintk_default+0x18/0x1a
   vprintk_func+0x6e/0x70
   printk+0x3e/0x46
  [..]

6) unsupported placeholder in printk() format now prints an actual
   warning from vscnprintf(), instead of
   	'BUG: recent printk recursion!'.

 ------------[ cut here ]------------
 WARNING: CPU: 5 PID: 337 at lib/vsprintf.c:1900 format_decode
 Please remove unsupported %
  in format string
 CPU: 5 PID: 337 Comm: bash
 Call Trace:
   dump_stack+0x4f/0x65
   __warn+0xc2/0xdd
   warn_slowpath_fmt+0x4b/0x53
   format_decode+0x22c/0x308
   vsnprintf+0x89/0x3b7
   vscnprintf+0xd/0x26
   vprintk_emit+0xb4/0x238
   vprintk_default+0x1d/0x1f
   vprintk_func+0x6c/0x73
   printk+0x43/0x4b
  [..]

Link: http://lkml.kernel.org/r/20161227141611.940-7-sergey.senozhatsky@gmail.com
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Calvin Owens <calvinowens@fb.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Hurley <peter@hurleysoftware.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/printk/printk.c | 47 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 6bb77eff502d..f426a2b12a9b 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -213,17 +213,36 @@ static int nr_ext_console_drivers;
 
 static int __down_trylock_console_sem(unsigned long ip)
 {
-	if (down_trylock(&console_sem))
+	int lock_failed;
+	unsigned long flags;
+
+	/*
+	 * Here and in __up_console_sem() we need to be in safe mode,
+	 * because spindump/WARN/etc from under console ->lock will
+	 * deadlock in printk()->down_trylock_console_sem() otherwise.
+	 */
+	printk_safe_enter_irqsave(flags);
+	lock_failed = down_trylock(&console_sem);
+	printk_safe_exit_irqrestore(flags);
+
+	if (lock_failed)
 		return 1;
 	mutex_acquire(&console_lock_dep_map, 0, 1, ip);
 	return 0;
 }
 #define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_)
 
-#define up_console_sem() do { \
-	mutex_release(&console_lock_dep_map, 1, _RET_IP_);\
-	up(&console_sem);\
-} while (0)
+static void __up_console_sem(unsigned long ip)
+{
+	unsigned long flags;
+
+	mutex_release(&console_lock_dep_map, 1, ip);
+
+	printk_safe_enter_irqsave(flags);
+	up(&console_sem);
+	printk_safe_exit_irqrestore(flags);
+}
+#define up_console_sem() __up_console_sem(_RET_IP_)
 
 /*
  * This is used for debugging the mess that is the VT code by
@@ -1689,7 +1708,7 @@ asmlinkage int vprintk_emit(int facility, int level,
 	boot_delay_msec(level);
 	printk_delay();
 
-	local_irq_save(flags);
+	printk_safe_enter_irqsave(flags);
 	this_cpu = smp_processor_id();
 
 	/*
@@ -1705,13 +1724,12 @@ asmlinkage int vprintk_emit(int facility, int level,
 		 */
 		if (!oops_in_progress && !lockdep_recursing(current)) {
 			recursion_bug = true;
-			local_irq_restore(flags);
+			printk_safe_exit_irqrestore(flags);
 			return 0;
 		}
 		zap_locks();
 	}
 
-	lockdep_off();
 	/* This stops the holder of console_sem just where we want him */
 	raw_spin_lock(&logbuf_lock);
 	logbuf_cpu = this_cpu;
@@ -1771,12 +1789,10 @@ asmlinkage int vprintk_emit(int facility, int level,
 
 	logbuf_cpu = UINT_MAX;
 	raw_spin_unlock(&logbuf_lock);
-	lockdep_on();
-	local_irq_restore(flags);
+	printk_safe_exit_irqrestore(flags);
 
 	/* If called from the scheduler, we can not call up(). */
 	if (!in_sched) {
-		lockdep_off();
 		/*
 		 * Try to acquire and then immediately release the console
 		 * semaphore.  The release will print out buffers and wake up
@@ -1784,7 +1800,6 @@ asmlinkage int vprintk_emit(int facility, int level,
 		 */
 		if (console_trylock())
 			console_unlock();
-		lockdep_on();
 	}
 
 	return printed_len;
@@ -2209,7 +2224,8 @@ again:
 		size_t len;
 		int level;
 
-		raw_spin_lock_irqsave(&logbuf_lock, flags);
+		printk_safe_enter_irqsave(flags);
+		raw_spin_lock(&logbuf_lock);
 		if (seen_seq != log_next_seq) {
 			wake_klogd = true;
 			seen_seq = log_next_seq;
@@ -2259,7 +2275,7 @@ skip:
 		stop_critical_timings();	/* don't trace print latency */
 		call_console_drivers(level, ext_text, ext_len, text, len);
 		start_critical_timings();
-		local_irq_restore(flags);
+		printk_safe_exit_irqrestore(flags);
 
 		if (do_cond_resched)
 			cond_resched();
@@ -2282,7 +2298,8 @@ skip:
 	 */
 	raw_spin_lock(&logbuf_lock);
 	retry = console_seq != log_next_seq;
-	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+	raw_spin_unlock(&logbuf_lock);
+	printk_safe_exit_irqrestore(flags);
 
 	if (retry && console_trylock())
 		goto again;
-- 
cgit v1.2.3


From 8b1742c9c2071520efacf44e974dc03421360471 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Tue, 27 Dec 2016 23:16:10 +0900
Subject: printk: remove zap_locks() function

We use printk-safe now which makes printk-recursion detection code
in vprintk_emit() unreachable. The tricky thing here is that, apart
from detecting and reporting printk recursions, that code also used
to zap_locks() in case of panic() from the same CPU. However,
zap_locks() does not look to be needed anymore:

1) Since commit 08d78658f393 ("panic: release stale console lock to
   always get the logbuf printed out") panic flushing of `logbuf' to
   console ignores the state of `console_sem' by doing
   	panic()
		console_trylock();
		console_unlock();

2) Since commit cf9b1106c81c ("printk/nmi: flush NMI messages on the
   system panic") panic attempts to zap the `logbuf_lock' spin_lock to
   successfully flush nmi messages to `logbuf'.

Basically, it seems that we either already do what zap_locks() used to
do but in other places or we ignore the state of the lock. The only
reaming difference is that we don't re-init the console semaphore in
printk_safe_flush_on_panic(), but this is not necessary because we
don't call console drivers from printk_safe_flush_on_panic() due to
the fact that we are using a deferred printk() version (as was
suggested by Petr Mladek).

Link: http://lkml.kernel.org/r/20161227141611.940-8-sergey.senozhatsky@gmail.com
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Calvin Owens <calvinowens@fb.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Hurley <peter@hurleysoftware.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 kernel/printk/printk.c | 61 --------------------------------------------------
 1 file changed, 61 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index f426a2b12a9b..e17d384f8c6b 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1557,28 +1557,6 @@ static void call_console_drivers(int level,
 	}
 }
 
-/*
- * Zap console related locks when oopsing.
- * To leave time for slow consoles to print a full oops,
- * only zap at most once every 30 seconds.
- */
-static void zap_locks(void)
-{
-	static unsigned long oops_timestamp;
-
-	if (time_after_eq(jiffies, oops_timestamp) &&
-	    !time_after(jiffies, oops_timestamp + 30 * HZ))
-		return;
-
-	oops_timestamp = jiffies;
-
-	debug_locks_off();
-	/* If a crash is occurring, make sure we can't deadlock */
-	raw_spin_lock_init(&logbuf_lock);
-	/* And make sure that we print immediately */
-	sema_init(&console_sem, 1);
-}
-
 int printk_delay_msec __read_mostly;
 
 static inline void printk_delay(void)
@@ -1688,17 +1666,13 @@ asmlinkage int vprintk_emit(int facility, int level,
 			    const char *dict, size_t dictlen,
 			    const char *fmt, va_list args)
 {
-	static bool recursion_bug;
 	static char textbuf[LOG_LINE_MAX];
 	char *text = textbuf;
 	size_t text_len = 0;
 	enum log_flags lflags = 0;
 	unsigned long flags;
-	int this_cpu;
 	int printed_len = 0;
 	bool in_sched = false;
-	/* cpu currently holding logbuf_lock in this function */
-	static unsigned int logbuf_cpu = UINT_MAX;
 
 	if (level == LOGLEVEL_SCHED) {
 		level = LOGLEVEL_DEFAULT;
@@ -1709,42 +1683,8 @@ asmlinkage int vprintk_emit(int facility, int level,
 	printk_delay();
 
 	printk_safe_enter_irqsave(flags);
-	this_cpu = smp_processor_id();
-
-	/*
-	 * Ouch, printk recursed into itself!
-	 */
-	if (unlikely(logbuf_cpu == this_cpu)) {
-		/*
-		 * If a crash is occurring during printk() on this CPU,
-		 * then try to get the crash message out but make sure
-		 * we can't deadlock. Otherwise just return to avoid the
-		 * recursion and return - but flag the recursion so that
-		 * it can be printed at the next appropriate moment:
-		 */
-		if (!oops_in_progress && !lockdep_recursing(current)) {
-			recursion_bug = true;
-			printk_safe_exit_irqrestore(flags);
-			return 0;
-		}
-		zap_locks();
-	}
-
 	/* This stops the holder of console_sem just where we want him */
 	raw_spin_lock(&logbuf_lock);
-	logbuf_cpu = this_cpu;
-
-	if (unlikely(recursion_bug)) {
-		static const char recursion_msg[] =
-			"BUG: recent printk recursion!";
-
-		recursion_bug = false;
-		/* emit KERN_CRIT message */
-		printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
-					 NULL, 0, recursion_msg,
-					 strlen(recursion_msg));
-	}
-
 	/*
 	 * The printf needs to come first; we need the syslog
 	 * prefix which might be passed-in as a parameter.
@@ -1787,7 +1727,6 @@ asmlinkage int vprintk_emit(int facility, int level,
 
 	printed_len += log_output(facility, level, lflags, dict, dictlen, text, text_len);
 
-	logbuf_cpu = UINT_MAX;
 	raw_spin_unlock(&logbuf_lock);
 	printk_safe_exit_irqrestore(flags);
 
-- 
cgit v1.2.3


From de6fcbdb68b29b475f4e6368cd465fd778ea421f Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Tue, 27 Dec 2016 23:16:11 +0900
Subject: printk: convert the rest to printk-safe

This patch converts the rest of logbuf users (which are
out of printk recursion case, but can deadlock in printk).
To make printk-safe usage easier the patch introduces 4
helper macros:
- logbuf_lock_irq()/logbuf_unlock_irq()
  lock/unlock the logbuf lock and disable/enable local IRQ

- logbuf_lock_irqsave(flags)/logbuf_unlock_irqrestore(flags)
  lock/unlock the logbuf lock and saves/restores local IRQ state

Link: http://lkml.kernel.org/r/20161227141611.940-9-sergey.senozhatsky@gmail.com
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Calvin Owens <calvinowens@fb.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Hurley <peter@hurleysoftware.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 kernel/printk/printk.c | 103 +++++++++++++++++++++++++++++++------------------
 1 file changed, 65 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index e17d384f8c6b..080843148e8d 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -370,6 +370,34 @@ __packed __aligned(4)
  */
 DEFINE_RAW_SPINLOCK(logbuf_lock);
 
+/*
+ * Helper macros to lock/unlock logbuf_lock and switch between
+ * printk-safe/unsafe modes.
+ */
+#define logbuf_lock_irq()				\
+	do {						\
+		printk_safe_enter_irq();		\
+		raw_spin_lock(&logbuf_lock);		\
+	} while (0)
+
+#define logbuf_unlock_irq()				\
+	do {						\
+		raw_spin_unlock(&logbuf_lock);		\
+		printk_safe_exit_irq();			\
+	} while (0)
+
+#define logbuf_lock_irqsave(flags)			\
+	do {						\
+		printk_safe_enter_irqsave(flags);	\
+		raw_spin_lock(&logbuf_lock);		\
+	} while (0)
+
+#define logbuf_unlock_irqrestore(flags)		\
+	do {						\
+		raw_spin_unlock(&logbuf_lock);		\
+		printk_safe_exit_irqrestore(flags);	\
+	} while (0)
+
 #ifdef CONFIG_PRINTK
 DECLARE_WAIT_QUEUE_HEAD(log_wait);
 /* the next printk record to read by syslog(READ) or /proc/kmsg */
@@ -801,20 +829,21 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 	ret = mutex_lock_interruptible(&user->lock);
 	if (ret)
 		return ret;
-	raw_spin_lock_irq(&logbuf_lock);
+
+	logbuf_lock_irq();
 	while (user->seq == log_next_seq) {
 		if (file->f_flags & O_NONBLOCK) {
 			ret = -EAGAIN;
-			raw_spin_unlock_irq(&logbuf_lock);
+			logbuf_unlock_irq();
 			goto out;
 		}
 
-		raw_spin_unlock_irq(&logbuf_lock);
+		logbuf_unlock_irq();
 		ret = wait_event_interruptible(log_wait,
 					       user->seq != log_next_seq);
 		if (ret)
 			goto out;
-		raw_spin_lock_irq(&logbuf_lock);
+		logbuf_lock_irq();
 	}
 
 	if (user->seq < log_first_seq) {
@@ -822,7 +851,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 		user->idx = log_first_idx;
 		user->seq = log_first_seq;
 		ret = -EPIPE;
-		raw_spin_unlock_irq(&logbuf_lock);
+		logbuf_unlock_irq();
 		goto out;
 	}
 
@@ -835,7 +864,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 
 	user->idx = log_next(user->idx);
 	user->seq++;
-	raw_spin_unlock_irq(&logbuf_lock);
+	logbuf_unlock_irq();
 
 	if (len > count) {
 		ret = -EINVAL;
@@ -862,7 +891,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
 	if (offset)
 		return -ESPIPE;
 
-	raw_spin_lock_irq(&logbuf_lock);
+	logbuf_lock_irq();
 	switch (whence) {
 	case SEEK_SET:
 		/* the first record */
@@ -886,7 +915,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
 	default:
 		ret = -EINVAL;
 	}
-	raw_spin_unlock_irq(&logbuf_lock);
+	logbuf_unlock_irq();
 	return ret;
 }
 
@@ -900,7 +929,7 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
 
 	poll_wait(file, &log_wait, wait);
 
-	raw_spin_lock_irq(&logbuf_lock);
+	logbuf_lock_irq();
 	if (user->seq < log_next_seq) {
 		/* return error when data has vanished underneath us */
 		if (user->seq < log_first_seq)
@@ -908,7 +937,7 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
 		else
 			ret = POLLIN|POLLRDNORM;
 	}
-	raw_spin_unlock_irq(&logbuf_lock);
+	logbuf_unlock_irq();
 
 	return ret;
 }
@@ -938,10 +967,10 @@ static int devkmsg_open(struct inode *inode, struct file *file)
 
 	mutex_init(&user->lock);
 
-	raw_spin_lock_irq(&logbuf_lock);
+	logbuf_lock_irq();
 	user->idx = log_first_idx;
 	user->seq = log_first_seq;
-	raw_spin_unlock_irq(&logbuf_lock);
+	logbuf_unlock_irq();
 
 	file->private_data = user;
 	return 0;
@@ -1083,13 +1112,13 @@ void __init setup_log_buf(int early)
 		return;
 	}
 
-	raw_spin_lock_irqsave(&logbuf_lock, flags);
+	logbuf_lock_irqsave(flags);
 	log_buf_len = new_log_buf_len;
 	log_buf = new_log_buf;
 	new_log_buf_len = 0;
 	free = __LOG_BUF_LEN - log_next_idx;
 	memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
-	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+	logbuf_unlock_irqrestore(flags);
 
 	pr_info("log_buf_len: %d bytes\n", log_buf_len);
 	pr_info("early log buf free: %d(%d%%)\n",
@@ -1267,7 +1296,7 @@ static int syslog_print(char __user *buf, int size)
 		size_t n;
 		size_t skip;
 
-		raw_spin_lock_irq(&logbuf_lock);
+		logbuf_lock_irq();
 		if (syslog_seq < log_first_seq) {
 			/* messages are gone, move to first one */
 			syslog_seq = log_first_seq;
@@ -1275,7 +1304,7 @@ static int syslog_print(char __user *buf, int size)
 			syslog_partial = 0;
 		}
 		if (syslog_seq == log_next_seq) {
-			raw_spin_unlock_irq(&logbuf_lock);
+			logbuf_unlock_irq();
 			break;
 		}
 
@@ -1294,7 +1323,7 @@ static int syslog_print(char __user *buf, int size)
 			syslog_partial += n;
 		} else
 			n = 0;
-		raw_spin_unlock_irq(&logbuf_lock);
+		logbuf_unlock_irq();
 
 		if (!n)
 			break;
@@ -1323,7 +1352,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 	if (!text)
 		return -ENOMEM;
 
-	raw_spin_lock_irq(&logbuf_lock);
+	logbuf_lock_irq();
 	if (buf) {
 		u64 next_seq;
 		u64 seq;
@@ -1371,12 +1400,12 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 			idx = log_next(idx);
 			seq++;
 
-			raw_spin_unlock_irq(&logbuf_lock);
+			logbuf_unlock_irq();
 			if (copy_to_user(buf + len, text, textlen))
 				len = -EFAULT;
 			else
 				len += textlen;
-			raw_spin_lock_irq(&logbuf_lock);
+			logbuf_lock_irq();
 
 			if (seq < log_first_seq) {
 				/* messages are gone, move to next one */
@@ -1390,7 +1419,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 		clear_seq = log_next_seq;
 		clear_idx = log_next_idx;
 	}
-	raw_spin_unlock_irq(&logbuf_lock);
+	logbuf_unlock_irq();
 
 	kfree(text);
 	return len;
@@ -1477,7 +1506,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
 		break;
 	/* Number of chars in the log buffer */
 	case SYSLOG_ACTION_SIZE_UNREAD:
-		raw_spin_lock_irq(&logbuf_lock);
+		logbuf_lock_irq();
 		if (syslog_seq < log_first_seq) {
 			/* messages are gone, move to first one */
 			syslog_seq = log_first_seq;
@@ -1505,7 +1534,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
 			}
 			error -= syslog_partial;
 		}
-		raw_spin_unlock_irq(&logbuf_lock);
+		logbuf_unlock_irq();
 		break;
 	/* Size of the log buffer */
 	case SYSLOG_ACTION_SIZE_BUFFER:
@@ -1682,9 +1711,8 @@ asmlinkage int vprintk_emit(int facility, int level,
 	boot_delay_msec(level);
 	printk_delay();
 
-	printk_safe_enter_irqsave(flags);
 	/* This stops the holder of console_sem just where we want him */
-	raw_spin_lock(&logbuf_lock);
+	logbuf_lock_irqsave(flags);
 	/*
 	 * The printf needs to come first; we need the syslog
 	 * prefix which might be passed-in as a parameter.
@@ -1727,8 +1755,7 @@ asmlinkage int vprintk_emit(int facility, int level,
 
 	printed_len += log_output(facility, level, lflags, dict, dictlen, text, text_len);
 
-	raw_spin_unlock(&logbuf_lock);
-	printk_safe_exit_irqrestore(flags);
+	logbuf_unlock_irqrestore(flags);
 
 	/* If called from the scheduler, we can not call up(). */
 	if (!in_sched) {
@@ -2501,10 +2528,10 @@ void register_console(struct console *newcon)
 		 * console_unlock(); will print out the buffered messages
 		 * for us.
 		 */
-		raw_spin_lock_irqsave(&logbuf_lock, flags);
+		logbuf_lock_irqsave(flags);
 		console_seq = syslog_seq;
 		console_idx = syslog_idx;
-		raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+		logbuf_unlock_irqrestore(flags);
 		/*
 		 * We're about to replay the log buffer.  Only do this to the
 		 * just-registered console to avoid excessive message spam to
@@ -2803,12 +2830,12 @@ void kmsg_dump(enum kmsg_dump_reason reason)
 		/* initialize iterator with data about the stored records */
 		dumper->active = true;
 
-		raw_spin_lock_irqsave(&logbuf_lock, flags);
+		logbuf_lock_irqsave(flags);
 		dumper->cur_seq = clear_seq;
 		dumper->cur_idx = clear_idx;
 		dumper->next_seq = log_next_seq;
 		dumper->next_idx = log_next_idx;
-		raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+		logbuf_unlock_irqrestore(flags);
 
 		/* invoke dumper which will iterate over records */
 		dumper->dump(dumper, reason);
@@ -2893,9 +2920,9 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
 	unsigned long flags;
 	bool ret;
 
-	raw_spin_lock_irqsave(&logbuf_lock, flags);
+	logbuf_lock_irqsave(flags);
 	ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
-	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+	logbuf_unlock_irqrestore(flags);
 
 	return ret;
 }
@@ -2934,7 +2961,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 	if (!dumper->active)
 		goto out;
 
-	raw_spin_lock_irqsave(&logbuf_lock, flags);
+	logbuf_lock_irqsave(flags);
 	if (dumper->cur_seq < log_first_seq) {
 		/* messages are gone, move to first available one */
 		dumper->cur_seq = log_first_seq;
@@ -2943,7 +2970,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 
 	/* last entry */
 	if (dumper->cur_seq >= dumper->next_seq) {
-		raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+		logbuf_unlock_irqrestore(flags);
 		goto out;
 	}
 
@@ -2985,7 +3012,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 	dumper->next_seq = next_seq;
 	dumper->next_idx = next_idx;
 	ret = true;
-	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+	logbuf_unlock_irqrestore(flags);
 out:
 	if (len)
 		*len = l;
@@ -3023,9 +3050,9 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper)
 {
 	unsigned long flags;
 
-	raw_spin_lock_irqsave(&logbuf_lock, flags);
+	logbuf_lock_irqsave(flags);
 	kmsg_dump_rewind_nolock(dumper);
-	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+	logbuf_unlock_irqrestore(flags);
 }
 EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
 
-- 
cgit v1.2.3


From d9c23523ed98a3acaa0bfd8fef143595d6aa631d Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Sat, 24 Dec 2016 23:09:01 +0900
Subject: printk: drop call_console_drivers() unused param

We do suppress_message_printing() check before we call
call_console_drivers() now, so `level' param is not needed
anymore.

Link: http://lkml.kernel.org/r/20161224140902.1962-2-sergey.senozhatsky@gmail.com
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Hurley <peter@hurleysoftware.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 kernel/printk/printk.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 080843148e8d..7180088cbb23 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1558,8 +1558,7 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
  * log_buf[start] to log_buf[end - 1].
  * The console_lock must be held.
  */
-static void call_console_drivers(int level,
-				 const char *ext_text, size_t ext_len,
+static void call_console_drivers(const char *ext_text, size_t ext_len,
 				 const char *text, size_t len)
 {
 	struct console *con;
@@ -1866,8 +1865,7 @@ static ssize_t msg_print_ext_header(char *buf, size_t size,
 static ssize_t msg_print_ext_body(char *buf, size_t size,
 				  char *dict, size_t dict_len,
 				  char *text, size_t text_len) { return 0; }
-static void call_console_drivers(int level,
-				 const char *ext_text, size_t ext_len,
+static void call_console_drivers(const char *ext_text, size_t ext_len,
 				 const char *text, size_t len) {}
 static size_t msg_print_text(const struct printk_log *msg,
 			     bool syslog, char *buf, size_t size) { return 0; }
@@ -2188,7 +2186,6 @@ again:
 		struct printk_log *msg;
 		size_t ext_len = 0;
 		size_t len;
-		int level;
 
 		printk_safe_enter_irqsave(flags);
 		raw_spin_lock(&logbuf_lock);
@@ -2212,8 +2209,7 @@ skip:
 			break;
 
 		msg = log_from_idx(console_idx);
-		level = msg->level;
-		if (suppress_message_printing(level)) {
+		if (suppress_message_printing(msg->level)) {
 			/*
 			 * Skip record we have buffered and already printed
 			 * directly to the console when we received it, and
@@ -2239,7 +2235,7 @@ skip:
 		raw_spin_unlock(&logbuf_lock);
 
 		stop_critical_timings();	/* don't trace print latency */
-		call_console_drivers(level, ext_text, ext_len, text, len);
+		call_console_drivers(ext_text, ext_len, text, len);
 		start_critical_timings();
 		printk_safe_exit_irqrestore(flags);
 
-- 
cgit v1.2.3


From c502faf94153bd0fedc5389a936f728a659cc6ab Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 8 Feb 2017 01:19:43 +0100
Subject: bpf, lpm: fix overflows in trie_alloc checks

Cap the maximum (total) value size and bail out if larger than KMALLOC_MAX_SIZE
as otherwise it doesn't make any sense to proceed further, since we're
guaranteed to fail to allocate elements anyway in lpm_trie_node_alloc();
likleyhood of failure is still high for large values, though, similarly
as with htab case in non-prealloc.

Next, make sure that cost vars are really u64 instead of size_t, so that we
don't overflow on 32 bit and charge only tiny map.pages against memlock while
allowing huge max_entries; cap also the max cost like we do with other map
types.

Fixes: b95a5c4db09b ("bpf: add a longest prefix match trie map implementation")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/lpm_trie.c | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 144e9763102f..e0f6a0bd279b 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -394,10 +394,21 @@ static int trie_delete_elem(struct bpf_map *map, void *key)
 	return -ENOSYS;
 }
 
+#define LPM_DATA_SIZE_MAX	256
+#define LPM_DATA_SIZE_MIN	1
+
+#define LPM_VAL_SIZE_MAX	(KMALLOC_MAX_SIZE - LPM_DATA_SIZE_MAX - \
+				 sizeof(struct lpm_trie_node))
+#define LPM_VAL_SIZE_MIN	1
+
+#define LPM_KEY_SIZE(X)		(sizeof(struct bpf_lpm_trie_key) + (X))
+#define LPM_KEY_SIZE_MAX	LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
+#define LPM_KEY_SIZE_MIN	LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
+
 static struct bpf_map *trie_alloc(union bpf_attr *attr)
 {
-	size_t cost, cost_per_node;
 	struct lpm_trie *trie;
+	u64 cost = sizeof(*trie), cost_per_node;
 	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -406,9 +417,10 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 ||
 	    attr->map_flags != BPF_F_NO_PREALLOC ||
-	    attr->key_size < sizeof(struct bpf_lpm_trie_key) + 1   ||
-	    attr->key_size > sizeof(struct bpf_lpm_trie_key) + 256 ||
-	    attr->value_size == 0)
+	    attr->key_size < LPM_KEY_SIZE_MIN ||
+	    attr->key_size > LPM_KEY_SIZE_MAX ||
+	    attr->value_size < LPM_VAL_SIZE_MIN ||
+	    attr->value_size > LPM_VAL_SIZE_MAX)
 		return ERR_PTR(-EINVAL);
 
 	trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN);
@@ -426,18 +438,24 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
 
 	cost_per_node = sizeof(struct lpm_trie_node) +
 			attr->value_size + trie->data_size;
-	cost = sizeof(*trie) + attr->max_entries * cost_per_node;
+	cost += (u64) attr->max_entries * cost_per_node;
+	if (cost >= U32_MAX - PAGE_SIZE) {
+		ret = -E2BIG;
+		goto out_err;
+	}
+
 	trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
 	ret = bpf_map_precharge_memlock(trie->map.pages);
-	if (ret) {
-		kfree(trie);
-		return ERR_PTR(ret);
-	}
+	if (ret)
+		goto out_err;
 
 	raw_spin_lock_init(&trie->lock);
 
 	return &trie->map;
+out_err:
+	kfree(trie);
+	return ERR_PTR(ret);
 }
 
 static void trie_free(struct bpf_map *map)
-- 
cgit v1.2.3


From 8a293be0d6fa0720809db6ac35a0552c51710cd2 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Sat, 23 Jul 2016 14:01:45 -0400
Subject: core: migrate exception table users off module.h and onto extable.h

These files were including module.h for exception table related
functions.  We've now separated that content out into its own file
"extable.h" so now move over to that and where possible, avoid all
the extra header content in module.h that we don't really need to
compile these non-modular files.

Note:
   init/main.c still needs module.h for __init_or_module
   kernel/extable.c still needs module.h for is_module_text_address

...and so we don't get the benefit of removing module.h from the cpp
feed for these two files, unlike the almost universal 1:1 exchange
of module.h for extable.h we were able to do in the arch dirs.

Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Jessica Yu <jeyu@redhat.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 init/main.c      | 1 +
 kernel/extable.c | 1 +
 kernel/module.c  | 1 +
 3 files changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/init/main.c b/init/main.c
index b0c9d6facef9..5b244bc1c176 100644
--- a/init/main.c
+++ b/init/main.c
@@ -12,6 +12,7 @@
 #define DEBUG		/* Enable initcall_debug */
 
 #include <linux/types.h>
+#include <linux/extable.h>
 #include <linux/module.h>
 #include <linux/proc_fs.h>
 #include <linux/kernel.h>
diff --git a/kernel/extable.c b/kernel/extable.c
index e3beec4a2339..b25d9901e431 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -17,6 +17,7 @@
 */
 #include <linux/ftrace.h>
 #include <linux/memory.h>
+#include <linux/extable.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/init.h>
diff --git a/kernel/module.c b/kernel/module.c
index 38d4270925d4..ded5a4abc43a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -17,6 +17,7 @@
     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
 #include <linux/export.h>
+#include <linux/extable.h>
 #include <linux/moduleloader.h>
 #include <linux/trace_events.h>
 #include <linux/init.h>
-- 
cgit v1.2.3


From fc62d0207ae0ebc9d19df68394c0dc925b4a92d1 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Wed, 8 Feb 2017 01:24:14 +0530
Subject: kprobes: Introduce weak variant of kprobe_exceptions_notify()

kprobe_exceptions_notify() is not used on some of the architectures such
as arm[64] and powerpc anymore. Introduce a weak variant for such
architectures.

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 kernel/kprobes.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 43460104f119..60a702a05684 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1705,6 +1705,12 @@ void unregister_kprobes(struct kprobe **kps, int num)
 }
 EXPORT_SYMBOL_GPL(unregister_kprobes);
 
+int __weak __kprobes kprobe_exceptions_notify(struct notifier_block *self,
+					      unsigned long val, void *data)
+{
+	return NOTIFY_DONE;
+}
+
 static struct notifier_block kprobe_exceptions_nb = {
 	.notifier_call = kprobe_exceptions_notify,
 	.priority = 0x7fffffff /* we need to be notified first */
-- 
cgit v1.2.3


From bb3bac2ca9a3a5b7fa601781adf70167a0449d75 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Mon, 6 Feb 2017 11:04:26 -0500
Subject: sched/core: Remove unlikely() annotation from sched_move_task()

The check for 'running' in sched_move_task() has an unlikely() around it. That
is, it is unlikely that the task being moved is running. That use to be
true. But with a couple of recent updates, it is now likely that the task
will be running.

The first change came from ea86cb4b7621 ("sched/cgroup: Fix
cpu_cgroup_fork() handling") that moved around the use case of
sched_move_task() in do_fork() where the call is now done after the task is
woken (hence it is running).

The second change came from 8e5bfa8c1f84 ("sched/autogroup: Do not use
autogroup->tg in zombie threads") where sched_move_task() is called by the
exit path, by the task that is exiting. Hence it too is running.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Link: http://lkml.kernel.org/r/20170206110426.27ca6426@gandalf.local.home
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e4aa470ed454..34e2291a9a6c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6404,14 +6404,14 @@ void sched_move_task(struct task_struct *tsk)
 
 	if (queued)
 		dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
-	if (unlikely(running))
+	if (running)
 		put_prev_task(rq, tsk);
 
 	sched_change_group(tsk, TASK_MOVE_GROUP);
 
 	if (queued)
 		enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
-	if (unlikely(running))
+	if (running)
 		set_curr_task(rq, tsk);
 
 	task_rq_unlock(rq, tsk, &rf);
-- 
cgit v1.2.3


From 9ccbfbb157a38921702402281ca7be530b4c3669 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Thu, 26 Jan 2017 11:40:56 +0200
Subject: perf/core: Do error out on a kernel filter on an exclude_filter event

It is currently possible to configure a kernel address filter for a
event that excludes kernel from its traces (attr.exclude_kernel==1).

While in reality this doesn't make sense, the SET_FILTER ioctl() should
return a error in such case, currently it does not. Furthermore, it
will still silently discard the filter and any potentially valid filters
that came with it.

This patch makes the SET_FILTER ioctl() error out in such cases.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Will Deacon <will.deacon@arm.com>
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20170126094057.13805-3-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 88676ff98c0f..1730995c31ec 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8260,6 +8260,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
 		 * attribute.
 		 */
 		if (state == IF_STATE_END) {
+			ret = -EINVAL;
 			if (kernel && event->attr.exclude_kernel)
 				goto fail;
 
-- 
cgit v1.2.3


From 6ce77bfd6cedbff61eabf8837dc0901bb671cc86 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Thu, 26 Jan 2017 11:40:57 +0200
Subject: perf/core: Allow kernel filters on CPU events

While supporting file-based address filters for CPU events requires some
extra context switch handling, kernel address filters are easy, since the
kernel mapping is preserved across address spaces. It is also useful as
it permits tracing scheduling paths of the kernel.

This patch allows setting up kernel filters for CPU events.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Will Deacon <will.deacon@arm.com>
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20170126094057.13805-4-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  2 ++
 kernel/events/core.c       | 42 ++++++++++++++++++++++++++++--------------
 2 files changed, 30 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 5c58e93c130c..000fdb211c7d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -482,6 +482,7 @@ struct perf_addr_filter {
  * @list:	list of filters for this event
  * @lock:	spinlock that serializes accesses to the @list and event's
  *		(and its children's) filter generations.
+ * @nr_file_filters:	number of file-based filters
  *
  * A child event will use parent's @list (and therefore @lock), so they are
  * bundled together; see perf_event_addr_filters().
@@ -489,6 +490,7 @@ struct perf_addr_filter {
 struct perf_addr_filters_head {
 	struct list_head	list;
 	raw_spinlock_t		lock;
+	unsigned int		nr_file_filters;
 };
 
 /**
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1730995c31ec..a8664247b3e4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8090,6 +8090,9 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
 	if (task == TASK_TOMBSTONE)
 		return;
 
+	if (!ifh->nr_file_filters)
+		return;
+
 	mm = get_task_mm(event->ctx->task);
 	if (!mm)
 		goto restart;
@@ -8268,6 +8271,18 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
 				if (!filename)
 					goto fail;
 
+				/*
+				 * For now, we only support file-based filters
+				 * in per-task events; doing so for CPU-wide
+				 * events requires additional context switching
+				 * trickery, since same object code will be
+				 * mapped at different virtual addresses in
+				 * different processes.
+				 */
+				ret = -EOPNOTSUPP;
+				if (!event->ctx->task)
+					goto fail_free_name;
+
 				/* look up the path and grab its inode */
 				ret = kern_path(filename, LOOKUP_FOLLOW, &path);
 				if (ret)
@@ -8283,6 +8298,8 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
 				    !S_ISREG(filter->inode->i_mode))
 					/* free_filters_list() will iput() */
 					goto fail;
+
+				event->addr_filters.nr_file_filters++;
 			}
 
 			/* ready to consume more filters */
@@ -8322,24 +8339,13 @@ perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
 	if (WARN_ON_ONCE(event->parent))
 		return -EINVAL;
 
-	/*
-	 * For now, we only support filtering in per-task events; doing so
-	 * for CPU-wide events requires additional context switching trickery,
-	 * since same object code will be mapped at different virtual
-	 * addresses in different processes.
-	 */
-	if (!event->ctx->task)
-		return -EOPNOTSUPP;
-
 	ret = perf_event_parse_addr_filter(event, filter_str, &filters);
 	if (ret)
-		return ret;
+		goto fail_clear_files;
 
 	ret = event->pmu->addr_filters_validate(&filters);
-	if (ret) {
-		free_filters_list(&filters);
-		return ret;
-	}
+	if (ret)
+		goto fail_free_filters;
 
 	/* remove existing filters, if any */
 	perf_addr_filters_splice(event, &filters);
@@ -8347,6 +8353,14 @@ perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
 	/* install new filters */
 	perf_event_for_each_child(event, perf_event_addr_filters_apply);
 
+	return ret;
+
+fail_free_filters:
+	free_filters_list(&filters);
+
+fail_clear_files:
+	event->addr_filters.nr_file_filters = 0;
+
 	return ret;
 }
 
-- 
cgit v1.2.3


From f9af456a61ecfbef8233c5046a9e347c9b98ba05 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Fri, 13 Jan 2017 11:42:04 +0900
Subject: lockdep: Fix incorrect condition to print bug msgs for
 MAX_LOCKDEP_CHAIN_HLOCKS

Bug messages and stack dump for MAX_LOCKDEP_CHAIN_HLOCKS should only
be printed once.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1484275324-28192-1-git-send-email-byungchul.park@lge.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 7c38f8f3d97b..bf6072524756 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2203,7 +2203,7 @@ cache_hit:
 	 * Important for check_no_collision().
 	 */
 	if (unlikely(nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)) {
-		if (debug_locks_off_graph_unlock())
+		if (!debug_locks_off_graph_unlock())
 			return 0;
 
 		print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!");
-- 
cgit v1.2.3


From bc88c10d7e6900916f5e1ba3829d66a9de92b633 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Wed, 8 Feb 2017 14:46:48 -0500
Subject: locking/spinlock/debug: Remove spinlock lockup detection code

The current spinlock lockup detection code can sometimes produce false
positives because of the unfairness of the locking algorithm itself.

So the lockup detection code is now removed. Instead, we are relying
on the NMI watchdog to detect potential lockup. We won't have lockup
detection if the watchdog isn't running.

The commented-out read-write lock lockup detection code are also
removed.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1486583208-11038-1-git-send-email-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/spinlock_debug.c | 86 +++--------------------------------------
 1 file changed, 5 insertions(+), 81 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index 0374a596cffa..9aa0fccd5d43 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -103,38 +103,14 @@ static inline void debug_spin_unlock(raw_spinlock_t *lock)
 	lock->owner_cpu = -1;
 }
 
-static void __spin_lock_debug(raw_spinlock_t *lock)
-{
-	u64 i;
-	u64 loops = loops_per_jiffy * HZ;
-
-	for (i = 0; i < loops; i++) {
-		if (arch_spin_trylock(&lock->raw_lock))
-			return;
-		__delay(1);
-	}
-	/* lockup suspected: */
-	spin_dump(lock, "lockup suspected");
-#ifdef CONFIG_SMP
-	trigger_all_cpu_backtrace();
-#endif
-
-	/*
-	 * The trylock above was causing a livelock.  Give the lower level arch
-	 * specific lock code a chance to acquire the lock. We have already
-	 * printed a warning/backtrace at this point. The non-debug arch
-	 * specific code might actually succeed in acquiring the lock.  If it is
-	 * not successful, the end-result is the same - there is no forward
-	 * progress.
-	 */
-	arch_spin_lock(&lock->raw_lock);
-}
-
+/*
+ * We are now relying on the NMI watchdog to detect lockup instead of doing
+ * the detection here with an unfair lock which can cause problem of its own.
+ */
 void do_raw_spin_lock(raw_spinlock_t *lock)
 {
 	debug_spin_lock_before(lock);
-	if (unlikely(!arch_spin_trylock(&lock->raw_lock)))
-		__spin_lock_debug(lock);
+	arch_spin_lock(&lock->raw_lock);
 	debug_spin_lock_after(lock);
 }
 
@@ -172,32 +148,6 @@ static void rwlock_bug(rwlock_t *lock, const char *msg)
 
 #define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg)
 
-#if 0		/* __write_lock_debug() can lock up - maybe this can too? */
-static void __read_lock_debug(rwlock_t *lock)
-{
-	u64 i;
-	u64 loops = loops_per_jiffy * HZ;
-	int print_once = 1;
-
-	for (;;) {
-		for (i = 0; i < loops; i++) {
-			if (arch_read_trylock(&lock->raw_lock))
-				return;
-			__delay(1);
-		}
-		/* lockup suspected: */
-		if (print_once) {
-			print_once = 0;
-			printk(KERN_EMERG "BUG: read-lock lockup on CPU#%d, "
-					"%s/%d, %p\n",
-				raw_smp_processor_id(), current->comm,
-				current->pid, lock);
-			dump_stack();
-		}
-	}
-}
-#endif
-
 void do_raw_read_lock(rwlock_t *lock)
 {
 	RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
@@ -247,32 +197,6 @@ static inline void debug_write_unlock(rwlock_t *lock)
 	lock->owner_cpu = -1;
 }
 
-#if 0		/* This can cause lockups */
-static void __write_lock_debug(rwlock_t *lock)
-{
-	u64 i;
-	u64 loops = loops_per_jiffy * HZ;
-	int print_once = 1;
-
-	for (;;) {
-		for (i = 0; i < loops; i++) {
-			if (arch_write_trylock(&lock->raw_lock))
-				return;
-			__delay(1);
-		}
-		/* lockup suspected: */
-		if (print_once) {
-			print_once = 0;
-			printk(KERN_EMERG "BUG: write-lock lockup on CPU#%d, "
-					"%s/%d, %p\n",
-				raw_smp_processor_id(), current->comm,
-				current->pid, lock);
-			dump_stack();
-		}
-	}
-}
-#endif
-
 void do_raw_write_lock(rwlock_t *lock)
 {
 	debug_write_lock_before(lock);
-- 
cgit v1.2.3


From dfb4357da6ddbdf57d583ba64361c9d792b0e0b1 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 8 Feb 2017 11:26:59 -0800
Subject: time: Remove CONFIG_TIMER_STATS

Currently CONFIG_TIMER_STATS exposes process information across namespaces:

kernel/time/timer_list.c print_timer():

        SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);

/proc/timer_list:

 #11: <0000000000000000>, hrtimer_wakeup, S:01, do_nanosleep, cron/2570

Given that the tracer can give the same information, this patch entirely
removes CONFIG_TIMER_STATS.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: John Stultz <john.stultz@linaro.org>
Cc: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: linux-doc@vger.kernel.org
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Xing Gao <xgao01@email.wm.edu>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Jessica Frazelle <me@jessfraz.com>
Cc: kernel-hardening@lists.openwall.com
Cc: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Michal Marek <mmarek@suse.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Olof Johansson <olof@lixom.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-api@vger.kernel.org
Cc: Arjan van de Ven <arjan@linux.intel.com>
Link: http://lkml.kernel.org/r/20170208192659.GA32582@beast
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/timers/timer_stats.txt |  73 ------
 include/linux/hrtimer.h              |  11 -
 include/linux/timer.h                |  45 ----
 kernel/kthread.c                     |   1 -
 kernel/time/Makefile                 |   1 -
 kernel/time/hrtimer.c                |  38 ----
 kernel/time/timer.c                  |  48 +---
 kernel/time/timer_list.c             |  10 -
 kernel/time/timer_stats.c            | 425 -----------------------------------
 kernel/workqueue.c                   |   2 -
 lib/Kconfig.debug                    |  14 --
 11 files changed, 2 insertions(+), 666 deletions(-)
 delete mode 100644 Documentation/timers/timer_stats.txt
 delete mode 100644 kernel/time/timer_stats.c

(limited to 'kernel')

diff --git a/Documentation/timers/timer_stats.txt b/Documentation/timers/timer_stats.txt
deleted file mode 100644
index de835ee97455..000000000000
--- a/Documentation/timers/timer_stats.txt
+++ /dev/null
@@ -1,73 +0,0 @@
-timer_stats - timer usage statistics
-------------------------------------
-
-timer_stats is a debugging facility to make the timer (ab)usage in a Linux
-system visible to kernel and userspace developers. If enabled in the config
-but not used it has almost zero runtime overhead, and a relatively small
-data structure overhead. Even if collection is enabled runtime all the
-locking is per-CPU and lookup is hashed.
-
-timer_stats should be used by kernel and userspace developers to verify that
-their code does not make unduly use of timers. This helps to avoid unnecessary
-wakeups, which should be avoided to optimize power consumption.
-
-It can be enabled by CONFIG_TIMER_STATS in the "Kernel hacking" configuration
-section.
-
-timer_stats collects information about the timer events which are fired in a
-Linux system over a sample period:
-
-- the pid of the task(process) which initialized the timer
-- the name of the process which initialized the timer
-- the function where the timer was initialized
-- the callback function which is associated to the timer
-- the number of events (callbacks)
-
-timer_stats adds an entry to /proc: /proc/timer_stats
-
-This entry is used to control the statistics functionality and to read out the
-sampled information.
-
-The timer_stats functionality is inactive on bootup.
-
-To activate a sample period issue:
-# echo 1 >/proc/timer_stats
-
-To stop a sample period issue:
-# echo 0 >/proc/timer_stats
-
-The statistics can be retrieved by:
-# cat /proc/timer_stats
-
-While sampling is enabled, each readout from /proc/timer_stats will see
-newly updated statistics. Once sampling is disabled, the sampled information
-is kept until a new sample period is started. This allows multiple readouts.
-
-Sample output of /proc/timer_stats:
-
-Timerstats sample period: 3.888770 s
-  12,     0 swapper          hrtimer_stop_sched_tick (hrtimer_sched_tick)
-  15,     1 swapper          hcd_submit_urb (rh_timer_func)
-   4,   959 kedac            schedule_timeout (process_timeout)
-   1,     0 swapper          page_writeback_init (wb_timer_fn)
-  28,     0 swapper          hrtimer_stop_sched_tick (hrtimer_sched_tick)
-  22,  2948 IRQ 4            tty_flip_buffer_push (delayed_work_timer_fn)
-   3,  3100 bash             schedule_timeout (process_timeout)
-   1,     1 swapper          queue_delayed_work_on (delayed_work_timer_fn)
-   1,     1 swapper          queue_delayed_work_on (delayed_work_timer_fn)
-   1,     1 swapper          neigh_table_init_no_netlink (neigh_periodic_timer)
-   1,  2292 ip               __netdev_watchdog_up (dev_watchdog)
-   1,    23 events/1         do_cache_clean (delayed_work_timer_fn)
-90 total events, 30.0 events/sec
-
-The first column is the number of events, the second column the pid, the third
-column is the name of the process. The forth column shows the function which
-initialized the timer and in parenthesis the callback function which was
-executed on expiry.
-
-    Thomas, Ingo
-
-Added flag to indicate 'deferrable timer' in /proc/timer_stats. A deferrable
-timer will appear as follows
-  10D,     1 swapper          queue_delayed_work_on (delayed_work_timer_fn)
-
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index cdab81ba29f8..e52b427223ba 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -88,12 +88,6 @@ enum hrtimer_restart {
  * @base:	pointer to the timer base (per cpu and per clock)
  * @state:	state information (See bit values above)
  * @is_rel:	Set if the timer was armed relative
- * @start_pid:  timer statistics field to store the pid of the task which
- *		started the timer
- * @start_site:	timer statistics field to store the site where the timer
- *		was started
- * @start_comm: timer statistics field to store the name of the process which
- *		started the timer
  *
  * The hrtimer structure must be initialized by hrtimer_init()
  */
@@ -104,11 +98,6 @@ struct hrtimer {
 	struct hrtimer_clock_base	*base;
 	u8				state;
 	u8				is_rel;
-#ifdef CONFIG_TIMER_STATS
-	int				start_pid;
-	void				*start_site;
-	char				start_comm[16];
-#endif
 };
 
 /**
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 51d601f192d4..5a209b84fd9e 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -20,11 +20,6 @@ struct timer_list {
 	unsigned long		data;
 	u32			flags;
 
-#ifdef CONFIG_TIMER_STATS
-	int			start_pid;
-	void			*start_site;
-	char			start_comm[16];
-#endif
 #ifdef CONFIG_LOCKDEP
 	struct lockdep_map	lockdep_map;
 #endif
@@ -197,46 +192,6 @@ extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
  */
 #define NEXT_TIMER_MAX_DELTA	((1UL << 30) - 1)
 
-/*
- * Timer-statistics info:
- */
-#ifdef CONFIG_TIMER_STATS
-
-extern int timer_stats_active;
-
-extern void init_timer_stats(void);
-
-extern void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
-				     void *timerf, char *comm, u32 flags);
-
-extern void __timer_stats_timer_set_start_info(struct timer_list *timer,
-					       void *addr);
-
-static inline void timer_stats_timer_set_start_info(struct timer_list *timer)
-{
-	if (likely(!timer_stats_active))
-		return;
-	__timer_stats_timer_set_start_info(timer, __builtin_return_address(0));
-}
-
-static inline void timer_stats_timer_clear_start_info(struct timer_list *timer)
-{
-	timer->start_site = NULL;
-}
-#else
-static inline void init_timer_stats(void)
-{
-}
-
-static inline void timer_stats_timer_set_start_info(struct timer_list *timer)
-{
-}
-
-static inline void timer_stats_timer_clear_start_info(struct timer_list *timer)
-{
-}
-#endif
-
 extern void add_timer(struct timer_list *timer);
 
 extern int try_to_del_timer_sync(struct timer_list *timer);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 2318fba86277..8461a4372e8a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -850,7 +850,6 @@ void __kthread_queue_delayed_work(struct kthread_worker *worker,
 
 	list_add(&work->node, &worker->delayed_work_list);
 	work->worker = worker;
-	timer_stats_timer_set_start_info(&dwork->timer);
 	timer->expires = jiffies + delay;
 	add_timer(timer);
 }
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 976840d29a71..938dbf33ef49 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -15,6 +15,5 @@ ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y)
 endif
 obj-$(CONFIG_GENERIC_SCHED_CLOCK)		+= sched_clock.o
 obj-$(CONFIG_TICK_ONESHOT)			+= tick-oneshot.o tick-sched.o
-obj-$(CONFIG_TIMER_STATS)			+= timer_stats.o
 obj-$(CONFIG_DEBUG_FS)				+= timekeeping_debug.o
 obj-$(CONFIG_TEST_UDELAY)			+= test_udelay.o
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index c6ecedd3b839..edabde646e58 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -766,34 +766,6 @@ void hrtimers_resume(void)
 	clock_was_set_delayed();
 }
 
-static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
-{
-#ifdef CONFIG_TIMER_STATS
-	if (timer->start_site)
-		return;
-	timer->start_site = __builtin_return_address(0);
-	memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
-	timer->start_pid = current->pid;
-#endif
-}
-
-static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer)
-{
-#ifdef CONFIG_TIMER_STATS
-	timer->start_site = NULL;
-#endif
-}
-
-static inline void timer_stats_account_hrtimer(struct hrtimer *timer)
-{
-#ifdef CONFIG_TIMER_STATS
-	if (likely(!timer_stats_active))
-		return;
-	timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
-				 timer->function, timer->start_comm, 0);
-#endif
-}
-
 /*
  * Counterpart to lock_hrtimer_base above:
  */
@@ -932,7 +904,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest
 		 * rare case and less expensive than a smp call.
 		 */
 		debug_deactivate(timer);
-		timer_stats_hrtimer_clear_start_info(timer);
 		reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
 
 		if (!restart)
@@ -990,8 +961,6 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 	/* Switch the timer base, if necessary: */
 	new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
 
-	timer_stats_hrtimer_set_start_info(timer);
-
 	leftmost = enqueue_hrtimer(timer, new_base);
 	if (!leftmost)
 		goto unlock;
@@ -1128,12 +1097,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 	base = hrtimer_clockid_to_base(clock_id);
 	timer->base = &cpu_base->clock_base[base];
 	timerqueue_init(&timer->node);
-
-#ifdef CONFIG_TIMER_STATS
-	timer->start_site = NULL;
-	timer->start_pid = -1;
-	memset(timer->start_comm, 0, TASK_COMM_LEN);
-#endif
 }
 
 /**
@@ -1217,7 +1180,6 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 	raw_write_seqcount_barrier(&cpu_base->seq);
 
 	__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
-	timer_stats_account_hrtimer(timer);
 	fn = timer->function;
 
 	/*
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index ec33a6933eae..82a6bfa0c307 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -571,38 +571,6 @@ internal_add_timer(struct timer_base *base, struct timer_list *timer)
 	trigger_dyntick_cpu(base, timer);
 }
 
-#ifdef CONFIG_TIMER_STATS
-void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
-{
-	if (timer->start_site)
-		return;
-
-	timer->start_site = addr;
-	memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
-	timer->start_pid = current->pid;
-}
-
-static void timer_stats_account_timer(struct timer_list *timer)
-{
-	void *site;
-
-	/*
-	 * start_site can be concurrently reset by
-	 * timer_stats_timer_clear_start_info()
-	 */
-	site = READ_ONCE(timer->start_site);
-	if (likely(!site))
-		return;
-
-	timer_stats_update_stats(timer, timer->start_pid, site,
-				 timer->function, timer->start_comm,
-				 timer->flags);
-}
-
-#else
-static void timer_stats_account_timer(struct timer_list *timer) {}
-#endif
-
 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
 
 static struct debug_obj_descr timer_debug_descr;
@@ -789,11 +757,6 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
 {
 	timer->entry.pprev = NULL;
 	timer->flags = flags | raw_smp_processor_id();
-#ifdef CONFIG_TIMER_STATS
-	timer->start_site = NULL;
-	timer->start_pid = -1;
-	memset(timer->start_comm, 0, TASK_COMM_LEN);
-#endif
 	lockdep_init_map(&timer->lockdep_map, name, key, 0);
 }
 
@@ -1001,8 +964,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 		base = lock_timer_base(timer, &flags);
 	}
 
-	timer_stats_timer_set_start_info(timer);
-
 	ret = detach_if_pending(timer, base, false);
 	if (!ret && pending_only)
 		goto out_unlock;
@@ -1130,7 +1091,6 @@ void add_timer_on(struct timer_list *timer, int cpu)
 	struct timer_base *new_base, *base;
 	unsigned long flags;
 
-	timer_stats_timer_set_start_info(timer);
 	BUG_ON(timer_pending(timer) || !timer->function);
 
 	new_base = get_timer_cpu_base(timer->flags, cpu);
@@ -1176,7 +1136,6 @@ int del_timer(struct timer_list *timer)
 
 	debug_assert_init(timer);
 
-	timer_stats_timer_clear_start_info(timer);
 	if (timer_pending(timer)) {
 		base = lock_timer_base(timer, &flags);
 		ret = detach_if_pending(timer, base, true);
@@ -1204,10 +1163,9 @@ int try_to_del_timer_sync(struct timer_list *timer)
 
 	base = lock_timer_base(timer, &flags);
 
-	if (base->running_timer != timer) {
-		timer_stats_timer_clear_start_info(timer);
+	if (base->running_timer != timer)
 		ret = detach_if_pending(timer, base, true);
-	}
+
 	spin_unlock_irqrestore(&base->lock, flags);
 
 	return ret;
@@ -1331,7 +1289,6 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
 		unsigned long data;
 
 		timer = hlist_entry(head->first, struct timer_list, entry);
-		timer_stats_account_timer(timer);
 
 		base->running_timer = timer;
 		detach_timer(timer, true);
@@ -1868,7 +1825,6 @@ static void __init init_timer_cpus(void)
 void __init init_timers(void)
 {
 	init_timer_cpus();
-	init_timer_stats();
 	open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
 }
 
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index afe6cd1944fc..387a3a5aa388 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -62,21 +62,11 @@ static void
 print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
 	    int idx, u64 now)
 {
-#ifdef CONFIG_TIMER_STATS
-	char tmp[TASK_COMM_LEN + 1];
-#endif
 	SEQ_printf(m, " #%d: ", idx);
 	print_name_offset(m, taddr);
 	SEQ_printf(m, ", ");
 	print_name_offset(m, timer->function);
 	SEQ_printf(m, ", S:%02x", timer->state);
-#ifdef CONFIG_TIMER_STATS
-	SEQ_printf(m, ", ");
-	print_name_offset(m, timer->start_site);
-	memcpy(tmp, timer->start_comm, TASK_COMM_LEN);
-	tmp[TASK_COMM_LEN] = 0;
-	SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
-#endif
 	SEQ_printf(m, "\n");
 	SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n",
 		(unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)),
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
deleted file mode 100644
index afddded947df..000000000000
--- a/kernel/time/timer_stats.c
+++ /dev/null
@@ -1,425 +0,0 @@
-/*
- * kernel/time/timer_stats.c
- *
- * Collect timer usage statistics.
- *
- * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
- * Copyright(C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
- *
- * timer_stats is based on timer_top, a similar functionality which was part of
- * Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the
- * Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based
- * on dynamic allocation of the statistics entries and linear search based
- * lookup combined with a global lock, rather than the static array, hash
- * and per-CPU locking which is used by timer_stats. It was written for the
- * pre hrtimer kernel code and therefore did not take hrtimers into account.
- * Nevertheless it provided the base for the timer_stats implementation and
- * was a helpful source of inspiration. Kudos to Daniel and the Nokia folks
- * for this effort.
- *
- * timer_top.c is
- *	Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus
- *	Written by Daniel Petrini <d.pensator@gmail.com>
- *	timer_top.c was released under the GNU General Public License version 2
- *
- * We export the addresses and counting of timer functions being called,
- * the pid and cmdline from the owner process if applicable.
- *
- * Start/stop data collection:
- * # echo [1|0] >/proc/timer_stats
- *
- * Display the information collected so far:
- * # cat /proc/timer_stats
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/proc_fs.h>
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/sched.h>
-#include <linux/seq_file.h>
-#include <linux/kallsyms.h>
-
-#include <linux/uaccess.h>
-
-/*
- * This is our basic unit of interest: a timer expiry event identified
- * by the timer, its start/expire functions and the PID of the task that
- * started the timer. We count the number of times an event happens:
- */
-struct entry {
-	/*
-	 * Hash list:
-	 */
-	struct entry		*next;
-
-	/*
-	 * Hash keys:
-	 */
-	void			*timer;
-	void			*start_func;
-	void			*expire_func;
-	pid_t			pid;
-
-	/*
-	 * Number of timeout events:
-	 */
-	unsigned long		count;
-	u32			flags;
-
-	/*
-	 * We save the command-line string to preserve
-	 * this information past task exit:
-	 */
-	char			comm[TASK_COMM_LEN + 1];
-
-} ____cacheline_aligned_in_smp;
-
-/*
- * Spinlock protecting the tables - not taken during lookup:
- */
-static DEFINE_RAW_SPINLOCK(table_lock);
-
-/*
- * Per-CPU lookup locks for fast hash lookup:
- */
-static DEFINE_PER_CPU(raw_spinlock_t, tstats_lookup_lock);
-
-/*
- * Mutex to serialize state changes with show-stats activities:
- */
-static DEFINE_MUTEX(show_mutex);
-
-/*
- * Collection status, active/inactive:
- */
-int __read_mostly timer_stats_active;
-
-/*
- * Beginning/end timestamps of measurement:
- */
-static ktime_t time_start, time_stop;
-
-/*
- * tstat entry structs only get allocated while collection is
- * active and never freed during that time - this simplifies
- * things quite a bit.
- *
- * They get freed when a new collection period is started.
- */
-#define MAX_ENTRIES_BITS	10
-#define MAX_ENTRIES		(1UL << MAX_ENTRIES_BITS)
-
-static unsigned long nr_entries;
-static struct entry entries[MAX_ENTRIES];
-
-static atomic_t overflow_count;
-
-/*
- * The entries are in a hash-table, for fast lookup:
- */
-#define TSTAT_HASH_BITS		(MAX_ENTRIES_BITS - 1)
-#define TSTAT_HASH_SIZE		(1UL << TSTAT_HASH_BITS)
-#define TSTAT_HASH_MASK		(TSTAT_HASH_SIZE - 1)
-
-#define __tstat_hashfn(entry)						\
-	(((unsigned long)(entry)->timer       ^				\
-	  (unsigned long)(entry)->start_func  ^				\
-	  (unsigned long)(entry)->expire_func ^				\
-	  (unsigned long)(entry)->pid		) & TSTAT_HASH_MASK)
-
-#define tstat_hashentry(entry)	(tstat_hash_table + __tstat_hashfn(entry))
-
-static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly;
-
-static void reset_entries(void)
-{
-	nr_entries = 0;
-	memset(entries, 0, sizeof(entries));
-	memset(tstat_hash_table, 0, sizeof(tstat_hash_table));
-	atomic_set(&overflow_count, 0);
-}
-
-static struct entry *alloc_entry(void)
-{
-	if (nr_entries >= MAX_ENTRIES)
-		return NULL;
-
-	return entries + nr_entries++;
-}
-
-static int match_entries(struct entry *entry1, struct entry *entry2)
-{
-	return entry1->timer       == entry2->timer	  &&
-	       entry1->start_func  == entry2->start_func  &&
-	       entry1->expire_func == entry2->expire_func &&
-	       entry1->pid	   == entry2->pid;
-}
-
-/*
- * Look up whether an entry matching this item is present
- * in the hash already. Must be called with irqs off and the
- * lookup lock held:
- */
-static struct entry *tstat_lookup(struct entry *entry, char *comm)
-{
-	struct entry **head, *curr, *prev;
-
-	head = tstat_hashentry(entry);
-	curr = *head;
-
-	/*
-	 * The fastpath is when the entry is already hashed,
-	 * we do this with the lookup lock held, but with the
-	 * table lock not held:
-	 */
-	while (curr) {
-		if (match_entries(curr, entry))
-			return curr;
-
-		curr = curr->next;
-	}
-	/*
-	 * Slowpath: allocate, set up and link a new hash entry:
-	 */
-	prev = NULL;
-	curr = *head;
-
-	raw_spin_lock(&table_lock);
-	/*
-	 * Make sure we have not raced with another CPU:
-	 */
-	while (curr) {
-		if (match_entries(curr, entry))
-			goto out_unlock;
-
-		prev = curr;
-		curr = curr->next;
-	}
-
-	curr = alloc_entry();
-	if (curr) {
-		*curr = *entry;
-		curr->count = 0;
-		curr->next = NULL;
-		memcpy(curr->comm, comm, TASK_COMM_LEN);
-
-		smp_mb(); /* Ensure that curr is initialized before insert */
-
-		if (prev)
-			prev->next = curr;
-		else
-			*head = curr;
-	}
- out_unlock:
-	raw_spin_unlock(&table_lock);
-
-	return curr;
-}
-
-/**
- * timer_stats_update_stats - Update the statistics for a timer.
- * @timer:	pointer to either a timer_list or a hrtimer
- * @pid:	the pid of the task which set up the timer
- * @startf:	pointer to the function which did the timer setup
- * @timerf:	pointer to the timer callback function of the timer
- * @comm:	name of the process which set up the timer
- * @tflags:	The flags field of the timer
- *
- * When the timer is already registered, then the event counter is
- * incremented. Otherwise the timer is registered in a free slot.
- */
-void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
-			      void *timerf, char *comm, u32 tflags)
-{
-	/*
-	 * It doesn't matter which lock we take:
-	 */
-	raw_spinlock_t *lock;
-	struct entry *entry, input;
-	unsigned long flags;
-
-	if (likely(!timer_stats_active))
-		return;
-
-	lock = &per_cpu(tstats_lookup_lock, raw_smp_processor_id());
-
-	input.timer = timer;
-	input.start_func = startf;
-	input.expire_func = timerf;
-	input.pid = pid;
-	input.flags = tflags;
-
-	raw_spin_lock_irqsave(lock, flags);
-	if (!timer_stats_active)
-		goto out_unlock;
-
-	entry = tstat_lookup(&input, comm);
-	if (likely(entry))
-		entry->count++;
-	else
-		atomic_inc(&overflow_count);
-
- out_unlock:
-	raw_spin_unlock_irqrestore(lock, flags);
-}
-
-static void print_name_offset(struct seq_file *m, unsigned long addr)
-{
-	char symname[KSYM_NAME_LEN];
-
-	if (lookup_symbol_name(addr, symname) < 0)
-		seq_printf(m, "<%p>", (void *)addr);
-	else
-		seq_printf(m, "%s", symname);
-}
-
-static int tstats_show(struct seq_file *m, void *v)
-{
-	struct timespec64 period;
-	struct entry *entry;
-	unsigned long ms;
-	long events = 0;
-	ktime_t time;
-	int i;
-
-	mutex_lock(&show_mutex);
-	/*
-	 * If still active then calculate up to now:
-	 */
-	if (timer_stats_active)
-		time_stop = ktime_get();
-
-	time = ktime_sub(time_stop, time_start);
-
-	period = ktime_to_timespec64(time);
-	ms = period.tv_nsec / 1000000;
-
-	seq_puts(m, "Timer Stats Version: v0.3\n");
-	seq_printf(m, "Sample period: %ld.%03ld s\n", (long)period.tv_sec, ms);
-	if (atomic_read(&overflow_count))
-		seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count));
-	seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive");
-
-	for (i = 0; i < nr_entries; i++) {
-		entry = entries + i;
-		if (entry->flags & TIMER_DEFERRABLE) {
-			seq_printf(m, "%4luD, %5d %-16s ",
-				entry->count, entry->pid, entry->comm);
-		} else {
-			seq_printf(m, " %4lu, %5d %-16s ",
-				entry->count, entry->pid, entry->comm);
-		}
-
-		print_name_offset(m, (unsigned long)entry->start_func);
-		seq_puts(m, " (");
-		print_name_offset(m, (unsigned long)entry->expire_func);
-		seq_puts(m, ")\n");
-
-		events += entry->count;
-	}
-
-	ms += period.tv_sec * 1000;
-	if (!ms)
-		ms = 1;
-
-	if (events && period.tv_sec)
-		seq_printf(m, "%ld total events, %ld.%03ld events/sec\n",
-			   events, events * 1000 / ms,
-			   (events * 1000000 / ms) % 1000);
-	else
-		seq_printf(m, "%ld total events\n", events);
-
-	mutex_unlock(&show_mutex);
-
-	return 0;
-}
-
-/*
- * After a state change, make sure all concurrent lookup/update
- * activities have stopped:
- */
-static void sync_access(void)
-{
-	unsigned long flags;
-	int cpu;
-
-	for_each_online_cpu(cpu) {
-		raw_spinlock_t *lock = &per_cpu(tstats_lookup_lock, cpu);
-
-		raw_spin_lock_irqsave(lock, flags);
-		/* nothing */
-		raw_spin_unlock_irqrestore(lock, flags);
-	}
-}
-
-static ssize_t tstats_write(struct file *file, const char __user *buf,
-			    size_t count, loff_t *offs)
-{
-	char ctl[2];
-
-	if (count != 2 || *offs)
-		return -EINVAL;
-
-	if (copy_from_user(ctl, buf, count))
-		return -EFAULT;
-
-	mutex_lock(&show_mutex);
-	switch (ctl[0]) {
-	case '0':
-		if (timer_stats_active) {
-			timer_stats_active = 0;
-			time_stop = ktime_get();
-			sync_access();
-		}
-		break;
-	case '1':
-		if (!timer_stats_active) {
-			reset_entries();
-			time_start = ktime_get();
-			smp_mb();
-			timer_stats_active = 1;
-		}
-		break;
-	default:
-		count = -EINVAL;
-	}
-	mutex_unlock(&show_mutex);
-
-	return count;
-}
-
-static int tstats_open(struct inode *inode, struct file *filp)
-{
-	return single_open(filp, tstats_show, NULL);
-}
-
-static const struct file_operations tstats_fops = {
-	.open		= tstats_open,
-	.read		= seq_read,
-	.write		= tstats_write,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-void __init init_timer_stats(void)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		raw_spin_lock_init(&per_cpu(tstats_lookup_lock, cpu));
-}
-
-static int __init init_tstats_procfs(void)
-{
-	struct proc_dir_entry *pe;
-
-	pe = proc_create("timer_stats", 0644, NULL, &tstats_fops);
-	if (!pe)
-		return -ENOMEM;
-	return 0;
-}
-__initcall(init_tstats_procfs);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1d9fb6543a66..072cbc9b175d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1523,8 +1523,6 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
 		return;
 	}
 
-	timer_stats_timer_set_start_info(&dwork->timer);
-
 	dwork->wq = wq;
 	dwork->cpu = cpu;
 	timer->expires = jiffies + delay;
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index eb9e9a7870fa..132af338d6dd 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -980,20 +980,6 @@ config DEBUG_TIMEKEEPING
 
 	  If unsure, say N.
 
-config TIMER_STATS
-	bool "Collect kernel timers statistics"
-	depends on DEBUG_KERNEL && PROC_FS
-	help
-	  If you say Y here, additional code will be inserted into the
-	  timer routines to collect statistics about kernel timers being
-	  reprogrammed. The statistics can be read from /proc/timer_stats.
-	  The statistics collection is started by writing 1 to /proc/timer_stats,
-	  writing 0 stops it. This feature is useful to collect information
-	  about timer usage patterns in kernel and userspace. This feature
-	  is lightweight if enabled in the kernel config but not activated
-	  (it defaults to deactivated on bootup and will only be activated
-	  if some application like powertop activates it explicitly).
-
 config DEBUG_PREEMPT
 	bool "Debug preemptible kernel"
 	depends on DEBUG_KERNEL && PREEMPT && TRACE_IRQFLAGS_SUPPORT
-- 
cgit v1.2.3


From 7551b02b94ad1daee3a79d667dc3c46d08328f87 Mon Sep 17 00:00:00 2001
From: Mars Cheng <mars.cheng@mediatek.com>
Date: Thu, 9 Feb 2017 15:50:15 +0800
Subject: timer_list: Remove useless cast when printing

hrtimer_resolution is already unsigned int, not necessary to cast
it when printing.

Signed-off-by: Mars Cheng <mars.cheng@mediatek.com>
Cc: CC Hwang <cc.hwang@mediatek.com>
Cc: wsd_upstream@mediatek.com
Cc: Loda Chou <loda.chou@mediatek.com>
Cc: Jades Shih <jades.shih@mediatek.com>
Cc: Miles Chen <miles.chen@mediatek.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: My Chuang <my.chuang@mediatek.com>
Cc: Matthias Brugger <matthias.bgg@gmail.com>
Cc: Yingjoe Chen <yingjoe.chen@mediatek.com>
Link: http://lkml.kernel.org/r/1486626615-5879-1-git-send-email-mars.cheng@mediatek.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timer_list.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 387a3a5aa388..ff8d5c13d04b 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -117,7 +117,7 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
 	SEQ_printf(m, "  .base:       %pK\n", base);
 	SEQ_printf(m, "  .index:      %d\n", base->index);
 
-	SEQ_printf(m, "  .resolution: %u nsecs\n", (unsigned) hrtimer_resolution);
+	SEQ_printf(m, "  .resolution: %u nsecs\n", hrtimer_resolution);
 
 	SEQ_printf(m,   "  .get_time:   ");
 	print_name_offset(m, base->get_time);
-- 
cgit v1.2.3


From 2b5e77308f3356faad640b24af6dc5aa7233eb2d Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Date: Fri, 10 Feb 2017 13:23:23 +0100
Subject: irqdesc: Add a resource managed version of irq_alloc_descs()

Add a devres flavor of __devm_irq_alloc_descs() and corresponding
helper macros.

Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: linux-doc@vger.kernel.org
Cc: Jonathan Corbet <corbet@lwn.net>
Link: http://lkml.kernel.org/r/1486729403-21132-1-git-send-email-bgolaszewski@baylibre.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/driver-model/devres.txt |  5 ++++
 include/linux/irq.h                   | 19 ++++++++++++
 kernel/irq/devres.c                   | 55 +++++++++++++++++++++++++++++++++++
 3 files changed, 79 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt
index ca9d1eb46bc0..bf34d5b3a733 100644
--- a/Documentation/driver-model/devres.txt
+++ b/Documentation/driver-model/devres.txt
@@ -306,6 +306,11 @@ IRQ
   devm_request_any_context_irq()
   devm_request_irq()
   devm_request_threaded_irq()
+  devm_irq_alloc_descs()
+  devm_irq_alloc_desc()
+  devm_irq_alloc_desc_at()
+  devm_irq_alloc_desc_from()
+  devm_irq_alloc_descs_from()
 
 LED
   devm_led_classdev_register()
diff --git a/include/linux/irq.h b/include/linux/irq.h
index e79875574b39..d915caecafa1 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -715,6 +715,10 @@ unsigned int arch_dynirq_lower_bound(unsigned int from);
 int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
 		      struct module *owner, const struct cpumask *affinity);
 
+int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from,
+			   unsigned int cnt, int node, struct module *owner,
+			   const struct cpumask *affinity);
+
 /* use macros to avoid needing export.h for THIS_MODULE */
 #define irq_alloc_descs(irq, from, cnt, node)	\
 	__irq_alloc_descs(irq, from, cnt, node, THIS_MODULE, NULL)
@@ -731,6 +735,21 @@ int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
 #define irq_alloc_descs_from(from, cnt, node)	\
 	irq_alloc_descs(-1, from, cnt, node)
 
+#define devm_irq_alloc_descs(dev, irq, from, cnt, node)		\
+	__devm_irq_alloc_descs(dev, irq, from, cnt, node, THIS_MODULE, NULL)
+
+#define devm_irq_alloc_desc(dev, node)				\
+	devm_irq_alloc_descs(dev, -1, 0, 1, node)
+
+#define devm_irq_alloc_desc_at(dev, at, node)			\
+	devm_irq_alloc_descs(dev, at, at, 1, node)
+
+#define devm_irq_alloc_desc_from(dev, from, node)		\
+	devm_irq_alloc_descs(dev, -1, from, 1, node)
+
+#define devm_irq_alloc_descs_from(dev, from, cnt, node)		\
+	devm_irq_alloc_descs(dev, -1, from, cnt, node)
+
 void irq_free_descs(unsigned int irq, unsigned int cnt);
 static inline void irq_free_desc(unsigned int irq)
 {
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 74d90a754268..18babef39361 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -2,6 +2,7 @@
 #include <linux/interrupt.h>
 #include <linux/device.h>
 #include <linux/gfp.h>
+#include <linux/irq.h>
 
 /*
  * Device resource management aware IRQ request/free implementation.
@@ -137,3 +138,57 @@ void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
 	free_irq(irq, dev_id);
 }
 EXPORT_SYMBOL(devm_free_irq);
+
+struct irq_desc_devres {
+	unsigned int from;
+	unsigned int cnt;
+};
+
+static void devm_irq_desc_release(struct device *dev, void *res)
+{
+	struct irq_desc_devres *this = res;
+
+	irq_free_descs(this->from, this->cnt);
+}
+
+/**
+ * __devm_irq_alloc_descs - Allocate and initialize a range of irq descriptors
+ *			    for a managed device
+ * @dev:	Device to allocate the descriptors for
+ * @irq:	Allocate for specific irq number if irq >= 0
+ * @from:	Start the search from this irq number
+ * @cnt:	Number of consecutive irqs to allocate
+ * @node:	Preferred node on which the irq descriptor should be allocated
+ * @owner:	Owning module (can be NULL)
+ * @affinity:	Optional pointer to an affinity mask array of size @cnt
+ *		which hints where the irq descriptors should be allocated
+ *		and which default affinities to use
+ *
+ * Returns the first irq number or error code.
+ *
+ * Note: Use the provided wrappers (devm_irq_alloc_desc*) for simplicity.
+ */
+int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from,
+			   unsigned int cnt, int node, struct module *owner,
+			   const struct cpumask *affinity)
+{
+	struct irq_desc_devres *dr;
+	int base;
+
+	dr = devres_alloc(devm_irq_desc_release, sizeof(*dr), GFP_KERNEL);
+	if (!dr)
+		return -ENOMEM;
+
+	base = __irq_alloc_descs(irq, from, cnt, node, owner, affinity);
+	if (base < 0) {
+		devres_free(dr);
+		return base;
+	}
+
+	dr->from = base;
+	dr->cnt = cnt;
+	devres_add(dev, dr);
+
+	return base;
+}
+EXPORT_SYMBOL_GPL(__devm_irq_alloc_descs);
-- 
cgit v1.2.3


From f435da416beaacc8934fc21820d9488269b39c98 Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hsweeten@visionengravers.com>
Date: Fri, 10 Feb 2017 09:54:16 -0700
Subject: genirq: Fix /proc/interrupts output alignment

If the irq_desc being output does not have a domain associated the
information following the 'name' is not aligned correctly.

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Link: http://lkml.kernel.org/r/20170210165416.5629-1-hsweeten@visionengravers.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/proc.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index feaa813b84a9..c53edad7b459 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -487,6 +487,8 @@ int show_interrupts(struct seq_file *p, void *v)
 	}
 	if (desc->irq_data.domain)
 		seq_printf(p, " %*d", prec, (int) desc->irq_data.hwirq);
+	else
+		seq_printf(p, " %*s", prec, "");
 #ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL
 	seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge");
 #endif
-- 
cgit v1.2.3


From 4c7384131c8d343c0bf79abac3b3e78596d85b10 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 8 Feb 2017 13:36:37 -0500
Subject: tracing: Have COMM event filter key be treated as a string

The GLOB operation "~" should be able to work with the COMM filter key in
order to trace programs with a glob. For example

  echo 'COMM ~ "systemd*"' > events/syscalls/filter

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index afbec961eab1..d2d068b36341 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1312,7 +1312,8 @@ static inline bool is_string_field(struct ftrace_event_field *field)
 {
 	return field->filter_type == FILTER_DYN_STRING ||
 	       field->filter_type == FILTER_STATIC_STRING ||
-	       field->filter_type == FILTER_PTR_STRING;
+	       field->filter_type == FILTER_PTR_STRING ||
+	       field->filter_type == FILTER_COMM;
 }
 
 static inline bool is_function_field(struct ftrace_event_field *field)
-- 
cgit v1.2.3


From 5ff22646d246e23bf8056c63bed6aaf9fd22ed12 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 8 Feb 2017 15:48:01 +0100
Subject: module: Optimize search_module_extables()

While looking through the __ex_table stuff I found that we do a linear
lookup of the module. Also fix up a comment.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Jessica Yu <jeyu@redhat.com>
---
 kernel/module.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 330f64e7e193..32d0d32abbf6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -4170,22 +4170,23 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
 	struct module *mod;
 
 	preempt_disable();
-	list_for_each_entry_rcu(mod, &modules, list) {
-		if (mod->state == MODULE_STATE_UNFORMED)
-			continue;
-		if (mod->num_exentries == 0)
-			continue;
+	mod = __module_address(addr);
+	if (!mod)
+		goto out;
 
-		e = search_extable(mod->extable,
-				   mod->extable + mod->num_exentries - 1,
-				   addr);
-		if (e)
-			break;
-	}
+	if (!mod->num_exentries)
+		goto out;
+
+	e = search_extable(mod->extable,
+			   mod->extable + mod->num_exentries - 1,
+			   addr);
+out:
 	preempt_enable();
 
-	/* Now, if we found one, we are running inside it now, hence
-	   we cannot unload the module, hence no refcnt needed. */
+	/*
+	 * Now, if we found one, we are running inside it now, hence
+	 * we cannot unload the module, hence no refcnt needed.
+	 */
 	return e;
 }
 
-- 
cgit v1.2.3


From 899b5fbf9d3fcb721690b4d58cf58cc018517003 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sun, 12 Feb 2017 16:31:44 +0100
Subject: genirq/devres: Use dev_name(dev) as default for devname

Allow the devname parameter to be NULL and use dev_name(dev) in this case.
This should be an appropriate default for most use cases.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: http://lkml.kernel.org/r/05c63d67-30b4-7026-02d5-ce7fb7bc185f@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/devres.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 18babef39361..1613bfd48365 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -34,7 +34,7 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
  *	@thread_fn: function to be called in a threaded interrupt context. NULL
  *		    for devices which handle everything in @handler
  *	@irqflags: Interrupt type flags
- *	@devname: An ascii name for the claiming device
+ *	@devname: An ascii name for the claiming device, dev_name(dev) if NULL
  *	@dev_id: A cookie passed back to the handler function
  *
  *	Except for the extra @dev argument, this function takes the
@@ -58,6 +58,9 @@ int devm_request_threaded_irq(struct device *dev, unsigned int irq,
 	if (!dr)
 		return -ENOMEM;
 
+	if (!devname)
+		devname = dev_name(dev);
+
 	rc = request_threaded_irq(irq, handler, thread_fn, irqflags, devname,
 				  dev_id);
 	if (rc) {
@@ -81,7 +84,7 @@ EXPORT_SYMBOL(devm_request_threaded_irq);
  *	@thread_fn: function to be called in a threaded interrupt context. NULL
  *		    for devices which handle everything in @handler
  *	@irqflags: Interrupt type flags
- *	@devname: An ascii name for the claiming device
+ *	@devname: An ascii name for the claiming device, dev_name(dev) if NULL
  *	@dev_id: A cookie passed back to the handler function
  *
  *	Except for the extra @dev argument, this function takes the
@@ -104,6 +107,9 @@ int devm_request_any_context_irq(struct device *dev, unsigned int irq,
 	if (!dr)
 		return -ENOMEM;
 
+	if (!devname)
+		devname = dev_name(dev);
+
 	rc = request_any_context_irq(irq, handler, irqflags, devname, dev_id);
 	if (rc < 0) {
 		devres_free(dr);
-- 
cgit v1.2.3


From 7f677633379b4abb3281cdbe7e7006f049305c03 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Fri, 10 Feb 2017 20:28:24 -0800
Subject: bpf: introduce BPF_F_ALLOW_OVERRIDE flag

If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command
to the given cgroup the descendent cgroup will be able to override
effective bpf program that was inherited from this cgroup.
By default it's not passed, therefore override is disallowed.

Examples:
1.
prog X attached to /A with default
prog Y fails to attach to /A/B and /A/B/C
Everything under /A runs prog X

2.
prog X attached to /A with allow_override.
prog Y fails to attach to /A/B with default (non-override)
prog M attached to /A/B with allow_override.
Everything under /A/B runs prog M only.

3.
prog X attached to /A with allow_override.
prog Y fails to attach to /A with default.
The user has to detach first to switch the mode.

In the future this behavior may be extended with a chain of
non-overridable programs.

Also fix the bug where detach from cgroup where nothing is attached
was not throwing error. Return ENOENT in such case.

Add several testcases and adjust libbpf.

Fixes: 3007098494be ("cgroup: add support for eBPF programs")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h       | 13 ++++----
 include/uapi/linux/bpf.h         |  7 +++++
 kernel/bpf/cgroup.c              | 59 +++++++++++++++++++++++++++-------
 kernel/bpf/syscall.c             | 20 ++++++++----
 kernel/cgroup.c                  |  9 +++---
 samples/bpf/test_cgrp2_attach.c  |  2 +-
 samples/bpf/test_cgrp2_attach2.c | 68 +++++++++++++++++++++++++++++++++++++---
 samples/bpf/test_cgrp2_sock.c    |  2 +-
 samples/bpf/test_cgrp2_sock2.c   |  2 +-
 tools/lib/bpf/bpf.c              |  4 ++-
 tools/lib/bpf/bpf.h              |  3 +-
 11 files changed, 151 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 92bc89ae7e20..c970a25d2a49 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -21,20 +21,19 @@ struct cgroup_bpf {
 	 */
 	struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE];
 	struct bpf_prog __rcu *effective[MAX_BPF_ATTACH_TYPE];
+	bool disallow_override[MAX_BPF_ATTACH_TYPE];
 };
 
 void cgroup_bpf_put(struct cgroup *cgrp);
 void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent);
 
-void __cgroup_bpf_update(struct cgroup *cgrp,
-			 struct cgroup *parent,
-			 struct bpf_prog *prog,
-			 enum bpf_attach_type type);
+int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
+			struct bpf_prog *prog, enum bpf_attach_type type,
+			bool overridable);
 
 /* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */
-void cgroup_bpf_update(struct cgroup *cgrp,
-		       struct bpf_prog *prog,
-		       enum bpf_attach_type type);
+int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, bool overridable);
 
 int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0eb0e87dbe9f..d2b0ac799d03 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -116,6 +116,12 @@ enum bpf_attach_type {
 
 #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
 
+/* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command
+ * to the given target_fd cgroup the descendent cgroup will be able to
+ * override effective bpf program that was inherited from this cgroup
+ */
+#define BPF_F_ALLOW_OVERRIDE	(1U << 0)
+
 #define BPF_PSEUDO_MAP_FD	1
 
 /* flags for BPF_MAP_UPDATE_ELEM command */
@@ -171,6 +177,7 @@ union bpf_attr {
 		__u32		target_fd;	/* container object to attach to */
 		__u32		attach_bpf_fd;	/* eBPF program to attach */
 		__u32		attach_type;
+		__u32		attach_flags;
 	};
 } __attribute__((aligned(8)));
 
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index a515f7b007c6..da0f53690295 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -52,6 +52,7 @@ void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
 		e = rcu_dereference_protected(parent->bpf.effective[type],
 					      lockdep_is_held(&cgroup_mutex));
 		rcu_assign_pointer(cgrp->bpf.effective[type], e);
+		cgrp->bpf.disallow_override[type] = parent->bpf.disallow_override[type];
 	}
 }
 
@@ -82,30 +83,63 @@ void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
  *
  * Must be called with cgroup_mutex held.
  */
-void __cgroup_bpf_update(struct cgroup *cgrp,
-			 struct cgroup *parent,
-			 struct bpf_prog *prog,
-			 enum bpf_attach_type type)
+int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
+			struct bpf_prog *prog, enum bpf_attach_type type,
+			bool new_overridable)
 {
-	struct bpf_prog *old_prog, *effective;
+	struct bpf_prog *old_prog, *effective = NULL;
 	struct cgroup_subsys_state *pos;
+	bool overridable = true;
 
-	old_prog = xchg(cgrp->bpf.prog + type, prog);
+	if (parent) {
+		overridable = !parent->bpf.disallow_override[type];
+		effective = rcu_dereference_protected(parent->bpf.effective[type],
+						      lockdep_is_held(&cgroup_mutex));
+	}
+
+	if (prog && effective && !overridable)
+		/* if parent has non-overridable prog attached, disallow
+		 * attaching new programs to descendent cgroup
+		 */
+		return -EPERM;
+
+	if (prog && effective && overridable != new_overridable)
+		/* if parent has overridable prog attached, only
+		 * allow overridable programs in descendent cgroup
+		 */
+		return -EPERM;
 
-	effective = (!prog && parent) ?
-		rcu_dereference_protected(parent->bpf.effective[type],
-					  lockdep_is_held(&cgroup_mutex)) :
-		prog;
+	old_prog = cgrp->bpf.prog[type];
+
+	if (prog) {
+		overridable = new_overridable;
+		effective = prog;
+		if (old_prog &&
+		    cgrp->bpf.disallow_override[type] == new_overridable)
+			/* disallow attaching non-overridable on top
+			 * of existing overridable in this cgroup
+			 * and vice versa
+			 */
+			return -EPERM;
+	}
+
+	if (!prog && !old_prog)
+		/* report error when trying to detach and nothing is attached */
+		return -ENOENT;
+
+	cgrp->bpf.prog[type] = prog;
 
 	css_for_each_descendant_pre(pos, &cgrp->self) {
 		struct cgroup *desc = container_of(pos, struct cgroup, self);
 
 		/* skip the subtree if the descendant has its own program */
-		if (desc->bpf.prog[type] && desc != cgrp)
+		if (desc->bpf.prog[type] && desc != cgrp) {
 			pos = css_rightmost_descendant(pos);
-		else
+		} else {
 			rcu_assign_pointer(desc->bpf.effective[type],
 					   effective);
+			desc->bpf.disallow_override[type] = !overridable;
+		}
 	}
 
 	if (prog)
@@ -115,6 +149,7 @@ void __cgroup_bpf_update(struct cgroup *cgrp,
 		bpf_prog_put(old_prog);
 		static_branch_dec(&cgroup_bpf_enabled_key);
 	}
+	return 0;
 }
 
 /**
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 19b6129eab23..bbb016adbaeb 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -920,13 +920,14 @@ static int bpf_obj_get(const union bpf_attr *attr)
 
 #ifdef CONFIG_CGROUP_BPF
 
-#define BPF_PROG_ATTACH_LAST_FIELD attach_type
+#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
 
 static int bpf_prog_attach(const union bpf_attr *attr)
 {
+	enum bpf_prog_type ptype;
 	struct bpf_prog *prog;
 	struct cgroup *cgrp;
-	enum bpf_prog_type ptype;
+	int ret;
 
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
@@ -934,6 +935,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_PROG_ATTACH))
 		return -EINVAL;
 
+	if (attr->attach_flags & ~BPF_F_ALLOW_OVERRIDE)
+		return -EINVAL;
+
 	switch (attr->attach_type) {
 	case BPF_CGROUP_INET_INGRESS:
 	case BPF_CGROUP_INET_EGRESS:
@@ -956,10 +960,13 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 		return PTR_ERR(cgrp);
 	}
 
-	cgroup_bpf_update(cgrp, prog, attr->attach_type);
+	ret = cgroup_bpf_update(cgrp, prog, attr->attach_type,
+				attr->attach_flags & BPF_F_ALLOW_OVERRIDE);
+	if (ret)
+		bpf_prog_put(prog);
 	cgroup_put(cgrp);
 
-	return 0;
+	return ret;
 }
 
 #define BPF_PROG_DETACH_LAST_FIELD attach_type
@@ -967,6 +974,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 static int bpf_prog_detach(const union bpf_attr *attr)
 {
 	struct cgroup *cgrp;
+	int ret;
 
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
@@ -982,7 +990,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 		if (IS_ERR(cgrp))
 			return PTR_ERR(cgrp);
 
-		cgroup_bpf_update(cgrp, NULL, attr->attach_type);
+		ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false);
 		cgroup_put(cgrp);
 		break;
 
@@ -990,7 +998,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 		return -EINVAL;
 	}
 
-	return 0;
+	return ret;
 }
 #endif /* CONFIG_CGROUP_BPF */
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 688dd02af985..53bbca7c4859 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -6498,15 +6498,16 @@ static __init int cgroup_namespaces_init(void)
 subsys_initcall(cgroup_namespaces_init);
 
 #ifdef CONFIG_CGROUP_BPF
-void cgroup_bpf_update(struct cgroup *cgrp,
-		       struct bpf_prog *prog,
-		       enum bpf_attach_type type)
+int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, bool overridable)
 {
 	struct cgroup *parent = cgroup_parent(cgrp);
+	int ret;
 
 	mutex_lock(&cgroup_mutex);
-	__cgroup_bpf_update(cgrp, parent, prog, type);
+	ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable);
 	mutex_unlock(&cgroup_mutex);
+	return ret;
 }
 #endif /* CONFIG_CGROUP_BPF */
 
diff --git a/samples/bpf/test_cgrp2_attach.c b/samples/bpf/test_cgrp2_attach.c
index 504058631ffc..4bfcaf93fcf3 100644
--- a/samples/bpf/test_cgrp2_attach.c
+++ b/samples/bpf/test_cgrp2_attach.c
@@ -104,7 +104,7 @@ static int attach_filter(int cg_fd, int type, int verdict)
 		return EXIT_FAILURE;
 	}
 
-	ret = bpf_prog_attach(prog_fd, cg_fd, type);
+	ret = bpf_prog_attach(prog_fd, cg_fd, type, 0);
 	if (ret < 0) {
 		printf("Failed to attach prog to cgroup: '%s'\n",
 		       strerror(errno));
diff --git a/samples/bpf/test_cgrp2_attach2.c b/samples/bpf/test_cgrp2_attach2.c
index 6e69be37f87f..3049b1f26267 100644
--- a/samples/bpf/test_cgrp2_attach2.c
+++ b/samples/bpf/test_cgrp2_attach2.c
@@ -79,11 +79,12 @@ int main(int argc, char **argv)
 	if (join_cgroup(FOO))
 		goto err;
 
-	if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS)) {
+	if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS, 1)) {
 		log_err("Attaching prog to /foo");
 		goto err;
 	}
 
+	printf("Attached DROP prog. This ping in cgroup /foo should fail...\n");
 	assert(system(PING_CMD) != 0);
 
 	/* Create cgroup /foo/bar, get fd, and join it */
@@ -94,24 +95,27 @@ int main(int argc, char **argv)
 	if (join_cgroup(BAR))
 		goto err;
 
+	printf("Attached DROP prog. This ping in cgroup /foo/bar should fail...\n");
 	assert(system(PING_CMD) != 0);
 
-	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS)) {
+	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) {
 		log_err("Attaching prog to /foo/bar");
 		goto err;
 	}
 
+	printf("Attached PASS prog. This ping in cgroup /foo/bar should pass...\n");
 	assert(system(PING_CMD) == 0);
 
-
 	if (bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS)) {
 		log_err("Detaching program from /foo/bar");
 		goto err;
 	}
 
+	printf("Detached PASS from /foo/bar while DROP is attached to /foo.\n"
+	       "This ping in cgroup /foo/bar should fail...\n");
 	assert(system(PING_CMD) != 0);
 
-	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS)) {
+	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) {
 		log_err("Attaching prog to /foo/bar");
 		goto err;
 	}
@@ -121,8 +125,60 @@ int main(int argc, char **argv)
 		goto err;
 	}
 
+	printf("Attached PASS from /foo/bar and detached DROP from /foo.\n"
+	       "This ping in cgroup /foo/bar should pass...\n");
 	assert(system(PING_CMD) == 0);
 
+	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) {
+		log_err("Attaching prog to /foo/bar");
+		goto err;
+	}
+
+	if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 0)) {
+		errno = 0;
+		log_err("Unexpected success attaching prog to /foo/bar");
+		goto err;
+	}
+
+	if (bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS)) {
+		log_err("Detaching program from /foo/bar");
+		goto err;
+	}
+
+	if (!bpf_prog_detach(foo, BPF_CGROUP_INET_EGRESS)) {
+		errno = 0;
+		log_err("Unexpected success in double detach from /foo");
+		goto err;
+	}
+
+	if (bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS, 0)) {
+		log_err("Attaching non-overridable prog to /foo");
+		goto err;
+	}
+
+	if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 0)) {
+		errno = 0;
+		log_err("Unexpected success attaching non-overridable prog to /foo/bar");
+		goto err;
+	}
+
+	if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) {
+		errno = 0;
+		log_err("Unexpected success attaching overridable prog to /foo/bar");
+		goto err;
+	}
+
+	if (!bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS, 1)) {
+		errno = 0;
+		log_err("Unexpected success attaching overridable prog to /foo");
+		goto err;
+	}
+
+	if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS, 0)) {
+		log_err("Attaching different non-overridable prog to /foo");
+		goto err;
+	}
+
 	goto out;
 
 err:
@@ -132,5 +188,9 @@ out:
 	close(foo);
 	close(bar);
 	cleanup_cgroup_environment();
+	if (!rc)
+		printf("PASS\n");
+	else
+		printf("FAIL\n");
 	return rc;
 }
diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c
index 0791b949cbe4..c3cfb23e23b5 100644
--- a/samples/bpf/test_cgrp2_sock.c
+++ b/samples/bpf/test_cgrp2_sock.c
@@ -75,7 +75,7 @@ int main(int argc, char **argv)
 		return EXIT_FAILURE;
 	}
 
-	ret = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_INET_SOCK_CREATE);
+	ret = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_INET_SOCK_CREATE, 0);
 	if (ret < 0) {
 		printf("Failed to attach prog to cgroup: '%s'\n",
 		       strerror(errno));
diff --git a/samples/bpf/test_cgrp2_sock2.c b/samples/bpf/test_cgrp2_sock2.c
index 455ef0d06e93..db036077b644 100644
--- a/samples/bpf/test_cgrp2_sock2.c
+++ b/samples/bpf/test_cgrp2_sock2.c
@@ -55,7 +55,7 @@ int main(int argc, char **argv)
 	}
 
 	ret = bpf_prog_attach(prog_fd[filter_id], cg_fd,
-			      BPF_CGROUP_INET_SOCK_CREATE);
+			      BPF_CGROUP_INET_SOCK_CREATE, 0);
 	if (ret < 0) {
 		printf("Failed to attach prog to cgroup: '%s'\n",
 		       strerror(errno));
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 3ddb58a36d3c..ae752fa4eaa7 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -168,7 +168,8 @@ int bpf_obj_get(const char *pathname)
 	return sys_bpf(BPF_OBJ_GET, &attr, sizeof(attr));
 }
 
-int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type)
+int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type,
+		    unsigned int flags)
 {
 	union bpf_attr attr;
 
@@ -176,6 +177,7 @@ int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type)
 	attr.target_fd	   = target_fd;
 	attr.attach_bpf_fd = prog_fd;
 	attr.attach_type   = type;
+	attr.attach_flags  = flags;
 
 	return sys_bpf(BPF_PROG_ATTACH, &attr, sizeof(attr));
 }
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index a2f9853dd882..4ac6c4b84100 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -41,7 +41,8 @@ int bpf_map_delete_elem(int fd, void *key);
 int bpf_map_get_next_key(int fd, void *key, void *next_key);
 int bpf_obj_pin(int fd, const char *pathname);
 int bpf_obj_get(const char *pathname);
-int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type);
+int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type,
+		    unsigned int flags);
 int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type);
 
 
-- 
cgit v1.2.3


From 202461e2f3c15dbfb05825d29ace0d20cdf55fa4 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Mon, 13 Feb 2017 03:31:55 +0100
Subject: tick/broadcast: Prevent deadlock on tick_broadcast_lock

tick_broadcast_lock is taken from interrupt context, but the following call
chain takes the lock without disabling interrupts:

[   12.703736]  _raw_spin_lock+0x3b/0x50
[   12.703738]  tick_broadcast_control+0x5a/0x1a0
[   12.703742]  intel_idle_cpu_online+0x22/0x100
[   12.703744]  cpuhp_invoke_callback+0x245/0x9d0
[   12.703752]  cpuhp_thread_fun+0x52/0x110
[   12.703754]  smpboot_thread_fn+0x276/0x320

So the following deadlock can happen:

   lock(tick_broadcast_lock);
   <Interrupt>
      lock(tick_broadcast_lock);

intel_idle_cpu_online() is the only place which violates the calling
convention of tick_broadcast_control(). This was caused by the removal of
the smp function call in course of the cpu hotplug rework.

Instead of slapping local_irq_disable/enable() at the call site, we can
relax the calling convention and handle it in the core code, which makes
the whole machinery more robust.

Fixes: 29d7bbada98e ("intel_idle: Remove superfluous SMP fuction call")
Reported-by: Gabriel C <nix.or.die@gmail.com>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Ruslan Ruslichenko <rruslich@cisco.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: lwn@lwn.net
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Anna-Maria Gleixner <anna-maria@linutronix.de>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: stable <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/1486953115.5912.4.camel@gmx.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 3109204c87cc..17ac99b60ee5 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -347,17 +347,16 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
  *
  * Called when the system enters a state where affected tick devices
  * might stop. Note: TICK_BROADCAST_FORCE cannot be undone.
- *
- * Called with interrupts disabled, so clockevents_lock is not
- * required here because the local clock event device cannot go away
- * under us.
  */
 void tick_broadcast_control(enum tick_broadcast_mode mode)
 {
 	struct clock_event_device *bc, *dev;
 	struct tick_device *td;
 	int cpu, bc_stopped;
+	unsigned long flags;
 
+	/* Protects also the local clockevent device. */
+	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
 	td = this_cpu_ptr(&tick_cpu_device);
 	dev = td->evtdev;
 
@@ -365,12 +364,11 @@ void tick_broadcast_control(enum tick_broadcast_mode mode)
 	 * Is the device not affected by the powerstate ?
 	 */
 	if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP))
-		return;
+		goto out;
 
 	if (!tick_device_is_functional(dev))
-		return;
+		goto out;
 
-	raw_spin_lock(&tick_broadcast_lock);
 	cpu = smp_processor_id();
 	bc = tick_broadcast_device.evtdev;
 	bc_stopped = cpumask_empty(tick_broadcast_mask);
@@ -420,7 +418,8 @@ void tick_broadcast_control(enum tick_broadcast_mode mode)
 				tick_broadcast_setup_oneshot(bc);
 		}
 	}
-	raw_spin_unlock(&tick_broadcast_lock);
+out:
+	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
 EXPORT_SYMBOL_GPL(tick_broadcast_control);
 
-- 
cgit v1.2.3


From 25f71d1c3e98ef0e52371746220d66458eac75bc Mon Sep 17 00:00:00 2001
From: Yang Yang <yang.yang29@zte.com.cn>
Date: Fri, 30 Dec 2016 16:17:55 +0800
Subject: futex: Move futex_init() to core_initcall

The UEVENT user mode helper is enabled before the initcalls are executed
and is available when the root filesystem has been mounted.

The user mode helper is triggered by device init calls and the executable
might use the futex syscall.

futex_init() is marked __initcall which maps to device_initcall, but there
is no guarantee that futex_init() is invoked _before_ the first device init
call which triggers the UEVENT user mode helper.

If the user mode helper uses the futex syscall before futex_init() then the
syscall crashes with a NULL pointer dereference because the futex subsystem
has not been initialized yet.

Move futex_init() to core_initcall so futexes are initialized before the
root filesystem is mounted and the usermode helper becomes available.

[ tglx: Rewrote changelog ]

Signed-off-by: Yang Yang <yang.yang29@zte.com.cn>
Cc: jiang.biao2@zte.com.cn
Cc: jiang.zhengxiong@zte.com.cn
Cc: zhong.weidong@zte.com.cn
Cc: deng.huali@zte.com.cn
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/1483085875-6130-1-git-send-email-yang.yang29@zte.com.cn
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 0842c8ca534b..cdf365036141 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -3323,4 +3323,4 @@ static int __init futex_init(void)
 
 	return 0;
 }
-__initcall(futex_init);
+core_initcall(futex_init);
-- 
cgit v1.2.3


From ca86cad7380e373fa17bc0ee8aff121380323e69 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Sat, 4 Feb 2017 13:10:38 -0500
Subject: audit: log module name on init_module

This adds a new auxiliary record MODULE_INIT to the SYSCALL event.

We get finit_module for free since it made most sense to hook this in to
load_module().

https://github.com/linux-audit/audit-kernel/issues/7
https://github.com/linux-audit/audit-kernel/wiki/RFE-Module-Load-Record-Format

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Acked-by: Jessica Yu <jeyu@redhat.com>
[PM: corrected links in the commit description]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit.h      | 12 ++++++++++++
 include/uapi/linux/audit.h |  1 +
 kernel/audit.h             |  3 +++
 kernel/auditsc.c           | 14 ++++++++++++++
 kernel/module.c            |  5 ++++-
 5 files changed, 34 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 2be99b276d29..aba3a2684300 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -360,6 +360,7 @@ extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
 				  const struct cred *old);
 extern void __audit_log_capset(const struct cred *new, const struct cred *old);
 extern void __audit_mmap_fd(int fd, int flags);
+extern void __audit_log_kern_module(char *name);
 
 static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
 {
@@ -450,6 +451,12 @@ static inline void audit_mmap_fd(int fd, int flags)
 		__audit_mmap_fd(fd, flags);
 }
 
+static inline void audit_log_kern_module(char *name)
+{
+	if (!audit_dummy_context())
+		__audit_log_kern_module(name);
+}
+
 extern int audit_n_rules;
 extern int audit_signals;
 #else /* CONFIG_AUDITSYSCALL */
@@ -561,6 +568,11 @@ static inline void audit_log_capset(const struct cred *new,
 { }
 static inline void audit_mmap_fd(int fd, int flags)
 { }
+
+static inline void audit_log_kern_module(char *name)
+{
+}
+
 static inline void audit_ptrace(struct task_struct *t)
 { }
 #define audit_n_rules 0
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 3f24110ae63c..3c02bb2ff779 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -111,6 +111,7 @@
 #define AUDIT_PROCTITLE		1327	/* Proctitle emit event */
 #define AUDIT_FEATURE_CHANGE	1328	/* audit log listing feature changes */
 #define AUDIT_REPLACE		1329	/* Replace auditd if this packet unanswerd */
+#define AUDIT_KERN_MODULE	1330	/* Kernel Module events */
 
 #define AUDIT_AVC		1400	/* SE Linux avc denial or grant */
 #define AUDIT_SELINUX_ERR	1401	/* Internal SE Linux Errors */
diff --git a/kernel/audit.h b/kernel/audit.h
index 431444c3708b..144b7ebd2deb 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -199,6 +199,9 @@ struct audit_context {
 		struct {
 			int			argc;
 		} execve;
+		struct {
+			char			*name;
+		} module;
 	};
 	int fds[2];
 	struct audit_proctitle proctitle;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index bb5f504592c6..bde3aac4deed 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1268,6 +1268,11 @@ static void show_special(struct audit_context *context, int *call_panic)
 	case AUDIT_EXECVE: {
 		audit_log_execve_info(context, &ab);
 		break; }
+	case AUDIT_KERN_MODULE:
+		audit_log_format(ab, "name=");
+		audit_log_untrustedstring(ab, context->module.name);
+		kfree(context->module.name);
+		break;
 	}
 	audit_log_end(ab);
 }
@@ -2368,6 +2373,15 @@ void __audit_mmap_fd(int fd, int flags)
 	context->type = AUDIT_MMAP;
 }
 
+void __audit_log_kern_module(char *name)
+{
+	struct audit_context *context = current->audit_context;
+
+	context->module.name = kmalloc(strlen(name) + 1, GFP_KERNEL);
+	strcpy(context->module.name, name);
+	context->type = AUDIT_KERN_MODULE;
+}
+
 static void audit_log_task(struct audit_buffer *ab)
 {
 	kuid_t auid, uid;
diff --git a/kernel/module.c b/kernel/module.c
index 529efae9f481..5432dbedf8cf 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -61,6 +61,7 @@
 #include <linux/pfn.h>
 #include <linux/bsearch.h>
 #include <linux/dynamic_debug.h>
+#include <linux/audit.h>
 #include <uapi/linux/module.h>
 #include "module-internal.h"
 
@@ -3593,6 +3594,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
 		goto free_copy;
 	}
 
+	audit_log_kern_module(mod->name);
+
 	/* Reserve our place in the list. */
 	err = add_unformed_module(mod);
 	if (err)
@@ -3681,7 +3684,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
 		       mod->name, after_dashes);
 	}
 
-	/* Link in to syfs. */
+	/* Link in to sysfs. */
 	err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
 	if (err < 0)
 		goto coming_cleanup;
-- 
cgit v1.2.3


From fe8e52b9b9100c486051aaf5208dbf4072bb87b1 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Mon, 13 Feb 2017 16:21:25 -0500
Subject: audit: remove unnecessary curly braces from switch/case statements

Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/auditsc.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index bde3aac4deed..4db32e8669f8 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1221,7 +1221,7 @@ static void show_special(struct audit_context *context, int *call_panic)
 				context->ipc.perm_mode);
 		}
 		break; }
-	case AUDIT_MQ_OPEN: {
+	case AUDIT_MQ_OPEN:
 		audit_log_format(ab,
 			"oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld "
 			"mq_msgsize=%ld mq_curmsgs=%ld",
@@ -1230,8 +1230,8 @@ static void show_special(struct audit_context *context, int *call_panic)
 			context->mq_open.attr.mq_maxmsg,
 			context->mq_open.attr.mq_msgsize,
 			context->mq_open.attr.mq_curmsgs);
-		break; }
-	case AUDIT_MQ_SENDRECV: {
+		break;
+	case AUDIT_MQ_SENDRECV:
 		audit_log_format(ab,
 			"mqdes=%d msg_len=%zd msg_prio=%u "
 			"abs_timeout_sec=%ld abs_timeout_nsec=%ld",
@@ -1240,12 +1240,12 @@ static void show_special(struct audit_context *context, int *call_panic)
 			context->mq_sendrecv.msg_prio,
 			context->mq_sendrecv.abs_timeout.tv_sec,
 			context->mq_sendrecv.abs_timeout.tv_nsec);
-		break; }
-	case AUDIT_MQ_NOTIFY: {
+		break;
+	case AUDIT_MQ_NOTIFY:
 		audit_log_format(ab, "mqdes=%d sigev_signo=%d",
 				context->mq_notify.mqdes,
 				context->mq_notify.sigev_signo);
-		break; }
+		break;
 	case AUDIT_MQ_GETSETATTR: {
 		struct mq_attr *attr = &context->mq_getsetattr.mqstat;
 		audit_log_format(ab,
@@ -1255,19 +1255,19 @@ static void show_special(struct audit_context *context, int *call_panic)
 			attr->mq_flags, attr->mq_maxmsg,
 			attr->mq_msgsize, attr->mq_curmsgs);
 		break; }
-	case AUDIT_CAPSET: {
+	case AUDIT_CAPSET:
 		audit_log_format(ab, "pid=%d", context->capset.pid);
 		audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable);
 		audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
 		audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
-		break; }
-	case AUDIT_MMAP: {
+		break;
+	case AUDIT_MMAP:
 		audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
 				 context->mmap.flags);
-		break; }
-	case AUDIT_EXECVE: {
+		break;
+	case AUDIT_EXECVE:
 		audit_log_execve_info(context, &ab);
-		break; }
+		break;
 	case AUDIT_KERN_MODULE:
 		audit_log_format(ab, "name=");
 		audit_log_untrustedstring(ab, context->module.name);
-- 
cgit v1.2.3


From 7e57fbb2a341b5d44d30e71a6d782c5e6dbc429c Mon Sep 17 00:00:00 2001
From: Alexander Alemayhu <alexander@alemayhu.com>
Date: Tue, 14 Feb 2017 00:02:35 +0100
Subject: bpf: reduce compiler warnings by adding fallthrough comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes the following warnings:

kernel/bpf/verifier.c: In function ‘may_access_direct_pkt_data’:
kernel/bpf/verifier.c:702:6: warning: this statement may fall through [-Wimplicit-fallthrough=]
   if (t == BPF_WRITE)
      ^
kernel/bpf/verifier.c:704:2: note: here
  case BPF_PROG_TYPE_SCHED_CLS:
  ^~~~
kernel/bpf/verifier.c: In function ‘reg_set_min_max_inv’:
kernel/bpf/verifier.c:2057:23: warning: this statement may fall through [-Wimplicit-fallthrough=]
   true_reg->min_value = 0;
   ~~~~~~~~~~~~~~~~~~~~^~~
kernel/bpf/verifier.c:2058:2: note: here
  case BPF_JSGT:
  ^~~~
kernel/bpf/verifier.c:2068:23: warning: this statement may fall through [-Wimplicit-fallthrough=]
   true_reg->min_value = 0;
   ~~~~~~~~~~~~~~~~~~~~^~~
kernel/bpf/verifier.c:2069:2: note: here
  case BPF_JSGE:
  ^~~~
kernel/bpf/verifier.c: In function ‘reg_set_min_max’:
kernel/bpf/verifier.c:2009:24: warning: this statement may fall through [-Wimplicit-fallthrough=]
   false_reg->min_value = 0;
   ~~~~~~~~~~~~~~~~~~~~~^~~
kernel/bpf/verifier.c:2010:2: note: here
  case BPF_JSGT:
  ^~~~
kernel/bpf/verifier.c:2019:24: warning: this statement may fall through [-Wimplicit-fallthrough=]
   false_reg->min_value = 0;
   ~~~~~~~~~~~~~~~~~~~~~^~~
kernel/bpf/verifier.c:2020:2: note: here
  case BPF_JSGE:
  ^~~~

Reported-by: David Binderman <dcb314@hotmail.com>
Signed-off-by: Alexander Alemayhu <alexander@alemayhu.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1a754e5d2695..d2bded2b250c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -701,6 +701,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
 		/* dst_input() and dst_output() can't write for now */
 		if (t == BPF_WRITE)
 			return false;
+		/* fallthrough */
 	case BPF_PROG_TYPE_SCHED_CLS:
 	case BPF_PROG_TYPE_SCHED_ACT:
 	case BPF_PROG_TYPE_XDP:
@@ -2007,6 +2008,7 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 	case BPF_JGT:
 		/* Unsigned comparison, the minimum value is 0. */
 		false_reg->min_value = 0;
+		/* fallthrough */
 	case BPF_JSGT:
 		/* If this is false then we know the maximum val is val,
 		 * otherwise we know the min val is val+1.
@@ -2017,6 +2019,7 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 	case BPF_JGE:
 		/* Unsigned comparison, the minimum value is 0. */
 		false_reg->min_value = 0;
+		/* fallthrough */
 	case BPF_JSGE:
 		/* If this is false then we know the maximum value is val - 1,
 		 * otherwise we know the mimimum value is val.
@@ -2055,6 +2058,7 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
 	case BPF_JGT:
 		/* Unsigned comparison, the minimum value is 0. */
 		true_reg->min_value = 0;
+		/* fallthrough */
 	case BPF_JSGT:
 		/*
 		 * If this is false, then the val is <= the register, if it is
@@ -2066,6 +2070,7 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
 	case BPF_JGE:
 		/* Unsigned comparison, the minimum value is 0. */
 		true_reg->min_value = 0;
+		/* fallthrough */
 	case BPF_JSGE:
 		/* If this is false then constant < register, if it is true then
 		 * the register < constant.
-- 
cgit v1.2.3


From f222449c9dfad7c9bb8cb53e64c5c407b172ebbc Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Date: Wed, 15 Feb 2017 13:43:32 +0900
Subject: timekeeping: Use deferred printk() in debug code

We cannot do printk() from tk_debug_account_sleep_time(), because
tk_debug_account_sleep_time() is called under tk_core seq lock.
The reason why printk() is unsafe there is that console_sem may
invoke scheduler (up()->wake_up_process()->activate_task()), which,
in turn, can return back to timekeeping code, for instance, via
get_time()->ktime_get(), deadlocking the system on tk_core seq lock.

[   48.950592] ======================================================
[   48.950622] [ INFO: possible circular locking dependency detected ]
[   48.950622] 4.10.0-rc7-next-20170213+ #101 Not tainted
[   48.950622] -------------------------------------------------------
[   48.950622] kworker/0:0/3 is trying to acquire lock:
[   48.950653]  (tk_core){----..}, at: [<c01cc624>] retrigger_next_event+0x4c/0x90
[   48.950683]
               but task is already holding lock:
[   48.950683]  (hrtimer_bases.lock){-.-...}, at: [<c01cc610>] retrigger_next_event+0x38/0x90
[   48.950714]
               which lock already depends on the new lock.

[   48.950714]
               the existing dependency chain (in reverse order) is:
[   48.950714]
               -> #5 (hrtimer_bases.lock){-.-...}:
[   48.950744]        _raw_spin_lock_irqsave+0x50/0x64
[   48.950775]        lock_hrtimer_base+0x28/0x58
[   48.950775]        hrtimer_start_range_ns+0x20/0x5c8
[   48.950775]        __enqueue_rt_entity+0x320/0x360
[   48.950805]        enqueue_rt_entity+0x2c/0x44
[   48.950805]        enqueue_task_rt+0x24/0x94
[   48.950836]        ttwu_do_activate+0x54/0xc0
[   48.950836]        try_to_wake_up+0x248/0x5c8
[   48.950836]        __setup_irq+0x420/0x5f0
[   48.950836]        request_threaded_irq+0xdc/0x184
[   48.950866]        devm_request_threaded_irq+0x58/0xa4
[   48.950866]        omap_i2c_probe+0x530/0x6a0
[   48.950897]        platform_drv_probe+0x50/0xb0
[   48.950897]        driver_probe_device+0x1f8/0x2cc
[   48.950897]        __driver_attach+0xc0/0xc4
[   48.950927]        bus_for_each_dev+0x6c/0xa0
[   48.950927]        bus_add_driver+0x100/0x210
[   48.950927]        driver_register+0x78/0xf4
[   48.950958]        do_one_initcall+0x3c/0x16c
[   48.950958]        kernel_init_freeable+0x20c/0x2d8
[   48.950958]        kernel_init+0x8/0x110
[   48.950988]        ret_from_fork+0x14/0x24
[   48.950988]
               -> #4 (&rt_b->rt_runtime_lock){-.-...}:
[   48.951019]        _raw_spin_lock+0x40/0x50
[   48.951019]        rq_offline_rt+0x9c/0x2bc
[   48.951019]        set_rq_offline.part.2+0x2c/0x58
[   48.951049]        rq_attach_root+0x134/0x144
[   48.951049]        cpu_attach_domain+0x18c/0x6f4
[   48.951049]        build_sched_domains+0xba4/0xd80
[   48.951080]        sched_init_smp+0x68/0x10c
[   48.951080]        kernel_init_freeable+0x160/0x2d8
[   48.951080]        kernel_init+0x8/0x110
[   48.951080]        ret_from_fork+0x14/0x24
[   48.951110]
               -> #3 (&rq->lock){-.-.-.}:
[   48.951110]        _raw_spin_lock+0x40/0x50
[   48.951141]        task_fork_fair+0x30/0x124
[   48.951141]        sched_fork+0x194/0x2e0
[   48.951141]        copy_process.part.5+0x448/0x1a20
[   48.951171]        _do_fork+0x98/0x7e8
[   48.951171]        kernel_thread+0x2c/0x34
[   48.951171]        rest_init+0x1c/0x18c
[   48.951202]        start_kernel+0x35c/0x3d4
[   48.951202]        0x8000807c
[   48.951202]
               -> #2 (&p->pi_lock){-.-.-.}:
[   48.951232]        _raw_spin_lock_irqsave+0x50/0x64
[   48.951232]        try_to_wake_up+0x30/0x5c8
[   48.951232]        up+0x4c/0x60
[   48.951263]        __up_console_sem+0x2c/0x58
[   48.951263]        console_unlock+0x3b4/0x650
[   48.951263]        vprintk_emit+0x270/0x474
[   48.951293]        vprintk_default+0x20/0x28
[   48.951293]        printk+0x20/0x30
[   48.951324]        kauditd_hold_skb+0x94/0xb8
[   48.951324]        kauditd_thread+0x1a4/0x56c
[   48.951324]        kthread+0x104/0x148
[   48.951354]        ret_from_fork+0x14/0x24
[   48.951354]
               -> #1 ((console_sem).lock){-.....}:
[   48.951385]        _raw_spin_lock_irqsave+0x50/0x64
[   48.951385]        down_trylock+0xc/0x2c
[   48.951385]        __down_trylock_console_sem+0x24/0x80
[   48.951385]        console_trylock+0x10/0x8c
[   48.951416]        vprintk_emit+0x264/0x474
[   48.951416]        vprintk_default+0x20/0x28
[   48.951416]        printk+0x20/0x30
[   48.951446]        tk_debug_account_sleep_time+0x5c/0x70
[   48.951446]        __timekeeping_inject_sleeptime.constprop.3+0x170/0x1a0
[   48.951446]        timekeeping_resume+0x218/0x23c
[   48.951477]        syscore_resume+0x94/0x42c
[   48.951477]        suspend_enter+0x554/0x9b4
[   48.951477]        suspend_devices_and_enter+0xd8/0x4b4
[   48.951507]        enter_state+0x934/0xbd4
[   48.951507]        pm_suspend+0x14/0x70
[   48.951507]        state_store+0x68/0xc8
[   48.951538]        kernfs_fop_write+0xf4/0x1f8
[   48.951538]        __vfs_write+0x1c/0x114
[   48.951538]        vfs_write+0xa0/0x168
[   48.951568]        SyS_write+0x3c/0x90
[   48.951568]        __sys_trace_return+0x0/0x10
[   48.951568]
               -> #0 (tk_core){----..}:
[   48.951599]        lock_acquire+0xe0/0x294
[   48.951599]        ktime_get_update_offsets_now+0x5c/0x1d4
[   48.951629]        retrigger_next_event+0x4c/0x90
[   48.951629]        on_each_cpu+0x40/0x7c
[   48.951629]        clock_was_set_work+0x14/0x20
[   48.951660]        process_one_work+0x2b4/0x808
[   48.951660]        worker_thread+0x3c/0x550
[   48.951660]        kthread+0x104/0x148
[   48.951690]        ret_from_fork+0x14/0x24
[   48.951690]
               other info that might help us debug this:

[   48.951690] Chain exists of:
                 tk_core --> &rt_b->rt_runtime_lock --> hrtimer_bases.lock

[   48.951721]  Possible unsafe locking scenario:

[   48.951721]        CPU0                    CPU1
[   48.951721]        ----                    ----
[   48.951721]   lock(hrtimer_bases.lock);
[   48.951751]                                lock(&rt_b->rt_runtime_lock);
[   48.951751]                                lock(hrtimer_bases.lock);
[   48.951751]   lock(tk_core);
[   48.951782]
                *** DEADLOCK ***

[   48.951782] 3 locks held by kworker/0:0/3:
[   48.951782]  #0:  ("events"){.+.+.+}, at: [<c0156590>] process_one_work+0x1f8/0x808
[   48.951812]  #1:  (hrtimer_work){+.+...}, at: [<c0156590>] process_one_work+0x1f8/0x808
[   48.951843]  #2:  (hrtimer_bases.lock){-.-...}, at: [<c01cc610>] retrigger_next_event+0x38/0x90
[   48.951843]   stack backtrace:
[   48.951873] CPU: 0 PID: 3 Comm: kworker/0:0 Not tainted 4.10.0-rc7-next-20170213+
[   48.951904] Workqueue: events clock_was_set_work
[   48.951904] [<c0110208>] (unwind_backtrace) from [<c010c224>] (show_stack+0x10/0x14)
[   48.951934] [<c010c224>] (show_stack) from [<c04ca6c0>] (dump_stack+0xac/0xe0)
[   48.951934] [<c04ca6c0>] (dump_stack) from [<c019b5cc>] (print_circular_bug+0x1d0/0x308)
[   48.951965] [<c019b5cc>] (print_circular_bug) from [<c019d2a8>] (validate_chain+0xf50/0x1324)
[   48.951965] [<c019d2a8>] (validate_chain) from [<c019ec18>] (__lock_acquire+0x468/0x7e8)
[   48.951995] [<c019ec18>] (__lock_acquire) from [<c019f634>] (lock_acquire+0xe0/0x294)
[   48.951995] [<c019f634>] (lock_acquire) from [<c01d0ea0>] (ktime_get_update_offsets_now+0x5c/0x1d4)
[   48.952026] [<c01d0ea0>] (ktime_get_update_offsets_now) from [<c01cc624>] (retrigger_next_event+0x4c/0x90)
[   48.952026] [<c01cc624>] (retrigger_next_event) from [<c01e4e24>] (on_each_cpu+0x40/0x7c)
[   48.952056] [<c01e4e24>] (on_each_cpu) from [<c01cafc4>] (clock_was_set_work+0x14/0x20)
[   48.952056] [<c01cafc4>] (clock_was_set_work) from [<c015664c>] (process_one_work+0x2b4/0x808)
[   48.952087] [<c015664c>] (process_one_work) from [<c0157774>] (worker_thread+0x3c/0x550)
[   48.952087] [<c0157774>] (worker_thread) from [<c015d644>] (kthread+0x104/0x148)
[   48.952087] [<c015d644>] (kthread) from [<c0107830>] (ret_from_fork+0x14/0x24)

Replace printk() with printk_deferred(), which does not call into
the scheduler.

Fixes: 0bf43f15db85 ("timekeeping: Prints the amounts of time spent during suspend")
Reported-and-tested-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Rafael J . Wysocki" <rjw@rjwysocki.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: "[4.9+]" <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/20170215044332.30449-1-sergey.senozhatsky@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping_debug.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index ca9fb800336b..38bc4d2208e8 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -75,7 +75,7 @@ void tk_debug_account_sleep_time(struct timespec64 *t)
 	int bin = min(fls(t->tv_sec), NUM_BINS-1);
 
 	sleep_time_bin[bin]++;
-	pr_info("Suspended for %lld.%03lu seconds\n", (s64)t->tv_sec,
-			t->tv_nsec / NSEC_PER_MSEC);
+	printk_deferred(KERN_INFO "Suspended for %lld.%03lu seconds\n",
+			(s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC);
 }
 
-- 
cgit v1.2.3


From 1f9b3546cf4c273b6d809003244d05cf0460a5e9 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 9 Feb 2017 17:53:50 -0500
Subject: tracing: Have traceprobe_probes_write() not access userspace
 unnecessarily

The code in traceprobe_probes_write() reads up to 4096 bytes from userpace
for each line. If userspace passes in several lines to execute, the code
will do a large read for each line, even though, it is highly likely that
the first read from userspace received all of the lines at once.

I changed the logic to do a single read from userspace, and to only read
from userspace again if not all of the read from userspace made it in.

I tested this by adding printk()s and writing files that would test -1, ==,
and +1 the buffer size, to make sure that there's no overflows and that if a
single line is written with +1 the buffer size, that it fails properly.

Link: http://lkml.kernel.org/r/20170209180458.5c829ab2@gandalf.local.home

Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_probe.c | 48 ++++++++++++++++++++++++++++------------------
 1 file changed, 29 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 8c0553d9afd3..2a06f1fa7001 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -647,7 +647,7 @@ ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
 				size_t count, loff_t *ppos,
 				int (*createfn)(int, char **))
 {
-	char *kbuf, *tmp;
+	char *kbuf, *buf, *tmp;
 	int ret = 0;
 	size_t done = 0;
 	size_t size;
@@ -667,27 +667,37 @@ ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
 			goto out;
 		}
 		kbuf[size] = '\0';
-		tmp = strchr(kbuf, '\n');
+		buf = kbuf;
+		do {
+			tmp = strchr(buf, '\n');
+			if (tmp) {
+				*tmp = '\0';
+				size = tmp - buf + 1;
+			} else {
+				size = strlen(buf);
+				if (done + size < count) {
+					if (buf != kbuf)
+						break;
+					pr_warn("Line length is too long: Should be less than %d\n",
+						WRITE_BUFSIZE);
+					ret = -EINVAL;
+					goto out;
+				}
+			}
+			done += size;
 
-		if (tmp) {
-			*tmp = '\0';
-			size = tmp - kbuf + 1;
-		} else if (done + size < count) {
-			pr_warn("Line length is too long: Should be less than %d\n",
-				WRITE_BUFSIZE);
-			ret = -EINVAL;
-			goto out;
-		}
-		done += size;
-		/* Remove comments */
-		tmp = strchr(kbuf, '#');
+			/* Remove comments */
+			tmp = strchr(buf, '#');
 
-		if (tmp)
-			*tmp = '\0';
+			if (tmp)
+				*tmp = '\0';
 
-		ret = traceprobe_command(kbuf, createfn);
-		if (ret)
-			goto out;
+			ret = traceprobe_command(buf, createfn);
+			if (ret)
+				goto out;
+			buf += size;
+
+		} while (done < count);
 	}
 	ret = done;
 
-- 
cgit v1.2.3


From 8e0f11429aec1119b17cbfcfa2d3194c8e864a26 Mon Sep 17 00:00:00 2001
From: Luiz Capitulino <lcapitulino@redhat.com>
Date: Mon, 13 Feb 2017 12:25:17 -0500
Subject: tracing/hwlat: Update old comment about migration

The ftrace hwlat does support a cpumask.

Link: http://lkml.kernel.org/r/20170213122517.6e211955@redhat.com

Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_hwlat.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index 775569ec50d0..bf209d2657ab 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -322,10 +322,7 @@ static void move_to_next_cpu(void)
  * need to ensure nothing else might be running (and thus preempting).
  * Obviously this should never be used in production environments.
  *
- * Currently this runs on which ever CPU it was scheduled on, but most
- * real-world hardware latency situations occur across several CPUs,
- * but we might later generalize this if we find there are any actualy
- * systems with alternate SMI delivery or other hardware latencies.
+ * Executes one loop interaction on each CPU in tracing_cpumask sysfs file.
  */
 static int kthread_fn(void *data)
 {
-- 
cgit v1.2.3


From 7257634135c247de235f3cdfdaa22f9eb5f054e4 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 7 Feb 2017 20:21:28 +0900
Subject: tracing/probe: Show subsystem name in messages

Show "trace_probe:", "trace_kprobe:" and "trace_uprobe:"
headers for each warning/error/info message. This will
help people to notice that kprobe/uprobe events caused
those messages.

Link: http://lkml.kernel.org/r/148646647813.24658.16705315294927615333.stgit@devbox

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_kprobe.c | 1 +
 kernel/trace/trace_probe.c  | 1 +
 kernel/trace/trace_uprobe.c | 1 +
 3 files changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index a133ecd741e4..5c9bd0550542 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -16,6 +16,7 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
+#define pr_fmt(fmt)	"trace_kprobe: " fmt
 
 #include <linux/module.h>
 #include <linux/uaccess.h>
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 2a06f1fa7001..11b63328d6fb 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -21,6 +21,7 @@
  * Copyright (C) IBM Corporation, 2010-2011
  * Author:     Srikar Dronamraju
  */
+#define pr_fmt(fmt)	"trace_probe: " fmt
 
 #include "trace_probe.h"
 
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 4f2ba2bb11e0..f4379e772171 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -17,6 +17,7 @@
  * Copyright (C) IBM Corporation, 2010-2012
  * Author:	Srikar Dronamraju <srikar@linux.vnet.ibm.com>
  */
+#define pr_fmt(fmt)	"trace_kprobe: " fmt
 
 #include <linux/module.h>
 #include <linux/uaccess.h>
-- 
cgit v1.2.3


From 3821fd35b58dba449bd894014fbf4e1c43c9e951 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@akamai.com>
Date: Fri, 3 Feb 2017 15:42:24 -0500
Subject: jump_label: Reduce the size of struct static_key

The static_key->next field goes mostly unused. The field is used for
associating module uses with a static key. Most uses of struct static_key
define a static key in the core kernel and make use of it entirely within
the core kernel, or define the static key in a module and make use of it
only from within that module. In fact, of the ~3,000 static keys defined,
I found only about 5 or so that did not fit this pattern.

Thus, we can remove the static_key->next field entirely and overload
the static_key->entries field. That is, when all the static_key uses
are contained within the same module, static_key->entries continues
to point to those uses. However, if the static_key uses are not contained
within the module where the static_key is defined, then we allocate a
struct static_key_mod, store a pointer to the uses within that
struct static_key_mod, and have the static key point at the static_key_mod.
This does incur some extra memory usage when a static_key is used in a
module that does not define it, but since there are only a handful of such
cases there is a net savings.

In order to identify if the static_key->entries pointer contains a
struct static_key_mod or a struct jump_entry pointer, bit 1 of
static_key->entries is set to 1 if it points to a struct static_key_mod and
is 0 if it points to a struct jump_entry. We were already using bit 0 in a
similar way to store the initial value of the static_key. This does mean
that allocations of struct static_key_mod and that the struct jump_entry
tables need to be at least 4-byte aligned in memory. As far as I can tell
all arches meet this criteria.

For my .config, the patch increased the text by 778 bytes, but reduced
the data + bss size by 14912, for a net savings of 14,134 bytes.

   text	   data	    bss	    dec	    hex	filename
8092427	5016512	 790528	13899467	 d416cb	vmlinux.pre
8093205	5001600	 790528	13885333	 d3df95	vmlinux.post

Link: http://lkml.kernel.org/r/1486154544-4321-1-git-send-email-jbaron@akamai.com

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: Jason Baron <jbaron@akamai.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 Documentation/static-keys.txt |   4 +-
 include/linux/jump_label.h    |  23 ++++---
 kernel/jump_label.c           | 153 +++++++++++++++++++++++++++++++++++-------
 3 files changed, 145 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/static-keys.txt b/Documentation/static-keys.txt
index ea8d7b4e53f0..32a25fad0c1b 100644
--- a/Documentation/static-keys.txt
+++ b/Documentation/static-keys.txt
@@ -155,7 +155,9 @@ or:
 
 There are a few functions and macros that architectures must implement in order
 to take advantage of this optimization. If there is no architecture support, we
-simply fall back to a traditional, load, test, and jump sequence.
+simply fall back to a traditional, load, test, and jump sequence. Also, the
+struct jump_entry table must be at least 4-byte aligned because the
+static_key->entry field makes use of the two least significant bits.
 
 * select HAVE_ARCH_JUMP_LABEL, see: arch/x86/Kconfig
 
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index a0547c571800..680c98b2f41c 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -89,11 +89,17 @@ extern bool static_key_initialized;
 
 struct static_key {
 	atomic_t enabled;
-/* Set lsb bit to 1 if branch is default true, 0 ot */
-	struct jump_entry *entries;
-#ifdef CONFIG_MODULES
-	struct static_key_mod *next;
-#endif
+/*
+ * bit 0 => 1 if key is initially true
+ *	    0 if initially false
+ * bit 1 => 1 if points to struct static_key_mod
+ *	    0 if points to struct jump_entry
+ */
+	union {
+		unsigned long type;
+		struct jump_entry *entries;
+		struct static_key_mod *next;
+	};
 };
 
 #else
@@ -118,9 +124,10 @@ struct module;
 
 #ifdef HAVE_JUMP_LABEL
 
-#define JUMP_TYPE_FALSE	0UL
-#define JUMP_TYPE_TRUE	1UL
-#define JUMP_TYPE_MASK	1UL
+#define JUMP_TYPE_FALSE		0UL
+#define JUMP_TYPE_TRUE		1UL
+#define JUMP_TYPE_LINKED	2UL
+#define JUMP_TYPE_MASK		3UL
 
 static __always_inline bool static_key_false(struct static_key *key)
 {
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 93ad6c1fb9b6..953411f5ba7f 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -229,12 +229,28 @@ void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry
 
 static inline struct jump_entry *static_key_entries(struct static_key *key)
 {
-	return (struct jump_entry *)((unsigned long)key->entries & ~JUMP_TYPE_MASK);
+	WARN_ON_ONCE(key->type & JUMP_TYPE_LINKED);
+	return (struct jump_entry *)(key->type & ~JUMP_TYPE_MASK);
 }
 
 static inline bool static_key_type(struct static_key *key)
 {
-	return (unsigned long)key->entries & JUMP_TYPE_MASK;
+	return key->type & JUMP_TYPE_TRUE;
+}
+
+static inline bool static_key_linked(struct static_key *key)
+{
+	return key->type & JUMP_TYPE_LINKED;
+}
+
+static inline void static_key_clear_linked(struct static_key *key)
+{
+	key->type &= ~JUMP_TYPE_LINKED;
+}
+
+static inline void static_key_set_linked(struct static_key *key)
+{
+	key->type |= JUMP_TYPE_LINKED;
 }
 
 static inline struct static_key *jump_entry_key(struct jump_entry *entry)
@@ -247,6 +263,26 @@ static bool jump_entry_branch(struct jump_entry *entry)
 	return (unsigned long)entry->key & 1UL;
 }
 
+/***
+ * A 'struct static_key' uses a union such that it either points directly
+ * to a table of 'struct jump_entry' or to a linked list of modules which in
+ * turn point to 'struct jump_entry' tables.
+ *
+ * The two lower bits of the pointer are used to keep track of which pointer
+ * type is in use and to store the initial branch direction, we use an access
+ * function which preserves these bits.
+ */
+static void static_key_set_entries(struct static_key *key,
+				   struct jump_entry *entries)
+{
+	unsigned long type;
+
+	WARN_ON_ONCE((unsigned long)entries & JUMP_TYPE_MASK);
+	type = key->type & JUMP_TYPE_MASK;
+	key->entries = entries;
+	key->type |= type;
+}
+
 static enum jump_label_type jump_label_type(struct jump_entry *entry)
 {
 	struct static_key *key = jump_entry_key(entry);
@@ -306,13 +342,7 @@ void __init jump_label_init(void)
 			continue;
 
 		key = iterk;
-		/*
-		 * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
-		 */
-		*((unsigned long *)&key->entries) += (unsigned long)iter;
-#ifdef CONFIG_MODULES
-		key->next = NULL;
-#endif
+		static_key_set_entries(key, iter);
 	}
 	static_key_initialized = true;
 	jump_label_unlock();
@@ -336,6 +366,29 @@ struct static_key_mod {
 	struct module *mod;
 };
 
+static inline struct static_key_mod *static_key_mod(struct static_key *key)
+{
+	WARN_ON_ONCE(!(key->type & JUMP_TYPE_LINKED));
+	return (struct static_key_mod *)(key->type & ~JUMP_TYPE_MASK);
+}
+
+/***
+ * key->type and key->next are the same via union.
+ * This sets key->next and preserves the type bits.
+ *
+ * See additional comments above static_key_set_entries().
+ */
+static void static_key_set_mod(struct static_key *key,
+			       struct static_key_mod *mod)
+{
+	unsigned long type;
+
+	WARN_ON_ONCE((unsigned long)mod & JUMP_TYPE_MASK);
+	type = key->type & JUMP_TYPE_MASK;
+	key->next = mod;
+	key->type |= type;
+}
+
 static int __jump_label_mod_text_reserved(void *start, void *end)
 {
 	struct module *mod;
@@ -358,11 +411,23 @@ static void __jump_label_mod_update(struct static_key *key)
 {
 	struct static_key_mod *mod;
 
-	for (mod = key->next; mod; mod = mod->next) {
-		struct module *m = mod->mod;
+	for (mod = static_key_mod(key); mod; mod = mod->next) {
+		struct jump_entry *stop;
+		struct module *m;
+
+		/*
+		 * NULL if the static_key is defined in a module
+		 * that does not use it
+		 */
+		if (!mod->entries)
+			continue;
 
-		__jump_label_update(key, mod->entries,
-				    m->jump_entries + m->num_jump_entries);
+		m = mod->mod;
+		if (!m)
+			stop = __stop___jump_table;
+		else
+			stop = m->jump_entries + m->num_jump_entries;
+		__jump_label_update(key, mod->entries, stop);
 	}
 }
 
@@ -397,7 +462,7 @@ static int jump_label_add_module(struct module *mod)
 	struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
 	struct jump_entry *iter;
 	struct static_key *key = NULL;
-	struct static_key_mod *jlm;
+	struct static_key_mod *jlm, *jlm2;
 
 	/* if the module doesn't have jump label entries, just return */
 	if (iter_start == iter_stop)
@@ -414,20 +479,32 @@ static int jump_label_add_module(struct module *mod)
 
 		key = iterk;
 		if (within_module(iter->key, mod)) {
-			/*
-			 * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
-			 */
-			*((unsigned long *)&key->entries) += (unsigned long)iter;
-			key->next = NULL;
+			static_key_set_entries(key, iter);
 			continue;
 		}
 		jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL);
 		if (!jlm)
 			return -ENOMEM;
+		if (!static_key_linked(key)) {
+			jlm2 = kzalloc(sizeof(struct static_key_mod),
+				       GFP_KERNEL);
+			if (!jlm2) {
+				kfree(jlm);
+				return -ENOMEM;
+			}
+			preempt_disable();
+			jlm2->mod = __module_address((unsigned long)key);
+			preempt_enable();
+			jlm2->entries = static_key_entries(key);
+			jlm2->next = NULL;
+			static_key_set_mod(key, jlm2);
+			static_key_set_linked(key);
+		}
 		jlm->mod = mod;
 		jlm->entries = iter;
-		jlm->next = key->next;
-		key->next = jlm;
+		jlm->next = static_key_mod(key);
+		static_key_set_mod(key, jlm);
+		static_key_set_linked(key);
 
 		/* Only update if we've changed from our initial state */
 		if (jump_label_type(iter) != jump_label_init_type(iter))
@@ -454,16 +531,34 @@ static void jump_label_del_module(struct module *mod)
 		if (within_module(iter->key, mod))
 			continue;
 
+		/* No memory during module load */
+		if (WARN_ON(!static_key_linked(key)))
+			continue;
+
 		prev = &key->next;
-		jlm = key->next;
+		jlm = static_key_mod(key);
 
 		while (jlm && jlm->mod != mod) {
 			prev = &jlm->next;
 			jlm = jlm->next;
 		}
 
-		if (jlm) {
+		/* No memory during module load */
+		if (WARN_ON(!jlm))
+			continue;
+
+		if (prev == &key->next)
+			static_key_set_mod(key, jlm->next);
+		else
 			*prev = jlm->next;
+
+		kfree(jlm);
+
+		jlm = static_key_mod(key);
+		/* if only one etry is left, fold it back into the static_key */
+		if (jlm->next == NULL) {
+			static_key_set_entries(key, jlm->entries);
+			static_key_clear_linked(key);
 			kfree(jlm);
 		}
 	}
@@ -492,8 +587,10 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
 	case MODULE_STATE_COMING:
 		jump_label_lock();
 		ret = jump_label_add_module(mod);
-		if (ret)
+		if (ret) {
+			WARN(1, "Failed to allocatote memory: jump_label may not work properly.\n");
 			jump_label_del_module(mod);
+		}
 		jump_label_unlock();
 		break;
 	case MODULE_STATE_GOING:
@@ -554,11 +651,14 @@ int jump_label_text_reserved(void *start, void *end)
 static void jump_label_update(struct static_key *key)
 {
 	struct jump_entry *stop = __stop___jump_table;
-	struct jump_entry *entry = static_key_entries(key);
+	struct jump_entry *entry;
 #ifdef CONFIG_MODULES
 	struct module *mod;
 
-	__jump_label_mod_update(key);
+	if (static_key_linked(key)) {
+		__jump_label_mod_update(key);
+		return;
+	}
 
 	preempt_disable();
 	mod = __module_address((unsigned long)key);
@@ -566,6 +666,7 @@ static void jump_label_update(struct static_key *key)
 		stop = mod->jump_entries + mod->num_jump_entries;
 	preempt_enable();
 #endif
+	entry = static_key_entries(key);
 	/* if there are no users, entry can be NULL */
 	if (entry)
 		__jump_label_update(key, entry, stop);
-- 
cgit v1.2.3


From eb583cd484f9a76c3716c0539bd06340943eeff9 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 23 Jan 2017 13:24:45 +0100
Subject: tracing: Use modern function declaration

We get a lot of harmless warnings about this header file at W=1 level
because of an unusual function declaration:

kernel/trace/trace.h:766:1: error: 'inline' is not at beginning of declaration [-Werror=old-style-declaration]

This moves the inline statement where it normally belongs, avoiding the
warning.

Link: http://lkml.kernel.org/r/20170123122521.3389010-1-arnd@arndb.de

Fixes: 4046bf023b06 ("ftrace: Expose ftrace_hash_empty and ftrace_lookup_ip")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d2d068b36341..ae1cce91fead 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -763,7 +763,7 @@ struct ftrace_hash {
 struct ftrace_func_entry *
 ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip);
 
-static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash)
+static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash)
 {
 	return !hash || !hash->count;
 }
-- 
cgit v1.2.3


From 8f0994bb8cbde5452e58ce0cacdbf6cb58079d01 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Thu, 12 Jan 2017 13:55:02 +0000
Subject: tracing: Fix return value check in trace_benchmark_reg()

In case of error, the function kthread_run() returns ERR_PTR() and never
returns NULL. The NULL test in the return value check should be replaced
with IS_ERR().

Link: http://lkml.kernel.org/r/20170112135502.28556-1-weiyj.lk@gmail.com

Cc: stable@vger.kernel.org
Fixes: 81dc9f0e ("tracing: Add tracepoint benchmark tracepoint")
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_benchmark.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index e3b488825ae3..e49fbe901cfc 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -175,9 +175,9 @@ int trace_benchmark_reg(void)
 
 	bm_event_thread = kthread_run(benchmark_event_kthread,
 				      NULL, "event_benchmark");
-	if (!bm_event_thread) {
+	if (IS_ERR(bm_event_thread)) {
 		pr_warning("trace benchmark failed to create kernel thread\n");
-		return -ENOMEM;
+		return PTR_ERR(bm_event_thread);
 	}
 
 	return 0;
-- 
cgit v1.2.3


From bef5da60595f5706232e2502876f8545743b9b41 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Fri, 10 Feb 2017 22:21:55 +0900
Subject: tracing/probes: Fix a warning message to show correct maximum length

Since tracing/*probe_events will accept a probe definition
up to 4096 - 2 ('\n' and '\0') bytes, it must show 4094 instead
of 4096 in warning message.

Note that there is one possible case of exceed 4094. If user
prepare 4096 bytes null-terminated string and syscall write
it with the count == 4095, then it can be accepted. However,
if user puts a '\n' after that, it must rejected.
So IMHO, the warning message should indicate shorter one,
since it is safer.

Link: http://lkml.kernel.org/r/148673290462.2579.7966778294009665632.stgit@devbox

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_probe.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 11b63328d6fb..52478f033f88 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -679,8 +679,9 @@ ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
 				if (done + size < count) {
 					if (buf != kbuf)
 						break;
+					/* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
 					pr_warn("Line length is too long: Should be less than %d\n",
-						WRITE_BUFSIZE);
+						WRITE_BUFSIZE - 2);
 					ret = -EINVAL;
 					goto out;
 				}
-- 
cgit v1.2.3


From 5d4bac9a5f4ef24b2482529bda6661a58e5b5b65 Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@ozlabs.org>
Date: Thu, 16 Feb 2017 12:24:09 +0800
Subject: genirq: Clarify logic calculating bogus irqreturn_t values

Although irqreturn_t is an enum, we treat it (and its enumeration
constants) as a bitmask.

However, bad_action_ret() uses a less-than operator to determine whether
an irqreturn_t falls within allowable bit values, which means we need to
know the signededness of an enum type to read the logic, which is
implementation-dependent.

This change explicitly uses an unsigned type for the comparison. We do
this instead of changing to a bitwise test, as the latter compiles to
increased instructions in this hot path.

It looks like we get the correct behaviour currently (bad_action_ret(-1)
returns 1), so this is purely a readability fix.

Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Link: http://lkml.kernel.org/r/1487219049-4061-1-git-send-email-jk@ozlabs.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/spurious.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 5707f97a3e6a..061ba7eed4ed 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -175,7 +175,9 @@ out:
 
 static inline int bad_action_ret(irqreturn_t action_ret)
 {
-	if (likely(action_ret <= (IRQ_HANDLED | IRQ_WAKE_THREAD)))
+	unsigned int r = action_ret;
+
+	if (likely(r <= (IRQ_HANDLED | IRQ_WAKE_THREAD)))
 		return 0;
 	return 1;
 }
-- 
cgit v1.2.3


From 558e8e27e73f53f8a512485be538b07115fe5f3c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 16 Feb 2017 12:19:18 -0800
Subject: Revert "nohz: Fix collision between tick and other hrtimers"

This reverts commit 24b91e360ef521a2808771633d76ebc68bd5604b and commit
7bdb59f1ad47 ("tick/nohz: Fix possible missing clock reprog after tick
soft restart") that depends on it,

Pavel reports that it causes occasional boot hangs for him that seem to
depend on just how the machine was booted.  In particular, his machine
hangs at around the PCI fixups of the EHCI USB host controller, but only
hangs from cold boot, not from a warm boot.

Thomas Gleixner suspecs it's a CPU hotplug interaction, particularly
since Pavel also saw suspend/resume issues that seem to be related.
We're reverting for now while trying to figure out the root cause.

Reported-bisected-and-tested-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@kernel.org  # reverted commits were marked for stable
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/time/tick-sched.c | 14 ++------------
 kernel/time/tick-sched.h |  2 --
 2 files changed, 2 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index fc6f740d0277..2c115fdab397 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -725,11 +725,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 		 */
 		if (delta == 0) {
 			tick_nohz_restart(ts, now);
-			/*
-			 * Make sure next tick stop doesn't get fooled by past
-			 * clock deadline
-			 */
-			ts->next_tick = 0;
 			goto out;
 		}
 	}
@@ -772,7 +767,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	tick = expires;
 
 	/* Skip reprogram of event if its not changed */
-	if (ts->tick_stopped && (expires == ts->next_tick))
+	if (ts->tick_stopped && (expires == dev->next_event))
 		goto out;
 
 	/*
@@ -792,8 +787,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 		trace_tick_stop(1, TICK_DEP_MASK_NONE);
 	}
 
-	ts->next_tick = tick;
-
 	/*
 	 * If the expiration time == KTIME_MAX, then we simply stop
 	 * the tick timer.
@@ -809,10 +802,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	else
 		tick_program_event(tick, 1);
 out:
-	/*
-	 * Update the estimated sleep length until the next timer
-	 * (not only the tick).
-	 */
+	/* Update the estimated sleep length */
 	ts->sleep_length = ktime_sub(dev->next_event, now);
 	return tick;
 }
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 075444e3d48e..bf38226e5c17 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -27,7 +27,6 @@ enum tick_nohz_mode {
  *			timer is modified for nohz sleeps. This is necessary
  *			to resume the tick timer operation in the timeline
  *			when the CPU returns from nohz sleep.
- * @next_tick:		Next tick to be fired when in dynticks mode.
  * @tick_stopped:	Indicator that the idle tick has been stopped
  * @idle_jiffies:	jiffies at the entry to idle for idle time accounting
  * @idle_calls:		Total number of idle calls
@@ -45,7 +44,6 @@ struct tick_sched {
 	unsigned long			check_clocks;
 	enum tick_nohz_mode		nohz_mode;
 	ktime_t				last_tick;
-	ktime_t				next_tick;
 	int				inidle;
 	int				tick_stopped;
 	unsigned long			idle_jiffies;
-- 
cgit v1.2.3


From 67d04bb2bcbd3e99f4c4daa58599c90a83ad314a Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Thu, 16 Feb 2017 20:10:58 -0800
Subject: tracing: Remove outdated ring buffer comment

The comment about ring buffer's organization is outdated and the code sits
elsewhere, remove the comment.
Link: http://lkml.kernel.org/r/20170217041058.23904-1-joelaf@google.com

Cc: Ingo Molnar <mingo@redhat.com>

Signed-off-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4589b67168fc..54e3b8711aca 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -260,16 +260,8 @@ unsigned long long ns2usecs(u64 nsec)
 	TRACE_ITER_EVENT_FORK
 
 /*
- * The global_trace is the descriptor that holds the tracing
- * buffers for the live tracing. For each CPU, it contains
- * a link list of pages that will store trace entries. The
- * page descriptor of the pages in the memory is used to hold
- * the link list by linking the lru item in the page descriptor
- * to each of the pages in the buffer per CPU.
- *
- * For each active CPU there is a data field that holds the
- * pages for the buffer for that CPU. Each CPU has the same number
- * of pages allocated for its buffer.
+ * The global_trace is the descriptor that holds the top-level tracing
+ * buffers for the live tracing.
  */
 static struct trace_array global_trace = {
 	.trace_flags = TRACE_DEFAULT_FLAGS,
-- 
cgit v1.2.3


From c78f8bdfa11fcceb9723c61212e4bd8f76c87f9e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 16 Feb 2017 22:24:48 +0100
Subject: bpf: mark all registered map/prog types as __ro_after_init

All map types and prog types are registered to the BPF core through
bpf_register_map_type() and bpf_register_prog_type() during init and
remain unchanged thereafter. As by design we don't (and never will)
have any pluggable code that can register to that at any later point
in time, lets mark all the existing bpf_{map,prog}_type_list objects
in the tree as __ro_after_init, so they can be moved to read-only
section from then onwards.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/arraymap.c    | 10 +++++-----
 kernel/bpf/hashtab.c     |  8 ++++----
 kernel/bpf/lpm_trie.c    |  2 +-
 kernel/bpf/stackmap.c    |  2 +-
 kernel/trace/bpf_trace.c |  6 +++---
 net/core/filter.c        | 18 +++++++++---------
 6 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 3d55d95dcf49..6b6f41f0b211 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -269,7 +269,7 @@ static const struct bpf_map_ops array_ops = {
 	.map_delete_elem = array_map_delete_elem,
 };
 
-static struct bpf_map_type_list array_type __read_mostly = {
+static struct bpf_map_type_list array_type __ro_after_init = {
 	.ops = &array_ops,
 	.type = BPF_MAP_TYPE_ARRAY,
 };
@@ -283,7 +283,7 @@ static const struct bpf_map_ops percpu_array_ops = {
 	.map_delete_elem = array_map_delete_elem,
 };
 
-static struct bpf_map_type_list percpu_array_type __read_mostly = {
+static struct bpf_map_type_list percpu_array_type __ro_after_init = {
 	.ops = &percpu_array_ops,
 	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
 };
@@ -409,7 +409,7 @@ static const struct bpf_map_ops prog_array_ops = {
 	.map_fd_put_ptr = prog_fd_array_put_ptr,
 };
 
-static struct bpf_map_type_list prog_array_type __read_mostly = {
+static struct bpf_map_type_list prog_array_type __ro_after_init = {
 	.ops = &prog_array_ops,
 	.type = BPF_MAP_TYPE_PROG_ARRAY,
 };
@@ -522,7 +522,7 @@ static const struct bpf_map_ops perf_event_array_ops = {
 	.map_release = perf_event_fd_array_release,
 };
 
-static struct bpf_map_type_list perf_event_array_type __read_mostly = {
+static struct bpf_map_type_list perf_event_array_type __ro_after_init = {
 	.ops = &perf_event_array_ops,
 	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
 };
@@ -564,7 +564,7 @@ static const struct bpf_map_ops cgroup_array_ops = {
 	.map_fd_put_ptr = cgroup_fd_array_put_ptr,
 };
 
-static struct bpf_map_type_list cgroup_array_type __read_mostly = {
+static struct bpf_map_type_list cgroup_array_type __ro_after_init = {
 	.ops = &cgroup_array_ops,
 	.type = BPF_MAP_TYPE_CGROUP_ARRAY,
 };
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index a753bbe7df0a..3ea87fb19a94 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1023,7 +1023,7 @@ static const struct bpf_map_ops htab_ops = {
 	.map_delete_elem = htab_map_delete_elem,
 };
 
-static struct bpf_map_type_list htab_type __read_mostly = {
+static struct bpf_map_type_list htab_type __ro_after_init = {
 	.ops = &htab_ops,
 	.type = BPF_MAP_TYPE_HASH,
 };
@@ -1037,7 +1037,7 @@ static const struct bpf_map_ops htab_lru_ops = {
 	.map_delete_elem = htab_lru_map_delete_elem,
 };
 
-static struct bpf_map_type_list htab_lru_type __read_mostly = {
+static struct bpf_map_type_list htab_lru_type __ro_after_init = {
 	.ops = &htab_lru_ops,
 	.type = BPF_MAP_TYPE_LRU_HASH,
 };
@@ -1124,7 +1124,7 @@ static const struct bpf_map_ops htab_percpu_ops = {
 	.map_delete_elem = htab_map_delete_elem,
 };
 
-static struct bpf_map_type_list htab_percpu_type __read_mostly = {
+static struct bpf_map_type_list htab_percpu_type __ro_after_init = {
 	.ops = &htab_percpu_ops,
 	.type = BPF_MAP_TYPE_PERCPU_HASH,
 };
@@ -1138,7 +1138,7 @@ static const struct bpf_map_ops htab_lru_percpu_ops = {
 	.map_delete_elem = htab_lru_map_delete_elem,
 };
 
-static struct bpf_map_type_list htab_lru_percpu_type __read_mostly = {
+static struct bpf_map_type_list htab_lru_percpu_type __ro_after_init = {
 	.ops = &htab_lru_percpu_ops,
 	.type = BPF_MAP_TYPE_LRU_PERCPU_HASH,
 };
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index e0f6a0bd279b..8bfe0afaee10 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -508,7 +508,7 @@ static const struct bpf_map_ops trie_ops = {
 	.map_delete_elem = trie_delete_elem,
 };
 
-static struct bpf_map_type_list trie_type __read_mostly = {
+static struct bpf_map_type_list trie_type __ro_after_init = {
 	.ops = &trie_ops,
 	.type = BPF_MAP_TYPE_LPM_TRIE,
 };
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index be8519148c25..22aa45cd0324 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -273,7 +273,7 @@ static const struct bpf_map_ops stack_map_ops = {
 	.map_delete_elem = stack_map_delete_elem,
 };
 
-static struct bpf_map_type_list stack_map_type __read_mostly = {
+static struct bpf_map_type_list stack_map_type __ro_after_init = {
 	.ops = &stack_map_ops,
 	.type = BPF_MAP_TYPE_STACK_TRACE,
 };
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 424daa4586d1..cee9802cf3e0 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -506,7 +506,7 @@ static const struct bpf_verifier_ops kprobe_prog_ops = {
 	.is_valid_access = kprobe_prog_is_valid_access,
 };
 
-static struct bpf_prog_type_list kprobe_tl = {
+static struct bpf_prog_type_list kprobe_tl __ro_after_init = {
 	.ops	= &kprobe_prog_ops,
 	.type	= BPF_PROG_TYPE_KPROBE,
 };
@@ -589,7 +589,7 @@ static const struct bpf_verifier_ops tracepoint_prog_ops = {
 	.is_valid_access = tp_prog_is_valid_access,
 };
 
-static struct bpf_prog_type_list tracepoint_tl = {
+static struct bpf_prog_type_list tracepoint_tl __ro_after_init = {
 	.ops	= &tracepoint_prog_ops,
 	.type	= BPF_PROG_TYPE_TRACEPOINT,
 };
@@ -648,7 +648,7 @@ static const struct bpf_verifier_ops perf_event_prog_ops = {
 	.convert_ctx_access	= pe_prog_convert_ctx_access,
 };
 
-static struct bpf_prog_type_list perf_event_tl = {
+static struct bpf_prog_type_list perf_event_tl __ro_after_init = {
 	.ops	= &perf_event_prog_ops,
 	.type	= BPF_PROG_TYPE_PERF_EVENT,
 };
diff --git a/net/core/filter.c b/net/core/filter.c
index 0b753cbb2536..e466e0040137 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3296,47 +3296,47 @@ static const struct bpf_verifier_ops cg_sock_ops = {
 	.convert_ctx_access	= sock_filter_convert_ctx_access,
 };
 
-static struct bpf_prog_type_list sk_filter_type __read_mostly = {
+static struct bpf_prog_type_list sk_filter_type __ro_after_init = {
 	.ops	= &sk_filter_ops,
 	.type	= BPF_PROG_TYPE_SOCKET_FILTER,
 };
 
-static struct bpf_prog_type_list sched_cls_type __read_mostly = {
+static struct bpf_prog_type_list sched_cls_type __ro_after_init = {
 	.ops	= &tc_cls_act_ops,
 	.type	= BPF_PROG_TYPE_SCHED_CLS,
 };
 
-static struct bpf_prog_type_list sched_act_type __read_mostly = {
+static struct bpf_prog_type_list sched_act_type __ro_after_init = {
 	.ops	= &tc_cls_act_ops,
 	.type	= BPF_PROG_TYPE_SCHED_ACT,
 };
 
-static struct bpf_prog_type_list xdp_type __read_mostly = {
+static struct bpf_prog_type_list xdp_type __ro_after_init = {
 	.ops	= &xdp_ops,
 	.type	= BPF_PROG_TYPE_XDP,
 };
 
-static struct bpf_prog_type_list cg_skb_type __read_mostly = {
+static struct bpf_prog_type_list cg_skb_type __ro_after_init = {
 	.ops	= &cg_skb_ops,
 	.type	= BPF_PROG_TYPE_CGROUP_SKB,
 };
 
-static struct bpf_prog_type_list lwt_in_type __read_mostly = {
+static struct bpf_prog_type_list lwt_in_type __ro_after_init = {
 	.ops	= &lwt_inout_ops,
 	.type	= BPF_PROG_TYPE_LWT_IN,
 };
 
-static struct bpf_prog_type_list lwt_out_type __read_mostly = {
+static struct bpf_prog_type_list lwt_out_type __ro_after_init = {
 	.ops	= &lwt_inout_ops,
 	.type	= BPF_PROG_TYPE_LWT_OUT,
 };
 
-static struct bpf_prog_type_list lwt_xmit_type __read_mostly = {
+static struct bpf_prog_type_list lwt_xmit_type __ro_after_init = {
 	.ops	= &lwt_xmit_ops,
 	.type	= BPF_PROG_TYPE_LWT_XMIT,
 };
 
-static struct bpf_prog_type_list cg_sock_type __read_mostly = {
+static struct bpf_prog_type_list cg_sock_type __ro_after_init = {
 	.ops	= &cg_sock_ops,
 	.type	= BPF_PROG_TYPE_CGROUP_SOCK
 };
-- 
cgit v1.2.3


From 9383191da4e40360a5d880fbe6bb03911c61621b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 16 Feb 2017 22:24:49 +0100
Subject: bpf: remove stubs for cBPF from arch code

Remove the dummy bpf_jit_compile() stubs for eBPF JITs and make
that a single __weak function in the core that can be overridden
similarly to the eBPF one. Also remove stale pr_err() mentions
of bpf_jit_compile.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/arm64/net/bpf_jit_comp.c     |  5 -----
 arch/powerpc/net/bpf_jit_comp64.c |  2 --
 arch/s390/net/bpf_jit_comp.c      |  8 --------
 arch/x86/net/bpf_jit_comp.c       |  8 ++------
 include/linux/filter.h            |  6 +-----
 kernel/bpf/core.c                 | 12 +++++++++++-
 6 files changed, 14 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index b2fc97a2c56c..c444408d5a8c 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -813,11 +813,6 @@ static inline void bpf_flush_icache(void *start, void *end)
 	flush_icache_range((unsigned long)start, (unsigned long)end);
 }
 
-void bpf_jit_compile(struct bpf_prog *prog)
-{
-	/* Nothing to do here. We support Internal BPF. */
-}
-
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 {
 	struct bpf_prog *tmp, *orig_prog = prog;
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 73a5cf18fd84..f9ebd02260da 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -961,8 +961,6 @@ common_load:
 	return 0;
 }
 
-void bpf_jit_compile(struct bpf_prog *fp) { }
-
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 {
 	u32 proglen;
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 167b31b186c1..6454efd22e63 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -1262,14 +1262,6 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp)
 	return 0;
 }
 
-/*
- * Classic BPF function stub. BPF programs will be converted into
- * eBPF and then bpf_int_jit_compile() will be called.
- */
-void bpf_jit_compile(struct bpf_prog *fp)
-{
-}
-
 /*
  * Compile eBPF program "fp"
  */
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index bb660e53cbd6..26123d0ae13a 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1067,13 +1067,13 @@ common_load:
 
 		ilen = prog - temp;
 		if (ilen > BPF_MAX_INSN_SIZE) {
-			pr_err("bpf_jit_compile fatal insn size error\n");
+			pr_err("bpf_jit: fatal insn size error\n");
 			return -EFAULT;
 		}
 
 		if (image) {
 			if (unlikely(proglen + ilen > oldproglen)) {
-				pr_err("bpf_jit_compile fatal error\n");
+				pr_err("bpf_jit: fatal error\n");
 				return -EFAULT;
 			}
 			memcpy(image + proglen, temp, ilen);
@@ -1085,10 +1085,6 @@ common_load:
 	return proglen;
 }
 
-void bpf_jit_compile(struct bpf_prog *prog)
-{
-}
-
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 {
 	struct bpf_binary_header *header = NULL;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index e4eb2546339a..c7a70e0cc3a0 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -607,6 +607,7 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
 u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
+void bpf_jit_compile(struct bpf_prog *prog);
 bool bpf_helper_changes_pkt_data(void *func);
 
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
@@ -625,7 +626,6 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 		     bpf_jit_fill_hole_t bpf_fill_ill_insns);
 void bpf_jit_binary_free(struct bpf_binary_header *hdr);
 
-void bpf_jit_compile(struct bpf_prog *fp);
 void bpf_jit_free(struct bpf_prog *fp);
 
 struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *fp);
@@ -669,10 +669,6 @@ static inline bool bpf_jit_blinding_enabled(void)
 	return true;
 }
 #else
-static inline void bpf_jit_compile(struct bpf_prog *fp)
-{
-}
-
 static inline void bpf_jit_free(struct bpf_prog *fp)
 {
 	bpf_prog_unlock_free(fp);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index fddd76b1b627..2831ba1e71c1 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1154,12 +1154,22 @@ const struct bpf_func_proto bpf_tail_call_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
-/* For classic BPF JITs that don't implement bpf_int_jit_compile(). */
+/* Stub for JITs that only support cBPF. eBPF programs are interpreted.
+ * It is encouraged to implement bpf_int_jit_compile() instead, so that
+ * eBPF and implicitly also cBPF can get JITed!
+ */
 struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog)
 {
 	return prog;
 }
 
+/* Stub for JITs that support eBPF. All cBPF code gets transformed into
+ * eBPF by the kernel and is later compiled by bpf_int_jit_compile().
+ */
+void __weak bpf_jit_compile(struct bpf_prog *prog)
+{
+}
+
 bool __weak bpf_helper_changes_pkt_data(void *func)
 {
 	return false;
-- 
cgit v1.2.3


From 74451e66d516c55e309e8d89a4a1e7596e46aacd Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 16 Feb 2017 22:24:50 +0100
Subject: bpf: make jited programs visible in traces

Long standing issue with JITed programs is that stack traces from
function tracing check whether a given address is kernel code
through {__,}kernel_text_address(), which checks for code in core
kernel, modules and dynamically allocated ftrace trampolines. But
what is still missing is BPF JITed programs (interpreted programs
are not an issue as __bpf_prog_run() will be attributed to them),
thus when a stack trace is triggered, the code walking the stack
won't see any of the JITed ones. The same for address correlation
done from user space via reading /proc/kallsyms. This is read by
tools like perf, but the latter is also useful for permanent live
tracing with eBPF itself in combination with stack maps when other
eBPF types are part of the callchain. See offwaketime example on
dumping stack from a map.

This work tries to tackle that issue by making the addresses and
symbols known to the kernel. The lookup from *kernel_text_address()
is implemented through a latched RB tree that can be read under
RCU in fast-path that is also shared for symbol/size/offset lookup
for a specific given address in kallsyms. The slow-path iteration
through all symbols in the seq file done via RCU list, which holds
a tiny fraction of all exported ksyms, usually below 0.1 percent.
Function symbols are exported as bpf_prog_<tag>, in order to aide
debugging and attribution. This facility is currently enabled for
root-only when bpf_jit_kallsyms is set to 1, and disabled if hardening
is active in any mode. The rationale behind this is that still a lot
of systems ship with world read permissions on kallsyms thus addresses
should not get suddenly exposed for them. If that situation gets
much better in future, we always have the option to change the
default on this. Likewise, unprivileged programs are not allowed
to add entries there either, but that is less of a concern as most
such programs types relevant in this context are for root-only anyway.
If enabled, call graphs and stack traces will then show a correct
attribution; one example is illustrated below, where the trace is
now visible in tooling such as perf script --kallsyms=/proc/kallsyms
and friends.

Before:

  7fff8166889d bpf_clone_redirect+0x80007f0020ed (/lib/modules/4.9.0-rc8+/build/vmlinux)
         f5d80 __sendmsg_nocancel+0xffff006451f1a007 (/usr/lib64/libc-2.18.so)

After:

  7fff816688b7 bpf_clone_redirect+0x80007f002107 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fffa0575728 bpf_prog_33c45a467c9e061a+0x8000600020fb (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fffa07ef1fc cls_bpf_classify+0x8000600020dc (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff81678b68 tc_classify+0x80007f002078 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff8164d40b __netif_receive_skb_core+0x80007f0025fb (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff8164d718 __netif_receive_skb+0x80007f002018 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff8164e565 process_backlog+0x80007f002095 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff8164dc71 net_rx_action+0x80007f002231 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff81767461 __softirqentry_text_start+0x80007f0020d1 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff817658ac do_softirq_own_stack+0x80007f00201c (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff810a2c20 do_softirq+0x80007f002050 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff810a2cb5 __local_bh_enable_ip+0x80007f002085 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff8168d452 ip_finish_output2+0x80007f002152 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff8168ea3d ip_finish_output+0x80007f00217d (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff8168f2af ip_output+0x80007f00203f (/lib/modules/4.9.0-rc8+/build/vmlinux)
  [...]
  7fff81005854 do_syscall_64+0x80007f002054 (/lib/modules/4.9.0-rc8+/build/vmlinux)
  7fff817649eb return_from_SYSCALL_64+0x80007f002000 (/lib/modules/4.9.0-rc8+/build/vmlinux)
         f5d80 __sendmsg_nocancel+0xffff01c484812007 (/usr/lib64/libc-2.18.so)

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/sysctl/net.txt      |  12 ++
 arch/arm64/net/bpf_jit_comp.c     |  15 ---
 arch/powerpc/net/bpf_jit_comp64.c |   1 +
 arch/s390/net/bpf_jit_comp.c      |  18 ---
 arch/x86/net/bpf_jit_comp.c       |  15 ---
 include/linux/bpf.h               |   4 +
 include/linux/filter.h            | 112 ++++++++++++++++++-
 kernel/bpf/core.c                 | 223 ++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c              |   2 +
 kernel/extable.c                  |   9 +-
 kernel/kallsyms.c                 |  61 +++++++++--
 net/Kconfig                       |   3 +-
 net/core/sysctl_net_core.c        |   7 ++
 13 files changed, 419 insertions(+), 63 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index b80fbd4e5575..2ebabc93014a 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -54,6 +54,18 @@ Values :
 	1 - enable JIT hardening for unprivileged users only
 	2 - enable JIT hardening for all users
 
+bpf_jit_kallsyms
+----------------
+
+When Berkeley Packet Filter Just in Time compiler is enabled, then compiled
+images are unknown addresses to the kernel, meaning they neither show up in
+traces nor in /proc/kallsyms. This enables export of these addresses, which
+can be used for debugging/tracing. If bpf_jit_harden is enabled, this feature
+is disabled.
+Values :
+	0 - disable JIT kallsyms export (default value)
+	1 - enable JIT kallsyms export for privileged users only
+
 dev_weight
 --------------
 
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index c444408d5a8c..05d12104d270 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -910,18 +910,3 @@ out:
 					   tmp : orig_prog);
 	return prog;
 }
-
-void bpf_jit_free(struct bpf_prog *prog)
-{
-	unsigned long addr = (unsigned long)prog->bpf_func & PAGE_MASK;
-	struct bpf_binary_header *header = (void *)addr;
-
-	if (!prog->jited)
-		goto free_filter;
-
-	set_memory_rw(addr, header->pages);
-	bpf_jit_binary_free(header);
-
-free_filter:
-	bpf_prog_unlock_free(prog);
-}
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index f9ebd02260da..c34166ef76fc 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -1064,6 +1064,7 @@ out:
 	return fp;
 }
 
+/* Overriding bpf_jit_free() as we don't set images read-only. */
 void bpf_jit_free(struct bpf_prog *fp)
 {
 	unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK;
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 6454efd22e63..f1d0e62ec1dd 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -1339,21 +1339,3 @@ out:
 					   tmp : orig_fp);
 	return fp;
 }
-
-/*
- * Free eBPF program
- */
-void bpf_jit_free(struct bpf_prog *fp)
-{
-	unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK;
-	struct bpf_binary_header *header = (void *)addr;
-
-	if (!fp->jited)
-		goto free_filter;
-
-	set_memory_rw(addr, header->pages);
-	bpf_jit_binary_free(header);
-
-free_filter:
-	bpf_prog_unlock_free(fp);
-}
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 26123d0ae13a..18a62e208826 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1180,18 +1180,3 @@ out:
 					   tmp : orig_prog);
 	return prog;
 }
-
-void bpf_jit_free(struct bpf_prog *fp)
-{
-	unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK;
-	struct bpf_binary_header *header = (void *)addr;
-
-	if (!fp->jited)
-		goto free_filter;
-
-	set_memory_rw(addr, header->pages);
-	bpf_jit_binary_free(header);
-
-free_filter:
-	bpf_prog_unlock_free(fp);
-}
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 57d60dc5b600..909fc033173a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -8,10 +8,12 @@
 #define _LINUX_BPF_H 1
 
 #include <uapi/linux/bpf.h>
+
 #include <linux/workqueue.h>
 #include <linux/file.h>
 #include <linux/percpu.h>
 #include <linux/err.h>
+#include <linux/rbtree_latch.h>
 
 struct perf_event;
 struct bpf_map;
@@ -177,6 +179,8 @@ struct bpf_prog_aux {
 	atomic_t refcnt;
 	u32 used_map_cnt;
 	u32 max_ctx_offset;
+	struct latch_tree_node ksym_tnode;
+	struct list_head ksym_lnode;
 	const struct bpf_verifier_ops *ops;
 	struct bpf_map **used_maps;
 	struct bpf_prog *prog;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index c7a70e0cc3a0..0c1cc9143cb2 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -54,6 +54,12 @@ struct bpf_prog_aux;
 #define BPF_REG_AX		MAX_BPF_REG
 #define MAX_BPF_JIT_REG		(MAX_BPF_REG + 1)
 
+/* As per nm, we expose JITed images as text (code) section for
+ * kallsyms. That way, tools like perf can find it to match
+ * addresses.
+ */
+#define BPF_SYM_ELF_TYPE	't'
+
 /* BPF program can access up to 512 bytes of stack space. */
 #define MAX_BPF_STACK	512
 
@@ -555,6 +561,11 @@ static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
 {
 	set_memory_rw((unsigned long)fp, fp->pages);
 }
+
+static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr)
+{
+	set_memory_rw((unsigned long)hdr, hdr->pages);
+}
 #else
 static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
 {
@@ -563,8 +574,21 @@ static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
 static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
 {
 }
+
+static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr)
+{
+}
 #endif /* CONFIG_DEBUG_SET_MODULE_RONX */
 
+static inline struct bpf_binary_header *
+bpf_jit_binary_hdr(const struct bpf_prog *fp)
+{
+	unsigned long real_start = (unsigned long)fp->bpf_func;
+	unsigned long addr = real_start & PAGE_MASK;
+
+	return (void *)addr;
+}
+
 int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap);
 static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
 {
@@ -617,6 +641,7 @@ void bpf_warn_invalid_xdp_action(u32 act);
 #ifdef CONFIG_BPF_JIT
 extern int bpf_jit_enable;
 extern int bpf_jit_harden;
+extern int bpf_jit_kallsyms;
 
 typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);
 
@@ -651,6 +676,11 @@ static inline bool bpf_jit_is_ebpf(void)
 # endif
 }
 
+static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp)
+{
+	return fp->jited && bpf_jit_is_ebpf();
+}
+
 static inline bool bpf_jit_blinding_enabled(void)
 {
 	/* These are the prerequisites, should someone ever have the
@@ -668,11 +698,91 @@ static inline bool bpf_jit_blinding_enabled(void)
 
 	return true;
 }
-#else
+
+static inline bool bpf_jit_kallsyms_enabled(void)
+{
+	/* There are a couple of corner cases where kallsyms should
+	 * not be enabled f.e. on hardening.
+	 */
+	if (bpf_jit_harden)
+		return false;
+	if (!bpf_jit_kallsyms)
+		return false;
+	if (bpf_jit_kallsyms == 1)
+		return true;
+
+	return false;
+}
+
+const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
+				 unsigned long *off, char *sym);
+bool is_bpf_text_address(unsigned long addr);
+int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+		    char *sym);
+
+static inline const char *
+bpf_address_lookup(unsigned long addr, unsigned long *size,
+		   unsigned long *off, char **modname, char *sym)
+{
+	const char *ret = __bpf_address_lookup(addr, size, off, sym);
+
+	if (ret && modname)
+		*modname = NULL;
+	return ret;
+}
+
+void bpf_prog_kallsyms_add(struct bpf_prog *fp);
+void bpf_prog_kallsyms_del(struct bpf_prog *fp);
+
+#else /* CONFIG_BPF_JIT */
+
+static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp)
+{
+	return false;
+}
+
 static inline void bpf_jit_free(struct bpf_prog *fp)
 {
 	bpf_prog_unlock_free(fp);
 }
+
+static inline bool bpf_jit_kallsyms_enabled(void)
+{
+	return false;
+}
+
+static inline const char *
+__bpf_address_lookup(unsigned long addr, unsigned long *size,
+		     unsigned long *off, char *sym)
+{
+	return NULL;
+}
+
+static inline bool is_bpf_text_address(unsigned long addr)
+{
+	return false;
+}
+
+static inline int bpf_get_kallsym(unsigned int symnum, unsigned long *value,
+				  char *type, char *sym)
+{
+	return -ERANGE;
+}
+
+static inline const char *
+bpf_address_lookup(unsigned long addr, unsigned long *size,
+		   unsigned long *off, char **modname, char *sym)
+{
+	return NULL;
+}
+
+static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp)
+{
+}
+
+static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp)
+{
+}
 #endif /* CONFIG_BPF_JIT */
 
 #define BPF_ANC		BIT(15)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 2831ba1e71c1..f45827e205d3 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -28,6 +28,9 @@
 #include <linux/moduleloader.h>
 #include <linux/bpf.h>
 #include <linux/frame.h>
+#include <linux/rbtree_latch.h>
+#include <linux/kallsyms.h>
+#include <linux/rcupdate.h>
 
 #include <asm/unaligned.h>
 
@@ -95,6 +98,8 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
 	fp->aux = aux;
 	fp->aux->prog = fp;
 
+	INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode);
+
 	return fp;
 }
 EXPORT_SYMBOL_GPL(bpf_prog_alloc);
@@ -290,6 +295,206 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 }
 
 #ifdef CONFIG_BPF_JIT
+static __always_inline void
+bpf_get_prog_addr_region(const struct bpf_prog *prog,
+			 unsigned long *symbol_start,
+			 unsigned long *symbol_end)
+{
+	const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog);
+	unsigned long addr = (unsigned long)hdr;
+
+	WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));
+
+	*symbol_start = addr;
+	*symbol_end   = addr + hdr->pages * PAGE_SIZE;
+}
+
+static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
+{
+	BUILD_BUG_ON(sizeof("bpf_prog_") +
+		     sizeof(prog->tag) * 2 + 1 > KSYM_NAME_LEN);
+
+	sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
+	sym  = bin2hex(sym, prog->tag, sizeof(prog->tag));
+	*sym = 0;
+}
+
+static __always_inline unsigned long
+bpf_get_prog_addr_start(struct latch_tree_node *n)
+{
+	unsigned long symbol_start, symbol_end;
+	const struct bpf_prog_aux *aux;
+
+	aux = container_of(n, struct bpf_prog_aux, ksym_tnode);
+	bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end);
+
+	return symbol_start;
+}
+
+static __always_inline bool bpf_tree_less(struct latch_tree_node *a,
+					  struct latch_tree_node *b)
+{
+	return bpf_get_prog_addr_start(a) < bpf_get_prog_addr_start(b);
+}
+
+static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n)
+{
+	unsigned long val = (unsigned long)key;
+	unsigned long symbol_start, symbol_end;
+	const struct bpf_prog_aux *aux;
+
+	aux = container_of(n, struct bpf_prog_aux, ksym_tnode);
+	bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end);
+
+	if (val < symbol_start)
+		return -1;
+	if (val >= symbol_end)
+		return  1;
+
+	return 0;
+}
+
+static const struct latch_tree_ops bpf_tree_ops = {
+	.less	= bpf_tree_less,
+	.comp	= bpf_tree_comp,
+};
+
+static DEFINE_SPINLOCK(bpf_lock);
+static LIST_HEAD(bpf_kallsyms);
+static struct latch_tree_root bpf_tree __cacheline_aligned;
+
+int bpf_jit_kallsyms __read_mostly;
+
+static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux)
+{
+	WARN_ON_ONCE(!list_empty(&aux->ksym_lnode));
+	list_add_tail_rcu(&aux->ksym_lnode, &bpf_kallsyms);
+	latch_tree_insert(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops);
+}
+
+static void bpf_prog_ksym_node_del(struct bpf_prog_aux *aux)
+{
+	if (list_empty(&aux->ksym_lnode))
+		return;
+
+	latch_tree_erase(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops);
+	list_del_rcu(&aux->ksym_lnode);
+}
+
+static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
+{
+	return fp->jited && !bpf_prog_was_classic(fp);
+}
+
+static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
+{
+	return list_empty(&fp->aux->ksym_lnode) ||
+	       fp->aux->ksym_lnode.prev == LIST_POISON2;
+}
+
+void bpf_prog_kallsyms_add(struct bpf_prog *fp)
+{
+	unsigned long flags;
+
+	if (!bpf_prog_kallsyms_candidate(fp) ||
+	    !capable(CAP_SYS_ADMIN))
+		return;
+
+	spin_lock_irqsave(&bpf_lock, flags);
+	bpf_prog_ksym_node_add(fp->aux);
+	spin_unlock_irqrestore(&bpf_lock, flags);
+}
+
+void bpf_prog_kallsyms_del(struct bpf_prog *fp)
+{
+	unsigned long flags;
+
+	if (!bpf_prog_kallsyms_candidate(fp))
+		return;
+
+	spin_lock_irqsave(&bpf_lock, flags);
+	bpf_prog_ksym_node_del(fp->aux);
+	spin_unlock_irqrestore(&bpf_lock, flags);
+}
+
+static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr)
+{
+	struct latch_tree_node *n;
+
+	if (!bpf_jit_kallsyms_enabled())
+		return NULL;
+
+	n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops);
+	return n ?
+	       container_of(n, struct bpf_prog_aux, ksym_tnode)->prog :
+	       NULL;
+}
+
+const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
+				 unsigned long *off, char *sym)
+{
+	unsigned long symbol_start, symbol_end;
+	struct bpf_prog *prog;
+	char *ret = NULL;
+
+	rcu_read_lock();
+	prog = bpf_prog_kallsyms_find(addr);
+	if (prog) {
+		bpf_get_prog_addr_region(prog, &symbol_start, &symbol_end);
+		bpf_get_prog_name(prog, sym);
+
+		ret = sym;
+		if (size)
+			*size = symbol_end - symbol_start;
+		if (off)
+			*off  = addr - symbol_start;
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+bool is_bpf_text_address(unsigned long addr)
+{
+	bool ret;
+
+	rcu_read_lock();
+	ret = bpf_prog_kallsyms_find(addr) != NULL;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+		    char *sym)
+{
+	unsigned long symbol_start, symbol_end;
+	struct bpf_prog_aux *aux;
+	unsigned int it = 0;
+	int ret = -ERANGE;
+
+	if (!bpf_jit_kallsyms_enabled())
+		return ret;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(aux, &bpf_kallsyms, ksym_lnode) {
+		if (it++ != symnum)
+			continue;
+
+		bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end);
+		bpf_get_prog_name(aux->prog, sym);
+
+		*value = symbol_start;
+		*type  = BPF_SYM_ELF_TYPE;
+
+		ret = 0;
+		break;
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
 struct bpf_binary_header *
 bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 		     unsigned int alignment,
@@ -326,6 +531,24 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
 	module_memfree(hdr);
 }
 
+/* This symbol is only overridden by archs that have different
+ * requirements than the usual eBPF JITs, f.e. when they only
+ * implement cBPF JIT, do not set images read-only, etc.
+ */
+void __weak bpf_jit_free(struct bpf_prog *fp)
+{
+	if (fp->jited) {
+		struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
+
+		bpf_jit_binary_unlock_ro(hdr);
+		bpf_jit_binary_free(hdr);
+
+		WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
+	}
+
+	bpf_prog_unlock_free(fp);
+}
+
 int bpf_jit_harden __read_mostly;
 
 static int bpf_jit_blind_insn(const struct bpf_insn *from,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f74ca17af64a..461eb1e66a0f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -707,6 +707,7 @@ void bpf_prog_put(struct bpf_prog *prog)
 {
 	if (atomic_dec_and_test(&prog->aux->refcnt)) {
 		trace_bpf_prog_put_rcu(prog);
+		bpf_prog_kallsyms_del(prog);
 		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
 	}
 }
@@ -903,6 +904,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 		/* failed to allocate fd */
 		goto free_used_maps;
 
+	bpf_prog_kallsyms_add(prog);
 	trace_bpf_prog_load(prog, err);
 	return err;
 
diff --git a/kernel/extable.c b/kernel/extable.c
index e3beec4a2339..bd82117ad424 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -20,6 +20,7 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/init.h>
+#include <linux/filter.h>
 
 #include <asm/sections.h>
 #include <linux/uaccess.h>
@@ -104,6 +105,8 @@ int __kernel_text_address(unsigned long addr)
 		return 1;
 	if (is_ftrace_trampoline(addr))
 		return 1;
+	if (is_bpf_text_address(addr))
+		return 1;
 	/*
 	 * There might be init symbols in saved stacktraces.
 	 * Give those symbols a chance to be printed in
@@ -123,7 +126,11 @@ int kernel_text_address(unsigned long addr)
 		return 1;
 	if (is_module_text_address(addr))
 		return 1;
-	return is_ftrace_trampoline(addr);
+	if (is_ftrace_trampoline(addr))
+		return 1;
+	if (is_bpf_text_address(addr))
+		return 1;
+	return 0;
 }
 
 /*
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index fafd1a3ef0da..6a3b249a2ae1 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -23,6 +23,7 @@
 #include <linux/mm.h>
 #include <linux/ctype.h>
 #include <linux/slab.h>
+#include <linux/filter.h>
 #include <linux/compiler.h>
 
 #include <asm/sections.h>
@@ -300,10 +301,11 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
 				unsigned long *offset)
 {
 	char namebuf[KSYM_NAME_LEN];
+
 	if (is_ksym_addr(addr))
 		return !!get_symbol_pos(addr, symbolsize, offset);
-
-	return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf);
+	return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf) ||
+	       !!__bpf_address_lookup(addr, symbolsize, offset, namebuf);
 }
 
 /*
@@ -318,6 +320,8 @@ const char *kallsyms_lookup(unsigned long addr,
 			    unsigned long *offset,
 			    char **modname, char *namebuf)
 {
+	const char *ret;
+
 	namebuf[KSYM_NAME_LEN - 1] = 0;
 	namebuf[0] = 0;
 
@@ -333,9 +337,13 @@ const char *kallsyms_lookup(unsigned long addr,
 		return namebuf;
 	}
 
-	/* See if it's in a module. */
-	return module_address_lookup(addr, symbolsize, offset, modname,
-				     namebuf);
+	/* See if it's in a module or a BPF JITed image. */
+	ret = module_address_lookup(addr, symbolsize, offset,
+				    modname, namebuf);
+	if (!ret)
+		ret = bpf_address_lookup(addr, symbolsize,
+					 offset, modname, namebuf);
+	return ret;
 }
 
 int lookup_symbol_name(unsigned long addr, char *symname)
@@ -471,6 +479,7 @@ EXPORT_SYMBOL(__print_symbol);
 /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
 struct kallsym_iter {
 	loff_t pos;
+	loff_t pos_mod_end;
 	unsigned long value;
 	unsigned int nameoff; /* If iterating in core kernel symbols. */
 	char type;
@@ -481,13 +490,27 @@ struct kallsym_iter {
 
 static int get_ksymbol_mod(struct kallsym_iter *iter)
 {
-	if (module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value,
-				&iter->type, iter->name, iter->module_name,
-				&iter->exported) < 0)
+	int ret = module_get_kallsym(iter->pos - kallsyms_num_syms,
+				     &iter->value, &iter->type,
+				     iter->name, iter->module_name,
+				     &iter->exported);
+	if (ret < 0) {
+		iter->pos_mod_end = iter->pos;
 		return 0;
+	}
+
 	return 1;
 }
 
+static int get_ksymbol_bpf(struct kallsym_iter *iter)
+{
+	iter->module_name[0] = '\0';
+	iter->exported = 0;
+	return bpf_get_kallsym(iter->pos - iter->pos_mod_end,
+			       &iter->value, &iter->type,
+			       iter->name) < 0 ? 0 : 1;
+}
+
 /* Returns space to next name. */
 static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
 {
@@ -508,16 +531,30 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
 	iter->name[0] = '\0';
 	iter->nameoff = get_symbol_offset(new_pos);
 	iter->pos = new_pos;
+	if (new_pos == 0)
+		iter->pos_mod_end = 0;
+}
+
+static int update_iter_mod(struct kallsym_iter *iter, loff_t pos)
+{
+	iter->pos = pos;
+
+	if (iter->pos_mod_end > 0 &&
+	    iter->pos_mod_end < iter->pos)
+		return get_ksymbol_bpf(iter);
+
+	if (!get_ksymbol_mod(iter))
+		return get_ksymbol_bpf(iter);
+
+	return 1;
 }
 
 /* Returns false if pos at or past end of file. */
 static int update_iter(struct kallsym_iter *iter, loff_t pos)
 {
 	/* Module symbols can be accessed randomly. */
-	if (pos >= kallsyms_num_syms) {
-		iter->pos = pos;
-		return get_ksymbol_mod(iter);
-	}
+	if (pos >= kallsyms_num_syms)
+		return update_iter_mod(iter, pos);
 
 	/* If we're not on the desired position, reset to new position. */
 	if (pos != iter->pos)
diff --git a/net/Kconfig b/net/Kconfig
index f19c0c3b9589..102f781a0131 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -297,7 +297,8 @@ config BPF_JIT
 
 	  Note, admin should enable this feature changing:
 	  /proc/sys/net/core/bpf_jit_enable
-	  /proc/sys/net/core/bpf_jit_harden (optional)
+	  /proc/sys/net/core/bpf_jit_harden   (optional)
+	  /proc/sys/net/core/bpf_jit_kallsyms (optional)
 
 config NET_FLOW_LIMIT
 	bool
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index eaa72eb0399c..4ead336e14ea 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -334,6 +334,13 @@ static struct ctl_table net_core_table[] = {
 		.mode		= 0600,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "bpf_jit_kallsyms",
+		.data		= &bpf_jit_kallsyms,
+		.maxlen		= sizeof(int),
+		.mode		= 0600,
+		.proc_handler	= proc_dointvec,
+	},
 # endif
 #endif
 	{
-- 
cgit v1.2.3


From 8cff66791a6bfc8fb98f93e4e7c13fd06afecf7a Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Fri, 17 Feb 2017 14:18:44 +0100
Subject: PM / sleep: Fix test_suspend after sleep state rework

When passing "test_suspend=mem" to the kernel:

    PM: can't test 'mem' suspend state

and the suspend test is not run.

Commit 406e79385f3223d8 ("PM / sleep: System sleep state selection
interface rework") changed pm_labels[] from a contiguous NULL-terminated
array to a sparse array (with the first element unpopulated), breaking
the assumptions of the iterator in setup_test_suspend().

Iterate from PM_SUSPEND_MIN to PM_SUSPEND_MAX - 1 to fix this.

Fixes: 406e79385f3223d8 (PM / sleep: System sleep state selection interface rework)
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/suspend_test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index bdff5ed57f10..5db217051232 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -166,7 +166,7 @@ static int __init setup_test_suspend(char *value)
 			return 0;
 	}
 
-	for (i = 0; pm_labels[i]; i++)
+	for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
 		if (!strcmp(pm_labels[i], suspend_type)) {
 			test_state_label = pm_labels[i];
 			return 0;
-- 
cgit v1.2.3


From 336a9cde10d641e70bac67d90ae91b3190c3edca Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Fri, 15 Jan 2016 17:41:09 +0000
Subject: hrtimer: Catch invalid clockids again

commit 82e88ff1ea94 ("hrtimer: Revert CLOCK_MONOTONIC_RAW support") removed
unfortunately a sanity check in the hrtimer code which was part of that
MONOTONIC_RAW patch series.

It would have caught the bogus usage of CLOCK_MONOTONIC_RAW in the wireless
code. So bring it back.

It is way too easy to take any random clockid and feed it to the hrtimer
subsystem. At best, it gets mapped to a monotonic base, but it would be
better to just catch illegal values as early as possible.

Detect invalid clockids, map them to CLOCK_MONOTONIC and emit a warning.

[ tglx: Replaced the BUG by a WARN and gracefully map to CLOCK_MONOTONIC ]

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Tomasz Nowicki <tn@semihalf.com>
Cc: Christoffer Dall <christoffer.dall@linaro.org>
Link: http://lkml.kernel.org/r/1452879670-16133-3-git-send-email-marc.zyngier@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/hrtimer.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index edabde646e58..8e11d8d9f419 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -94,17 +94,15 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 };
 
 static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
+	/* Make sure we catch unsupported clockids */
+	[0 ... MAX_CLOCKS - 1]	= HRTIMER_MAX_CLOCK_BASES,
+
 	[CLOCK_REALTIME]	= HRTIMER_BASE_REALTIME,
 	[CLOCK_MONOTONIC]	= HRTIMER_BASE_MONOTONIC,
 	[CLOCK_BOOTTIME]	= HRTIMER_BASE_BOOTTIME,
 	[CLOCK_TAI]		= HRTIMER_BASE_TAI,
 };
 
-static inline int hrtimer_clockid_to_base(clockid_t clock_id)
-{
-	return hrtimer_clock_to_base_table[clock_id];
-}
-
 /*
  * Functions and macros which are different for UP/SMP systems are kept in a
  * single place
@@ -1081,6 +1079,18 @@ u64 hrtimer_get_next_event(void)
 }
 #endif
 
+static inline int hrtimer_clockid_to_base(clockid_t clock_id)
+{
+	if (likely(clock_id < MAX_CLOCKS)) {
+		int base = hrtimer_clock_to_base_table[clock_id];
+
+		if (likely(base != HRTIMER_MAX_CLOCK_BASES))
+			return base;
+	}
+	WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
+	return HRTIMER_BASE_MONOTONIC;
+}
+
 static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 			   enum hrtimer_mode mode)
 {
-- 
cgit v1.2.3


From fc98c3c8c9dcafd67adcce69e6ce3191d5306c9c Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Date: Sat, 18 Feb 2017 03:42:54 -0800
Subject: printk: use rcuidle console tracepoint

Use rcuidle console tracepoint because, apparently, it may be issued
from an idle CPU:

  hw-breakpoint: Failed to enable monitor mode on CPU 0.
  hw-breakpoint: CPU 0 failed to disable vector catch

  ===============================
  [ ERR: suspicious RCU usage.  ]
  4.10.0-rc8-next-20170215+ #119 Not tainted
  -------------------------------
  ./include/trace/events/printk.h:32 suspicious rcu_dereference_check() usage!

  other info that might help us debug this:

  RCU used illegally from idle CPU!
  rcu_scheduler_active = 2, debug_locks = 0
  RCU used illegally from extended quiescent state!
  2 locks held by swapper/0/0:
   #0:  (cpu_pm_notifier_lock){......}, at: [<c0237e2c>] cpu_pm_exit+0x10/0x54
   #1:  (console_lock){+.+.+.}, at: [<c01ab350>] vprintk_emit+0x264/0x474

  stack backtrace:
  CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.10.0-rc8-next-20170215+ #119
  Hardware name: Generic OMAP4 (Flattened Device Tree)
    console_unlock
    vprintk_emit
    vprintk_default
    printk
    reset_ctrl_regs
    dbg_cpu_pm_notify
    notifier_call_chain
    cpu_pm_exit
    omap_enter_idle_coupled
    cpuidle_enter_state
    cpuidle_enter_state_coupled
    do_idle
    cpu_startup_entry
    start_kernel

This RCU warning, however, is suppressed by lockdep_off() in printk().
lockdep_off() increments the ->lockdep_recursion counter and thus
disables RCU_LOCKDEP_WARN() and debug_lockdep_rcu_enabled(), which want
lockdep to be enabled "current->lockdep_recursion == 0".

Link: http://lkml.kernel.org/r/20170217015932.11898-1-sergey.senozhatsky@gmail.com
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Reported-by: Tony Lindgren <tony@atomide.com>
Tested-by: Tony Lindgren <tony@atomide.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Lindgren <tony@atomide.com>
Cc: Russell King <rmk@armlinux.org.uk>
Cc: <stable@vger.kernel.org> [3.4+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 8b2696420abb..4ba3d34938c0 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1516,7 +1516,7 @@ static void call_console_drivers(int level,
 {
 	struct console *con;
 
-	trace_console(text, len);
+	trace_console_rcuidle(text, len);
 
 	if (!console_drivers)
 		return;
-- 
cgit v1.2.3


From 95cb64c1fe61e70685a95f6260c8e9cd219fe08c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 18 Feb 2017 15:26:45 +0100
Subject: fork: Fix task_struct alignment

Stupid bug that wrecked the alignment of task_struct and causes WARN()s
in the x86 FPU code on some platforms.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Tested-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: e274795ea7b7 ("locking/mutex: Fix mutex handoff")
Link: http://lkml.kernel.org/r/20170218142645.GH6500@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index a90510d0bbf8..ea33f8adc032 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -434,7 +434,7 @@ void __init fork_init(void)
 #ifndef ARCH_MIN_TASKALIGN
 #define ARCH_MIN_TASKALIGN	0
 #endif
-	int align = min_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
+	int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
 
 	/* create a slab on which task_structs can be allocated */
 	task_struct_cachep = kmem_cache_create("task_struct",
-- 
cgit v1.2.3


From fd4a61e08aa79f2b7835b25c6f94f27bd2d65990 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 21 Feb 2017 09:29:01 -0800
Subject: sched/core: Fix build paravirt build on arm and arm64

Commit 004172bdad64 ("sched/core: Remove unnecessary #include headers")
removed the inclusion of asm/paravirt.h which is used to get
declarations of paravirt_steal_rq_enabled and paravirt_steal_clock.

It is implicitly included on x86 but not on arm and arm64 breaking the
build if paravirtualization is used.  Since things from that header are
used directly fix the build by putting the direct inclusion back.

Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched/core.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 34e2291a9a6c..e1ae6ac15eac 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -23,6 +23,9 @@
 
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
 
 #include "sched.h"
 #include "../workqueue_internal.h"
-- 
cgit v1.2.3


From a5544880aff90baf1bd4443ac7ff65182213ffcd Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@kernel.org>
Date: Fri, 10 Feb 2017 14:06:22 -0800
Subject: module: fix memory leak on early load_module() failures

While looking for early possible module loading failures I was
able to reproduce a memory leak possible with kmemleak. There
are a few rare ways to trigger a failure:

  o we've run into a failure while processing kernel parameters
    (parse_args() returns an error)
  o mod_sysfs_setup() fails
  o we're a live patch module and copy_module_elf() fails

Chances of running into this issue is really low.

kmemleak splat:

unreferenced object 0xffff9f2c4ada1b00 (size 32):
  comm "kworker/u16:4", pid 82, jiffies 4294897636 (age 681.816s)
  hex dump (first 32 bytes):
    6d 65 6d 73 74 69 63 6b 30 00 00 00 00 00 00 00  memstick0.......
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<ffffffff8c6cfeba>] kmemleak_alloc+0x4a/0xa0
    [<ffffffff8c200046>] __kmalloc_track_caller+0x126/0x230
    [<ffffffff8c1bc581>] kstrdup+0x31/0x60
    [<ffffffff8c1bc5d4>] kstrdup_const+0x24/0x30
    [<ffffffff8c3c23aa>] kvasprintf_const+0x7a/0x90
    [<ffffffff8c3b5481>] kobject_set_name_vargs+0x21/0x90
    [<ffffffff8c4fbdd7>] dev_set_name+0x47/0x50
    [<ffffffffc07819e5>] memstick_check+0x95/0x33c [memstick]
    [<ffffffff8c09c893>] process_one_work+0x1f3/0x4b0
    [<ffffffff8c09cb98>] worker_thread+0x48/0x4e0
    [<ffffffff8c0a2b79>] kthread+0xc9/0xe0
    [<ffffffff8c6dab5f>] ret_from_fork+0x1f/0x40
    [<ffffffffffffffff>] 0xffffffffffffffff

Cc: stable <stable@vger.kernel.org> # v2.6.30
Fixes: e180a6b7759a ("param: fix charp parameters set via sysfs")
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Aaron Tomlin <atomlin@redhat.com>
Reviewed-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
Signed-off-by: Jessica Yu <jeyu@redhat.com>
---
 kernel/module.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 32d0d32abbf6..14da88b5d0fb 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3724,6 +3724,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
 	mod_sysfs_teardown(mod);
  coming_cleanup:
 	mod->state = MODULE_STATE_GOING;
+	destroy_params(mod->kp, mod->num_kp);
 	blocking_notifier_call_chain(&module_notify_list,
 				     MODULE_STATE_GOING, mod);
 	klp_module_going(mod);
-- 
cgit v1.2.3


From d7276e321ff8a53106a59c85ca46d03e34288893 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 7 Feb 2017 15:18:51 -0800
Subject: seccomp: Only dump core when single-threaded

The SECCOMP_RET_KILL filter return code has always killed the current
thread, not the entire process. Changing this as a side-effect of dumping
core isn't a safe thing to do (a few test suites have already flagged this
behavioral change). Instead, restore the RET_KILL semantics, but still
dump core when a RET_KILL delivers SIGSYS to a single-threaded process.

Fixes: b25e67161c29 ("seccomp: dump core when using SECCOMP_RET_KILL")
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 kernel/seccomp.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index f8f88ebcb3ba..e15185c28de5 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -643,11 +643,14 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 	default: {
 		siginfo_t info;
 		audit_seccomp(this_syscall, SIGSYS, action);
-		/* Show the original registers in the dump. */
-		syscall_rollback(current, task_pt_regs(current));
-		/* Trigger a manual coredump since do_exit skips it. */
-		seccomp_init_siginfo(&info, this_syscall, data);
-		do_coredump(&info);
+		/* Dump core only if this is the last remaining thread. */
+		if (get_nr_threads(current) == 1) {
+			/* Show the original registers in the dump. */
+			syscall_rollback(current, task_pt_regs(current));
+			/* Trigger a manual coredump since do_exit skips it. */
+			seccomp_init_siginfo(&info, this_syscall, data);
+			do_coredump(&info);
+		}
 		do_exit(SIGSYS);
 	}
 	}
-- 
cgit v1.2.3


From d3213e8fd4b0f18dfd438268ff480406ba743abb Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 22 Feb 2017 15:39:47 -0800
Subject: tracing: add __print_flags_u64()

Patch series "DAX tracepoints, mm argument simplification", v4.

This contains both my DAX tracepoint code and Dave Jiang's MM argument
simplifications.  Dave's code was written with my tracepoint code as a
baseline, so it seemed simplest to keep them together in a single series.

This patch (of 7):

Add __print_flags_u64() and the helper trace_print_flags_seq_u64() in the
same spirit as __print_symbolic_u64() and trace_print_symbols_seq_u64().
These functions allow us to print symbols associated with flags that are
64 bits wide even on 32 bit machines.

These will be used by the DAX code so that we can print the flags set in a
pfn_t such as PFN_SG_CHAIN, PFN_SG_LAST, PFN_DEV and PFN_MAP.

Without this new function I was getting errors like the following when
compiling for i386:

  include/linux/pfn_t.h:13:22: warning: large integer implicitly truncated to unsigned type [-Woverflow]
   #define PFN_SG_CHAIN (1ULL << (BITS_PER_LONG_LONG - 1))
    ^

Link: http://lkml.kernel.org/r/1484085142-2297-2-git-send-email-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/trace_events.h |  4 ++++
 include/trace/trace_events.h | 11 +++++++++++
 kernel/trace/trace_output.c  | 38 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 0f165507495c..0af63c4381b9 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -23,6 +23,10 @@ const char *trace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 				    const struct trace_print_flags *symbol_array);
 
 #if BITS_PER_LONG == 32
+const char *trace_print_flags_seq_u64(struct trace_seq *p, const char *delim,
+		      unsigned long long flags,
+		      const struct trace_print_flags_u64 *flag_array);
+
 const char *trace_print_symbols_seq_u64(struct trace_seq *p,
 					unsigned long long val,
 					const struct trace_print_flags_u64
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index 5c06f4af8323..00f643164ca2 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -283,8 +283,16 @@ TRACE_MAKE_SYSTEM_STR();
 		trace_print_symbols_seq(p, value, symbols);		\
 	})
 
+#undef __print_flags_u64
 #undef __print_symbolic_u64
 #if BITS_PER_LONG == 32
+#define __print_flags_u64(flag, delim, flag_array...)			\
+	({								\
+		static const struct trace_print_flags_u64 __flags[] =	\
+			{ flag_array, { -1, NULL } };			\
+		trace_print_flags_seq_u64(p, delim, flag, __flags);	\
+	})
+
 #define __print_symbolic_u64(value, symbol_array...)			\
 	({								\
 		static const struct trace_print_flags_u64 symbols[] =	\
@@ -292,6 +300,9 @@ TRACE_MAKE_SYSTEM_STR();
 		trace_print_symbols_seq_u64(p, value, symbols);	\
 	})
 #else
+#define __print_flags_u64(flag, delim, flag_array...)			\
+			__print_flags(flag, delim, flag_array)
+
 #define __print_symbolic_u64(value, symbol_array...)			\
 			__print_symbolic(value, symbol_array)
 #endif
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index aea6a1218c7d..070866c32eb9 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -123,6 +123,44 @@ trace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 EXPORT_SYMBOL(trace_print_symbols_seq);
 
 #if BITS_PER_LONG == 32
+const char *
+trace_print_flags_seq_u64(struct trace_seq *p, const char *delim,
+		      unsigned long long flags,
+		      const struct trace_print_flags_u64 *flag_array)
+{
+	unsigned long long mask;
+	const char *str;
+	const char *ret = trace_seq_buffer_ptr(p);
+	int i, first = 1;
+
+	for (i = 0;  flag_array[i].name && flags; i++) {
+
+		mask = flag_array[i].mask;
+		if ((flags & mask) != mask)
+			continue;
+
+		str = flag_array[i].name;
+		flags &= ~mask;
+		if (!first && delim)
+			trace_seq_puts(p, delim);
+		else
+			first = 0;
+		trace_seq_puts(p, str);
+	}
+
+	/* check for left over flags */
+	if (flags) {
+		if (!first && delim)
+			trace_seq_puts(p, delim);
+		trace_seq_printf(p, "0x%llx", flags);
+	}
+
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+EXPORT_SYMBOL(trace_print_flags_seq_u64);
+
 const char *
 trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
 			 const struct trace_print_flags_u64 *symbol_array)
-- 
cgit v1.2.3


From 8dcde9def5a15df54fa4ca41f7750d80fa741d61 Mon Sep 17 00:00:00 2001
From: Prarit Bhargava <prarit@redhat.com>
Date: Wed, 22 Feb 2017 15:40:56 -0800
Subject: kernel/watchdog.c: do not hardcode CPU 0 as the initial thread

When CONFIG_BOOTPARAM_HOTPLUG_CPU0 is enabled, the socket containing the
boot cpu can be replaced.  During the hot add event, the message

NMI watchdog: enabled on all CPUs, permanently consumes one hw-PMU counter.

is output implying that the NMI watchdog was disabled at some point.  This
is not the case and the message has caused confusion for users of systems
that support the removal of the boot cpu socket.

The watchdog code is coded to assume that cpu 0 is always the first cpu to
initialize the watchdog, and the last to stop its watchdog thread.  That
is not the case for initializing if cpu 0 has been removed and added.  The
removal case has never been correct because the smpboot code will remove
the watchdog threads starting with the lowest cpu number.

This patch adds watchdog_cpus to track the number of cpus with active NMI
watchdog threads so that the first and last thread can be used to set and
clear the value of firstcpu_err.  firstcpu_err is set when the first
watchdog thread is enabled, and cleared when the last watchdog thread is
disabled.

Link: http://lkml.kernel.org/r/1480425321-32296-1-git-send-email-prarit@redhat.com
Signed-off-by: Prarit Bhargava <prarit@redhat.com>
Acked-by: Don Zickus <dzickus@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Joshua Hunt <johunt@akamai.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Babu Moger <babu.moger@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog_hld.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 12b8dd640786..b5de262a9eb9 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -137,12 +137,14 @@ static void watchdog_overflow_callback(struct perf_event *event,
  * Reduce the watchdog noise by only printing messages
  * that are different from what cpu0 displayed.
  */
-static unsigned long cpu0_err;
+static unsigned long firstcpu_err;
+static atomic_t watchdog_cpus;
 
 int watchdog_nmi_enable(unsigned int cpu)
 {
 	struct perf_event_attr *wd_attr;
 	struct perf_event *event = per_cpu(watchdog_ev, cpu);
+	int firstcpu = 0;
 
 	/* nothing to do if the hard lockup detector is disabled */
 	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
@@ -156,19 +158,22 @@ int watchdog_nmi_enable(unsigned int cpu)
 	if (event != NULL)
 		goto out_enable;
 
+	if (atomic_inc_return(&watchdog_cpus) == 1)
+		firstcpu = 1;
+
 	wd_attr = &wd_hw_attr;
 	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
 
 	/* Try to register using hardware perf events */
 	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
 
-	/* save cpu0 error for future comparision */
-	if (cpu == 0 && IS_ERR(event))
-		cpu0_err = PTR_ERR(event);
+	/* save the first cpu's error for future comparision */
+	if (firstcpu && IS_ERR(event))
+		firstcpu_err = PTR_ERR(event);
 
 	if (!IS_ERR(event)) {
-		/* only print for cpu0 or different than cpu0 */
-		if (cpu == 0 || cpu0_err)
+		/* only print for the first cpu initialized */
+		if (firstcpu || firstcpu_err)
 			pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
 		goto out_save;
 	}
@@ -186,7 +191,7 @@ int watchdog_nmi_enable(unsigned int cpu)
 	smp_mb__after_atomic();
 
 	/* skip displaying the same error again */
-	if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
+	if (!firstcpu && (PTR_ERR(event) == firstcpu_err))
 		return PTR_ERR(event);
 
 	/* vary the KERN level based on the returned errno */
@@ -222,9 +227,9 @@ void watchdog_nmi_disable(unsigned int cpu)
 
 		/* should be in cleanup, but blocks oprofile */
 		perf_event_release_kernel(event);
-	}
-	if (cpu == 0) {
+
 		/* watchdog_nmi_enable() expects this to be zero initially. */
-		cpu0_err = 0;
+		if (atomic_dec_and_test(&watchdog_cpus))
+			firstcpu_err = 0;
 	}
 }
-- 
cgit v1.2.3


From 893e26e61d04eac974ded0c11e1647b335c8cb7b Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Wed, 22 Feb 2017 15:42:27 -0800
Subject: userfaultfd: non-cooperative: Add fork() event

When the mm with uffd-ed vmas fork()-s the respective vmas notify their
uffds with the event which contains a descriptor with new uffd.  This
new descriptor can then be used to get events from the child and
populate its mm with data.  Note, that there can be different uffd-s
controlling different vmas within one mm, so first we should collect all
those uffds (and ctx-s) in a list and then notify them all one by one
but only once per fork().

The context is created at fork() time but the descriptor, file struct
and anon inode object is created at event read time.  So some trickery
is added to the userfaultfd_ctx_read() to handle the ctx queues' locking
vs file creation.

Another thing worth noticing is that the task that fork()-s waits for
the uffd event to get processed WITHOUT the mmap sem.

[aarcange@redhat.com: build warning fix]
  Link: http://lkml.kernel.org/r/20161216144821.5183-10-aarcange@redhat.com
Link: http://lkml.kernel.org/r/20161216144821.5183-9-aarcange@redhat.com
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c                 | 148 ++++++++++++++++++++++++++++++++++++++-
 include/linux/userfaultfd_k.h    |  13 ++++
 include/uapi/linux/userfaultfd.h |  15 ++--
 kernel/fork.c                    |  10 ++-
 4 files changed, 170 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 87d31921b66c..6046e0b552b2 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -64,6 +64,12 @@ struct userfaultfd_ctx {
 	struct mm_struct *mm;
 };
 
+struct userfaultfd_fork_ctx {
+	struct userfaultfd_ctx *orig;
+	struct userfaultfd_ctx *new;
+	struct list_head list;
+};
+
 struct userfaultfd_wait_queue {
 	struct uffd_msg msg;
 	wait_queue_t wq;
@@ -465,9 +471,8 @@ out:
 	return ret;
 }
 
-static int __maybe_unused userfaultfd_event_wait_completion(
-		struct userfaultfd_ctx *ctx,
-		struct userfaultfd_wait_queue *ewq)
+static int userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
+					     struct userfaultfd_wait_queue *ewq)
 {
 	int ret = 0;
 
@@ -518,6 +523,79 @@ static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
 	__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
 }
 
+int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
+{
+	struct userfaultfd_ctx *ctx = NULL, *octx;
+	struct userfaultfd_fork_ctx *fctx;
+
+	octx = vma->vm_userfaultfd_ctx.ctx;
+	if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
+		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+		vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
+		return 0;
+	}
+
+	list_for_each_entry(fctx, fcs, list)
+		if (fctx->orig == octx) {
+			ctx = fctx->new;
+			break;
+		}
+
+	if (!ctx) {
+		fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
+		if (!fctx)
+			return -ENOMEM;
+
+		ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
+		if (!ctx) {
+			kfree(fctx);
+			return -ENOMEM;
+		}
+
+		atomic_set(&ctx->refcount, 1);
+		ctx->flags = octx->flags;
+		ctx->state = UFFD_STATE_RUNNING;
+		ctx->features = octx->features;
+		ctx->released = false;
+		ctx->mm = vma->vm_mm;
+		atomic_inc(&ctx->mm->mm_users);
+
+		userfaultfd_ctx_get(octx);
+		fctx->orig = octx;
+		fctx->new = ctx;
+		list_add_tail(&fctx->list, fcs);
+	}
+
+	vma->vm_userfaultfd_ctx.ctx = ctx;
+	return 0;
+}
+
+static int dup_fctx(struct userfaultfd_fork_ctx *fctx)
+{
+	struct userfaultfd_ctx *ctx = fctx->orig;
+	struct userfaultfd_wait_queue ewq;
+
+	msg_init(&ewq.msg);
+
+	ewq.msg.event = UFFD_EVENT_FORK;
+	ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
+
+	return userfaultfd_event_wait_completion(ctx, &ewq);
+}
+
+void dup_userfaultfd_complete(struct list_head *fcs)
+{
+	int ret = 0;
+	struct userfaultfd_fork_ctx *fctx, *n;
+
+	list_for_each_entry_safe(fctx, n, fcs, list) {
+		if (!ret)
+			ret = dup_fctx(fctx);
+		list_del(&fctx->list);
+		kfree(fctx);
+	}
+}
+
 static int userfaultfd_release(struct inode *inode, struct file *file)
 {
 	struct userfaultfd_ctx *ctx = file->private_data;
@@ -653,12 +731,49 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
 	}
 }
 
+static const struct file_operations userfaultfd_fops;
+
+static int resolve_userfault_fork(struct userfaultfd_ctx *ctx,
+				  struct userfaultfd_ctx *new,
+				  struct uffd_msg *msg)
+{
+	int fd;
+	struct file *file;
+	unsigned int flags = new->flags & UFFD_SHARED_FCNTL_FLAGS;
+
+	fd = get_unused_fd_flags(flags);
+	if (fd < 0)
+		return fd;
+
+	file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, new,
+				  O_RDWR | flags);
+	if (IS_ERR(file)) {
+		put_unused_fd(fd);
+		return PTR_ERR(file);
+	}
+
+	fd_install(fd, file);
+	msg->arg.reserved.reserved1 = 0;
+	msg->arg.fork.ufd = fd;
+
+	return 0;
+}
+
 static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 				    struct uffd_msg *msg)
 {
 	ssize_t ret;
 	DECLARE_WAITQUEUE(wait, current);
 	struct userfaultfd_wait_queue *uwq;
+	/*
+	 * Handling fork event requires sleeping operations, so
+	 * we drop the event_wqh lock, then do these ops, then
+	 * lock it back and wake up the waiter. While the lock is
+	 * dropped the ewq may go away so we keep track of it
+	 * carefully.
+	 */
+	LIST_HEAD(fork_event);
+	struct userfaultfd_ctx *fork_nctx = NULL;
 
 	/* always take the fd_wqh lock before the fault_pending_wqh lock */
 	spin_lock(&ctx->fd_wqh.lock);
@@ -716,6 +831,16 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 		if (uwq) {
 			*msg = uwq->msg;
 
+			if (uwq->msg.event == UFFD_EVENT_FORK) {
+				fork_nctx = (struct userfaultfd_ctx *)
+					(unsigned long)
+					uwq->msg.arg.reserved.reserved1;
+				list_move(&uwq->wq.task_list, &fork_event);
+				spin_unlock(&ctx->event_wqh.lock);
+				ret = 0;
+				break;
+			}
+
 			userfaultfd_event_complete(ctx, uwq);
 			spin_unlock(&ctx->event_wqh.lock);
 			ret = 0;
@@ -739,6 +864,23 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 	__set_current_state(TASK_RUNNING);
 	spin_unlock(&ctx->fd_wqh.lock);
 
+	if (!ret && msg->event == UFFD_EVENT_FORK) {
+		ret = resolve_userfault_fork(ctx, fork_nctx, msg);
+
+		if (!ret) {
+			spin_lock(&ctx->event_wqh.lock);
+			if (!list_empty(&fork_event)) {
+				uwq = list_first_entry(&fork_event,
+						       typeof(*uwq),
+						       wq.task_list);
+				list_del(&uwq->wq.task_list);
+				__add_wait_queue(&ctx->event_wqh, &uwq->wq);
+				userfaultfd_event_complete(ctx, uwq);
+			}
+			spin_unlock(&ctx->event_wqh.lock);
+		}
+	}
+
 	return ret;
 }
 
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 11b92b047a1e..79002bca1f43 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -52,6 +52,9 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 	return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP);
 }
 
+extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
+extern void dup_userfaultfd_complete(struct list_head *);
+
 #else /* CONFIG_USERFAULTFD */
 
 /* mm helpers */
@@ -76,6 +79,16 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 	return false;
 }
 
+static inline int dup_userfaultfd(struct vm_area_struct *vma,
+				  struct list_head *l)
+{
+	return 0;
+}
+
+static inline void dup_userfaultfd_complete(struct list_head *l)
+{
+}
+
 #endif /* CONFIG_USERFAULTFD */
 
 #endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 94046b8aa6ad..c8953c84fdcc 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -18,12 +18,7 @@
  * means the userland is reading).
  */
 #define UFFD_API ((__u64)0xAA)
-/*
- * After implementing the respective features it will become:
- * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
- *			      UFFD_FEATURE_EVENT_FORK)
- */
-#define UFFD_API_FEATURES (0)
+#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -77,6 +72,10 @@ struct uffd_msg {
 			__u64	address;
 		} pagefault;
 
+		struct {
+			__u32	ufd;
+		} fork;
+
 		struct {
 			/* unused reserved fields */
 			__u64	reserved1;
@@ -90,9 +89,7 @@ struct uffd_msg {
  * Start at 0x12 and not at 0 to be more strict against bugs.
  */
 #define UFFD_EVENT_PAGEFAULT	0x12
-#if 0 /* not available yet */
 #define UFFD_EVENT_FORK		0x13
-#endif
 
 /* flags for UFFD_EVENT_PAGEFAULT */
 #define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
@@ -111,10 +108,8 @@ struct uffdio_api {
 	 * are to be considered implicitly always enabled in all kernels as
 	 * long as the uffdio_api.api requested matches UFFD_API.
 	 */
-#if 0 /* not available yet */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
-#endif
 	__u64 features;
 
 	__u64 ioctls;
diff --git a/kernel/fork.c b/kernel/fork.c
index ff82e24573b6..d12fcc4db8a3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -55,6 +55,7 @@
 #include <linux/rmap.h>
 #include <linux/ksm.h>
 #include <linux/acct.h>
+#include <linux/userfaultfd_k.h>
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/freezer.h>
@@ -561,6 +562,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 	struct rb_node **rb_link, *rb_parent;
 	int retval;
 	unsigned long charge;
+	LIST_HEAD(uf);
 
 	uprobe_start_dup_mmap();
 	if (down_write_killable(&oldmm->mmap_sem)) {
@@ -617,12 +619,13 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 		if (retval)
 			goto fail_nomem_policy;
 		tmp->vm_mm = mm;
+		retval = dup_userfaultfd(tmp, &uf);
+		if (retval)
+			goto fail_nomem_anon_vma_fork;
 		if (anon_vma_fork(tmp, mpnt))
 			goto fail_nomem_anon_vma_fork;
-		tmp->vm_flags &=
-			~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP);
+		tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
 		tmp->vm_next = tmp->vm_prev = NULL;
-		tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 		file = tmp->vm_file;
 		if (file) {
 			struct inode *inode = file_inode(file);
@@ -678,6 +681,7 @@ out:
 	up_write(&mm->mmap_sem);
 	flush_tlb_mm(oldmm);
 	up_write(&oldmm->mmap_sem);
+	dup_userfaultfd_complete(&uf);
 fail_uprobe_end:
 	uprobe_end_dup_mmap();
 	return retval;
-- 
cgit v1.2.3


From bc1750f366902449f36f15f4a692a495fe6bcdfe Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 23 Feb 2017 00:20:53 +0000
Subject: bpf: fix spelling mistake: "proccessed" -> "processed"

trivial fix to spelling mistake in verbose log message

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d2bded2b250c..3fc6e39b223e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2776,7 +2776,7 @@ static int do_check(struct bpf_verifier_env *env)
 		class = BPF_CLASS(insn->code);
 
 		if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
-			verbose("BPF program is too large. Proccessed %d insn\n",
+			verbose("BPF program is too large. Processed %d insn\n",
 				insn_processed);
 			return -E2BIG;
 		}
-- 
cgit v1.2.3


From 7db60d05e5ccc0a473fa2275f90f2fca0002ab21 Mon Sep 17 00:00:00 2001
From: Vijay Kumar <vijay.ac.kumar@oracle.com>
Date: Wed, 1 Feb 2017 11:34:39 -0800
Subject: sparc64: Send break twice from console to return to boot prom

Now we can also jump to boot prom from sunhv console by sending
break twice on console for both running and panicked kernel
cases.

Signed-off-by: Vijay Kumar <vijay.ac.kumar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/tty/serial/sunhv.c | 6 +++++-
 kernel/panic.c             | 3 ++-
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/drivers/tty/serial/sunhv.c b/drivers/tty/serial/sunhv.c
index c5ebdc8bed6a..46e46894e918 100644
--- a/drivers/tty/serial/sunhv.c
+++ b/drivers/tty/serial/sunhv.c
@@ -116,7 +116,7 @@ static int receive_chars_getchar(struct uart_port *port)
 
 static int receive_chars_read(struct uart_port *port)
 {
-	int saw_console_brk = 0;
+	static int saw_console_brk;
 	int limit = 10000;
 
 	while (limit-- > 0) {
@@ -128,6 +128,9 @@ static int receive_chars_read(struct uart_port *port)
 			bytes_read = 0;
 
 			if (stat == CON_BREAK) {
+				if (saw_console_brk)
+					sun_do_break();
+
 				if (uart_handle_break(port))
 					continue;
 				saw_console_brk = 1;
@@ -151,6 +154,7 @@ static int receive_chars_read(struct uart_port *port)
 		if (port->sysrq != 0 &&  *con_read_page) {
 			for (i = 0; i < bytes_read; i++)
 				uart_handle_sysrq_char(port, con_read_page[i]);
+			saw_console_brk = 0;
 		}
 
 		if (port->state == NULL)
diff --git a/kernel/panic.c b/kernel/panic.c
index b95959733ce0..3ec16e603e88 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -273,7 +273,8 @@ void panic(const char *fmt, ...)
 		extern int stop_a_enabled;
 		/* Make sure the user can actually press Stop-A (L1-A) */
 		stop_a_enabled = 1;
-		pr_emerg("Press Stop-A (L1-A) to return to the boot prom\n");
+		pr_emerg("Press Stop-A (L1-A) from sun keyboard or send break\n"
+			 "twice on console to return to the boot prom\n");
 	}
 #endif
 #if defined(CONFIG_S390)
-- 
cgit v1.2.3


From 279b5165ffadf57e2596e0ad438cb9b69b76f320 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 16 Feb 2017 10:28:37 +0100
Subject: perf/core: Remove confusing comment and move put_ctx()

Since commit:

  321027c1fe77 ("perf/core: Fix concurrent sys_perf_event_open() vs. 'move_group' race")

... the code looks like (assuming move_group==1):

  gctx = __perf_event_ctx_lock_double(group_leader, ctx);

  perf_remove_from_context(group_leader, 0);
  list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) {
	perf_remove_from_context(sibling, 0);
	put_ctx(gctx);
  }

  /* ... */

  /* misleading comment about how this is the last reference */
  put_ctx(gctx);

  perf_event_ctx_unlock(group_leader, gctx);

What that 'last' put_ctx() does is drop @group_leader's reference on
gctx after having dropped all its potential sibling references.

But the thing is that __perf_event_ctx_lock_double() returns with a
reference _and_ a held lock, and perf_event_ctx_unlock() unlocks that
lock and drops that reference. Therefore that put_ctx() cannot be the
'last' of anything, nor is there an unbalance in puts.

To reduce confusion, remove the comment and place the put_ctx() next
to the remove_from_context() call.

Reported-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 77a932b54a64..94d7b9aae925 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9955,6 +9955,7 @@ SYSCALL_DEFINE5(perf_event_open,
 		 * of swizzling perf_event::ctx.
 		 */
 		perf_remove_from_context(group_leader, 0);
+		put_ctx(gctx);
 
 		list_for_each_entry(sibling, &group_leader->sibling_list,
 				    group_entry) {
@@ -9993,13 +9994,6 @@ SYSCALL_DEFINE5(perf_event_open,
 		perf_event__state_init(group_leader);
 		perf_install_in_context(ctx, group_leader, group_leader->cpu);
 		get_ctx(ctx);
-
-		/*
-		 * Now that all events are installed in @ctx, nothing
-		 * references @gctx anymore, so drop the last reference we have
-		 * on it.
-		 */
-		put_ctx(gctx);
 	}
 
 	/*
-- 
cgit v1.2.3


From 7bbba0eb1af34694868d028b80475981f90e6bee Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 15 Feb 2017 16:12:20 +0100
Subject: perf/core: Fix perf_event_enable_on_exec() timekeeping (again)

Where commit:

  7fce250915ef ("perf: Fix scaling vs.  perf_event_enable_on_exec()")

disabled the ctx-time a-priory, such that all events get enabled and
scheduled at the time point in time, there is one hole in that patch,
when no events do get enabled nothing re-enables the ctx-time.

Reported-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Reported-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Fixes: 7fce250915ef ("perf: Fix scaling vs.  perf_event_enable_on_exec()")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 94d7b9aae925..d4e3f8d8238b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3522,6 +3522,8 @@ static void perf_event_enable_on_exec(int ctxn)
 	if (enabled) {
 		clone_ctx = unclone_ctx(ctx);
 		ctx_resched(cpuctx, ctx, event_type);
+	} else {
+		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
 	}
 	perf_ctx_unlock(cpuctx, ctx);
 
-- 
cgit v1.2.3


From 1572e45a924f254d9570093abde46430c3172e3d Mon Sep 17 00:00:00 2001
From: Tan Xiaojun <tanxiaojun@huawei.com>
Date: Thu, 23 Feb 2017 14:04:39 +0800
Subject: perf/core: Fix the perf_cpu_time_max_percent check

Use "proc_dointvec_minmax" instead of "proc_dointvec" to check the input
value from user-space.

If not, we can set a big value and some vars will overflow like
"sysctl_perf_event_sample_rate" which will cause a lot of unexpected
problems.

Signed-off-by: Tan Xiaojun <tanxiaojun@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <acme@kernel.org>
Cc: <alexander.shishkin@linux.intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1487829879-56237-1-git-send-email-tanxiaojun@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index d4e3f8d8238b..c1c1cdf0b811 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -455,7 +455,7 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 				void __user *buffer, size_t *lenp,
 				loff_t *ppos)
 {
-	int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 
 	if (ret || !write)
 		return ret;
-- 
cgit v1.2.3


From 8cb68b343a66cf19834472012590490d34d31703 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 2 Feb 2017 16:55:06 +0100
Subject: sched/core: Fix update_rq_clock() splat on hotplug (and
 suspend/resume)

The hotplug code still triggers the warning about using a stale
rq->clock value.

Fix things up to actually run update_rq_clock() in a place where we
record the 'UPDATED' flag, and then modify the annotation to retain
this flag over the rq->lock fiddling that happens as a result of
actually migrating all the tasks elsewhere.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Tested-by: Mike Galbraith <efault@gmx.de>
Tested-by: Sachin Sant <sachinp@linux.vnet.ibm.com>
Tested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ross Zwisler <zwisler@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 4d25b35ea372 ("sched/fair: Restore previous rq_flags when migrating tasks in hotplug")
Link: http://lkml.kernel.org/r/20170202155506.GX6515@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 34e2291a9a6c..2d6e828a0471 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5557,7 +5557,7 @@ static void migrate_tasks(struct rq *dead_rq)
 {
 	struct rq *rq = dead_rq;
 	struct task_struct *next, *stop = rq->stop;
-	struct rq_flags rf, old_rf;
+	struct rq_flags rf;
 	int dest_cpu;
 
 	/*
@@ -5576,7 +5576,9 @@ static void migrate_tasks(struct rq *dead_rq)
 	 * class method both need to have an up-to-date
 	 * value of rq->clock[_task]
 	 */
+	rq_pin_lock(rq, &rf);
 	update_rq_clock(rq);
+	rq_unpin_lock(rq, &rf);
 
 	for (;;) {
 		/*
@@ -5589,7 +5591,7 @@ static void migrate_tasks(struct rq *dead_rq)
 		/*
 		 * pick_next_task() assumes pinned rq->lock:
 		 */
-		rq_pin_lock(rq, &rf);
+		rq_repin_lock(rq, &rf);
 		next = pick_next_task(rq, &fake_task, &rf);
 		BUG_ON(!next);
 		next->sched_class->put_prev_task(rq, next);
@@ -5618,13 +5620,6 @@ static void migrate_tasks(struct rq *dead_rq)
 			continue;
 		}
 
-		/*
-		 * __migrate_task() may return with a different
-		 * rq->lock held and a new cookie in 'rf', but we need
-		 * to preserve rf::clock_update_flags for 'dead_rq'.
-		 */
-		old_rf = rf;
-
 		/* Find suitable destination for @next, with force if needed. */
 		dest_cpu = select_fallback_rq(dead_rq->cpu, next);
 
@@ -5633,7 +5628,6 @@ static void migrate_tasks(struct rq *dead_rq)
 			raw_spin_unlock(&rq->lock);
 			rq = dead_rq;
 			raw_spin_lock(&rq->lock);
-			rf = old_rf;
 		}
 		raw_spin_unlock(&next->pi_lock);
 	}
-- 
cgit v1.2.3


From a499c3ead88ccf147fc50689e85a530ad923ce36 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Tue, 21 Feb 2017 23:52:55 -0800
Subject: sched/fair: Update rq clock before changing a task's CPU affinity

This is triggered during boot when CONFIG_SCHED_DEBUG is enabled:

 ------------[ cut here ]------------
 WARNING: CPU: 6 PID: 81 at kernel/sched/sched.h:812 set_next_entity+0x11d/0x380
 rq->clock_update_flags < RQCF_ACT_SKIP
 CPU: 6 PID: 81 Comm: torture_shuffle Not tainted 4.10.0+ #1
 Hardware name: LENOVO ThinkCentre M8500t-N000/SHARKBAY, BIOS FBKTC1AUS 02/16/2016
 Call Trace:
  dump_stack+0x85/0xc2
  __warn+0xcb/0xf0
  warn_slowpath_fmt+0x5f/0x80
  set_next_entity+0x11d/0x380
  set_curr_task_fair+0x2b/0x60
  do_set_cpus_allowed+0x139/0x180
  __set_cpus_allowed_ptr+0x113/0x260
  set_cpus_allowed_ptr+0x10/0x20
  torture_shuffle+0xfd/0x180
  kthread+0x10f/0x150
  ? torture_shutdown_init+0x60/0x60
  ? kthread_create_on_node+0x60/0x60
  ret_from_fork+0x31/0x40
 ---[ end trace dd94d92344cea9c6 ]---

The task is running && !queued, so there is no rq clock update before calling
set_curr_task().

This patch fixes it by updating rq clock after holding rq->lock/pi_lock
just as what other dequeue + put_prev + enqueue + set_curr story does.

Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1487749975-5994-1-git-send-email-wanpeng.li@hotmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d6e828a0471..cc1e3e0d29b0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1087,6 +1087,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 	int ret = 0;
 
 	rq = task_rq_lock(p, &rf);
+	update_rq_clock(rq);
 
 	if (p->flags & PF_KTHREAD) {
 		/*
-- 
cgit v1.2.3


From 96b777452d8881480fd5be50112f791c17db4b6b Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Wed, 8 Feb 2017 14:27:27 +0300
Subject: sched/cgroup: Move sched_online_group() back into css_online() to fix
 crash

Commit:

  2f5177f0fd7e ("sched/cgroup: Fix/cleanup cgroup teardown/init")

.. moved sched_online_group() from css_online() to css_alloc().
It exposes half-baked task group into global lists before initializing
generic cgroup stuff.

LTP testcase (third in cgroup_regression_test) written for testing
similar race in kernels 2.6.26-2.6.28 easily triggers this oops:

  BUG: unable to handle kernel NULL pointer dereference at 0000000000000008
  IP: kernfs_path_from_node_locked+0x260/0x320
  CPU: 1 PID: 30346 Comm: cat Not tainted 4.10.0-rc5-test #4
  Call Trace:
  ? kernfs_path_from_node+0x4f/0x60
  kernfs_path_from_node+0x3e/0x60
  print_rt_rq+0x44/0x2b0
  print_rt_stats+0x7a/0xd0
  print_cpu+0x2fc/0xe80
  ? __might_sleep+0x4a/0x80
  sched_debug_show+0x17/0x30
  seq_read+0xf2/0x3b0
  proc_reg_read+0x42/0x70
  __vfs_read+0x28/0x130
  ? security_file_permission+0x9b/0xc0
  ? rw_verify_area+0x4e/0xb0
  vfs_read+0xa5/0x170
  SyS_read+0x46/0xa0
  entry_SYSCALL_64_fastpath+0x1e/0xad

Here the task group is already linked into the global RCU-protected 'task_groups'
list, but the css->cgroup pointer is still NULL.

This patch reverts this chunk and moves online back to css_online().

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 2f5177f0fd7e ("sched/cgroup: Fix/cleanup cgroup teardown/init")
Link: http://lkml.kernel.org/r/148655324740.424917.5302984537258726349.stgit@buzz
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index cc1e3e0d29b0..e01bd807f0af 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6811,11 +6811,20 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 	if (IS_ERR(tg))
 		return ERR_PTR(-ENOMEM);
 
-	sched_online_group(tg, parent);
-
 	return &tg->css;
 }
 
+/* Expose task group only after completing cgroup initialization */
+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+	struct task_group *tg = css_tg(css);
+	struct task_group *parent = css_tg(css->parent);
+
+	if (parent)
+		sched_online_group(tg, parent);
+	return 0;
+}
+
 static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
 {
 	struct task_group *tg = css_tg(css);
@@ -7221,6 +7230,7 @@ static struct cftype cpu_files[] = {
 
 struct cgroup_subsys cpu_cgrp_subsys = {
 	.css_alloc	= cpu_cgroup_css_alloc,
+	.css_online	= cpu_cgroup_css_online,
 	.css_released	= cpu_cgroup_css_released,
 	.css_free	= cpu_cgroup_css_free,
 	.fork		= cpu_cgroup_fork,
-- 
cgit v1.2.3


From b5d24fda9c3dce51fcb4eee459550a458eaaf1e2 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 24 Feb 2017 14:55:45 -0800
Subject: mm, devm_memremap_pages: hold device_hotplug lock over
 mem_hotplug_{begin, done}

The mem_hotplug_{begin,done} lock coordinates with {get,put}_online_mems()
to hold off "readers" of the current state of memory from new hotplug
actions.  mem_hotplug_begin() expects exclusive access, via the
device_hotplug lock, to set mem_hotplug.active_writer.  Calling
mem_hotplug_begin() without locking device_hotplug can lead to
corrupting mem_hotplug.refcount and missed wakeups / soft lockups.

[dan.j.williams@intel.com: v2]
  Link: http://lkml.kernel.org/r/148728203365.38457.17804568297887708345.stgit@dwillia2-desk3.amr.corp.intel.com
Link: http://lkml.kernel.org/r/148693885680.16345.17802627926777862337.stgit@dwillia2-desk3.amr.corp.intel.com
Fixes: f931ab479dd2 ("mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done}")
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: Ben Hutchings <ben@decadent.org.uk>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Masayoshi Mizuma <m.mizuma@jp.fujitsu.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/memremap.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/memremap.c b/kernel/memremap.c
index 9ecedc28b928..06123234f118 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -246,9 +246,13 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
 	/* pages are dead and unused, undo the arch mapping */
 	align_start = res->start & ~(SECTION_SIZE - 1);
 	align_size = ALIGN(resource_size(res), SECTION_SIZE);
+
+	lock_device_hotplug();
 	mem_hotplug_begin();
 	arch_remove_memory(align_start, align_size);
 	mem_hotplug_done();
+	unlock_device_hotplug();
+
 	untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
 	pgmap_radix_release(res);
 	dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
@@ -360,9 +364,11 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 	if (error)
 		goto err_pfn_remap;
 
+	lock_device_hotplug();
 	mem_hotplug_begin();
 	error = arch_add_memory(nid, align_start, align_size, true);
 	mem_hotplug_done();
+	unlock_device_hotplug();
 	if (error)
 		goto err_add_memory;
 
-- 
cgit v1.2.3


From 11bac80004499ea59f361ef2a5516c84b6eab675 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 24 Feb 2017 14:56:41 -0800
Subject: mm, fs: reduce fault, page_mkwrite, and pfn_mkwrite to take only vmf

->fault(), ->page_mkwrite(), and ->pfn_mkwrite() calls do not need to
take a vma and vmf parameter when the vma already resides in vmf.

Remove the vma parameter to simplify things.

[arnd@arndb.de: fix ARM build]
  Link: http://lkml.kernel.org/r/20170125223558.1451224-1-arnd@arndb.de
Link: http://lkml.kernel.org/r/148521301778.19116.10840599906674778980.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Jan Kara <jack@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/kvm/book3s_64_vio.c                 |  4 +--
 arch/powerpc/platforms/cell/spufs/file.c         | 39 ++++++++++++------------
 drivers/android/binder.c                         |  2 +-
 drivers/char/agp/alpha-agp.c                     |  5 ++-
 drivers/char/mspec.c                             |  6 ++--
 drivers/dax/dax.c                                | 12 ++++----
 drivers/gpu/drm/armada/armada_gem.c              |  9 +++---
 drivers/gpu/drm/drm_vm.c                         | 32 ++++++++++---------
 drivers/gpu/drm/etnaviv/etnaviv_drv.h            |  2 +-
 drivers/gpu/drm/etnaviv/etnaviv_gem.c            |  3 +-
 drivers/gpu/drm/exynos/exynos_drm_gem.c          |  3 +-
 drivers/gpu/drm/exynos/exynos_drm_gem.h          |  2 +-
 drivers/gpu/drm/gma500/framebuffer.c             |  3 +-
 drivers/gpu/drm/gma500/gem.c                     |  3 +-
 drivers/gpu/drm/gma500/psb_drv.h                 |  2 +-
 drivers/gpu/drm/i915/i915_drv.h                  |  2 +-
 drivers/gpu/drm/i915/i915_gem.c                  |  4 +--
 drivers/gpu/drm/msm/msm_drv.h                    |  2 +-
 drivers/gpu/drm/msm/msm_gem.c                    |  3 +-
 drivers/gpu/drm/omapdrm/omap_drv.h               |  2 +-
 drivers/gpu/drm/omapdrm/omap_gem.c               |  4 +--
 drivers/gpu/drm/qxl/qxl_ttm.c                    |  6 ++--
 drivers/gpu/drm/radeon/radeon_ttm.c              |  6 ++--
 drivers/gpu/drm/tegra/gem.c                      |  3 +-
 drivers/gpu/drm/ttm/ttm_bo_vm.c                  | 10 +++---
 drivers/gpu/drm/udl/udl_drv.h                    |  2 +-
 drivers/gpu/drm/udl/udl_gem.c                    |  3 +-
 drivers/gpu/drm/vgem/vgem_drv.c                  |  3 +-
 drivers/gpu/drm/virtio/virtgpu_ttm.c             |  7 ++---
 drivers/hsi/clients/cmt_speech.c                 |  4 +--
 drivers/hwtracing/intel_th/msu.c                 |  6 ++--
 drivers/infiniband/hw/hfi1/file_ops.c            |  4 +--
 drivers/infiniband/hw/qib/qib_file_ops.c         |  2 +-
 drivers/media/v4l2-core/videobuf-dma-sg.c        |  3 +-
 drivers/misc/cxl/context.c                       |  3 +-
 drivers/misc/sgi-gru/grumain.c                   |  3 +-
 drivers/misc/sgi-gru/grutables.h                 |  2 +-
 drivers/scsi/cxlflash/superpipe.c                |  6 ++--
 drivers/scsi/sg.c                                |  3 +-
 drivers/staging/android/ion/ion.c                |  6 ++--
 drivers/staging/lustre/lustre/llite/llite_mmap.c |  7 +++--
 drivers/staging/lustre/lustre/llite/vvp_io.c     |  2 +-
 drivers/target/target_core_user.c                |  6 ++--
 drivers/uio/uio.c                                |  6 ++--
 drivers/usb/mon/mon_bin.c                        |  4 +--
 drivers/video/fbdev/core/fb_defio.c              | 16 +++++-----
 drivers/xen/privcmd.c                            |  4 +--
 fs/9p/vfs_file.c                                 |  4 +--
 fs/btrfs/ctree.h                                 |  2 +-
 fs/btrfs/inode.c                                 |  6 ++--
 fs/ceph/addr.c                                   |  8 +++--
 fs/cifs/file.c                                   |  2 +-
 fs/dax.c                                         | 15 ++++-----
 fs/ext2/file.c                                   | 17 +++++------
 fs/ext4/ext4.h                                   |  4 +--
 fs/ext4/file.c                                   | 17 +++++------
 fs/ext4/inode.c                                  |  9 +++---
 fs/f2fs/file.c                                   |  7 ++---
 fs/fuse/file.c                                   |  6 ++--
 fs/gfs2/file.c                                   |  8 ++---
 fs/iomap.c                                       |  5 ++-
 fs/kernfs/file.c                                 | 13 ++++----
 fs/ncpfs/mmap.c                                  |  7 ++---
 fs/nfs/file.c                                    |  4 +--
 fs/nilfs2/file.c                                 |  3 +-
 fs/ocfs2/mmap.c                                  | 15 ++++-----
 fs/proc/vmcore.c                                 |  4 +--
 fs/ubifs/file.c                                  |  5 ++-
 fs/xfs/xfs_file.c                                | 25 +++++++--------
 include/linux/dax.h                              |  5 ++-
 include/linux/iomap.h                            |  3 +-
 include/linux/mm.h                               | 10 +++---
 ipc/shm.c                                        |  6 ++--
 kernel/events/core.c                             |  6 ++--
 kernel/relay.c                                   |  4 +--
 mm/filemap.c                                     | 19 ++++++------
 mm/hugetlb.c                                     |  2 +-
 mm/memory.c                                      |  6 ++--
 mm/mmap.c                                        |  9 +++---
 mm/nommu.c                                       |  2 +-
 mm/shmem.c                                       |  3 +-
 security/selinux/selinuxfs.c                     |  5 ++-
 sound/core/pcm_native.c                          | 15 ++++-----
 sound/usb/usx2y/us122l.c                         |  5 ++-
 sound/usb/usx2y/usX2Yhwdep.c                     |  7 ++---
 sound/usb/usx2y/usx2yhwdeppcm.c                  |  5 ++-
 virt/kvm/kvm_main.c                              |  4 +--
 87 files changed, 283 insertions(+), 290 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 491c5d8120f7..ab9d14c0e460 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -102,9 +102,9 @@ static void release_spapr_tce_table(struct rcu_head *head)
 	kfree(stt);
 }
 
-static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int kvm_spapr_tce_fault(struct vm_fault *vmf)
 {
-	struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data;
+	struct kvmppc_spapr_tce_table *stt = vmf->vma->vm_file->private_data;
 	struct page *page;
 
 	if (vmf->pgoff >= kvmppc_tce_pages(stt->size))
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index a35e2c29d7ee..e5ec1368f0cd 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -233,8 +233,9 @@ spufs_mem_write(struct file *file, const char __user *buffer,
 }
 
 static int
-spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+spufs_mem_mmap_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct spu_context *ctx	= vma->vm_file->private_data;
 	unsigned long pfn, offset;
 
@@ -311,12 +312,11 @@ static const struct file_operations spufs_mem_fops = {
 	.mmap			= spufs_mem_mmap,
 };
 
-static int spufs_ps_fault(struct vm_area_struct *vma,
-				    struct vm_fault *vmf,
+static int spufs_ps_fault(struct vm_fault *vmf,
 				    unsigned long ps_offs,
 				    unsigned long ps_size)
 {
-	struct spu_context *ctx = vma->vm_file->private_data;
+	struct spu_context *ctx = vmf->vma->vm_file->private_data;
 	unsigned long area, offset = vmf->pgoff << PAGE_SHIFT;
 	int ret = 0;
 
@@ -354,7 +354,7 @@ static int spufs_ps_fault(struct vm_area_struct *vma,
 		down_read(&current->mm->mmap_sem);
 	} else {
 		area = ctx->spu->problem_phys + ps_offs;
-		vm_insert_pfn(vma, vmf->address, (area + offset) >> PAGE_SHIFT);
+		vm_insert_pfn(vmf->vma, vmf->address, (area + offset) >> PAGE_SHIFT);
 		spu_context_trace(spufs_ps_fault__insert, ctx, ctx->spu);
 	}
 
@@ -367,10 +367,9 @@ refault:
 }
 
 #if SPUFS_MMAP_4K
-static int spufs_cntl_mmap_fault(struct vm_area_struct *vma,
-					   struct vm_fault *vmf)
+static int spufs_cntl_mmap_fault(struct vm_fault *vmf)
 {
-	return spufs_ps_fault(vma, vmf, 0x4000, SPUFS_CNTL_MAP_SIZE);
+	return spufs_ps_fault(vmf, 0x4000, SPUFS_CNTL_MAP_SIZE);
 }
 
 static const struct vm_operations_struct spufs_cntl_mmap_vmops = {
@@ -1067,15 +1066,15 @@ static ssize_t spufs_signal1_write(struct file *file, const char __user *buf,
 }
 
 static int
-spufs_signal1_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+spufs_signal1_mmap_fault(struct vm_fault *vmf)
 {
 #if SPUFS_SIGNAL_MAP_SIZE == 0x1000
-	return spufs_ps_fault(vma, vmf, 0x14000, SPUFS_SIGNAL_MAP_SIZE);
+	return spufs_ps_fault(vmf, 0x14000, SPUFS_SIGNAL_MAP_SIZE);
 #elif SPUFS_SIGNAL_MAP_SIZE == 0x10000
 	/* For 64k pages, both signal1 and signal2 can be used to mmap the whole
 	 * signal 1 and 2 area
 	 */
-	return spufs_ps_fault(vma, vmf, 0x10000, SPUFS_SIGNAL_MAP_SIZE);
+	return spufs_ps_fault(vmf, 0x10000, SPUFS_SIGNAL_MAP_SIZE);
 #else
 #error unsupported page size
 #endif
@@ -1205,15 +1204,15 @@ static ssize_t spufs_signal2_write(struct file *file, const char __user *buf,
 
 #if SPUFS_MMAP_4K
 static int
-spufs_signal2_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+spufs_signal2_mmap_fault(struct vm_fault *vmf)
 {
 #if SPUFS_SIGNAL_MAP_SIZE == 0x1000
-	return spufs_ps_fault(vma, vmf, 0x1c000, SPUFS_SIGNAL_MAP_SIZE);
+	return spufs_ps_fault(vmf, 0x1c000, SPUFS_SIGNAL_MAP_SIZE);
 #elif SPUFS_SIGNAL_MAP_SIZE == 0x10000
 	/* For 64k pages, both signal1 and signal2 can be used to mmap the whole
 	 * signal 1 and 2 area
 	 */
-	return spufs_ps_fault(vma, vmf, 0x10000, SPUFS_SIGNAL_MAP_SIZE);
+	return spufs_ps_fault(vmf, 0x10000, SPUFS_SIGNAL_MAP_SIZE);
 #else
 #error unsupported page size
 #endif
@@ -1334,9 +1333,9 @@ DEFINE_SPUFS_ATTRIBUTE(spufs_signal2_type, spufs_signal2_type_get,
 
 #if SPUFS_MMAP_4K
 static int
-spufs_mss_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+spufs_mss_mmap_fault(struct vm_fault *vmf)
 {
-	return spufs_ps_fault(vma, vmf, 0x0000, SPUFS_MSS_MAP_SIZE);
+	return spufs_ps_fault(vmf, 0x0000, SPUFS_MSS_MAP_SIZE);
 }
 
 static const struct vm_operations_struct spufs_mss_mmap_vmops = {
@@ -1396,9 +1395,9 @@ static const struct file_operations spufs_mss_fops = {
 };
 
 static int
-spufs_psmap_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+spufs_psmap_mmap_fault(struct vm_fault *vmf)
 {
-	return spufs_ps_fault(vma, vmf, 0x0000, SPUFS_PS_MAP_SIZE);
+	return spufs_ps_fault(vmf, 0x0000, SPUFS_PS_MAP_SIZE);
 }
 
 static const struct vm_operations_struct spufs_psmap_mmap_vmops = {
@@ -1456,9 +1455,9 @@ static const struct file_operations spufs_psmap_fops = {
 
 #if SPUFS_MMAP_4K
 static int
-spufs_mfc_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+spufs_mfc_mmap_fault(struct vm_fault *vmf)
 {
-	return spufs_ps_fault(vma, vmf, 0x3000, SPUFS_MFC_MAP_SIZE);
+	return spufs_ps_fault(vmf, 0x3000, SPUFS_MFC_MAP_SIZE);
 }
 
 static const struct vm_operations_struct spufs_mfc_mmap_vmops = {
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 15b263a420e8..2bbcdc6fdfee 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -3342,7 +3342,7 @@ static void binder_vma_close(struct vm_area_struct *vma)
 	binder_defer_work(proc, BINDER_DEFERRED_PUT_FILES);
 }
 
-static int binder_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int binder_vm_fault(struct vm_fault *vmf)
 {
 	return VM_FAULT_SIGBUS;
 }
diff --git a/drivers/char/agp/alpha-agp.c b/drivers/char/agp/alpha-agp.c
index 737187865269..53fe633df1e8 100644
--- a/drivers/char/agp/alpha-agp.c
+++ b/drivers/char/agp/alpha-agp.c
@@ -11,15 +11,14 @@
 
 #include "agp.h"
 
-static int alpha_core_agp_vm_fault(struct vm_area_struct *vma,
-					struct vm_fault *vmf)
+static int alpha_core_agp_vm_fault(struct vm_fault *vmf)
 {
 	alpha_agp_info *agp = agp_bridge->dev_private_data;
 	dma_addr_t dma_addr;
 	unsigned long pa;
 	struct page *page;
 
-	dma_addr = vmf->address - vma->vm_start + agp->aperture.bus_base;
+	dma_addr = vmf->address - vmf->vma->vm_start + agp->aperture.bus_base;
 	pa = agp->ops->translate(agp, dma_addr);
 
 	if (pa == (unsigned long)-EINVAL)
diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c
index a697ca0cab1e..a9c2fa3c81e5 100644
--- a/drivers/char/mspec.c
+++ b/drivers/char/mspec.c
@@ -191,12 +191,12 @@ mspec_close(struct vm_area_struct *vma)
  * Creates a mspec page and maps it to user space.
  */
 static int
-mspec_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+mspec_fault(struct vm_fault *vmf)
 {
 	unsigned long paddr, maddr;
 	unsigned long pfn;
 	pgoff_t index = vmf->pgoff;
-	struct vma_data *vdata = vma->vm_private_data;
+	struct vma_data *vdata = vmf->vma->vm_private_data;
 
 	maddr = (volatile unsigned long) vdata->maddr[index];
 	if (maddr == 0) {
@@ -227,7 +227,7 @@ mspec_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	 * be because another thread has installed the pte first, so it
 	 * is no problem.
 	 */
-	vm_insert_pfn(vma, vmf->address, pfn);
+	vm_insert_pfn(vmf->vma, vmf->address, pfn);
 
 	return VM_FAULT_NOPAGE;
 }
diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index 18e9875f6277..0261f332bf3e 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -419,8 +419,7 @@ static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
 	return -1;
 }
 
-static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
-		struct vm_fault *vmf)
+static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 {
 	struct device *dev = &dax_dev->dev;
 	struct dax_region *dax_region;
@@ -428,7 +427,7 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
 	phys_addr_t phys;
 	pfn_t pfn;
 
-	if (check_vma(dax_dev, vma, __func__))
+	if (check_vma(dax_dev, vmf->vma, __func__))
 		return VM_FAULT_SIGBUS;
 
 	dax_region = dax_dev->region;
@@ -446,7 +445,7 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
 
 	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
 
-	rc = vm_insert_mixed(vma, vmf->address, pfn);
+	rc = vm_insert_mixed(vmf->vma, vmf->address, pfn);
 
 	if (rc == -ENOMEM)
 		return VM_FAULT_OOM;
@@ -456,8 +455,9 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
 	return VM_FAULT_NOPAGE;
 }
 
-static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int dax_dev_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	int rc;
 	struct file *filp = vma->vm_file;
 	struct dax_dev *dax_dev = filp->private_data;
@@ -466,7 +466,7 @@ static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 			current->comm, (vmf->flags & FAULT_FLAG_WRITE)
 			? "write" : "read", vma->vm_start, vma->vm_end);
 	rcu_read_lock();
-	rc = __dax_dev_fault(dax_dev, vma, vmf);
+	rc = __dax_dev_fault(dax_dev, vmf);
 	rcu_read_unlock();
 
 	return rc;
diff --git a/drivers/gpu/drm/armada/armada_gem.c b/drivers/gpu/drm/armada/armada_gem.c
index 560d416deab2..1597458d884e 100644
--- a/drivers/gpu/drm/armada/armada_gem.c
+++ b/drivers/gpu/drm/armada/armada_gem.c
@@ -14,14 +14,15 @@
 #include <drm/armada_drm.h>
 #include "armada_ioctlP.h"
 
-static int armada_gem_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int armada_gem_vm_fault(struct vm_fault *vmf)
 {
-	struct armada_gem_object *obj = drm_to_armada_gem(vma->vm_private_data);
+	struct drm_gem_object *gobj = vmf->vma->vm_private_data;
+	struct armada_gem_object *obj = drm_to_armada_gem(gobj);
 	unsigned long pfn = obj->phys_addr >> PAGE_SHIFT;
 	int ret;
 
-	pfn += (vmf->address - vma->vm_start) >> PAGE_SHIFT;
-	ret = vm_insert_pfn(vma, vmf->address, pfn);
+	pfn += (vmf->address - vmf->vma->vm_start) >> PAGE_SHIFT;
+	ret = vm_insert_pfn(vmf->vma, vmf->address, pfn);
 
 	switch (ret) {
 	case 0:
diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c
index bd311c77c254..bae6e26038ee 100644
--- a/drivers/gpu/drm/drm_vm.c
+++ b/drivers/gpu/drm/drm_vm.c
@@ -96,8 +96,9 @@ static pgprot_t drm_dma_prot(uint32_t map_type, struct vm_area_struct *vma)
  * map, get the page, increment the use count and return it.
  */
 #if IS_ENABLED(CONFIG_AGP)
-static int drm_do_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int drm_do_vm_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_file *priv = vma->vm_file->private_data;
 	struct drm_device *dev = priv->minor->dev;
 	struct drm_local_map *map = NULL;
@@ -168,7 +169,7 @@ vm_fault_error:
 	return VM_FAULT_SIGBUS;	/* Disallow mremap */
 }
 #else
-static int drm_do_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int drm_do_vm_fault(struct vm_fault *vmf)
 {
 	return VM_FAULT_SIGBUS;
 }
@@ -184,8 +185,9 @@ static int drm_do_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
  * Get the mapping, find the real physical page to map, get the page, and
  * return it.
  */
-static int drm_do_vm_shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int drm_do_vm_shm_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_local_map *map = vma->vm_private_data;
 	unsigned long offset;
 	unsigned long i;
@@ -280,14 +282,14 @@ static void drm_vm_shm_close(struct vm_area_struct *vma)
 /**
  * \c fault method for DMA virtual memory.
  *
- * \param vma virtual memory area.
  * \param address access address.
  * \return pointer to the page structure.
  *
  * Determine the page number from the page offset and get it from drm_device_dma::pagelist.
  */
-static int drm_do_vm_dma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int drm_do_vm_dma_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_file *priv = vma->vm_file->private_data;
 	struct drm_device *dev = priv->minor->dev;
 	struct drm_device_dma *dma = dev->dma;
@@ -315,14 +317,14 @@ static int drm_do_vm_dma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 /**
  * \c fault method for scatter-gather virtual memory.
  *
- * \param vma virtual memory area.
  * \param address access address.
  * \return pointer to the page structure.
  *
  * Determine the map offset from the page offset and get it from drm_sg_mem::pagelist.
  */
-static int drm_do_vm_sg_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int drm_do_vm_sg_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_local_map *map = vma->vm_private_data;
 	struct drm_file *priv = vma->vm_file->private_data;
 	struct drm_device *dev = priv->minor->dev;
@@ -347,24 +349,24 @@ static int drm_do_vm_sg_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return 0;
 }
 
-static int drm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int drm_vm_fault(struct vm_fault *vmf)
 {
-	return drm_do_vm_fault(vma, vmf);
+	return drm_do_vm_fault(vmf);
 }
 
-static int drm_vm_shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int drm_vm_shm_fault(struct vm_fault *vmf)
 {
-	return drm_do_vm_shm_fault(vma, vmf);
+	return drm_do_vm_shm_fault(vmf);
 }
 
-static int drm_vm_dma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int drm_vm_dma_fault(struct vm_fault *vmf)
 {
-	return drm_do_vm_dma_fault(vma, vmf);
+	return drm_do_vm_dma_fault(vmf);
 }
 
-static int drm_vm_sg_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int drm_vm_sg_fault(struct vm_fault *vmf)
 {
-	return drm_do_vm_sg_fault(vma, vmf);
+	return drm_do_vm_sg_fault(vmf);
 }
 
 /** AGP virtual memory operations */
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_drv.h b/drivers/gpu/drm/etnaviv/etnaviv_drv.h
index c255eda40526..e41f38667c1c 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_drv.h
+++ b/drivers/gpu/drm/etnaviv/etnaviv_drv.h
@@ -73,7 +73,7 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data,
 		struct drm_file *file);
 
 int etnaviv_gem_mmap(struct file *filp, struct vm_area_struct *vma);
-int etnaviv_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
+int etnaviv_gem_fault(struct vm_fault *vmf);
 int etnaviv_gem_mmap_offset(struct drm_gem_object *obj, u64 *offset);
 struct sg_table *etnaviv_gem_prime_get_sg_table(struct drm_gem_object *obj);
 void *etnaviv_gem_prime_vmap(struct drm_gem_object *obj);
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
index aa6e35ddc87f..e78f1406885d 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
@@ -175,8 +175,9 @@ int etnaviv_gem_mmap(struct file *filp, struct vm_area_struct *vma)
 	return obj->ops->mmap(obj, vma);
 }
 
-int etnaviv_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int etnaviv_gem_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_gem_object *obj = vma->vm_private_data;
 	struct etnaviv_gem_object *etnaviv_obj = to_etnaviv_bo(obj);
 	struct page **pages, *page;
diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c
index 57b81460fec8..4c28f7ffcc4d 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_gem.c
+++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c
@@ -447,8 +447,9 @@ int exynos_drm_gem_dumb_map_offset(struct drm_file *file_priv,
 	return ret;
 }
 
-int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int exynos_drm_gem_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_gem_object *obj = vma->vm_private_data;
 	struct exynos_drm_gem *exynos_gem = to_exynos_gem(obj);
 	unsigned long pfn;
diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.h b/drivers/gpu/drm/exynos/exynos_drm_gem.h
index df7c543d6558..85457255fcd1 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_gem.h
+++ b/drivers/gpu/drm/exynos/exynos_drm_gem.h
@@ -116,7 +116,7 @@ int exynos_drm_gem_dumb_map_offset(struct drm_file *file_priv,
 				   uint64_t *offset);
 
 /* page fault handler and mmap fault address(virtual) to physical memory. */
-int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
+int exynos_drm_gem_fault(struct vm_fault *vmf);
 
 /* set vm_flags and we can change the vm attribute to other one at here. */
 int exynos_drm_gem_mmap(struct file *filp, struct vm_area_struct *vma);
diff --git a/drivers/gpu/drm/gma500/framebuffer.c b/drivers/gpu/drm/gma500/framebuffer.c
index da42d2e1d397..ffe6b4ffa1a8 100644
--- a/drivers/gpu/drm/gma500/framebuffer.c
+++ b/drivers/gpu/drm/gma500/framebuffer.c
@@ -111,8 +111,9 @@ static int psbfb_pan(struct fb_var_screeninfo *var, struct fb_info *info)
         return 0;
 }
 
-static int psbfb_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int psbfb_vm_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct psb_framebuffer *psbfb = vma->vm_private_data;
 	struct drm_device *dev = psbfb->base.dev;
 	struct drm_psb_private *dev_priv = dev->dev_private;
diff --git a/drivers/gpu/drm/gma500/gem.c b/drivers/gpu/drm/gma500/gem.c
index 527c62917660..7da061aab729 100644
--- a/drivers/gpu/drm/gma500/gem.c
+++ b/drivers/gpu/drm/gma500/gem.c
@@ -164,8 +164,9 @@ int psb_gem_dumb_create(struct drm_file *file, struct drm_device *dev,
  *	vma->vm_private_data points to the GEM object that is backing this
  *	mapping.
  */
-int psb_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int psb_gem_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_gem_object *obj;
 	struct gtt_range *r;
 	int ret;
diff --git a/drivers/gpu/drm/gma500/psb_drv.h b/drivers/gpu/drm/gma500/psb_drv.h
index 05d7aaf47eea..83e22fd4cfc0 100644
--- a/drivers/gpu/drm/gma500/psb_drv.h
+++ b/drivers/gpu/drm/gma500/psb_drv.h
@@ -752,7 +752,7 @@ extern int psb_gem_dumb_create(struct drm_file *file, struct drm_device *dev,
 			struct drm_mode_create_dumb *args);
 extern int psb_gem_dumb_map_gtt(struct drm_file *file, struct drm_device *dev,
 			uint32_t handle, uint64_t *offset);
-extern int psb_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
+extern int psb_gem_fault(struct vm_fault *vmf);
 
 /* psb_device.c */
 extern const struct psb_ops psb_chip_ops;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index bcc81912b5e5..0a4b42d31391 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3352,7 +3352,7 @@ int __must_check i915_gem_wait_for_idle(struct drm_i915_private *dev_priv,
 					unsigned int flags);
 int __must_check i915_gem_suspend(struct drm_i915_private *dev_priv);
 void i915_gem_resume(struct drm_i915_private *dev_priv);
-int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
+int i915_gem_fault(struct vm_fault *vmf);
 int i915_gem_object_wait(struct drm_i915_gem_object *obj,
 			 unsigned int flags,
 			 long timeout,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 88f3628b4e29..6908123162d1 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1772,7 +1772,6 @@ compute_partial_view(struct drm_i915_gem_object *obj,
 
 /**
  * i915_gem_fault - fault a page into the GTT
- * @area: CPU VMA in question
  * @vmf: fault info
  *
  * The fault handler is set up by drm_gem_mmap() when a object is GTT mapped
@@ -1789,9 +1788,10 @@ compute_partial_view(struct drm_i915_gem_object *obj,
  * The current feature set supported by i915_gem_fault() and thus GTT mmaps
  * is exposed via I915_PARAM_MMAP_GTT_VERSION (see i915_gem_mmap_gtt_version).
  */
-int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf)
+int i915_gem_fault(struct vm_fault *vmf)
 {
 #define MIN_CHUNK_PAGES ((1 << 20) >> PAGE_SHIFT) /* 1 MiB */
+	struct vm_area_struct *area = vmf->vma;
 	struct drm_i915_gem_object *obj = to_intel_bo(area->vm_private_data);
 	struct drm_device *dev = obj->base.dev;
 	struct drm_i915_private *dev_priv = to_i915(dev);
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index cdd7b2f8e977..c3b14876edaa 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -206,7 +206,7 @@ void msm_gem_shrinker_cleanup(struct drm_device *dev);
 int msm_gem_mmap_obj(struct drm_gem_object *obj,
 			struct vm_area_struct *vma);
 int msm_gem_mmap(struct file *filp, struct vm_area_struct *vma);
-int msm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
+int msm_gem_fault(struct vm_fault *vmf);
 uint64_t msm_gem_mmap_offset(struct drm_gem_object *obj);
 int msm_gem_get_iova_locked(struct drm_gem_object *obj, int id,
 		uint64_t *iova);
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index e140b05af134..59811f29607d 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -191,8 +191,9 @@ int msm_gem_mmap(struct file *filp, struct vm_area_struct *vma)
 	return msm_gem_mmap_obj(vma->vm_private_data, vma);
 }
 
-int msm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int msm_gem_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_gem_object *obj = vma->vm_private_data;
 	struct drm_device *dev = obj->dev;
 	struct msm_drm_private *priv = dev->dev_private;
diff --git a/drivers/gpu/drm/omapdrm/omap_drv.h b/drivers/gpu/drm/omapdrm/omap_drv.h
index 36d93ce84a29..65977982f15f 100644
--- a/drivers/gpu/drm/omapdrm/omap_drv.h
+++ b/drivers/gpu/drm/omapdrm/omap_drv.h
@@ -188,7 +188,7 @@ int omap_gem_dumb_create(struct drm_file *file, struct drm_device *dev,
 int omap_gem_mmap(struct file *filp, struct vm_area_struct *vma);
 int omap_gem_mmap_obj(struct drm_gem_object *obj,
 		struct vm_area_struct *vma);
-int omap_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
+int omap_gem_fault(struct vm_fault *vmf);
 int omap_gem_op_start(struct drm_gem_object *obj, enum omap_gem_op op);
 int omap_gem_op_finish(struct drm_gem_object *obj, enum omap_gem_op op);
 int omap_gem_op_sync(struct drm_gem_object *obj, enum omap_gem_op op);
diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c
index 74a9968df421..5d5a9f517c30 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem.c
@@ -518,7 +518,6 @@ static int fault_2d(struct drm_gem_object *obj,
 
 /**
  * omap_gem_fault		-	pagefault handler for GEM objects
- * @vma: the VMA of the GEM object
  * @vmf: fault detail
  *
  * Invoked when a fault occurs on an mmap of a GEM managed area. GEM
@@ -529,8 +528,9 @@ static int fault_2d(struct drm_gem_object *obj,
  * vma->vm_private_data points to the GEM object that is backing this
  * mapping.
  */
-int omap_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int omap_gem_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_gem_object *obj = vma->vm_private_data;
 	struct omap_gem_object *omap_obj = to_omap_bo(obj);
 	struct drm_device *dev = obj->dev;
diff --git a/drivers/gpu/drm/qxl/qxl_ttm.c b/drivers/gpu/drm/qxl/qxl_ttm.c
index 4e1a40389964..7d1cab57c89e 100644
--- a/drivers/gpu/drm/qxl/qxl_ttm.c
+++ b/drivers/gpu/drm/qxl/qxl_ttm.c
@@ -105,15 +105,15 @@ static void qxl_ttm_global_fini(struct qxl_device *qdev)
 static struct vm_operations_struct qxl_ttm_vm_ops;
 static const struct vm_operations_struct *ttm_vm_ops;
 
-static int qxl_ttm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int qxl_ttm_fault(struct vm_fault *vmf)
 {
 	struct ttm_buffer_object *bo;
 	int r;
 
-	bo = (struct ttm_buffer_object *)vma->vm_private_data;
+	bo = (struct ttm_buffer_object *)vmf->vma->vm_private_data;
 	if (bo == NULL)
 		return VM_FAULT_NOPAGE;
-	r = ttm_vm_ops->fault(vma, vmf);
+	r = ttm_vm_ops->fault(vmf);
 	return r;
 }
 
diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c
index 7a10b3852970..684f1703aa5c 100644
--- a/drivers/gpu/drm/radeon/radeon_ttm.c
+++ b/drivers/gpu/drm/radeon/radeon_ttm.c
@@ -979,19 +979,19 @@ void radeon_ttm_set_active_vram_size(struct radeon_device *rdev, u64 size)
 static struct vm_operations_struct radeon_ttm_vm_ops;
 static const struct vm_operations_struct *ttm_vm_ops = NULL;
 
-static int radeon_ttm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int radeon_ttm_fault(struct vm_fault *vmf)
 {
 	struct ttm_buffer_object *bo;
 	struct radeon_device *rdev;
 	int r;
 
-	bo = (struct ttm_buffer_object *)vma->vm_private_data;	
+	bo = (struct ttm_buffer_object *)vmf->vma->vm_private_data;
 	if (bo == NULL) {
 		return VM_FAULT_NOPAGE;
 	}
 	rdev = radeon_get_rdev(bo->bdev);
 	down_read(&rdev->pm.mclk_lock);
-	r = ttm_vm_ops->fault(vma, vmf);
+	r = ttm_vm_ops->fault(vmf);
 	up_read(&rdev->pm.mclk_lock);
 	return r;
 }
diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
index b523a5d4a38c..17e62ecb5d4d 100644
--- a/drivers/gpu/drm/tegra/gem.c
+++ b/drivers/gpu/drm/tegra/gem.c
@@ -441,8 +441,9 @@ int tegra_bo_dumb_map_offset(struct drm_file *file, struct drm_device *drm,
 	return 0;
 }
 
-static int tegra_bo_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int tegra_bo_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_gem_object *gem = vma->vm_private_data;
 	struct tegra_bo *bo = to_tegra_bo(gem);
 	struct page *page;
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index 88169141bef5..35ffb3754feb 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -43,7 +43,6 @@
 #define TTM_BO_VM_NUM_PREFAULT 16
 
 static int ttm_bo_vm_fault_idle(struct ttm_buffer_object *bo,
-				struct vm_area_struct *vma,
 				struct vm_fault *vmf)
 {
 	int ret = 0;
@@ -67,7 +66,7 @@ static int ttm_bo_vm_fault_idle(struct ttm_buffer_object *bo,
 			goto out_unlock;
 
 		ttm_bo_reference(bo);
-		up_read(&vma->vm_mm->mmap_sem);
+		up_read(&vmf->vma->vm_mm->mmap_sem);
 		(void) dma_fence_wait(bo->moving, true);
 		ttm_bo_unreserve(bo);
 		ttm_bo_unref(&bo);
@@ -92,8 +91,9 @@ out_unlock:
 	return ret;
 }
 
-static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ttm_bo_vm_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
 	    vma->vm_private_data;
 	struct ttm_bo_device *bdev = bo->bdev;
@@ -124,7 +124,7 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		if (vmf->flags & FAULT_FLAG_ALLOW_RETRY) {
 			if (!(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
 				ttm_bo_reference(bo);
-				up_read(&vma->vm_mm->mmap_sem);
+				up_read(&vmf->vma->vm_mm->mmap_sem);
 				(void) ttm_bo_wait_unreserved(bo);
 				ttm_bo_unref(&bo);
 			}
@@ -168,7 +168,7 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	 * Wait for buffer data in transit, due to a pipelined
 	 * move.
 	 */
-	ret = ttm_bo_vm_fault_idle(bo, vma, vmf);
+	ret = ttm_bo_vm_fault_idle(bo, vmf);
 	if (unlikely(ret != 0)) {
 		retval = ret;
 
diff --git a/drivers/gpu/drm/udl/udl_drv.h b/drivers/gpu/drm/udl/udl_drv.h
index 6c4286e57362..2a75ab80527a 100644
--- a/drivers/gpu/drm/udl/udl_drv.h
+++ b/drivers/gpu/drm/udl/udl_drv.h
@@ -134,7 +134,7 @@ void udl_gem_put_pages(struct udl_gem_object *obj);
 int udl_gem_vmap(struct udl_gem_object *obj);
 void udl_gem_vunmap(struct udl_gem_object *obj);
 int udl_drm_gem_mmap(struct file *filp, struct vm_area_struct *vma);
-int udl_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
+int udl_gem_fault(struct vm_fault *vmf);
 
 int udl_handle_damage(struct udl_framebuffer *fb, int x, int y,
 		      int width, int height);
diff --git a/drivers/gpu/drm/udl/udl_gem.c b/drivers/gpu/drm/udl/udl_gem.c
index 3c0c4bd3f750..775c50e4f02c 100644
--- a/drivers/gpu/drm/udl/udl_gem.c
+++ b/drivers/gpu/drm/udl/udl_gem.c
@@ -100,8 +100,9 @@ int udl_drm_gem_mmap(struct file *filp, struct vm_area_struct *vma)
 	return ret;
 }
 
-int udl_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int udl_gem_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct udl_gem_object *obj = to_udl_bo(vma->vm_private_data);
 	struct page *page;
 	unsigned int page_offset;
diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c
index 477e07f0ecb6..7ccbb03e98de 100644
--- a/drivers/gpu/drm/vgem/vgem_drv.c
+++ b/drivers/gpu/drm/vgem/vgem_drv.c
@@ -50,8 +50,9 @@ static void vgem_gem_free_object(struct drm_gem_object *obj)
 	kfree(vgem_obj);
 }
 
-static int vgem_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int vgem_gem_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_vgem_gem_object *obj = vma->vm_private_data;
 	/* We don't use vmf->pgoff since that has the fake offset */
 	unsigned long vaddr = vmf->address;
diff --git a/drivers/gpu/drm/virtio/virtgpu_ttm.c b/drivers/gpu/drm/virtio/virtgpu_ttm.c
index 9cc7079f7aca..70ec8ca8d9b1 100644
--- a/drivers/gpu/drm/virtio/virtgpu_ttm.c
+++ b/drivers/gpu/drm/virtio/virtgpu_ttm.c
@@ -114,18 +114,17 @@ static void virtio_gpu_ttm_global_fini(struct virtio_gpu_device *vgdev)
 static struct vm_operations_struct virtio_gpu_ttm_vm_ops;
 static const struct vm_operations_struct *ttm_vm_ops;
 
-static int virtio_gpu_ttm_fault(struct vm_area_struct *vma,
-				struct vm_fault *vmf)
+static int virtio_gpu_ttm_fault(struct vm_fault *vmf)
 {
 	struct ttm_buffer_object *bo;
 	struct virtio_gpu_device *vgdev;
 	int r;
 
-	bo = (struct ttm_buffer_object *)vma->vm_private_data;
+	bo = (struct ttm_buffer_object *)vmf->vma->vm_private_data;
 	if (bo == NULL)
 		return VM_FAULT_NOPAGE;
 	vgdev = virtio_gpu_get_vgdev(bo->bdev);
-	r = ttm_vm_ops->fault(vma, vmf);
+	r = ttm_vm_ops->fault(vmf);
 	return r;
 }
 #endif
diff --git a/drivers/hsi/clients/cmt_speech.c b/drivers/hsi/clients/cmt_speech.c
index 3deef6cc7d7c..7175e6bedf21 100644
--- a/drivers/hsi/clients/cmt_speech.c
+++ b/drivers/hsi/clients/cmt_speech.c
@@ -1098,9 +1098,9 @@ static void cs_hsi_stop(struct cs_hsi_iface *hi)
 	kfree(hi);
 }
 
-static int cs_char_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int cs_char_vma_fault(struct vm_fault *vmf)
 {
-	struct cs_char *csdata = vma->vm_private_data;
+	struct cs_char *csdata = vmf->vma->vm_private_data;
 	struct page *page;
 
 	page = virt_to_page(csdata->mmap_base);
diff --git a/drivers/hwtracing/intel_th/msu.c b/drivers/hwtracing/intel_th/msu.c
index e8d55a153a65..e88afe1a435c 100644
--- a/drivers/hwtracing/intel_th/msu.c
+++ b/drivers/hwtracing/intel_th/msu.c
@@ -1188,9 +1188,9 @@ static void msc_mmap_close(struct vm_area_struct *vma)
 	mutex_unlock(&msc->buf_mutex);
 }
 
-static int msc_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int msc_mmap_fault(struct vm_fault *vmf)
 {
-	struct msc_iter *iter = vma->vm_file->private_data;
+	struct msc_iter *iter = vmf->vma->vm_file->private_data;
 	struct msc *msc = iter->msc;
 
 	vmf->page = msc_buffer_get_page(msc, vmf->pgoff);
@@ -1198,7 +1198,7 @@ static int msc_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		return VM_FAULT_SIGBUS;
 
 	get_page(vmf->page);
-	vmf->page->mapping = vma->vm_file->f_mapping;
+	vmf->page->mapping = vmf->vma->vm_file->f_mapping;
 	vmf->page->index = vmf->pgoff;
 
 	return 0;
diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
index bd786b7bd30b..f46033984d07 100644
--- a/drivers/infiniband/hw/hfi1/file_ops.c
+++ b/drivers/infiniband/hw/hfi1/file_ops.c
@@ -92,7 +92,7 @@ static unsigned int poll_next(struct file *, struct poll_table_struct *);
 static int user_event_ack(struct hfi1_ctxtdata *, int, unsigned long);
 static int set_ctxt_pkey(struct hfi1_ctxtdata *, unsigned, u16);
 static int manage_rcvq(struct hfi1_ctxtdata *, unsigned, int);
-static int vma_fault(struct vm_area_struct *, struct vm_fault *);
+static int vma_fault(struct vm_fault *);
 static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
 			    unsigned long arg);
 
@@ -695,7 +695,7 @@ done:
  * Local (non-chip) user memory is not mapped right away but as it is
  * accessed by the user-level code.
  */
-static int vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int vma_fault(struct vm_fault *vmf)
 {
 	struct page *page;
 
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index 2d1eacf1dfed..9396c1807cc3 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -893,7 +893,7 @@ bail:
 /*
  * qib_file_vma_fault - handle a VMA page fault.
  */
-static int qib_file_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int qib_file_vma_fault(struct vm_fault *vmf)
 {
 	struct page *page;
 
diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c
index ba63ca57ed7e..36bd904946bd 100644
--- a/drivers/media/v4l2-core/videobuf-dma-sg.c
+++ b/drivers/media/v4l2-core/videobuf-dma-sg.c
@@ -434,8 +434,9 @@ static void videobuf_vm_close(struct vm_area_struct *vma)
  * now ...).  Bounce buffers don't work very well for the data rates
  * video capture has.
  */
-static int videobuf_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int videobuf_vm_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct page *page;
 
 	dprintk(3, "fault: fault @ %08lx [vma %08lx-%08lx]\n",
diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
index 3907387b6d15..062bf6ca2625 100644
--- a/drivers/misc/cxl/context.c
+++ b/drivers/misc/cxl/context.c
@@ -121,8 +121,9 @@ void cxl_context_set_mapping(struct cxl_context *ctx,
 	mutex_unlock(&ctx->mapping_lock);
 }
 
-static int cxl_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int cxl_mmap_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct cxl_context *ctx = vma->vm_file->private_data;
 	u64 area, offset;
 
diff --git a/drivers/misc/sgi-gru/grumain.c b/drivers/misc/sgi-gru/grumain.c
index af2e077da4b8..3641f1334cf0 100644
--- a/drivers/misc/sgi-gru/grumain.c
+++ b/drivers/misc/sgi-gru/grumain.c
@@ -926,8 +926,9 @@ again:
  *
  * 	Note: gru segments alway mmaped on GRU_GSEG_PAGESIZE boundaries.
  */
-int gru_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int gru_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct gru_thread_state *gts;
 	unsigned long paddr, vaddr;
 	unsigned long expires;
diff --git a/drivers/misc/sgi-gru/grutables.h b/drivers/misc/sgi-gru/grutables.h
index 5c3ce2459675..b5e308b50ed1 100644
--- a/drivers/misc/sgi-gru/grutables.h
+++ b/drivers/misc/sgi-gru/grutables.h
@@ -665,7 +665,7 @@ extern unsigned long gru_reserve_cb_resources(struct gru_state *gru,
 		int cbr_au_count, char *cbmap);
 extern unsigned long gru_reserve_ds_resources(struct gru_state *gru,
 		int dsr_au_count, char *dsmap);
-extern int gru_fault(struct vm_area_struct *, struct vm_fault *vmf);
+extern int gru_fault(struct vm_fault *vmf);
 extern struct gru_mm_struct *gru_register_mmu_notifier(void);
 extern void gru_drop_mmu_notifier(struct gru_mm_struct *gms);
 
diff --git a/drivers/scsi/cxlflash/superpipe.c b/drivers/scsi/cxlflash/superpipe.c
index 90869cee2b20..ef5bf55f08a4 100644
--- a/drivers/scsi/cxlflash/superpipe.c
+++ b/drivers/scsi/cxlflash/superpipe.c
@@ -1053,7 +1053,6 @@ out:
 
 /**
  * cxlflash_mmap_fault() - mmap fault handler for adapter file descriptor
- * @vma:	VM area associated with mapping.
  * @vmf:	VM fault associated with current fault.
  *
  * To support error notification via MMIO, faults are 'caught' by this routine
@@ -1067,8 +1066,9 @@ out:
  *
  * Return: 0 on success, VM_FAULT_SIGBUS on failure
  */
-static int cxlflash_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int cxlflash_mmap_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct file *file = vma->vm_file;
 	struct cxl_context *ctx = cxl_fops_get_context(file);
 	struct cxlflash_cfg *cfg = container_of(file->f_op, struct cxlflash_cfg,
@@ -1097,7 +1097,7 @@ static int cxlflash_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 	if (likely(!ctxi->err_recovery_active)) {
 		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-		rc = ctxi->cxl_mmap_vmops->fault(vma, vmf);
+		rc = ctxi->cxl_mmap_vmops->fault(vmf);
 	} else {
 		dev_dbg(dev, "%s: err recovery active, use err_page\n",
 			__func__);
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index e831e01f9fa6..29b86505f796 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1185,8 +1185,9 @@ sg_fasync(int fd, struct file *filp, int mode)
 }
 
 static int
-sg_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+sg_vma_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	Sg_fd *sfp;
 	unsigned long offset, len, sa;
 	Sg_scatter_hold *rsv_schp;
diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c
index 969600779e44..2c3ffbcbd621 100644
--- a/drivers/staging/android/ion/ion.c
+++ b/drivers/staging/android/ion/ion.c
@@ -870,9 +870,9 @@ static void ion_buffer_sync_for_device(struct ion_buffer *buffer,
 	mutex_unlock(&buffer->lock);
 }
 
-static int ion_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ion_vm_fault(struct vm_fault *vmf)
 {
-	struct ion_buffer *buffer = vma->vm_private_data;
+	struct ion_buffer *buffer = vmf->vma->vm_private_data;
 	unsigned long pfn;
 	int ret;
 
@@ -881,7 +881,7 @@ static int ion_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	BUG_ON(!buffer->pages || !buffer->pages[vmf->pgoff]);
 
 	pfn = page_to_pfn(ion_buffer_page(buffer->pages[vmf->pgoff]));
-	ret = vm_insert_pfn(vma, vmf->address, pfn);
+	ret = vm_insert_pfn(vmf->vma, vmf->address, pfn);
 	mutex_unlock(&buffer->lock);
 	if (ret)
 		return VM_FAULT_ERROR;
diff --git a/drivers/staging/lustre/lustre/llite/llite_mmap.c b/drivers/staging/lustre/lustre/llite/llite_mmap.c
index 9afa6bec3e6f..896196c74cd2 100644
--- a/drivers/staging/lustre/lustre/llite/llite_mmap.c
+++ b/drivers/staging/lustre/lustre/llite/llite_mmap.c
@@ -321,7 +321,7 @@ out:
 	return fault_ret;
 }
 
-static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ll_fault(struct vm_fault *vmf)
 {
 	int count = 0;
 	bool printed = false;
@@ -335,7 +335,7 @@ static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM));
 
 restart:
-	result = ll_fault0(vma, vmf);
+	result = ll_fault0(vmf->vma, vmf);
 	LASSERT(!(result & VM_FAULT_LOCKED));
 	if (result == 0) {
 		struct page *vmpage = vmf->page;
@@ -362,8 +362,9 @@ restart:
 	return result;
 }
 
-static int ll_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ll_page_mkwrite(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	int count = 0;
 	bool printed = false;
 	bool retry;
diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c
index 3e9cf710501b..4c57755e06e7 100644
--- a/drivers/staging/lustre/lustre/llite/vvp_io.c
+++ b/drivers/staging/lustre/lustre/llite/vvp_io.c
@@ -1014,7 +1014,7 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
 {
 	struct vm_fault *vmf = cfio->ft_vmf;
 
-	cfio->ft_flags = filemap_fault(cfio->ft_vma, vmf);
+	cfio->ft_flags = filemap_fault(vmf);
 	cfio->ft_flags_valid = 1;
 
 	if (vmf->page) {
diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 8041710b6972..5c1cb2df3a54 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -783,15 +783,15 @@ static int tcmu_find_mem_index(struct vm_area_struct *vma)
 	return -1;
 }
 
-static int tcmu_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int tcmu_vma_fault(struct vm_fault *vmf)
 {
-	struct tcmu_dev *udev = vma->vm_private_data;
+	struct tcmu_dev *udev = vmf->vma->vm_private_data;
 	struct uio_info *info = &udev->uio_info;
 	struct page *page;
 	unsigned long offset;
 	void *addr;
 
-	int mi = tcmu_find_mem_index(vma);
+	int mi = tcmu_find_mem_index(vmf->vma);
 	if (mi < 0)
 		return VM_FAULT_SIGBUS;
 
diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c
index fba021f5736a..31d95dc9c202 100644
--- a/drivers/uio/uio.c
+++ b/drivers/uio/uio.c
@@ -597,14 +597,14 @@ static int uio_find_mem_index(struct vm_area_struct *vma)
 	return -1;
 }
 
-static int uio_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int uio_vma_fault(struct vm_fault *vmf)
 {
-	struct uio_device *idev = vma->vm_private_data;
+	struct uio_device *idev = vmf->vma->vm_private_data;
 	struct page *page;
 	unsigned long offset;
 	void *addr;
 
-	int mi = uio_find_mem_index(vma);
+	int mi = uio_find_mem_index(vmf->vma);
 	if (mi < 0)
 		return VM_FAULT_SIGBUS;
 
diff --git a/drivers/usb/mon/mon_bin.c b/drivers/usb/mon/mon_bin.c
index 91c22276c03b..9fb8b1e6ecc2 100644
--- a/drivers/usb/mon/mon_bin.c
+++ b/drivers/usb/mon/mon_bin.c
@@ -1223,9 +1223,9 @@ static void mon_bin_vma_close(struct vm_area_struct *vma)
 /*
  * Map ring pages to user space.
  */
-static int mon_bin_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int mon_bin_vma_fault(struct vm_fault *vmf)
 {
-	struct mon_reader_bin *rp = vma->vm_private_data;
+	struct mon_reader_bin *rp = vmf->vma->vm_private_data;
 	unsigned long offset, chunk_idx;
 	struct page *pageptr;
 
diff --git a/drivers/video/fbdev/core/fb_defio.c b/drivers/video/fbdev/core/fb_defio.c
index 74b5bcac8bf2..37f69c061210 100644
--- a/drivers/video/fbdev/core/fb_defio.c
+++ b/drivers/video/fbdev/core/fb_defio.c
@@ -37,12 +37,11 @@ static struct page *fb_deferred_io_page(struct fb_info *info, unsigned long offs
 }
 
 /* this is to find and return the vmalloc-ed fb pages */
-static int fb_deferred_io_fault(struct vm_area_struct *vma,
-				struct vm_fault *vmf)
+static int fb_deferred_io_fault(struct vm_fault *vmf)
 {
 	unsigned long offset;
 	struct page *page;
-	struct fb_info *info = vma->vm_private_data;
+	struct fb_info *info = vmf->vma->vm_private_data;
 
 	offset = vmf->pgoff << PAGE_SHIFT;
 	if (offset >= info->fix.smem_len)
@@ -54,8 +53,8 @@ static int fb_deferred_io_fault(struct vm_area_struct *vma,
 
 	get_page(page);
 
-	if (vma->vm_file)
-		page->mapping = vma->vm_file->f_mapping;
+	if (vmf->vma->vm_file)
+		page->mapping = vmf->vma->vm_file->f_mapping;
 	else
 		printk(KERN_ERR "no mapping available\n");
 
@@ -91,11 +90,10 @@ int fb_deferred_io_fsync(struct file *file, loff_t start, loff_t end, int datasy
 EXPORT_SYMBOL_GPL(fb_deferred_io_fsync);
 
 /* vm_ops->page_mkwrite handler */
-static int fb_deferred_io_mkwrite(struct vm_area_struct *vma,
-				  struct vm_fault *vmf)
+static int fb_deferred_io_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct fb_info *info = vma->vm_private_data;
+	struct fb_info *info = vmf->vma->vm_private_data;
 	struct fb_deferred_io *fbdefio = info->fbdefio;
 	struct page *cur;
 
@@ -105,7 +103,7 @@ static int fb_deferred_io_mkwrite(struct vm_area_struct *vma,
 	deferred framebuffer IO. then if userspace touches a page
 	again, we repeat the same scheme */
 
-	file_update_time(vma->vm_file);
+	file_update_time(vmf->vma->vm_file);
 
 	/* protect against the workqueue changing the page list */
 	mutex_lock(&fbdefio->lock);
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 2077a3ac7c0c..7a92a5e1d40c 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -804,10 +804,10 @@ static void privcmd_close(struct vm_area_struct *vma)
 	kfree(pages);
 }
 
-static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int privcmd_fault(struct vm_fault *vmf)
 {
 	printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
-	       vma, vma->vm_start, vma->vm_end,
+	       vmf->vma, vmf->vma->vm_start, vmf->vma->vm_end,
 	       vmf->pgoff, (void *)vmf->address);
 
 	return VM_FAULT_SIGBUS;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 6a0f3fa85ef7..3de3b4a89d89 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -534,11 +534,11 @@ v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma)
 }
 
 static int
-v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+v9fs_vm_page_mkwrite(struct vm_fault *vmf)
 {
 	struct v9fs_inode *v9inode;
 	struct page *page = vmf->page;
-	struct file *filp = vma->vm_file;
+	struct file *filp = vmf->vma->vm_file;
 	struct inode *inode = file_inode(filp);
 
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6a823719b6c5..adf16307842a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3147,7 +3147,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 			 size_t size, struct bio *bio,
 			 unsigned long bio_flags);
-int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
+int btrfs_page_mkwrite(struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_evict_inode(struct inode *inode);
 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1e861a063721..ea1d500cfba6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8964,10 +8964,10 @@ again:
  * beyond EOF, then the page is guaranteed safe against truncation until we
  * unlock the page.
  */
-int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+int btrfs_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_ordered_extent *ordered;
@@ -9000,7 +9000,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	ret = btrfs_delalloc_reserve_space(inode, page_start,
 					   reserved_space);
 	if (!ret) {
-		ret = file_update_time(vma->vm_file);
+		ret = file_update_time(vmf->vma->vm_file);
 		reserved = 1;
 	}
 	if (ret) {
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index e4b066cd912a..09860c0ec7c1 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1386,8 +1386,9 @@ static void ceph_restore_sigs(sigset_t *oldset)
 /*
  * vm ops
  */
-static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ceph_filemap_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct inode *inode = file_inode(vma->vm_file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_file_info *fi = vma->vm_file->private_data;
@@ -1416,7 +1417,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
 	    ci->i_inline_version == CEPH_INLINE_NONE) {
 		current->journal_info = vma->vm_file;
-		ret = filemap_fault(vma, vmf);
+		ret = filemap_fault(vmf);
 		current->journal_info = NULL;
 	} else
 		ret = -EAGAIN;
@@ -1477,8 +1478,9 @@ out_restore:
 /*
  * Reuse write_begin here for simplicity.
  */
-static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ceph_page_mkwrite(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct inode *inode = file_inode(vma->vm_file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_file_info *fi = vma->vm_file->private_data;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 98dc842e7245..aa3debbba826 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3282,7 +3282,7 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset)
  * sure that it doesn't change while being written back.
  */
 static int
-cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+cifs_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
 
diff --git a/fs/dax.c b/fs/dax.c
index 3f1181563fb1..f955c0df33bb 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -925,12 +925,11 @@ static int dax_insert_mapping(struct address_space *mapping,
 
 /**
  * dax_pfn_mkwrite - handle first write to DAX page
- * @vma: The virtual memory area where the fault occurred
  * @vmf: The description of the fault
  */
-int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+int dax_pfn_mkwrite(struct vm_fault *vmf)
 {
-	struct file *file = vma->vm_file;
+	struct file *file = vmf->vma->vm_file;
 	struct address_space *mapping = file->f_mapping;
 	void *entry, **slot;
 	pgoff_t index = vmf->pgoff;
@@ -1121,7 +1120,6 @@ static int dax_fault_return(int error)
 
 /**
  * dax_iomap_fault - handle a page fault on a DAX file
- * @vma: The virtual memory area where the fault occurred
  * @vmf: The description of the fault
  * @ops: iomap ops passed from the file system
  *
@@ -1129,10 +1127,9 @@ static int dax_fault_return(int error)
  * or mkwrite handler for DAX files. Assumes the caller has done all the
  * necessary locking for the page fault to proceed successfully.
  */
-int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-			const struct iomap_ops *ops)
+int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops)
 {
-	struct address_space *mapping = vma->vm_file->f_mapping;
+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
 	unsigned long vaddr = vmf->address;
 	loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
@@ -1205,11 +1202,11 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	case IOMAP_MAPPED:
 		if (iomap.flags & IOMAP_F_NEW) {
 			count_vm_event(PGMAJFAULT);
-			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+			mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT);
 			major = VM_FAULT_MAJOR;
 		}
 		error = dax_insert_mapping(mapping, iomap.bdev, sector,
-				PAGE_SIZE, &entry, vma, vmf);
+				PAGE_SIZE, &entry, vmf->vma, vmf);
 		/* -EBUSY is fine, somebody else faulted on the same PTE */
 		if (error == -EBUSY)
 			error = 0;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index b0f241528a30..0bf0d971205a 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -87,19 +87,19 @@ out_unlock:
  * The default page_lock and i_size verification done by non-DAX fault paths
  * is sufficient because ext2 doesn't support hole punching.
  */
-static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ext2_dax_fault(struct vm_fault *vmf)
 {
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct ext2_inode_info *ei = EXT2_I(inode);
 	int ret;
 
 	if (vmf->flags & FAULT_FLAG_WRITE) {
 		sb_start_pagefault(inode->i_sb);
-		file_update_time(vma->vm_file);
+		file_update_time(vmf->vma->vm_file);
 	}
 	down_read(&ei->dax_sem);
 
-	ret = dax_iomap_fault(vma, vmf, &ext2_iomap_ops);
+	ret = dax_iomap_fault(vmf, &ext2_iomap_ops);
 
 	up_read(&ei->dax_sem);
 	if (vmf->flags & FAULT_FLAG_WRITE)
@@ -107,16 +107,15 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return ret;
 }
 
-static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
-		struct vm_fault *vmf)
+static int ext2_dax_pfn_mkwrite(struct vm_fault *vmf)
 {
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct ext2_inode_info *ei = EXT2_I(inode);
 	loff_t size;
 	int ret;
 
 	sb_start_pagefault(inode->i_sb);
-	file_update_time(vma->vm_file);
+	file_update_time(vmf->vma->vm_file);
 	down_read(&ei->dax_sem);
 
 	/* check that the faulting page hasn't raced with truncate */
@@ -124,7 +123,7 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
 	if (vmf->pgoff >= size)
 		ret = VM_FAULT_SIGBUS;
 	else
-		ret = dax_pfn_mkwrite(vma, vmf);
+		ret = dax_pfn_mkwrite(vmf);
 
 	up_read(&ei->dax_sem);
 	sb_end_pagefault(inode->i_sb);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index cee23b684f47..2fd17e8e4984 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2483,8 +2483,8 @@ extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
 			     loff_t lstart, loff_t lend);
-extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
-extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
+extern int ext4_page_mkwrite(struct vm_fault *vmf);
+extern int ext4_filemap_fault(struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
 extern void ext4_da_update_reserve_space(struct inode *inode,
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 13021a054fc0..21e1f17fe36d 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -253,19 +253,19 @@ out:
 }
 
 #ifdef CONFIG_FS_DAX
-static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ext4_dax_fault(struct vm_fault *vmf)
 {
 	int result;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct super_block *sb = inode->i_sb;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
 
 	if (write) {
 		sb_start_pagefault(sb);
-		file_update_time(vma->vm_file);
+		file_update_time(vmf->vma->vm_file);
 	}
 	down_read(&EXT4_I(inode)->i_mmap_sem);
-	result = dax_iomap_fault(vma, vmf, &ext4_iomap_ops);
+	result = dax_iomap_fault(vmf, &ext4_iomap_ops);
 	up_read(&EXT4_I(inode)->i_mmap_sem);
 	if (write)
 		sb_end_pagefault(sb);
@@ -303,22 +303,21 @@ ext4_dax_pmd_fault(struct vm_fault *vmf)
  * wp_pfn_shared() fails. Thus fault gets retried and things work out as
  * desired.
  */
-static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma,
-				struct vm_fault *vmf)
+static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf)
 {
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct super_block *sb = inode->i_sb;
 	loff_t size;
 	int ret;
 
 	sb_start_pagefault(sb);
-	file_update_time(vma->vm_file);
+	file_update_time(vmf->vma->vm_file);
 	down_read(&EXT4_I(inode)->i_mmap_sem);
 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (vmf->pgoff >= size)
 		ret = VM_FAULT_SIGBUS;
 	else
-		ret = dax_pfn_mkwrite(vma, vmf);
+		ret = dax_pfn_mkwrite(vmf);
 	up_read(&EXT4_I(inode)->i_mmap_sem);
 	sb_end_pagefault(sb);
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 75212a6e69f8..41d8e53e5a7f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5821,8 +5821,9 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
 	return !buffer_mapped(bh);
 }
 
-int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+int ext4_page_mkwrite(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct page *page = vmf->page;
 	loff_t size;
 	unsigned long len;
@@ -5912,13 +5913,13 @@ out:
 	return ret;
 }
 
-int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int ext4_filemap_fault(struct vm_fault *vmf)
 {
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	int err;
 
 	down_read(&EXT4_I(inode)->i_mmap_sem);
-	err = filemap_fault(vma, vmf);
+	err = filemap_fault(vmf);
 	up_read(&EXT4_I(inode)->i_mmap_sem);
 
 	return err;
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 49f10dce817d..1edc86e874e3 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -32,11 +32,10 @@
 #include "trace.h"
 #include <trace/events/f2fs.h>
 
-static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
-						struct vm_fault *vmf)
+static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct dnode_of_data dn;
 	int err;
@@ -58,7 +57,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 
 	f2fs_balance_fs(sbi, dn.node_changed);
 
-	file_update_time(vma->vm_file);
+	file_update_time(vmf->vma->vm_file);
 	lock_page(page);
 	if (unlikely(page->mapping != inode->i_mapping ||
 			page_offset(page) > i_size_read(inode) ||
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 2401c5dabb2a..e80bfd06daf5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2043,12 +2043,12 @@ static void fuse_vma_close(struct vm_area_struct *vma)
  * - sync(2)
  * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
  */
-static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int fuse_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 
-	file_update_time(vma->vm_file);
+	file_update_time(vmf->vma->vm_file);
 	lock_page(page);
 	if (page->mapping != inode->i_mapping) {
 		unlock_page(page);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 016c11eaca7c..6fe2a59c6a9a 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -379,10 +379,10 @@ static int gfs2_allocate_page_backing(struct page *page)
  * blocks allocated on disk to back that page.
  */
 
-static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int gfs2_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct gfs2_alloc_parms ap = { .aflags = 0, };
@@ -399,7 +399,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (ret)
 		goto out;
 
-	gfs2_size_hint(vma->vm_file, pos, PAGE_SIZE);
+	gfs2_size_hint(vmf->vma->vm_file, pos, PAGE_SIZE);
 
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
 	ret = gfs2_glock_nq(&gh);
@@ -407,7 +407,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 		goto out_uninit;
 
 	/* Update file times before taking page lock */
-	file_update_time(vma->vm_file);
+	file_update_time(vmf->vma->vm_file);
 
 	set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
 	set_bit(GIF_SW_PAGED, &ip->i_flags);
diff --git a/fs/iomap.c b/fs/iomap.c
index d89f70bbb952..d209f42cdcb8 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -445,11 +445,10 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
 	return length;
 }
 
-int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
-		const struct iomap_ops *ops)
+int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	unsigned long length;
 	loff_t offset, size;
 	ssize_t ret;
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 78219d5644e9..4f0535890b30 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -348,9 +348,9 @@ static void kernfs_vma_open(struct vm_area_struct *vma)
 	kernfs_put_active(of->kn);
 }
 
-static int kernfs_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int kernfs_vma_fault(struct vm_fault *vmf)
 {
-	struct file *file = vma->vm_file;
+	struct file *file = vmf->vma->vm_file;
 	struct kernfs_open_file *of = kernfs_of(file);
 	int ret;
 
@@ -362,16 +362,15 @@ static int kernfs_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 	ret = VM_FAULT_SIGBUS;
 	if (of->vm_ops->fault)
-		ret = of->vm_ops->fault(vma, vmf);
+		ret = of->vm_ops->fault(vmf);
 
 	kernfs_put_active(of->kn);
 	return ret;
 }
 
-static int kernfs_vma_page_mkwrite(struct vm_area_struct *vma,
-				   struct vm_fault *vmf)
+static int kernfs_vma_page_mkwrite(struct vm_fault *vmf)
 {
-	struct file *file = vma->vm_file;
+	struct file *file = vmf->vma->vm_file;
 	struct kernfs_open_file *of = kernfs_of(file);
 	int ret;
 
@@ -383,7 +382,7 @@ static int kernfs_vma_page_mkwrite(struct vm_area_struct *vma,
 
 	ret = 0;
 	if (of->vm_ops->page_mkwrite)
-		ret = of->vm_ops->page_mkwrite(vma, vmf);
+		ret = of->vm_ops->page_mkwrite(vmf);
 	else
 		file_update_time(file);
 
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 39f57bef8531..0c3905e0542e 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -27,10 +27,9 @@
  * XXX: how are we excluding truncate/invalidate here? Maybe need to lock
  * page?
  */
-static int ncp_file_mmap_fault(struct vm_area_struct *area,
-					struct vm_fault *vmf)
+static int ncp_file_mmap_fault(struct vm_fault *vmf)
 {
-	struct inode *inode = file_inode(area->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	char *pg_addr;
 	unsigned int already_read;
 	unsigned int count;
@@ -90,7 +89,7 @@ static int ncp_file_mmap_fault(struct vm_area_struct *area,
 	 * -- nyc
 	 */
 	count_vm_event(PGMAJFAULT);
-	mem_cgroup_count_vm_event(area->vm_mm, PGMAJFAULT);
+	mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT);
 	return VM_FAULT_MAJOR;
 }
 
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 26dbe8b0c10d..668213984d68 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -528,10 +528,10 @@ const struct address_space_operations nfs_file_aops = {
  * writable, implying that someone is about to modify the page through a
  * shared-writable mapping
  */
-static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int nfs_vm_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct file *filp = vma->vm_file;
+	struct file *filp = vmf->vma->vm_file;
 	struct inode *inode = file_inode(filp);
 	unsigned pagelen;
 	int ret = VM_FAULT_NOPAGE;
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 547381f3ce13..c5fa3dee72fc 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -51,8 +51,9 @@ int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	return err;
 }
 
-static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int nilfs_page_mkwrite(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct page *page = vmf->page;
 	struct inode *inode = file_inode(vma->vm_file);
 	struct nilfs_transaction_info ti;
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 429088786e93..098f5c712569 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -44,17 +44,18 @@
 #include "ocfs2_trace.h"
 
 
-static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
+static int ocfs2_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	sigset_t oldset;
 	int ret;
 
 	ocfs2_block_signals(&oldset);
-	ret = filemap_fault(area, vmf);
+	ret = filemap_fault(vmf);
 	ocfs2_unblock_signals(&oldset);
 
-	trace_ocfs2_fault(OCFS2_I(area->vm_file->f_mapping->host)->ip_blkno,
-			  area, vmf->page, vmf->pgoff);
+	trace_ocfs2_fault(OCFS2_I(vma->vm_file->f_mapping->host)->ip_blkno,
+			  vma, vmf->page, vmf->pgoff);
 	return ret;
 }
 
@@ -127,10 +128,10 @@ out:
 	return ret;
 }
 
-static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ocfs2_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct buffer_head *di_bh = NULL;
 	sigset_t oldset;
 	int ret;
@@ -160,7 +161,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	 */
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 
-	ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page);
+	ret = __ocfs2_page_mkwrite(vmf->vma->vm_file, di_bh, page);
 
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 5105b1599981..f52d8e857ff7 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -265,10 +265,10 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
  * On s390 the fault handler is used for memory regions that can't be mapped
  * directly with remap_pfn_range().
  */
-static int mmap_vmcore_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int mmap_vmcore_fault(struct vm_fault *vmf)
 {
 #ifdef CONFIG_S390
-	struct address_space *mapping = vma->vm_file->f_mapping;
+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	pgoff_t index = vmf->pgoff;
 	struct page *page;
 	loff_t offset;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index b0d783774c96..d9ae86f96df7 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1506,11 +1506,10 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
  * mmap()d file has taken write protection fault and is being made writable.
  * UBIFS must ensure page is budgeted for.
  */
-static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma,
-				 struct vm_fault *vmf)
+static int ubifs_vm_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 	struct timespec now = ubifs_current_time(inode);
 	struct ubifs_budget_req req = { .new_page = 1 };
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 022014016d80..9cc10136ba0b 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1379,22 +1379,21 @@ xfs_file_llseek(
  */
 STATIC int
 xfs_filemap_page_mkwrite(
-	struct vm_area_struct	*vma,
 	struct vm_fault		*vmf)
 {
-	struct inode		*inode = file_inode(vma->vm_file);
+	struct inode		*inode = file_inode(vmf->vma->vm_file);
 	int			ret;
 
 	trace_xfs_filemap_page_mkwrite(XFS_I(inode));
 
 	sb_start_pagefault(inode->i_sb);
-	file_update_time(vma->vm_file);
+	file_update_time(vmf->vma->vm_file);
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	if (IS_DAX(inode)) {
-		ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
+		ret = dax_iomap_fault(vmf, &xfs_iomap_ops);
 	} else {
-		ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
+		ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
 		ret = block_page_mkwrite_return(ret);
 	}
 
@@ -1406,23 +1405,22 @@ xfs_filemap_page_mkwrite(
 
 STATIC int
 xfs_filemap_fault(
-	struct vm_area_struct	*vma,
 	struct vm_fault		*vmf)
 {
-	struct inode		*inode = file_inode(vma->vm_file);
+	struct inode		*inode = file_inode(vmf->vma->vm_file);
 	int			ret;
 
 	trace_xfs_filemap_fault(XFS_I(inode));
 
 	/* DAX can shortcut the normal fault path on write faults! */
 	if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(inode))
-		return xfs_filemap_page_mkwrite(vma, vmf);
+		return xfs_filemap_page_mkwrite(vmf);
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 	if (IS_DAX(inode))
-		ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
+		ret = dax_iomap_fault(vmf, &xfs_iomap_ops);
 	else
-		ret = filemap_fault(vma, vmf);
+		ret = filemap_fault(vmf);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	return ret;
@@ -1471,11 +1469,10 @@ xfs_filemap_pmd_fault(
  */
 static int
 xfs_filemap_pfn_mkwrite(
-	struct vm_area_struct	*vma,
 	struct vm_fault		*vmf)
 {
 
-	struct inode		*inode = file_inode(vma->vm_file);
+	struct inode		*inode = file_inode(vmf->vma->vm_file);
 	struct xfs_inode	*ip = XFS_I(inode);
 	int			ret = VM_FAULT_NOPAGE;
 	loff_t			size;
@@ -1483,7 +1480,7 @@ xfs_filemap_pfn_mkwrite(
 	trace_xfs_filemap_pfn_mkwrite(ip);
 
 	sb_start_pagefault(inode->i_sb);
-	file_update_time(vma->vm_file);
+	file_update_time(vmf->vma->vm_file);
 
 	/* check if the faulting page hasn't raced with truncate */
 	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
@@ -1491,7 +1488,7 @@ xfs_filemap_pfn_mkwrite(
 	if (vmf->pgoff >= size)
 		ret = VM_FAULT_SIGBUS;
 	else if (IS_DAX(inode))
-		ret = dax_pfn_mkwrite(vma, vmf);
+		ret = dax_pfn_mkwrite(vmf);
 	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
 	sb_end_pagefault(inode->i_sb);
 	return ret;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 1e77ff5818f1..eeb02421c848 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -38,8 +38,7 @@ static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
 
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops);
-int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-			const struct iomap_ops *ops);
+int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops);
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
@@ -83,7 +82,7 @@ static inline int dax_iomap_pmd_fault(struct vm_fault *vmf,
 	return VM_FAULT_FALLBACK;
 }
 #endif
-int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
+int dax_pfn_mkwrite(struct vm_fault *vmf);
 
 static inline bool vma_is_dax(struct vm_area_struct *vma)
 {
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 891459caa278..7291810067eb 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -79,8 +79,7 @@ int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
 		bool *did_zero, const struct iomap_ops *ops);
 int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
 		const struct iomap_ops *ops);
-int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
-		const struct iomap_ops *ops);
+int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops);
 int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		loff_t start, loff_t len, const struct iomap_ops *ops);
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 574bc157a27c..3dd80ba6568a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -350,17 +350,17 @@ struct vm_operations_struct {
 	void (*open)(struct vm_area_struct * area);
 	void (*close)(struct vm_area_struct * area);
 	int (*mremap)(struct vm_area_struct * area);
-	int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
+	int (*fault)(struct vm_fault *vmf);
 	int (*pmd_fault)(struct vm_fault *vmf);
 	void (*map_pages)(struct vm_fault *vmf,
 			pgoff_t start_pgoff, pgoff_t end_pgoff);
 
 	/* notification that a previously read-only page is about to become
 	 * writable, if an error is returned it will cause a SIGBUS */
-	int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf);
+	int (*page_mkwrite)(struct vm_fault *vmf);
 
 	/* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */
-	int (*pfn_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf);
+	int (*pfn_mkwrite)(struct vm_fault *vmf);
 
 	/* called by access_process_vm when get_user_pages() fails, typically
 	 * for use by special VMAs that can switch between memory and hardware
@@ -2124,10 +2124,10 @@ extern void truncate_inode_pages_range(struct address_space *,
 extern void truncate_inode_pages_final(struct address_space *);
 
 /* generic vm_area_ops exported for stackable file systems */
-extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
+extern int filemap_fault(struct vm_fault *vmf);
 extern void filemap_map_pages(struct vm_fault *vmf,
 		pgoff_t start_pgoff, pgoff_t end_pgoff);
-extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
+extern int filemap_page_mkwrite(struct vm_fault *vmf);
 
 /* mm/page-writeback.c */
 int write_one_page(struct page *page, int wait);
diff --git a/ipc/shm.c b/ipc/shm.c
index 81203e8ba013..7f6537b84ef5 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -374,12 +374,12 @@ void exit_shm(struct task_struct *task)
 	up_write(&shm_ids(ns).rwsem);
 }
 
-static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int shm_fault(struct vm_fault *vmf)
 {
-	struct file *file = vma->vm_file;
+	struct file *file = vmf->vma->vm_file;
 	struct shm_file_data *sfd = shm_file_data(file);
 
-	return sfd->vm_ops->fault(vma, vmf);
+	return sfd->vm_ops->fault(vmf);
 }
 
 #ifdef CONFIG_NUMA
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 77a932b54a64..b2eb3542e829 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4925,9 +4925,9 @@ unlock:
 	rcu_read_unlock();
 }
 
-static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int perf_mmap_fault(struct vm_fault *vmf)
 {
-	struct perf_event *event = vma->vm_file->private_data;
+	struct perf_event *event = vmf->vma->vm_file->private_data;
 	struct ring_buffer *rb;
 	int ret = VM_FAULT_SIGBUS;
 
@@ -4950,7 +4950,7 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		goto unlock;
 
 	get_page(vmf->page);
-	vmf->page->mapping = vma->vm_file->f_mapping;
+	vmf->page->mapping = vmf->vma->vm_file->f_mapping;
 	vmf->page->index   = vmf->pgoff;
 
 	ret = 0;
diff --git a/kernel/relay.c b/kernel/relay.c
index 8f18d314a96a..8f8dc91db680 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -39,10 +39,10 @@ static void relay_file_mmap_close(struct vm_area_struct *vma)
 /*
  * fault() vm_op implementation for relay file mapping.
  */
-static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int relay_buf_fault(struct vm_fault *vmf)
 {
 	struct page *page;
-	struct rchan_buf *buf = vma->vm_private_data;
+	struct rchan_buf *buf = vmf->vma->vm_private_data;
 	pgoff_t pgoff = vmf->pgoff;
 
 	if (!buf)
diff --git a/mm/filemap.c b/mm/filemap.c
index 416d563468a3..2ba46f410c7c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2169,7 +2169,6 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma,
 
 /**
  * filemap_fault - read in file data for page fault handling
- * @vma:	vma in which the fault was taken
  * @vmf:	struct vm_fault containing details of the fault
  *
  * filemap_fault() is invoked via the vma operations vector for a
@@ -2191,10 +2190,10 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma,
  *
  * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
  */
-int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int filemap_fault(struct vm_fault *vmf)
 {
 	int error;
-	struct file *file = vma->vm_file;
+	struct file *file = vmf->vma->vm_file;
 	struct address_space *mapping = file->f_mapping;
 	struct file_ra_state *ra = &file->f_ra;
 	struct inode *inode = mapping->host;
@@ -2216,12 +2215,12 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		 * We found the page, so try async readahead before
 		 * waiting for the lock.
 		 */
-		do_async_mmap_readahead(vma, ra, file, page, offset);
+		do_async_mmap_readahead(vmf->vma, ra, file, page, offset);
 	} else if (!page) {
 		/* No page in the page cache at all */
-		do_sync_mmap_readahead(vma, ra, file, offset);
+		do_sync_mmap_readahead(vmf->vma, ra, file, offset);
 		count_vm_event(PGMAJFAULT);
-		mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+		mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT);
 		ret = VM_FAULT_MAJOR;
 retry_find:
 		page = find_get_page(mapping, offset);
@@ -2229,7 +2228,7 @@ retry_find:
 			goto no_cached_page;
 	}
 
-	if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
+	if (!lock_page_or_retry(page, vmf->vma->vm_mm, vmf->flags)) {
 		put_page(page);
 		return ret | VM_FAULT_RETRY;
 	}
@@ -2396,14 +2395,14 @@ next:
 }
 EXPORT_SYMBOL(filemap_map_pages);
 
-int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+int filemap_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	int ret = VM_FAULT_LOCKED;
 
 	sb_start_pagefault(inode->i_sb);
-	file_update_time(vma->vm_file);
+	file_update_time(vmf->vma->vm_file);
 	lock_page(page);
 	if (page->mapping != inode->i_mapping) {
 		unlock_page(page);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 30e7709a5121..167fd0722c15 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3142,7 +3142,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
  * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
  * this far.
  */
-static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int hugetlb_vm_op_fault(struct vm_fault *vmf)
 {
 	BUG();
 	return 0;
diff --git a/mm/memory.c b/mm/memory.c
index 7663068a33c6..cf97d88158cd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2035,7 +2035,7 @@ static int do_page_mkwrite(struct vm_fault *vmf)
 
 	vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
 
-	ret = vmf->vma->vm_ops->page_mkwrite(vmf->vma, vmf);
+	ret = vmf->vma->vm_ops->page_mkwrite(vmf);
 	/* Restore original flags so that caller is not surprised */
 	vmf->flags = old_flags;
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
@@ -2307,7 +2307,7 @@ static int wp_pfn_shared(struct vm_fault *vmf)
 
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
 		vmf->flags |= FAULT_FLAG_MKWRITE;
-		ret = vma->vm_ops->pfn_mkwrite(vma, vmf);
+		ret = vma->vm_ops->pfn_mkwrite(vmf);
 		if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
 			return ret;
 		return finish_mkwrite_fault(vmf);
@@ -2861,7 +2861,7 @@ static int __do_fault(struct vm_fault *vmf)
 	struct vm_area_struct *vma = vmf->vma;
 	int ret;
 
-	ret = vma->vm_ops->fault(vma, vmf);
+	ret = vma->vm_ops->fault(vmf);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
 			    VM_FAULT_DONE_COW)))
 		return ret;
diff --git a/mm/mmap.c b/mm/mmap.c
index b729084eea90..1cd70010edf0 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3125,8 +3125,7 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
 		mm->data_vm += npages;
 }
 
-static int special_mapping_fault(struct vm_area_struct *vma,
-				 struct vm_fault *vmf);
+static int special_mapping_fault(struct vm_fault *vmf);
 
 /*
  * Having a close hook prevents vma merging regardless of flags.
@@ -3161,9 +3160,9 @@ static const struct vm_operations_struct legacy_special_mapping_vmops = {
 	.fault = special_mapping_fault,
 };
 
-static int special_mapping_fault(struct vm_area_struct *vma,
-				struct vm_fault *vmf)
+static int special_mapping_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	pgoff_t pgoff;
 	struct page **pages;
 
@@ -3173,7 +3172,7 @@ static int special_mapping_fault(struct vm_area_struct *vma,
 		struct vm_special_mapping *sm = vma->vm_private_data;
 
 		if (sm->fault)
-			return sm->fault(sm, vma, vmf);
+			return sm->fault(sm, vmf->vma, vmf);
 
 		pages = sm->pages;
 	}
diff --git a/mm/nommu.c b/mm/nommu.c
index bc964c26be8c..62600ba0d1d8 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1794,7 +1794,7 @@ void unmap_mapping_range(struct address_space *mapping,
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 
-int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int filemap_fault(struct vm_fault *vmf)
 {
 	BUG();
 	return 0;
diff --git a/mm/shmem.c b/mm/shmem.c
index 9c6d22ff44e2..f7f2330c6cb6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1908,8 +1908,9 @@ static int synchronous_wake_function(wait_queue_t *wait, unsigned mode, int sync
 	return ret;
 }
 
-static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int shmem_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct inode *inode = file_inode(vma->vm_file);
 	gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
 	enum sgp_type sgp;
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index c354807381c1..c9e8a9898ce4 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -424,10 +424,9 @@ out:
 	return ret;
 }
 
-static int sel_mmap_policy_fault(struct vm_area_struct *vma,
-				 struct vm_fault *vmf)
+static int sel_mmap_policy_fault(struct vm_fault *vmf)
 {
-	struct policy_load_memory *plm = vma->vm_file->private_data;
+	struct policy_load_memory *plm = vmf->vma->vm_file->private_data;
 	unsigned long offset;
 	struct page *page;
 
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 9d33c1e85c79..aec9c92250fd 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -3245,10 +3245,9 @@ static unsigned int snd_pcm_capture_poll(struct file *file, poll_table * wait)
 /*
  * mmap status record
  */
-static int snd_pcm_mmap_status_fault(struct vm_area_struct *area,
-						struct vm_fault *vmf)
+static int snd_pcm_mmap_status_fault(struct vm_fault *vmf)
 {
-	struct snd_pcm_substream *substream = area->vm_private_data;
+	struct snd_pcm_substream *substream = vmf->vma->vm_private_data;
 	struct snd_pcm_runtime *runtime;
 	
 	if (substream == NULL)
@@ -3282,10 +3281,9 @@ static int snd_pcm_mmap_status(struct snd_pcm_substream *substream, struct file
 /*
  * mmap control record
  */
-static int snd_pcm_mmap_control_fault(struct vm_area_struct *area,
-						struct vm_fault *vmf)
+static int snd_pcm_mmap_control_fault(struct vm_fault *vmf)
 {
-	struct snd_pcm_substream *substream = area->vm_private_data;
+	struct snd_pcm_substream *substream = vmf->vma->vm_private_data;
 	struct snd_pcm_runtime *runtime;
 	
 	if (substream == NULL)
@@ -3341,10 +3339,9 @@ snd_pcm_default_page_ops(struct snd_pcm_substream *substream, unsigned long ofs)
 /*
  * fault callback for mmapping a RAM page
  */
-static int snd_pcm_mmap_data_fault(struct vm_area_struct *area,
-						struct vm_fault *vmf)
+static int snd_pcm_mmap_data_fault(struct vm_fault *vmf)
 {
-	struct snd_pcm_substream *substream = area->vm_private_data;
+	struct snd_pcm_substream *substream = vmf->vma->vm_private_data;
 	struct snd_pcm_runtime *runtime;
 	unsigned long offset;
 	struct page * page;
diff --git a/sound/usb/usx2y/us122l.c b/sound/usb/usx2y/us122l.c
index cf5dc33f4a6d..cf45bf1f7ee0 100644
--- a/sound/usb/usx2y/us122l.c
+++ b/sound/usb/usx2y/us122l.c
@@ -137,13 +137,12 @@ static void usb_stream_hwdep_vm_open(struct vm_area_struct *area)
 	snd_printdd(KERN_DEBUG "%i\n", atomic_read(&us122l->mmap_count));
 }
 
-static int usb_stream_hwdep_vm_fault(struct vm_area_struct *area,
-				     struct vm_fault *vmf)
+static int usb_stream_hwdep_vm_fault(struct vm_fault *vmf)
 {
 	unsigned long offset;
 	struct page *page;
 	void *vaddr;
-	struct us122l *us122l = area->vm_private_data;
+	struct us122l *us122l = vmf->vma->vm_private_data;
 	struct usb_stream *s;
 
 	mutex_lock(&us122l->mutex);
diff --git a/sound/usb/usx2y/usX2Yhwdep.c b/sound/usb/usx2y/usX2Yhwdep.c
index 0b34dbc8f302..605e1047c01d 100644
--- a/sound/usb/usx2y/usX2Yhwdep.c
+++ b/sound/usb/usx2y/usX2Yhwdep.c
@@ -31,19 +31,18 @@
 #include "usbusx2y.h"
 #include "usX2Yhwdep.h"
 
-static int snd_us428ctls_vm_fault(struct vm_area_struct *area,
-				  struct vm_fault *vmf)
+static int snd_us428ctls_vm_fault(struct vm_fault *vmf)
 {
 	unsigned long offset;
 	struct page * page;
 	void *vaddr;
 
 	snd_printdd("ENTER, start %lXh, pgoff %ld\n",
-		   area->vm_start,
+		   vmf->vma->vm_start,
 		   vmf->pgoff);
 	
 	offset = vmf->pgoff << PAGE_SHIFT;
-	vaddr = (char*)((struct usX2Ydev *)area->vm_private_data)->us428ctls_sharedmem + offset;
+	vaddr = (char *)((struct usX2Ydev *)vmf->vma->vm_private_data)->us428ctls_sharedmem + offset;
 	page = virt_to_page(vaddr);
 	get_page(page);
 	vmf->page = page;
diff --git a/sound/usb/usx2y/usx2yhwdeppcm.c b/sound/usb/usx2y/usx2yhwdeppcm.c
index 90766a92e7fd..f95164b91152 100644
--- a/sound/usb/usx2y/usx2yhwdeppcm.c
+++ b/sound/usb/usx2y/usx2yhwdeppcm.c
@@ -652,14 +652,13 @@ static void snd_usX2Y_hwdep_pcm_vm_close(struct vm_area_struct *area)
 }
 
 
-static int snd_usX2Y_hwdep_pcm_vm_fault(struct vm_area_struct *area,
-					struct vm_fault *vmf)
+static int snd_usX2Y_hwdep_pcm_vm_fault(struct vm_fault *vmf)
 {
 	unsigned long offset;
 	void *vaddr;
 
 	offset = vmf->pgoff << PAGE_SHIFT;
-	vaddr = (char*)((struct usX2Ydev *)area->vm_private_data)->hwdep_pcm_shm + offset;
+	vaddr = (char *)((struct usX2Ydev *)vmf->vma->vm_private_data)->hwdep_pcm_shm + offset;
 	vmf->page = virt_to_page(vaddr);
 	get_page(vmf->page);
 	return 0;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index cc4d6e0dd2a2..5b0dd4a9b2cb 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2350,9 +2350,9 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
 
-static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int kvm_vcpu_fault(struct vm_fault *vmf)
 {
-	struct kvm_vcpu *vcpu = vma->vm_file->private_data;
+	struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
 	struct page *page;
 
 	if (vmf->pgoff == 0)
-- 
cgit v1.2.3


From c8394812e56fbc334d815226268cea69b447d461 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Fri, 24 Feb 2017 14:57:42 -0800
Subject: uprobes: split THPs before trying to replace them

Patch series "Fix few rmap-related THP bugs", v3.

The patchset fixes handing PTE-mapped THPs in page_referenced() and
page_idle_clear_pte_refs().

To achieve that I've intrdocued new helper -- page_vma_mapped_walk() --
which replaces all page_check_address{,_transhuge}() and covers all THP
cases.

Patchset overview:
  - First patch fixes one uprobe bug (unrelated to the rest of the
    patchset, just spotted it at the same time);

  - Patches 2-5 fix handling PTE-mapped THPs in page_referenced(),
    page_idle_clear_pte_refs() and rmap core;

  - Patches 6-12 convert all page_check_address{,_transhuge}() users
    (plus remove_migration_pte()) to page_vma_mapped_walk() and drop
    unused helpers.

I think the fixes are not critical enough for stable@ as they don't lead
to crashes or hangs, only suboptimal behaviour.

This patch (of 12):

For THPs page_check_address() always fails.  It leads to endless loop in
uprobe_write_opcode().

Testcase with huge-tmpfs (uprobes cannot probe anonymous memory).

	mount -t debugfs none /sys/kernel/debug
	mount -t tmpfs -o huge=always none /mnt
	gcc -Wall -O2 -o /mnt/test -x c - <<EOF
	int main(void)
	{
		return 0;
	}
	/* Padding to map the code segment with huge pmd */
	asm (".zero 2097152");
	EOF
	echo 'p /mnt/test:0' > /sys/kernel/debug/tracing/uprobe_events
	echo 1 > /sys/kernel/debug/tracing/events/uprobes/enable
	/mnt/test

Let's split THPs before trying to replace.

Link: http://lkml.kernel.org/r/20170129173858.45174-2-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/events/uprobes.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index d416f3baf392..1e65c79e52a6 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -300,8 +300,8 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
 
 retry:
 	/* Read the page with vaddr into memory */
-	ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page,
-			&vma, NULL);
+	ret = get_user_pages_remote(NULL, mm, vaddr, 1,
+			FOLL_FORCE | FOLL_SPLIT, &old_page, &vma, NULL);
 	if (ret <= 0)
 		return ret;
 
-- 
cgit v1.2.3


From 14fa2daa15887f9246cfedc345e83e8d24cb9058 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Fri, 24 Feb 2017 14:58:07 -0800
Subject: mm, uprobes: convert __replace_page() to use page_vma_mapped_walk()

For consistency, it worth converting all page_check_address() to
page_vma_mapped_walk(), so we could drop the former.

Link: http://lkml.kernel.org/r/20170129173858.45174-10-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/events/uprobes.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 1e65c79e52a6..18c6b23edd3c 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -153,14 +153,19 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 				struct page *old_page, struct page *new_page)
 {
 	struct mm_struct *mm = vma->vm_mm;
-	spinlock_t *ptl;
-	pte_t *ptep;
+	struct page_vma_mapped_walk pvmw = {
+		.page = old_page,
+		.vma = vma,
+		.address = addr,
+	};
 	int err;
 	/* For mmu_notifiers */
 	const unsigned long mmun_start = addr;
 	const unsigned long mmun_end   = addr + PAGE_SIZE;
 	struct mem_cgroup *memcg;
 
+	VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page);
+
 	err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg,
 			false);
 	if (err)
@@ -171,11 +176,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 
 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 	err = -EAGAIN;
-	ptep = page_check_address(old_page, mm, addr, &ptl, 0);
-	if (!ptep) {
+	if (!page_vma_mapped_walk(&pvmw)) {
 		mem_cgroup_cancel_charge(new_page, memcg, false);
 		goto unlock;
 	}
+	VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
 
 	get_page(new_page);
 	page_add_new_anon_rmap(new_page, vma, addr, false);
@@ -187,14 +192,15 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 		inc_mm_counter(mm, MM_ANONPAGES);
 	}
 
-	flush_cache_page(vma, addr, pte_pfn(*ptep));
-	ptep_clear_flush_notify(vma, addr, ptep);
-	set_pte_at_notify(mm, addr, ptep, mk_pte(new_page, vma->vm_page_prot));
+	flush_cache_page(vma, addr, pte_pfn(*pvmw.pte));
+	ptep_clear_flush_notify(vma, addr, pvmw.pte);
+	set_pte_at_notify(mm, addr, pvmw.pte,
+			mk_pte(new_page, vma->vm_page_prot));
 
 	page_remove_rmap(old_page, false);
 	if (!page_mapped(old_page))
 		try_to_free_swap(old_page);
-	pte_unmap_unlock(ptep, ptl);
+	page_vma_mapped_walk_done(&pvmw);
 
 	if (vma->vm_flags & VM_LOCKED)
 		munlock_vma_page(old_page);
-- 
cgit v1.2.3


From ca49ca7114553587736fe78319e22f073b631380 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Fri, 24 Feb 2017 14:58:25 -0800
Subject: userfaultfd: non-cooperative: add event for exit() notification

Allow userfaultfd monitor track termination of the processes that have
memory backed by the uffd.

[rppt@linux.vnet.ibm.com: add comment]
  Link: http://lkml.kernel.org/r/20170202135448.GB19804@rapoport-lnxLink: http://lkml.kernel.org/r/1485542673-24387-4-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c                 | 28 ++++++++++++++++++++++++++++
 include/linux/userfaultfd_k.h    |  7 +++++++
 include/uapi/linux/userfaultfd.h |  5 ++++-
 kernel/exit.c                    |  2 ++
 4 files changed, 41 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 4c78458ea78d..b676575f2268 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -774,6 +774,34 @@ void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
 	}
 }
 
+void userfaultfd_exit(struct mm_struct *mm)
+{
+	struct vm_area_struct *vma = mm->mmap;
+
+	/*
+	 * We can do the vma walk without locking because the caller
+	 * (exit_mm) knows it now has exclusive access
+	 */
+	while (vma) {
+		struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
+
+		if (ctx && (ctx->features & UFFD_FEATURE_EVENT_EXIT)) {
+			struct userfaultfd_wait_queue ewq;
+
+			userfaultfd_ctx_get(ctx);
+
+			msg_init(&ewq.msg);
+			ewq.msg.event = UFFD_EVENT_EXIT;
+
+			userfaultfd_event_wait_completion(ctx, &ewq);
+
+			ctx->features &= ~UFFD_FEATURE_EVENT_EXIT;
+		}
+
+		vma = vma->vm_next;
+	}
+}
+
 static int userfaultfd_release(struct inode *inode, struct file *file)
 {
 	struct userfaultfd_ctx *ctx = file->private_data;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index a40be5d0661b..0468548acebf 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -72,6 +72,8 @@ extern int userfaultfd_unmap_prep(struct vm_area_struct *vma,
 extern void userfaultfd_unmap_complete(struct mm_struct *mm,
 				       struct list_head *uf);
 
+extern void userfaultfd_exit(struct mm_struct *mm);
+
 #else /* CONFIG_USERFAULTFD */
 
 /* mm helpers */
@@ -136,6 +138,11 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm,
 					      struct list_head *uf)
 {
 }
+
+static inline void userfaultfd_exit(struct mm_struct *mm)
+{
+}
+
 #endif /* CONFIG_USERFAULTFD */
 
 #endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 3b059530dac9..c055947c5c98 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -18,7 +18,8 @@
  * means the userland is reading).
  */
 #define UFFD_API ((__u64)0xAA)
-#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK |		\
+#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_EXIT |		\
+			   UFFD_FEATURE_EVENT_FORK |		\
 			   UFFD_FEATURE_EVENT_REMAP |		\
 			   UFFD_FEATURE_EVENT_REMOVE |	\
 			   UFFD_FEATURE_EVENT_UNMAP |		\
@@ -112,6 +113,7 @@ struct uffd_msg {
 #define UFFD_EVENT_REMAP	0x14
 #define UFFD_EVENT_REMOVE	0x15
 #define UFFD_EVENT_UNMAP	0x16
+#define UFFD_EVENT_EXIT		0x17
 
 /* flags for UFFD_EVENT_PAGEFAULT */
 #define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
@@ -161,6 +163,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_MISSING_HUGETLBFS		(1<<4)
 #define UFFD_FEATURE_MISSING_SHMEM		(1<<5)
 #define UFFD_FEATURE_EVENT_UNMAP		(1<<6)
+#define UFFD_FEATURE_EVENT_EXIT			(1<<7)
 	__u64 features;
 
 	__u64 ioctls;
diff --git a/kernel/exit.c b/kernel/exit.c
index 9960accbf2ab..90b09ca35c84 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -45,6 +45,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/tracehook.h>
 #include <linux/fs_struct.h>
+#include <linux/userfaultfd_k.h>
 #include <linux/init_task.h>
 #include <linux/perf_event.h>
 #include <trace/events/sched.h>
@@ -547,6 +548,7 @@ static void exit_mm(void)
 	enter_lazy_tlb(mm, current);
 	task_unlock(current);
 	mm_update_next_owner(mm);
+	userfaultfd_exit(mm);
 	mmput(mm);
 	if (test_thread_flag(TIF_MEMDIE))
 		exit_oom_victim();
-- 
cgit v1.2.3


From 3e6daded1f51b79ff851b6c1e4b192f47ea3d063 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 24 Feb 2017 15:00:44 -0800
Subject: kernel/notifier.c: simplify expression

NOTIFY_STOP_MASK (0x8000) has only one bit set and there is no need to
compare output of "ret & NOTIFY_STOP_MASK" to NOTIFY_STOP_MASK.  We just
need to make sure the output is non-zero, that's it.

Link: http://lkml.kernel.org/r/88ee58264a2bfab1c97ffc8ac753e25f55f57c10.1483593065.git.viresh.kumar@linaro.org
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/notifier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/notifier.c b/kernel/notifier.c
index fd2c9acbcc19..6196af8a8223 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -95,7 +95,7 @@ static int notifier_call_chain(struct notifier_block **nl,
 		if (nr_calls)
 			(*nr_calls)++;
 
-		if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
+		if (ret & NOTIFY_STOP_MASK)
 			break;
 		nb = next_nb;
 		nr_to_call--;
-- 
cgit v1.2.3


From 738bc38d49e017fe7acb3596712518e22c225816 Mon Sep 17 00:00:00 2001
From: Bhumika Goyal <bhumirks@gmail.com>
Date: Fri, 24 Feb 2017 15:00:46 -0800
Subject: kernel/ksysfs.c: add __ro_after_init to bin_attribute structure

The object notes_attr of type bin_attribute is not modified after
getting initailized by ksysfs_init.  Apart from initialization in
ksysfs_init it is also passed as an argument to the function
sysfs_create_bin_file but this argument is of type const.  Therefore,
add __ro_after_init to its declaration.

Link: http://lkml.kernel.org/r/1486839969-16891-1-git-send-email-bhumirks@gmail.com
Signed-off-by: Bhumika Goyal <bhumirks@gmail.com>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ksysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index ee1bc1bb8feb..0999679d6f26 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -195,7 +195,7 @@ static ssize_t notes_read(struct file *filp, struct kobject *kobj,
 	return count;
 }
 
-static struct bin_attribute notes_attr = {
+static struct bin_attribute notes_attr __ro_after_init  = {
 	.attr = {
 		.name = "notes",
 		.mode = S_IRUGO,
-- 
cgit v1.2.3


From 81d45bdf89135cd26dc7535c14a45db6cdd647fa Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 24 Feb 2017 00:25:28 +0100
Subject: PM / hibernate: Untangle power_down()

The power_down() routine in the core hibernation code is not exactly
straightforward (to put it lightly), so clean it up to make it avoid
invoking itself recursively, among other things.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/hibernate.c | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index b26dbc48c75b..f251b4d32913 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -608,6 +608,22 @@ static void power_down(void)
 {
 #ifdef CONFIG_SUSPEND
 	int error;
+
+	if (hibernation_mode == HIBERNATION_SUSPEND) {
+		error = suspend_devices_and_enter(PM_SUSPEND_MEM);
+		if (error) {
+			hibernation_mode = hibernation_ops ?
+						HIBERNATION_PLATFORM :
+						HIBERNATION_SHUTDOWN;
+		} else {
+			/* Restore swap signature. */
+			error = swsusp_unmark();
+			if (error)
+				pr_err("PM: Swap will be unusable! Try swapon -a.\n");
+
+			return;
+		}
+	}
 #endif
 
 	switch (hibernation_mode) {
@@ -620,25 +636,6 @@ static void power_down(void)
 		if (pm_power_off)
 			kernel_power_off();
 		break;
-#ifdef CONFIG_SUSPEND
-	case HIBERNATION_SUSPEND:
-		error = suspend_devices_and_enter(PM_SUSPEND_MEM);
-		if (error) {
-			if (hibernation_ops)
-				hibernation_mode = HIBERNATION_PLATFORM;
-			else
-				hibernation_mode = HIBERNATION_SHUTDOWN;
-			power_down();
-		}
-		/*
-		 * Restore swap signature.
-		 */
-		error = swsusp_unmark();
-		if (error)
-			printk(KERN_ERR "PM: Swap will be unusable! "
-			                "Try swapon -a.\n");
-		return;
-#endif
 	}
 	kernel_halt();
 	/*
-- 
cgit v1.2.3


From 2872de1382a7c888fa69532eda25aa7342dfe748 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 24 Feb 2017 00:26:15 +0100
Subject: PM / hibernate: Define pr_fmt() and use pr_*() instead of printk()

Define a pr_fmt() for hibernate.c and convert all of the explicit
printk() calls into corresponding pr_*() so that they use the
pr_fmt() format.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/hibernate.c | 60 +++++++++++++++++++++++-------------------------
 1 file changed, 29 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index f251b4d32913..8951d0d04810 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -10,6 +10,8 @@
  * This file is released under the GPLv2.
  */
 
+#define pr_fmt(fmt) "PM: " fmt
+
 #include <linux/export.h>
 #include <linux/suspend.h>
 #include <linux/syscalls.h>
@@ -104,7 +106,7 @@ EXPORT_SYMBOL(system_entering_hibernation);
 #ifdef CONFIG_PM_DEBUG
 static void hibernation_debug_sleep(void)
 {
-	printk(KERN_INFO "hibernation debug: Waiting for 5 seconds.\n");
+	pr_info("hibernation debug: Waiting for 5 seconds.\n");
 	mdelay(5000);
 }
 
@@ -250,10 +252,9 @@ void swsusp_show_speed(ktime_t start, ktime_t stop,
 		centisecs = 1;	/* avoid div-by-zero */
 	k = nr_pages * (PAGE_SIZE / 1024);
 	kps = (k * 100) / centisecs;
-	printk(KERN_INFO "PM: %s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n",
-			msg, k,
-			centisecs / 100, centisecs % 100,
-			kps / 1000, (kps % 1000) / 10);
+	pr_info("%s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n",
+		msg, k, centisecs / 100, centisecs % 100, kps / 1000,
+		(kps % 1000) / 10);
 }
 
 /**
@@ -271,8 +272,7 @@ static int create_image(int platform_mode)
 
 	error = dpm_suspend_end(PMSG_FREEZE);
 	if (error) {
-		printk(KERN_ERR "PM: Some devices failed to power down, "
-			"aborting hibernation\n");
+		pr_err("Some devices failed to power down, aborting hibernation\n");
 		return error;
 	}
 
@@ -288,8 +288,7 @@ static int create_image(int platform_mode)
 
 	error = syscore_suspend();
 	if (error) {
-		printk(KERN_ERR "PM: Some system devices failed to power down, "
-			"aborting hibernation\n");
+		pr_err("Some system devices failed to power down, aborting hibernation\n");
 		goto Enable_irqs;
 	}
 
@@ -304,8 +303,8 @@ static int create_image(int platform_mode)
 	restore_processor_state();
 	trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
 	if (error)
-		printk(KERN_ERR "PM: Error %d creating hibernation image\n",
-			error);
+		pr_err("Error %d creating hibernation image\n", error);
+
 	if (!in_suspend) {
 		events_check_enabled = false;
 		clear_free_pages();
@@ -432,8 +431,7 @@ static int resume_target_kernel(bool platform_mode)
 
 	error = dpm_suspend_end(PMSG_QUIESCE);
 	if (error) {
-		printk(KERN_ERR "PM: Some devices failed to power down, "
-			"aborting resume\n");
+		pr_err("Some devices failed to power down, aborting resume\n");
 		return error;
 	}
 
@@ -619,7 +617,7 @@ static void power_down(void)
 			/* Restore swap signature. */
 			error = swsusp_unmark();
 			if (error)
-				pr_err("PM: Swap will be unusable! Try swapon -a.\n");
+				pr_err("Swap will be unusable! Try swapon -a.\n");
 
 			return;
 		}
@@ -642,7 +640,7 @@ static void power_down(void)
 	 * Valid image is on the disk, if we continue we risk serious data
 	 * corruption after resume.
 	 */
-	printk(KERN_CRIT "PM: Please power down manually\n");
+	pr_crit("Power down manually\n");
 	while (1)
 		cpu_relax();
 }
@@ -652,7 +650,7 @@ static int load_image_and_restore(void)
 	int error;
 	unsigned int flags;
 
-	pr_debug("PM: Loading hibernation image.\n");
+	pr_debug("Loading hibernation image.\n");
 
 	lock_device_hotplug();
 	error = create_basic_memory_bitmaps();
@@ -664,7 +662,7 @@ static int load_image_and_restore(void)
 	if (!error)
 		hibernation_restore(flags & SF_PLATFORM_MODE);
 
-	printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
+	pr_err("Failed to load hibernation image, recovering.\n");
 	swsusp_free();
 	free_basic_memory_bitmaps();
  Unlock:
@@ -682,7 +680,7 @@ int hibernate(void)
 	bool snapshot_test = false;
 
 	if (!hibernation_available()) {
-		pr_debug("PM: Hibernation not available.\n");
+		pr_debug("Hibernation not available.\n");
 		return -EPERM;
 	}
 
@@ -700,9 +698,9 @@ int hibernate(void)
 		goto Exit;
 	}
 
-	printk(KERN_INFO "PM: Syncing filesystems ... ");
+	pr_info("Syncing filesystems ... \n");
 	sys_sync();
-	printk("done.\n");
+	pr_info("done.\n");
 
 	error = freeze_processes();
 	if (error)
@@ -728,7 +726,7 @@ int hibernate(void)
 		else
 		        flags |= SF_CRC32_MODE;
 
-		pr_debug("PM: writing image.\n");
+		pr_debug("Writing image.\n");
 		error = swsusp_write(flags);
 		swsusp_free();
 		if (!error) {
@@ -740,7 +738,7 @@ int hibernate(void)
 		in_suspend = 0;
 		pm_restore_gfp_mask();
 	} else {
-		pr_debug("PM: Image restored successfully.\n");
+		pr_debug("Image restored successfully.\n");
 	}
 
  Free_bitmaps:
@@ -748,7 +746,7 @@ int hibernate(void)
  Thaw:
 	unlock_device_hotplug();
 	if (snapshot_test) {
-		pr_debug("PM: Checking hibernation image\n");
+		pr_debug("Checking hibernation image\n");
 		error = swsusp_check();
 		if (!error)
 			error = load_image_and_restore();
@@ -812,10 +810,10 @@ static int software_resume(void)
 		goto Unlock;
 	}
 
-	pr_debug("PM: Checking hibernation image partition %s\n", resume_file);
+	pr_debug("Checking hibernation image partition %s\n", resume_file);
 
 	if (resume_delay) {
-		printk(KERN_INFO "Waiting %dsec before reading resume device...\n",
+		pr_info("Waiting %dsec before reading resume device ...\n",
 			resume_delay);
 		ssleep(resume_delay);
 	}
@@ -854,10 +852,10 @@ static int software_resume(void)
 	}
 
  Check_image:
-	pr_debug("PM: Hibernation image partition %d:%d present\n",
+	pr_debug("Hibernation image partition %d:%d present\n",
 		MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
 
-	pr_debug("PM: Looking for hibernation image.\n");
+	pr_debug("Looking for hibernation image.\n");
 	error = swsusp_check();
 	if (error)
 		goto Unlock;
@@ -876,7 +874,7 @@ static int software_resume(void)
 		goto Close_Finish;
 	}
 
-	pr_debug("PM: Preparing processes for restore.\n");
+	pr_debug("Preparing processes for restore.\n");
 	error = freeze_processes();
 	if (error)
 		goto Close_Finish;
@@ -889,7 +887,7 @@ static int software_resume(void)
 	/* For success case, the suspend path will release the lock */
  Unlock:
 	mutex_unlock(&pm_mutex);
-	pr_debug("PM: Hibernation image not present or could not be loaded.\n");
+	pr_debug("Hibernation image not present or could not be loaded.\n");
 	return error;
  Close_Finish:
 	swsusp_close(FMODE_READ);
@@ -1013,7 +1011,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
 		error = -EINVAL;
 
 	if (!error)
-		pr_debug("PM: Hibernation mode set to '%s'\n",
+		pr_debug("Hibernation mode set to '%s'\n",
 			 hibernation_modes[mode]);
 	unlock_system_sleep();
 	return error ? error : n;
@@ -1049,7 +1047,7 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
 	lock_system_sleep();
 	swsusp_resume_device = res;
 	unlock_system_sleep();
-	printk(KERN_INFO "PM: Starting manual resume from disk\n");
+	pr_info("Starting manual resume from disk\n");
 	noresume = 0;
 	software_resume();
 	return n;
-- 
cgit v1.2.3


From 441398d378f29a5ad6d0fcda07918e54e4961800 Mon Sep 17 00:00:00 2001
From: Stas Sergeev <stsp@list.ru>
Date: Mon, 27 Feb 2017 14:27:25 -0800
Subject: sigaltstack: support SS_AUTODISARM for CONFIG_COMPAT

Currently SS_AUTODISARM is not supported in compatibility mode, but does
not return -EINVAL either.  This makes dosemu built with -m32 on x86_64
to crash.  Also the kernel's sigaltstack selftest fails if compiled with
-m32.

This patch adds the needed support.

Link: http://lkml.kernel.org/r/20170205101213.8163-2-stsp@list.ru
Signed-off-by: Stas Sergeev <stsp@users.sourceforge.net>
Cc: Milosz Tanski <milosz@adfin.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: Waiman Long <Waiman.Long@hpe.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dmitry Safonov <dsafonov@virtuozzo.com>
Cc: Wang Xiaoqiang <wangxq10@lzu.edu.cn>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compat.h |  4 +++-
 kernel/signal.c        | 11 +++++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 9e40be522793..aef47be2a5c1 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -711,8 +711,10 @@ int __compat_save_altstack(compat_stack_t __user *, unsigned long);
 	compat_stack_t __user *__uss = uss; \
 	struct task_struct *t = current; \
 	put_user_ex(ptr_to_compat((void __user *)t->sas_ss_sp), &__uss->ss_sp); \
-	put_user_ex(sas_ss_flags(sp), &__uss->ss_flags); \
+	put_user_ex(t->sas_ss_flags, &__uss->ss_flags); \
 	put_user_ex(t->sas_ss_size, &__uss->ss_size); \
+	if (t->sas_ss_flags & SS_AUTODISARM) \
+		sas_ss_reset(t); \
 } while (0);
 
 asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
diff --git a/kernel/signal.c b/kernel/signal.c
index 13f9def8b24a..214a8feeb771 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3239,10 +3239,17 @@ int compat_restore_altstack(const compat_stack_t __user *uss)
 
 int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
 {
+	int err;
 	struct task_struct *t = current;
-	return  __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), &uss->ss_sp) |
-		__put_user(sas_ss_flags(sp), &uss->ss_flags) |
+	err = __put_user(ptr_to_compat((void __user *)t->sas_ss_sp),
+			 &uss->ss_sp) |
+		__put_user(t->sas_ss_flags, &uss->ss_flags) |
 		__put_user(t->sas_ss_size, &uss->ss_size);
+	if (err)
+		return err;
+	if (t->sas_ss_flags & SS_AUTODISARM)
+		sas_ss_reset(t);
+	return 0;
 }
 #endif
 
-- 
cgit v1.2.3


From 2d75cb59a5c6ade417d3c8b7f3654408ca6a71d5 Mon Sep 17 00:00:00 2001
From: Daniel Micay <danielmicay@gmail.com>
Date: Mon, 27 Feb 2017 14:28:03 -0800
Subject: config: android-recommended: disable aio support

The aio interface adds substantial attack surface for a feature that's
not being exposed by Android at all.  It's unlikely that anyone is using
the kernel feature directly either.  This feature is rarely used even on
servers.  The glibc POSIX aio calls really use thread pools.  The lack
of widespread usage also means this is relatively poorly audited/tested.

The kernel's aio rarely provides performance benefits over using a
thread pool and is quite incomplete in terms of system call coverage
along with having edge cases where blocking can occur.  Part of the
performance issue is the fact that it only supports direct io, not
buffered io.  The existing API is considered fundamentally flawed and
it's unlikely it will be expanded, but rather replaced:

  https://marc.info/?l=linux-aio&m=145255815216051&w=2

Since ext4 encryption means no direct io support, kernel aio isn't even
going to work properly on Android devices using file-based encryption.

Reviewed-at: https://android-review.googlesource.com/#/c/292158/

Link: http://lkml.kernel.org/r/1481113148-29204-1-git-send-email-amit.pundir@linaro.org
Signed-off-by: Daniel Micay <danielmicay@gmail.com>
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
Cc: Rob Herring <rob.herring@linaro.org>
Cc: John Stultz <john.stultz@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/configs/android-recommended.config | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
index 99127edc5204..28ee064b6744 100644
--- a/kernel/configs/android-recommended.config
+++ b/kernel/configs/android-recommended.config
@@ -1,4 +1,5 @@
 #  KEEP ALPHABETICALLY SORTED
+# CONFIG_AIO is not set
 # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
 # CONFIG_INPUT_MOUSE is not set
 # CONFIG_LEGACY_PTYS is not set
-- 
cgit v1.2.3


From 0d31a194d517ce77a1c3a06cd8708184bb6e7862 Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Mon, 27 Feb 2017 14:28:06 -0800
Subject: config: android-base: enable hardened usercopy and kernel ASLR

Enable CONFIG_HARDENED_USERCOPY and CONFIG_RANDOMIZE_BASE in Android
base config fragment.

Reviewed at https://android-review.googlesource.com/#/c/283659/
Reviewed at https://android-review.googlesource.com/#/c/278133/

Link: http://lkml.kernel.org/r/1481113148-29204-2-git-send-email-amit.pundir@linaro.org
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
Cc: Rob Herring <rob.herring@linaro.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Daniel Micay <danielmicay@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/configs/android-base.config | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index 1a8f34f63601..26a06e09a5bd 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -21,6 +21,7 @@ CONFIG_CP15_BARRIER_EMULATION=y
 CONFIG_DEFAULT_SECURITY_SELINUX=y
 CONFIG_EMBEDDED=y
 CONFIG_FB=y
+CONFIG_HARDENED_USERCOPY=y
 CONFIG_HIGH_RES_TIMERS=y
 CONFIG_INET6_AH=y
 CONFIG_INET6_ESP=y
@@ -129,6 +130,7 @@ CONFIG_PPP_DEFLATE=y
 CONFIG_PPP_MPPE=y
 CONFIG_PREEMPT=y
 CONFIG_QUOTA=y
+CONFIG_RANDOMIZE_BASE=y
 CONFIG_RTC_CLASS=y
 CONFIG_RT_GROUP_SCHED=y
 CONFIG_SECCOMP=y
-- 
cgit v1.2.3


From 9332ef9dbd172d4ab0a0141df7cb21c696a5ce96 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Mon, 27 Feb 2017 14:28:47 -0800
Subject: scripts/spelling.txt: add "an user" pattern and fix typo instances

Fix typos and add the following to the scripts/spelling.txt:

  an user||a user
  an userspace||a userspace

I also added "userspace" to the list since it is a common word in Linux.
I found some instances for "an userfaultfd", but I did not add it to the
list.  I felt it is endless to find words that start with "user" such as
"userland" etc., so must draw a line somewhere.

Link: http://lkml.kernel.org/r/1481573103-11329-4-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/ras.rst             | 2 +-
 Documentation/devicetree/bindings/opp/opp.txt | 2 +-
 Documentation/filesystems/quota.txt           | 2 +-
 Documentation/kselftest.txt                   | 4 ++--
 Documentation/media/dvb-drivers/ci.rst        | 2 +-
 Documentation/networking/cdc_mbim.txt         | 4 ++--
 Documentation/vm/userfaultfd.txt              | 2 +-
 arch/Kconfig                                  | 2 +-
 arch/powerpc/xmon/ppc-opc.c                   | 2 +-
 arch/x86/kvm/mmu.c                            | 2 +-
 drivers/media/dvb-core/dvb_ringbuffer.h       | 4 ++--
 drivers/scsi/lpfc/lpfc_attr.c                 | 2 +-
 fs/userfaultfd.c                              | 6 +++---
 include/net/mac80211.h                        | 2 +-
 kernel/irq/manage.c                           | 2 +-
 net/bluetooth/hci_sock.c                      | 6 +++---
 net/netfilter/nfnetlink_cthelper.c            | 2 +-
 scripts/spelling.txt                          | 2 ++
 tools/perf/Documentation/tips.txt             | 2 +-
 19 files changed, 27 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/ras.rst b/Documentation/admin-guide/ras.rst
index 9939348bd4a3..1b90c6f00a92 100644
--- a/Documentation/admin-guide/ras.rst
+++ b/Documentation/admin-guide/ras.rst
@@ -81,7 +81,7 @@ That defines some categories of errors:
   still run, eventually replacing the affected hardware by a hot spare,
   if available.
 
-  Also, when an error happens on an userspace process, it is also possible to
+  Also, when an error happens on a userspace process, it is also possible to
   kill such process and let userspace restart it.
 
 The mechanism for handling non-fatal errors is usually complex and may
diff --git a/Documentation/devicetree/bindings/opp/opp.txt b/Documentation/devicetree/bindings/opp/opp.txt
index 9f5ca4457b5f..ecdcfb790704 100644
--- a/Documentation/devicetree/bindings/opp/opp.txt
+++ b/Documentation/devicetree/bindings/opp/opp.txt
@@ -136,7 +136,7 @@ Optional properties:
   larger OPP table, based on what version of the hardware we are running on. We
   still can't have multiple nodes with the same opp-hz value in OPP table.
 
-  It's an user defined array containing a hierarchy of hardware version numbers,
+  It's a user defined array containing a hierarchy of hardware version numbers,
   supported by the OPP. For example: a platform with hierarchy of three levels
   of versions (A, B and C), this field should be like <X Y Z>, where X
   corresponds to Version hierarchy A, Y corresponds to version hierarchy B and Z
diff --git a/Documentation/filesystems/quota.txt b/Documentation/filesystems/quota.txt
index 29fc01552646..32874b06ebe9 100644
--- a/Documentation/filesystems/quota.txt
+++ b/Documentation/filesystems/quota.txt
@@ -6,7 +6,7 @@ Quota subsystem allows system administrator to set limits on used space and
 number of used inodes (inode is a filesystem structure which is associated with
 each file or directory) for users and/or groups. For both used space and number
 of used inodes there are actually two limits. The first one is called softlimit
-and the second one hardlimit.  An user can never exceed a hardlimit for any
+and the second one hardlimit.  A user can never exceed a hardlimit for any
 resource (unless he has CAP_SYS_RESOURCE capability). User is allowed to exceed
 softlimit but only for limited period of time. This period is called "grace
 period" or "grace time". When grace time is over, user is not able to allocate
diff --git a/Documentation/kselftest.txt b/Documentation/kselftest.txt
index d431dc82c228..5bd590335839 100644
--- a/Documentation/kselftest.txt
+++ b/Documentation/kselftest.txt
@@ -59,14 +59,14 @@ Install selftests
 =================
 
 You can use kselftest_install.sh tool installs selftests in default
-location which is tools/testing/selftests/kselftest or an user specified
+location which is tools/testing/selftests/kselftest or a user specified
 location.
 
 To install selftests in default location:
    $ cd tools/testing/selftests
    $ ./kselftest_install.sh
 
-To install selftests in an user specified location:
+To install selftests in a user specified location:
    $ cd tools/testing/selftests
    $ ./kselftest_install.sh install_dir
 
diff --git a/Documentation/media/dvb-drivers/ci.rst b/Documentation/media/dvb-drivers/ci.rst
index 8124bf5ce5ef..69b07e9d1816 100644
--- a/Documentation/media/dvb-drivers/ci.rst
+++ b/Documentation/media/dvb-drivers/ci.rst
@@ -20,7 +20,7 @@ existing low level CI API.
 ca_zap
 ~~~~~~
 
-An userspace application, like ``ca_zap`` is required to handle encrypted
+A userspace application, like ``ca_zap`` is required to handle encrypted
 MPEG-TS streams.
 
 The ``ca_zap`` userland application is in charge of sending the
diff --git a/Documentation/networking/cdc_mbim.txt b/Documentation/networking/cdc_mbim.txt
index a15ea602aa52..b9482ca10254 100644
--- a/Documentation/networking/cdc_mbim.txt
+++ b/Documentation/networking/cdc_mbim.txt
@@ -38,7 +38,7 @@ Basic usage
 ===========
 
 MBIM functions are inactive when unmanaged. The cdc_mbim driver only
-provides an userspace interface to the MBIM control channel, and will
+provides a userspace interface to the MBIM control channel, and will
 not participate in the management of the function. This implies that a
 userspace MBIM management application always is required to enable a
 MBIM function.
@@ -200,7 +200,7 @@ structure described in section 10.5.29 of [1].
 The DSS VLAN subdevices are used as a practical interface between the
 shared MBIM data channel and a MBIM DSS aware userspace application.
 It is not intended to be presented as-is to an end user. The
-assumption is that an userspace application initiating a DSS session
+assumption is that a userspace application initiating a DSS session
 also takes care of the necessary framing of the DSS data, presenting
 the stream to the end user in an appropriate way for the stream type.
 
diff --git a/Documentation/vm/userfaultfd.txt b/Documentation/vm/userfaultfd.txt
index fe51a5aa8963..0e5543a920e5 100644
--- a/Documentation/vm/userfaultfd.txt
+++ b/Documentation/vm/userfaultfd.txt
@@ -149,7 +149,7 @@ migration thread in the QEMU running in the destination node will
 receive the page that triggered the userfault and it'll map it as
 usual with the UFFDIO_COPY|ZEROPAGE (without actually knowing if it
 was spontaneously sent by the source or if it was an urgent page
-requested through an userfault).
+requested through a userfault).
 
 By the time the userfaults start, the QEMU in the destination node
 doesn't need to keep any per-page state bitmap relative to the live
diff --git a/arch/Kconfig b/arch/Kconfig
index d0012add6b19..cd211a14a88f 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -29,7 +29,7 @@ config OPROFILE_EVENT_MULTIPLEX
 	  The number of hardware counters is limited. The multiplexing
 	  feature enables OProfile to gather more events than counters
 	  are provided by the hardware. This is realized by switching
-	  between events at an user specified time interval.
+	  between events at a user specified time interval.
 
 	  If unsure, say N.
 
diff --git a/arch/powerpc/xmon/ppc-opc.c b/arch/powerpc/xmon/ppc-opc.c
index 6845e91ba04a..954dbf8222d7 100644
--- a/arch/powerpc/xmon/ppc-opc.c
+++ b/arch/powerpc/xmon/ppc-opc.c
@@ -1587,7 +1587,7 @@ extract_tbr (unsigned long insn,
 #define CTX(op, xop)   (OP (op) | (((unsigned long)(xop)) & 0x7))
 #define CTX_MASK CTX(0x3f, 0x7)
 
-/* An User Context form instruction.  */
+/* A User Context form instruction.  */
 #define UCTX(op, xop)  (OP (op) | (((unsigned long)(xop)) & 0x1f))
 #define UCTX_MASK UCTX(0x3f, 0x1f)
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2fd7586aad4d..1cda35277278 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4102,7 +4102,7 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
 				 * as a SMAP violation if all of the following
 				 * conditions are ture:
 				 *   - X86_CR4_SMAP is set in CR4
-				 *   - An user page is accessed
+				 *   - A user page is accessed
 				 *   - Page fault in kernel mode
 				 *   - if CPL = 3 or X86_EFLAGS_AC is clear
 				 *
diff --git a/drivers/media/dvb-core/dvb_ringbuffer.h b/drivers/media/dvb-core/dvb_ringbuffer.h
index bbe94873d44d..8ed6bcc3a56e 100644
--- a/drivers/media/dvb-core/dvb_ringbuffer.h
+++ b/drivers/media/dvb-core/dvb_ringbuffer.h
@@ -136,7 +136,7 @@ extern void dvb_ringbuffer_flush_spinlock_wakeup(struct dvb_ringbuffer *rbuf);
 }
 
 /**
- * dvb_ringbuffer_read_user - Reads a buffer into an user pointer
+ * dvb_ringbuffer_read_user - Reads a buffer into a user pointer
  *
  * @rbuf: pointer to struct dvb_ringbuffer
  * @buf: pointer to the buffer where the data will be stored
@@ -193,7 +193,7 @@ extern ssize_t dvb_ringbuffer_write(struct dvb_ringbuffer *rbuf, const u8 *buf,
 				    size_t len);
 
 /**
- * dvb_ringbuffer_write_user - Writes a buffer received via an user pointer
+ * dvb_ringbuffer_write_user - Writes a buffer received via a user pointer
  *
  * @rbuf: pointer to struct dvb_ringbuffer
  * @buf: pointer to the buffer where the data will be read
diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c
index 50cf402dea29..03cb05abc821 100644
--- a/drivers/scsi/lpfc/lpfc_attr.c
+++ b/drivers/scsi/lpfc/lpfc_attr.c
@@ -3329,7 +3329,7 @@ static DEVICE_ATTR(lpfc_static_vport, S_IRUGO,
  * @buf: Data buffer.
  * @count: Size of the data buffer.
  *
- * This function get called when an user write to the lpfc_stat_data_ctrl
+ * This function get called when a user write to the lpfc_stat_data_ctrl
  * sysfs file. This function parse the command written to the sysfs file
  * and take appropriate action. These commands are used for controlling
  * driver statistical data collection.
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 625b7285a37b..e6e0a619cb3a 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1807,17 +1807,17 @@ static void init_once_userfaultfd_ctx(void *mem)
 }
 
 /**
- * userfaultfd_file_create - Creates an userfaultfd file pointer.
+ * userfaultfd_file_create - Creates a userfaultfd file pointer.
  * @flags: Flags for the userfaultfd file.
  *
- * This function creates an userfaultfd file pointer, w/out installing
+ * This function creates a userfaultfd file pointer, w/out installing
  * it into the fd table. This is useful when the userfaultfd file is
  * used during the initialization of data structures that require
  * extra setup after the userfaultfd creation. So the userfaultfd
  * creation is split into the file pointer creation phase, and the
  * file descriptor installation phase.  In this way races with
  * userspace closing the newly installed file descriptor can be
- * avoided.  Returns an userfaultfd file pointer, or a proper error
+ * avoided.  Returns a userfaultfd file pointer, or a proper error
  * pointer.
  */
 static struct file *userfaultfd_file_create(int flags)
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index b9a08cd1d97d..a3bab3c5ecfb 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -3392,7 +3392,7 @@ enum ieee80211_reconfig_type {
  *	since there won't be any time to beacon before the switch anyway.
  * @pre_channel_switch: This is an optional callback that is called
  *	before a channel switch procedure is started (ie. when a STA
- *	gets a CSA or an userspace initiated channel-switch), allowing
+ *	gets a CSA or a userspace initiated channel-switch), allowing
  *	the driver to prepare for the channel switch.
  * @post_channel_switch: This is an optional callback that is called
  *	after a channel switch procedure is completed, allowing the
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6b669593e7eb..944d068b6c48 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -353,7 +353,7 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)
 		return 0;
 
 	/*
-	 * Preserve the managed affinity setting and an userspace affinity
+	 * Preserve the managed affinity setting and a userspace affinity
 	 * setup, but make sure that one of the targets is online.
 	 */
 	if (irqd_affinity_is_managed(&desc->irq_data) ||
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 48f9471e7c85..f64d6566021f 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -851,7 +851,7 @@ static int hci_sock_release(struct socket *sock)
 
 	if (hdev) {
 		if (hci_pi(sk)->channel == HCI_CHANNEL_USER) {
-			/* When releasing an user channel exclusive access,
+			/* When releasing a user channel exclusive access,
 			 * call hci_dev_do_close directly instead of calling
 			 * hci_dev_close to ensure the exclusive access will
 			 * be released and the controller brought back down.
@@ -1172,7 +1172,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 				/* In case the transport is already up and
 				 * running, clear the error here.
 				 *
-				 * This can happen when opening an user
+				 * This can happen when opening a user
 				 * channel and HCI_AUTO_OFF grace period
 				 * is still active.
 				 */
@@ -1190,7 +1190,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 		if (!hci_sock_gen_cookie(sk)) {
 			/* In the case when a cookie has already been assigned,
 			 * this socket will transition from a raw socket into
-			 * an user channel socket. For a clean transition, send
+			 * a user channel socket. For a clean transition, send
 			 * the close notification first.
 			 */
 			skb = create_monitor_ctrl_close(sk);
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index 3b79f34b5095..de8782345c86 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -48,7 +48,7 @@ nfnl_userspace_cthelper(struct sk_buff *skb, unsigned int protoff,
 	if (helper == NULL)
 		return NF_DROP;
 
-	/* This is an user-space helper not yet configured, skip. */
+	/* This is a user-space helper not yet configured, skip. */
 	if ((helper->flags &
 	    (NF_CT_HELPER_F_USERSPACE | NF_CT_HELPER_F_CONFIGURED)) ==
 	     NF_CT_HELPER_F_USERSPACE)
diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index 372840a672a4..13794532c3fa 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -86,6 +86,8 @@ alue||value
 ambigious||ambiguous
 amoung||among
 amout||amount
+an user||a user
+an userspace||a userspace
 analysator||analyzer
 ang||and
 anniversery||anniversary
diff --git a/tools/perf/Documentation/tips.txt b/tools/perf/Documentation/tips.txt
index 8a6479c0eac9..170b0289a7bc 100644
--- a/tools/perf/Documentation/tips.txt
+++ b/tools/perf/Documentation/tips.txt
@@ -22,7 +22,7 @@ If you have debuginfo enabled, try: perf report -s sym,srcline
 For memory address profiling, try: perf mem record / perf mem report
 For tracepoint events, try: perf report -s trace_fields
 To record callchains for each sample: perf record -g
-To record every process run by an user: perf record -u <user>
+To record every process run by a user: perf record -u <user>
 Skip collecing build-id when recording: perf record -B
 To change sampling frequency to 100 Hz: perf record -F 100
 See assembly instructions with percentage: perf annotate <symbol>
-- 
cgit v1.2.3


From b564d62e67560423965c43d60249a09d8e70a27a Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Mon, 27 Feb 2017 14:29:06 -0800
Subject: scripts/spelling.txt: add "varible" pattern and fix typo instances

Fix typos and add the following to the scripts/spelling.txt:

  varible||variable

While we are here, tidy up the comment blocks that fit in a single line
for drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c and
net/sctp/transport.c.

Link: http://lkml.kernel.org/r/1481573103-11329-11-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/scsi/ChangeLog.megaraid_sas          | 2 +-
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 4 +---
 drivers/net/wimax/i2400m/usb-fw.c                  | 2 +-
 kernel/torture.c                                   | 2 +-
 net/sctp/transport.c                               | 4 +---
 scripts/spelling.txt                               | 1 +
 6 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/scsi/ChangeLog.megaraid_sas b/Documentation/scsi/ChangeLog.megaraid_sas
index 00ffdf187f0b..234ddabb23ef 100644
--- a/Documentation/scsi/ChangeLog.megaraid_sas
+++ b/Documentation/scsi/ChangeLog.megaraid_sas
@@ -549,7 +549,7 @@ ii.	Reduced by 1 max cmds sent to FW from Driver to make the reply_q_sz same
 3 Older Version   : 00.00.03.02
 
 i.	Send stop adapter to FW & Dump pending FW cmds before declaring adapter dead.
-	New varible added to set dbg level.
+	New variable added to set dbg level.
 ii.	Disable interrupt made as fn pointer as they are different for 1068 / 1078
 iii.	Frame count optimization. Main frame can contain 2 SGE for 64 bit SGLs and
 	3 SGE for 32 bit SGL
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index cbbf8648307a..78460c52b7c4 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -847,9 +847,7 @@ static void i40e_free_vf_res(struct i40e_vf *vf)
 		wr32(hw, reg_idx, reg);
 		i40e_flush(hw);
 	}
-	/* reset some of the state varibles keeping
-	 * track of the resources
-	 */
+	/* reset some of the state variables keeping track of the resources */
 	vf->num_queue_pairs = 0;
 	vf->vf_states = 0;
 	clear_bit(I40E_VF_STAT_INIT, &vf->vf_states);
diff --git a/drivers/net/wimax/i2400m/usb-fw.c b/drivers/net/wimax/i2400m/usb-fw.c
index e74664b84925..502c346aa790 100644
--- a/drivers/net/wimax/i2400m/usb-fw.c
+++ b/drivers/net/wimax/i2400m/usb-fw.c
@@ -237,7 +237,7 @@ void __i2400mu_bm_notif_cb(struct urb *urb)
  *
  * @i2400m: device descriptor
  * @urb: urb to use
- * @completion: completion varible to complete when done
+ * @completion: completion variable to complete when done
  *
  * Data is always read to i2400m->bm_ack_buf
  */
diff --git a/kernel/torture.c b/kernel/torture.c
index 0d887eb62856..01a99976f072 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -311,7 +311,7 @@ EXPORT_SYMBOL_GPL(torture_random);
 /*
  * Variables for shuffling.  The idea is to ensure that each CPU stays
  * idle for an extended period to test interactions with dyntick idle,
- * as well as interactions with any per-CPU varibles.
+ * as well as interactions with any per-CPU variables.
  */
 struct shuffle_task {
 	struct list_head st_l;
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 5b63ceb3bf37..3379668af368 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -643,9 +643,7 @@ void sctp_transport_reset(struct sctp_transport *t)
 	t->srtt = 0;
 	t->rttvar = 0;
 
-	/* Reset these additional varibles so that we have a clean
-	 * slate.
-	 */
+	/* Reset these additional variables so that we have a clean slate. */
 	t->partial_bytes_acked = 0;
 	t->flight_size = 0;
 	t->error_count = 0;
diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index d22f0b8a9368..f445fea22a6f 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -1092,6 +1092,7 @@ vaid||valid
 vaild||valid
 valide||valid
 variantions||variations
+varible||variable
 varient||variant
 vaule||value
 verbse||verbose
-- 
cgit v1.2.3


From 5b5e0928f742cfa853b2411400a1b19fa379d758 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 27 Feb 2017 14:30:02 -0800
Subject: lib/vsprintf.c: remove %Z support

Now that %z is standartised in C99 there is no reason to support %Z.
Unlike %L it doesn't even make format strings smaller.

Use BUILD_BUG_ON in a couple ATM drivers.

In case anyone didn't notice lib/vsprintf.o is about half of SLUB which
is in my opinion is quite an achievement.  Hopefully this patch inspires
someone else to trim vsprintf.c more.

Link: http://lkml.kernel.org/r/20170103230126.GA30170@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andy Shevchenko <andy.shevchenko@gmail.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/bsg.c                                        |  6 +--
 drivers/atm/ambassador.c                           |  7 +--
 drivers/atm/eni.c                                  |  6 +--
 drivers/atm/firestream.c                           | 12 ++---
 drivers/atm/horizon.c                              |  7 +--
 drivers/atm/lanai.c                                | 16 +++---
 drivers/char/pcmcia/cm4000_cs.c                    |  2 +-
 drivers/char/pcmcia/cm4040_cs.c                    |  2 +-
 drivers/gpu/drm/mga/mga_drv.h                      |  4 +-
 drivers/ide/ide-acpi.c                             |  2 +-
 drivers/ide/ide-tape.c                             |  4 +-
 drivers/input/touchscreen/cyttsp4_core.c           | 62 +++++++++++-----------
 drivers/media/dvb-frontends/helene.c               |  2 +-
 drivers/media/dvb-frontends/or51132.c              |  2 +-
 drivers/media/dvb-frontends/tda10048.c             |  2 +-
 drivers/media/pci/saa7164/saa7164-fw.c             |  2 +-
 drivers/media/tuners/xc5000.c                      |  2 +-
 drivers/media/usb/dvb-usb/dib0700_devices.c        |  4 +-
 drivers/misc/vmw_vmci/vmci_context.c               |  2 +-
 drivers/net/arcnet/arcnet.c                        |  2 +-
 drivers/net/ethernet/cadence/macb.c                |  2 +-
 drivers/net/gtp.c                                  |  2 +-
 drivers/net/usb/rndis_host.c                       |  2 +-
 drivers/net/usb/sierra_net.c                       |  2 +-
 .../broadcom/brcm80211/brcmfmac/cfg80211.c         |  2 +-
 drivers/net/wireless/intel/iwlegacy/4965-mac.c     | 30 +++++------
 drivers/net/wireless/intel/iwlwifi/iwl-drv.c       | 16 +++---
 drivers/parport/ieee1284_ops.c                     |  2 +-
 drivers/scsi/osd/osd_initiator.c                   |  2 +-
 drivers/scsi/osst.c                                |  6 +--
 drivers/scsi/qla2xxx/qla_init.c                    | 10 ++--
 drivers/tty/n_hdlc.c                               |  2 +-
 drivers/usb/gadget/legacy/inode.c                  |  2 +-
 drivers/usb/host/ehci-hcd.c                        |  2 +-
 drivers/usb/host/fotg210-hcd.c                     |  2 +-
 drivers/usb/host/ohci-hcd.c                        |  2 +-
 drivers/usb/misc/adutux.c                          |  6 +--
 drivers/usb/misc/legousbtower.c                    |  2 +-
 drivers/usb/misc/uss720.c                          |  8 +--
 drivers/video/fbdev/metronomefb.c                  |  2 +-
 fs/afs/dir.c                                       | 14 ++---
 fs/ncpfs/sock.c                                    |  4 +-
 fs/nfs/blocklayout/blocklayout.c                   |  2 +-
 fs/nfs/filelayout/filelayout.c                     |  4 +-
 fs/nfs/flexfilelayout/flexfilelayout.c             |  4 +-
 fs/nfs/objlayout/objlayout.c                       |  2 +-
 fs/nfsd/nfscache.c                                 |  2 +-
 kernel/relay.c                                     |  2 +-
 lib/vsprintf.c                                     |  8 +--
 mm/dmapool.c                                       |  2 +-
 net/appletalk/ddp.c                                |  4 +-
 net/atm/mpc.c                                      |  2 +-
 net/bridge/netfilter/ebt_among.c                   |  2 +-
 net/ieee802154/socket.c                            |  4 +-
 net/ipv4/fib_trie.c                                |  2 +-
 net/ipv4/ipmr.c                                    |  2 +-
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c     |  2 +-
 net/ipv4/netfilter/nf_log_arp.c                    |  2 +-
 net/ipv6/netfilter/nf_log_ipv6.c                   |  2 +-
 net/irda/irnet/irnet_ppp.c                         | 12 ++---
 net/l2tp/l2tp_core.c                               |  4 +-
 net/netfilter/ipvs/ip_vs_conn.c                    |  2 +-
 net/netfilter/ipvs/ip_vs_dh.c                      |  4 +-
 net/netfilter/ipvs/ip_vs_lblc.c                    |  4 +-
 net/netfilter/ipvs/ip_vs_lblcr.c                   |  4 +-
 net/netfilter/ipvs/ip_vs_sh.c                      |  4 +-
 net/netfilter/ipvs/ip_vs_sync.c                    |  2 +-
 net/netfilter/nf_conntrack_ftp.c                   |  2 +-
 net/sctp/output.c                                  |  2 +-
 net/sunrpc/auth_gss/auth_gss.c                     |  2 +-
 net/sunrpc/svc.c                                   |  2 +-
 net/sunrpc/svcsock.c                               |  4 +-
 net/sunrpc/xprtsock.c                              |  8 +--
 security/selinux/ss/ebitmap.c                      |  2 +-
 security/selinux/ss/policydb.c                     |  2 +-
 sound/pci/korg1212/korg1212.c                      |  4 +-
 sound/pci/pcxhr/pcxhr_hwdep.c                      |  2 +-
 sound/pcmcia/vx/vxp_ops.c                          |  2 +-
 sound/soc/soc-core.c                               |  2 +-
 79 files changed, 187 insertions(+), 201 deletions(-)

(limited to 'kernel')

diff --git a/block/bsg.c b/block/bsg.c
index a9a8b8e0446f..74835dbf0c47 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -573,7 +573,7 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 	int ret;
 	ssize_t bytes_read;
 
-	dprintk("%s: read %Zd bytes\n", bd->name, count);
+	dprintk("%s: read %zd bytes\n", bd->name, count);
 
 	bsg_set_block(bd, file);
 
@@ -648,7 +648,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
 	ssize_t bytes_written;
 	int ret;
 
-	dprintk("%s: write %Zd bytes\n", bd->name, count);
+	dprintk("%s: write %zd bytes\n", bd->name, count);
 
 	if (unlikely(segment_eq(get_fs(), KERNEL_DS)))
 		return -EINVAL;
@@ -667,7 +667,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
 	if (!bytes_written || err_block_err(ret))
 		bytes_written = ret;
 
-	dprintk("%s: returning %Zd\n", bd->name, bytes_written);
+	dprintk("%s: returning %zd\n", bd->name, bytes_written);
 	return bytes_written;
 }
 
diff --git a/drivers/atm/ambassador.c b/drivers/atm/ambassador.c
index f1a9198dfe5a..4a610795b585 100644
--- a/drivers/atm/ambassador.c
+++ b/drivers/atm/ambassador.c
@@ -2394,12 +2394,7 @@ static int __init amb_module_init (void)
 {
   PRINTD (DBG_FLOW|DBG_INIT, "init_module");
   
-  // sanity check - cast needed as printk does not support %Zu
-  if (sizeof(amb_mem) != 4*16 + 4*12) {
-    PRINTK (KERN_ERR, "Fix amb_mem (is %lu words).",
-	    (unsigned long) sizeof(amb_mem));
-    return -ENOMEM;
-  }
+  BUILD_BUG_ON(sizeof(amb_mem) != 4*16 + 4*12);
   
   show_version();
   
diff --git a/drivers/atm/eni.c b/drivers/atm/eni.c
index 623359e407aa..b042ec458544 100644
--- a/drivers/atm/eni.c
+++ b/drivers/atm/eni.c
@@ -2326,11 +2326,7 @@ static int __init eni_init(void)
 {
 	struct sk_buff *skb; /* dummy for sizeof */
 
-	if (sizeof(skb->cb) < sizeof(struct eni_skb_prv)) {
-		printk(KERN_ERR "eni_detect: skb->cb is too small (%Zd < %Zd)\n",
-		    sizeof(skb->cb),sizeof(struct eni_skb_prv));
-		return -EIO;
-	}
+	BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct eni_skb_prv));
 	return pci_register_driver(&eni_driver);
 }
 
diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c
index 80c2ddcfa92c..22dcab952a24 100644
--- a/drivers/atm/firestream.c
+++ b/drivers/atm/firestream.c
@@ -895,7 +895,7 @@ static int fs_open(struct atm_vcc *atm_vcc)
 	/* XXX handle qos parameters (rate limiting) ? */
 
 	vcc = kmalloc(sizeof(struct fs_vcc), GFP_KERNEL);
-	fs_dprintk (FS_DEBUG_ALLOC, "Alloc VCC: %p(%Zd)\n", vcc, sizeof(struct fs_vcc));
+	fs_dprintk (FS_DEBUG_ALLOC, "Alloc VCC: %p(%zd)\n", vcc, sizeof(struct fs_vcc));
 	if (!vcc) {
 		clear_bit(ATM_VF_ADDR, &atm_vcc->flags);
 		return -ENOMEM;
@@ -946,7 +946,7 @@ static int fs_open(struct atm_vcc *atm_vcc)
 
 	if (DO_DIRECTION (txtp)) {
 		tc = kmalloc (sizeof (struct fs_transmit_config), GFP_KERNEL);
-		fs_dprintk (FS_DEBUG_ALLOC, "Alloc tc: %p(%Zd)\n",
+		fs_dprintk (FS_DEBUG_ALLOC, "Alloc tc: %p(%zd)\n",
 			    tc, sizeof (struct fs_transmit_config));
 		if (!tc) {
 			fs_dprintk (FS_DEBUG_OPEN, "fs: can't alloc transmit_config.\n");
@@ -1185,7 +1185,7 @@ static int fs_send (struct atm_vcc *atm_vcc, struct sk_buff *skb)
 	vcc->last_skb = skb;
 
 	td = kmalloc (sizeof (struct FS_BPENTRY), GFP_ATOMIC);
-	fs_dprintk (FS_DEBUG_ALLOC, "Alloc transd: %p(%Zd)\n", td, sizeof (struct FS_BPENTRY));
+	fs_dprintk (FS_DEBUG_ALLOC, "Alloc transd: %p(%zd)\n", td, sizeof (struct FS_BPENTRY));
 	if (!td) {
 		/* Oops out of mem */
 		return -ENOMEM;
@@ -1492,7 +1492,7 @@ static void top_off_fp (struct fs_dev *dev, struct freepool *fp,
 		fs_dprintk (FS_DEBUG_ALLOC, "Alloc rec-skb: %p(%d)\n", skb, fp->bufsize);
 		if (!skb) break;
 		ne = kmalloc (sizeof (struct FS_BPENTRY), gfp_flags);
-		fs_dprintk (FS_DEBUG_ALLOC, "Alloc rec-d: %p(%Zd)\n", ne, sizeof (struct FS_BPENTRY));
+		fs_dprintk (FS_DEBUG_ALLOC, "Alloc rec-d: %p(%zd)\n", ne, sizeof (struct FS_BPENTRY));
 		if (!ne) {
 			fs_dprintk (FS_DEBUG_ALLOC, "Free rec-skb: %p\n", skb);
 			dev_kfree_skb_any (skb);
@@ -1803,7 +1803,7 @@ static int fs_init(struct fs_dev *dev)
 	}
 	dev->atm_vccs = kcalloc (dev->nchannels, sizeof (struct atm_vcc *),
 				 GFP_KERNEL);
-	fs_dprintk (FS_DEBUG_ALLOC, "Alloc atmvccs: %p(%Zd)\n",
+	fs_dprintk (FS_DEBUG_ALLOC, "Alloc atmvccs: %p(%zd)\n",
 		    dev->atm_vccs, dev->nchannels * sizeof (struct atm_vcc *));
 
 	if (!dev->atm_vccs) {
@@ -1911,7 +1911,7 @@ static int firestream_init_one(struct pci_dev *pci_dev,
 		goto err_out;
 
 	fs_dev = kzalloc (sizeof (struct fs_dev), GFP_KERNEL);
-	fs_dprintk (FS_DEBUG_ALLOC, "Alloc fs-dev: %p(%Zd)\n",
+	fs_dprintk (FS_DEBUG_ALLOC, "Alloc fs-dev: %p(%zd)\n",
 		    fs_dev, sizeof (struct fs_dev));
 	if (!fs_dev)
 		goto err_out;
diff --git a/drivers/atm/horizon.c b/drivers/atm/horizon.c
index 584aa881882b..2bf1ef1c3c78 100644
--- a/drivers/atm/horizon.c
+++ b/drivers/atm/horizon.c
@@ -2884,12 +2884,7 @@ static struct pci_driver hrz_driver = {
 /********** module entry **********/
 
 static int __init hrz_module_init (void) {
-  // sanity check - cast is needed since printk does not support %Zu
-  if (sizeof(struct MEMMAP) != 128*1024/4) {
-    PRINTK (KERN_ERR, "Fix struct MEMMAP (is %lu fakewords).",
-	    (unsigned long) sizeof(struct MEMMAP));
-    return -ENOMEM;
-  }
+  BUILD_BUG_ON(sizeof(struct MEMMAP) != 128*1024/4);
   
   show_version();
   
diff --git a/drivers/atm/lanai.c b/drivers/atm/lanai.c
index 445505d9ea07..1a9bc51284b0 100644
--- a/drivers/atm/lanai.c
+++ b/drivers/atm/lanai.c
@@ -1389,7 +1389,7 @@ static void vcc_rx_aal5(struct lanai_vcc *lvcc, int endptr)
 	if (n < 0)
 		n += lanai_buf_size(&lvcc->rx.buf);
 	APRINTK(n >= 0 && n < lanai_buf_size(&lvcc->rx.buf) && !(n & 15),
-	    "vcc_rx_aal5: n out of range (%d/%Zu)\n",
+	    "vcc_rx_aal5: n out of range (%d/%zu)\n",
 	    n, lanai_buf_size(&lvcc->rx.buf));
 	/* Recover the second-to-last word to get true pdu length */
 	if ((x = &end[-2]) < lvcc->rx.buf.start)
@@ -1493,9 +1493,9 @@ static int lanai_get_sized_buffer(struct lanai_dev *lanai,
 		return -ENOMEM;
 	if (unlikely(lanai_buf_size(buf) < size))
 		printk(KERN_WARNING DEV_LABEL "(itf %d): wanted %d bytes "
-		    "for %s buffer, got only %Zu\n", lanai->number, size,
+		    "for %s buffer, got only %zu\n", lanai->number, size,
 		    name, lanai_buf_size(buf));
-	DPRINTK("Allocated %Zu byte %s buffer\n", lanai_buf_size(buf), name);
+	DPRINTK("Allocated %zu byte %s buffer\n", lanai_buf_size(buf), name);
 	return 0;
 }
 
@@ -1586,7 +1586,7 @@ static int service_buffer_allocate(struct lanai_dev *lanai)
 	    lanai->pci);
 	if (unlikely(lanai->service.start == NULL))
 		return -ENOMEM;
-	DPRINTK("allocated service buffer at 0x%08lX, size %Zu(%d)\n",
+	DPRINTK("allocated service buffer at 0x%08lX, size %zu(%d)\n",
 	    (unsigned long) lanai->service.start,
 	    lanai_buf_size(&lanai->service),
 	    lanai_buf_size_cardorder(&lanai->service));
@@ -2467,8 +2467,8 @@ static int lanai_proc_read(struct atm_dev *atmdev, loff_t *pos, char *page)
 		    (lanai->status & STATUS_LED) ? 1 : 0,
 		    (lanai->status & STATUS_GPIN) ? 1 : 0);
 	if (left-- == 0)
-		return sprintf(page, "global buffer sizes: service=%Zu, "
-		    "aal0_rx=%Zu\n", lanai_buf_size(&lanai->service),
+		return sprintf(page, "global buffer sizes: service=%zu, "
+		    "aal0_rx=%zu\n", lanai_buf_size(&lanai->service),
 		    lanai->naal0 ? lanai_buf_size(&lanai->aal0buf) : 0);
 	if (left-- == 0) {
 		get_statistics(lanai);
@@ -2513,7 +2513,7 @@ static int lanai_proc_read(struct atm_dev *atmdev, loff_t *pos, char *page)
 		left += sprintf(&page[left], ",\n          rx_AAL=%d",
 		    lvcc->rx.atmvcc->qos.aal == ATM_AAL5 ? 5 : 0);
 		if (lvcc->rx.atmvcc->qos.aal == ATM_AAL5)
-			left += sprintf(&page[left], ", rx_buf_size=%Zu, "
+			left += sprintf(&page[left], ", rx_buf_size=%zu, "
 			    "rx_bad_len=%u,\n          rx_service_trash=%u, "
 			    "rx_service_stream=%u, rx_bad_crc=%u",
 			    lanai_buf_size(&lvcc->rx.buf),
@@ -2524,7 +2524,7 @@ static int lanai_proc_read(struct atm_dev *atmdev, loff_t *pos, char *page)
 	}
 	if (lvcc->tx.atmvcc != NULL)
 		left += sprintf(&page[left], ",\n          tx_AAL=%d, "
-		    "tx_buf_size=%Zu, tx_qos=%cBR, tx_backlogged=%c",
+		    "tx_buf_size=%zu, tx_qos=%cBR, tx_backlogged=%c",
 		    lvcc->tx.atmvcc->qos.aal == ATM_AAL5 ? 5 : 0,
 		    lanai_buf_size(&lvcc->tx.buf),
 		    lvcc->tx.atmvcc == lanai->cbrvcc ? 'C' : 'U',
diff --git a/drivers/char/pcmcia/cm4000_cs.c b/drivers/char/pcmcia/cm4000_cs.c
index 4eb609787ad6..cd53771b9ae7 100644
--- a/drivers/char/pcmcia/cm4000_cs.c
+++ b/drivers/char/pcmcia/cm4000_cs.c
@@ -1037,7 +1037,7 @@ release_io:
 	clear_bit(LOCK_IO, &dev->flags);
 	wake_up_interruptible(&dev->ioq);
 
-	DEBUGP(2, dev, "<- cmm_read returns: rc = %Zi\n",
+	DEBUGP(2, dev, "<- cmm_read returns: rc = %zi\n",
 	       (rc < 0 ? rc : count));
 	return rc < 0 ? rc : count;
 }
diff --git a/drivers/char/pcmcia/cm4040_cs.c b/drivers/char/pcmcia/cm4040_cs.c
index d7123259143e..d4dbd8d8e524 100644
--- a/drivers/char/pcmcia/cm4040_cs.c
+++ b/drivers/char/pcmcia/cm4040_cs.c
@@ -331,7 +331,7 @@ static ssize_t cm4040_write(struct file *filp, const char __user *buf,
 	}
 
 	if ((count < 5) || (count > READ_WRITE_BUFFER_SIZE)) {
-		DEBUGP(2, dev, "<- cm4040_write buffersize=%Zd < 5\n", count);
+		DEBUGP(2, dev, "<- cm4040_write buffersize=%zd < 5\n", count);
 		return -EIO;
 	}
 
diff --git a/drivers/gpu/drm/mga/mga_drv.h b/drivers/gpu/drm/mga/mga_drv.h
index d5ce829b3199..45cf363d25ad 100644
--- a/drivers/gpu/drm/mga/mga_drv.h
+++ b/drivers/gpu/drm/mga/mga_drv.h
@@ -266,7 +266,7 @@ do {									\
 do {									\
 	if (MGA_VERBOSE) {						\
 		DRM_INFO("BEGIN_DMA(%d)\n", (n));			\
-		DRM_INFO("   space=0x%x req=0x%Zx\n",			\
+		DRM_INFO("   space=0x%x req=0x%zx\n",			\
 			 dev_priv->prim.space, (n) * DMA_BLOCK_SIZE);	\
 	}								\
 	prim = dev_priv->prim.start;					\
@@ -313,7 +313,7 @@ do {									\
 #define DMA_WRITE(offset, val)						\
 do {									\
 	if (MGA_VERBOSE)						\
-		DRM_INFO("   DMA_WRITE( 0x%08x ) at 0x%04Zx\n",		\
+		DRM_INFO("   DMA_WRITE( 0x%08x ) at 0x%04zx\n",		\
 			 (u32)(val), write + (offset) * sizeof(u32));	\
 	*(volatile u32 *)(prim + write + (offset) * sizeof(u32)) = val;	\
 } while (0)
diff --git a/drivers/ide/ide-acpi.c b/drivers/ide/ide-acpi.c
index b6940992a6ff..968038482d2f 100644
--- a/drivers/ide/ide-acpi.c
+++ b/drivers/ide/ide-acpi.c
@@ -447,7 +447,7 @@ void ide_acpi_get_timing(ide_hwif_t *hwif)
 	memcpy(&hwif->acpidata->gtm, out_obj->buffer.pointer,
 	       sizeof(struct GTM_buffer));
 
-	DEBPRINT("_GTM info: ptr: 0x%p, len: 0x%x, exp.len: 0x%Zx\n",
+	DEBPRINT("_GTM info: ptr: 0x%p, len: 0x%x, exp.len: 0x%zx\n",
 		 out_obj->buffer.pointer, out_obj->buffer.length,
 		 sizeof(struct GTM_buffer));
 
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 3c1b7974d66d..d8a552b47718 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -1136,7 +1136,7 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 	ssize_t ret = 0;
 	int rc;
 
-	ide_debug_log(IDE_DBG_FUNC, "count %Zd", count);
+	ide_debug_log(IDE_DBG_FUNC, "count %zd", count);
 
 	if (tape->chrdev_dir != IDETAPE_DIR_READ) {
 		if (test_bit(ilog2(IDE_AFLAG_DETECT_BS), &drive->atapi_flags))
@@ -1195,7 +1195,7 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	if (tape->write_prot)
 		return -EACCES;
 
-	ide_debug_log(IDE_DBG_FUNC, "count %Zd", count);
+	ide_debug_log(IDE_DBG_FUNC, "count %zd", count);
 
 	/* Initialize write operation */
 	rc = idetape_init_rw(drive, IDETAPE_DIR_WRITE);
diff --git a/drivers/input/touchscreen/cyttsp4_core.c b/drivers/input/touchscreen/cyttsp4_core.c
index 44deca88c579..beaf61ce775b 100644
--- a/drivers/input/touchscreen/cyttsp4_core.c
+++ b/drivers/input/touchscreen/cyttsp4_core.c
@@ -202,7 +202,7 @@ static int cyttsp4_si_get_cydata(struct cyttsp4 *cd)
 	int rc;
 
 	si->si_ofs.cydata_size = si->si_ofs.test_ofs - si->si_ofs.cydata_ofs;
-	dev_dbg(cd->dev, "%s: cydata size: %Zd\n", __func__,
+	dev_dbg(cd->dev, "%s: cydata size: %zd\n", __func__,
 			si->si_ofs.cydata_size);
 
 	p = krealloc(si->si_ptrs.cydata, si->si_ofs.cydata_size, GFP_KERNEL);
@@ -430,13 +430,13 @@ static int cyttsp4_si_get_opcfg_data(struct cyttsp4 *cd)
 	for (abs = 0; abs < CY_TCH_NUM_ABS; abs++) {
 		dev_dbg(cd->dev, "%s: tch_rec_%s\n", __func__,
 			cyttsp4_tch_abs_string[abs]);
-		dev_dbg(cd->dev, "%s:     ofs =%2Zd\n", __func__,
+		dev_dbg(cd->dev, "%s:     ofs =%2zd\n", __func__,
 			si->si_ofs.tch_abs[abs].ofs);
-		dev_dbg(cd->dev, "%s:     siz =%2Zd\n", __func__,
+		dev_dbg(cd->dev, "%s:     siz =%2zd\n", __func__,
 			si->si_ofs.tch_abs[abs].size);
-		dev_dbg(cd->dev, "%s:     max =%2Zd\n", __func__,
+		dev_dbg(cd->dev, "%s:     max =%2zd\n", __func__,
 			si->si_ofs.tch_abs[abs].max);
-		dev_dbg(cd->dev, "%s:     bofs=%2Zd\n", __func__,
+		dev_dbg(cd->dev, "%s:     bofs=%2zd\n", __func__,
 			si->si_ofs.tch_abs[abs].bofs);
 	}
 
@@ -586,62 +586,62 @@ static int cyttsp4_si_get_op_data_ptrs(struct cyttsp4 *cd)
 static void cyttsp4_si_put_log_data(struct cyttsp4 *cd)
 {
 	struct cyttsp4_sysinfo *si = &cd->sysinfo;
-	dev_dbg(cd->dev, "%s: cydata_ofs =%4Zd siz=%4Zd\n", __func__,
+	dev_dbg(cd->dev, "%s: cydata_ofs =%4zd siz=%4zd\n", __func__,
 		si->si_ofs.cydata_ofs, si->si_ofs.cydata_size);
-	dev_dbg(cd->dev, "%s: test_ofs   =%4Zd siz=%4Zd\n", __func__,
+	dev_dbg(cd->dev, "%s: test_ofs   =%4zd siz=%4zd\n", __func__,
 		si->si_ofs.test_ofs, si->si_ofs.test_size);
-	dev_dbg(cd->dev, "%s: pcfg_ofs   =%4Zd siz=%4Zd\n", __func__,
+	dev_dbg(cd->dev, "%s: pcfg_ofs   =%4zd siz=%4zd\n", __func__,
 		si->si_ofs.pcfg_ofs, si->si_ofs.pcfg_size);
-	dev_dbg(cd->dev, "%s: opcfg_ofs  =%4Zd siz=%4Zd\n", __func__,
+	dev_dbg(cd->dev, "%s: opcfg_ofs  =%4zd siz=%4zd\n", __func__,
 		si->si_ofs.opcfg_ofs, si->si_ofs.opcfg_size);
-	dev_dbg(cd->dev, "%s: ddata_ofs  =%4Zd siz=%4Zd\n", __func__,
+	dev_dbg(cd->dev, "%s: ddata_ofs  =%4zd siz=%4zd\n", __func__,
 		si->si_ofs.ddata_ofs, si->si_ofs.ddata_size);
-	dev_dbg(cd->dev, "%s: mdata_ofs  =%4Zd siz=%4Zd\n", __func__,
+	dev_dbg(cd->dev, "%s: mdata_ofs  =%4zd siz=%4zd\n", __func__,
 		si->si_ofs.mdata_ofs, si->si_ofs.mdata_size);
 
-	dev_dbg(cd->dev, "%s: cmd_ofs       =%4Zd\n", __func__,
+	dev_dbg(cd->dev, "%s: cmd_ofs       =%4zd\n", __func__,
 		si->si_ofs.cmd_ofs);
-	dev_dbg(cd->dev, "%s: rep_ofs       =%4Zd\n", __func__,
+	dev_dbg(cd->dev, "%s: rep_ofs       =%4zd\n", __func__,
 		si->si_ofs.rep_ofs);
-	dev_dbg(cd->dev, "%s: rep_sz        =%4Zd\n", __func__,
+	dev_dbg(cd->dev, "%s: rep_sz        =%4zd\n", __func__,
 		si->si_ofs.rep_sz);
-	dev_dbg(cd->dev, "%s: num_btns      =%4Zd\n", __func__,
+	dev_dbg(cd->dev, "%s: num_btns      =%4zd\n", __func__,
 		si->si_ofs.num_btns);
-	dev_dbg(cd->dev, "%s: num_btn_regs  =%4Zd\n", __func__,
+	dev_dbg(cd->dev, "%s: num_btn_regs  =%4zd\n", __func__,
 		si->si_ofs.num_btn_regs);
-	dev_dbg(cd->dev, "%s: tt_stat_ofs   =%4Zd\n", __func__,
+	dev_dbg(cd->dev, "%s: tt_stat_ofs   =%4zd\n", __func__,
 		si->si_ofs.tt_stat_ofs);
-	dev_dbg(cd->dev, "%s: tch_rec_size  =%4Zd\n", __func__,
+	dev_dbg(cd->dev, "%s: tch_rec_size  =%4zd\n", __func__,
 		si->si_ofs.tch_rec_size);
-	dev_dbg(cd->dev, "%s: max_tchs      =%4Zd\n", __func__,
+	dev_dbg(cd->dev, "%s: max_tchs      =%4zd\n", __func__,
 		si->si_ofs.max_tchs);
-	dev_dbg(cd->dev, "%s: mode_size     =%4Zd\n", __func__,
+	dev_dbg(cd->dev, "%s: mode_size     =%4zd\n", __func__,
 		si->si_ofs.mode_size);
-	dev_dbg(cd->dev, "%s: data_size     =%4Zd\n", __func__,
+	dev_dbg(cd->dev, "%s: data_size     =%4zd\n", __func__,
 		si->si_ofs.data_size);
-	dev_dbg(cd->dev, "%s: map_sz        =%4Zd\n", __func__,
+	dev_dbg(cd->dev, "%s: map_sz        =%4zd\n", __func__,
 		si->si_ofs.map_sz);
 
-	dev_dbg(cd->dev, "%s: btn_rec_size   =%2Zd\n", __func__,
+	dev_dbg(cd->dev, "%s: btn_rec_size   =%2zd\n", __func__,
 		si->si_ofs.btn_rec_size);
-	dev_dbg(cd->dev, "%s: btn_diff_ofs   =%2Zd\n", __func__,
+	dev_dbg(cd->dev, "%s: btn_diff_ofs   =%2zd\n", __func__,
 		si->si_ofs.btn_diff_ofs);
-	dev_dbg(cd->dev, "%s: btn_diff_size  =%2Zd\n", __func__,
+	dev_dbg(cd->dev, "%s: btn_diff_size  =%2zd\n", __func__,
 		si->si_ofs.btn_diff_size);
 
-	dev_dbg(cd->dev, "%s: max_x    = 0x%04ZX (%Zd)\n", __func__,
+	dev_dbg(cd->dev, "%s: max_x    = 0x%04zX (%zd)\n", __func__,
 		si->si_ofs.max_x, si->si_ofs.max_x);
-	dev_dbg(cd->dev, "%s: x_origin = %Zd (%s)\n", __func__,
+	dev_dbg(cd->dev, "%s: x_origin = %zd (%s)\n", __func__,
 		si->si_ofs.x_origin,
 		si->si_ofs.x_origin == CY_NORMAL_ORIGIN ?
 		"left corner" : "right corner");
-	dev_dbg(cd->dev, "%s: max_y    = 0x%04ZX (%Zd)\n", __func__,
+	dev_dbg(cd->dev, "%s: max_y    = 0x%04zX (%zd)\n", __func__,
 		si->si_ofs.max_y, si->si_ofs.max_y);
-	dev_dbg(cd->dev, "%s: y_origin = %Zd (%s)\n", __func__,
+	dev_dbg(cd->dev, "%s: y_origin = %zd (%s)\n", __func__,
 		si->si_ofs.y_origin,
 		si->si_ofs.y_origin == CY_NORMAL_ORIGIN ?
 		"upper corner" : "lower corner");
-	dev_dbg(cd->dev, "%s: max_p    = 0x%04ZX (%Zd)\n", __func__,
+	dev_dbg(cd->dev, "%s: max_p    = 0x%04zX (%zd)\n", __func__,
 		si->si_ofs.max_p, si->si_ofs.max_p);
 
 	dev_dbg(cd->dev, "%s: xy_mode=%p xy_data=%p\n", __func__,
@@ -1000,7 +1000,7 @@ static int cyttsp4_xy_worker(struct cyttsp4 *cd)
 		dev_dbg(dev, "%s: Large area detected\n", __func__);
 
 	if (num_cur_tch > si->si_ofs.max_tchs) {
-		dev_err(dev, "%s: too many tch; set to max tch (n=%d c=%Zd)\n",
+		dev_err(dev, "%s: too many tch; set to max tch (n=%d c=%zd)\n",
 				__func__, num_cur_tch, si->si_ofs.max_tchs);
 		num_cur_tch = si->si_ofs.max_tchs;
 	}
diff --git a/drivers/media/dvb-frontends/helene.c b/drivers/media/dvb-frontends/helene.c
index ef35c2b30ea3..4bf5a551ba40 100644
--- a/drivers/media/dvb-frontends/helene.c
+++ b/drivers/media/dvb-frontends/helene.c
@@ -309,7 +309,7 @@ static int helene_write_regs(struct helene_priv *priv,
 
 	if (len + 1 > sizeof(buf)) {
 		dev_warn(&priv->i2c->dev,
-				"wr reg=%04x: len=%d vs %Zu is too big!\n",
+				"wr reg=%04x: len=%d vs %zu is too big!\n",
 				reg, len + 1, sizeof(buf));
 		return -E2BIG;
 	}
diff --git a/drivers/media/dvb-frontends/or51132.c b/drivers/media/dvb-frontends/or51132.c
index 4b67d7e0116d..62aa00767015 100644
--- a/drivers/media/dvb-frontends/or51132.c
+++ b/drivers/media/dvb-frontends/or51132.c
@@ -133,7 +133,7 @@ static int or51132_load_firmware (struct dvb_frontend* fe, const struct firmware
 	u32 firmwareAsize, firmwareBsize;
 	int i,ret;
 
-	dprintk("Firmware is %Zd bytes\n",fw->size);
+	dprintk("Firmware is %zd bytes\n",fw->size);
 
 	/* Get size of firmware A and B */
 	firmwareAsize = le32_to_cpu(*((__le32*)fw->data));
diff --git a/drivers/media/dvb-frontends/tda10048.c b/drivers/media/dvb-frontends/tda10048.c
index 92ab34c3e0be..143b39b5f6c9 100644
--- a/drivers/media/dvb-frontends/tda10048.c
+++ b/drivers/media/dvb-frontends/tda10048.c
@@ -499,7 +499,7 @@ static int tda10048_firmware_upload(struct dvb_frontend *fe)
 			__func__);
 		return -EIO;
 	} else {
-		printk(KERN_INFO "%s: firmware read %Zu bytes.\n",
+		printk(KERN_INFO "%s: firmware read %zu bytes.\n",
 			__func__,
 			fw->size);
 		ret = 0;
diff --git a/drivers/media/pci/saa7164/saa7164-fw.c b/drivers/media/pci/saa7164/saa7164-fw.c
index 4ba5eade7ce2..ef4906406ebf 100644
--- a/drivers/media/pci/saa7164/saa7164-fw.c
+++ b/drivers/media/pci/saa7164/saa7164-fw.c
@@ -422,7 +422,7 @@ int saa7164_downloadfirmware(struct saa7164_dev *dev)
 			return -ENOMEM;
 		}
 
-		printk(KERN_INFO "%s() firmware read %Zu bytes.\n",
+		printk(KERN_INFO "%s() firmware read %zu bytes.\n",
 			__func__, fw->size);
 
 		if (fw->size != fwlength) {
diff --git a/drivers/media/tuners/xc5000.c b/drivers/media/tuners/xc5000.c
index 0345b274eccc..91947cf1950e 100644
--- a/drivers/media/tuners/xc5000.c
+++ b/drivers/media/tuners/xc5000.c
@@ -1144,7 +1144,7 @@ static int xc_load_fw_and_init_tuner(struct dvb_frontend *fe, int force)
 			pr_err("xc5000: Upload failed. rc %d\n", ret);
 			return ret;
 		}
-		dprintk(1, "firmware read %Zu bytes.\n", fw->size);
+		dprintk(1, "firmware read %zu bytes.\n", fw->size);
 
 		if (fw->size != desired_fw->size) {
 			pr_err("xc5000: Firmware file with incorrect size\n");
diff --git a/drivers/media/usb/dvb-usb/dib0700_devices.c b/drivers/media/usb/dvb-usb/dib0700_devices.c
index 81d7fd4f7776..85ab3fa48f9a 100644
--- a/drivers/media/usb/dvb-usb/dib0700_devices.c
+++ b/drivers/media/usb/dvb-usb/dib0700_devices.c
@@ -2414,7 +2414,7 @@ static int stk9090m_frontend_attach(struct dvb_usb_adapter *adap)
 		deb_info("%s: Upload failed. (file not found?)\n", __func__);
 		return -ENODEV;
 	} else {
-		deb_info("%s: firmware read %Zu bytes.\n", __func__, state->frontend_firmware->size);
+		deb_info("%s: firmware read %zu bytes.\n", __func__, state->frontend_firmware->size);
 	}
 	stk9090m_config.microcode_B_fe_size = state->frontend_firmware->size;
 	stk9090m_config.microcode_B_fe_buffer = state->frontend_firmware->data;
@@ -2480,7 +2480,7 @@ static int nim9090md_frontend_attach(struct dvb_usb_adapter *adap)
 		deb_info("%s: Upload failed. (file not found?)\n", __func__);
 		return -EIO;
 	} else {
-		deb_info("%s: firmware read %Zu bytes.\n", __func__, state->frontend_firmware->size);
+		deb_info("%s: firmware read %zu bytes.\n", __func__, state->frontend_firmware->size);
 	}
 	nim9090md_config[0].microcode_B_fe_size = state->frontend_firmware->size;
 	nim9090md_config[0].microcode_B_fe_buffer = state->frontend_firmware->data;
diff --git a/drivers/misc/vmw_vmci/vmci_context.c b/drivers/misc/vmw_vmci/vmci_context.c
index f866a4baecb5..f35f0c8606b9 100644
--- a/drivers/misc/vmw_vmci/vmci_context.c
+++ b/drivers/misc/vmw_vmci/vmci_context.c
@@ -303,7 +303,7 @@ int vmci_ctx_enqueue_datagram(u32 cid, struct vmci_datagram *dg)
 
 	vmci_dg_size = VMCI_DG_SIZE(dg);
 	if (vmci_dg_size > VMCI_MAX_DG_SIZE) {
-		pr_devel("Datagram too large (bytes=%Zu)\n", vmci_dg_size);
+		pr_devel("Datagram too large (bytes=%zu)\n", vmci_dg_size);
 		return VMCI_ERROR_INVALID_ARGS;
 	}
 
diff --git a/drivers/net/arcnet/arcnet.c b/drivers/net/arcnet/arcnet.c
index 6ea963e3b89a..62ee439d5882 100644
--- a/drivers/net/arcnet/arcnet.c
+++ b/drivers/net/arcnet/arcnet.c
@@ -123,7 +123,7 @@ static int __init arcnet_init(void)
 		arc_proto_map[count] = arc_proto_default;
 
 	if (BUGLVL(D_DURING))
-		pr_info("struct sizes: %Zd %Zd %Zd %Zd %Zd\n",
+		pr_info("struct sizes: %zd %zd %zd %zd %zd\n",
 			sizeof(struct arc_hardware),
 			sizeof(struct arc_rfc1201),
 			sizeof(struct arc_rfc1051),
diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c
index 016d481c6476..30606b11b128 100644
--- a/drivers/net/ethernet/cadence/macb.c
+++ b/drivers/net/ethernet/cadence/macb.c
@@ -1622,7 +1622,7 @@ static void macb_init_rx_buffer_size(struct macb *bp, size_t size)
 		}
 	}
 
-	netdev_dbg(bp->dev, "mtu [%u] rx_buffer_size [%Zu]\n",
+	netdev_dbg(bp->dev, "mtu [%u] rx_buffer_size [%zu]\n",
 		   bp->dev->mtu, bp->rx_buffer_size);
 }
 
diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index bda0c6413450..89698741682f 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -1330,7 +1330,7 @@ static int __init gtp_init(void)
 	if (err < 0)
 		goto unreg_genl_family;
 
-	pr_info("GTP module loaded (pdp ctx size %Zd bytes)\n",
+	pr_info("GTP module loaded (pdp ctx size %zd bytes)\n",
 		sizeof(struct pdp_ctx));
 	return 0;
 
diff --git a/drivers/net/usb/rndis_host.c b/drivers/net/usb/rndis_host.c
index 4f4f71b2966b..c5b21138b7eb 100644
--- a/drivers/net/usb/rndis_host.c
+++ b/drivers/net/usb/rndis_host.c
@@ -383,7 +383,7 @@ generic_rndis_bind(struct usbnet *dev, struct usb_interface *intf, int flags)
 
 	/* REVISIT:  peripheral "alignment" request is ignored ... */
 	dev_dbg(&intf->dev,
-		"hard mtu %u (%u from dev), rx buflen %Zu, align %d\n",
+		"hard mtu %u (%u from dev), rx buflen %zu, align %d\n",
 		dev->hard_mtu, tmp, dev->rx_urb_size,
 		1 << le32_to_cpu(u.init_c->packet_alignment));
 
diff --git a/drivers/net/usb/sierra_net.c b/drivers/net/usb/sierra_net.c
index d9440bc022f2..ac69f28d92d2 100644
--- a/drivers/net/usb/sierra_net.c
+++ b/drivers/net/usb/sierra_net.c
@@ -379,7 +379,7 @@ static int sierra_net_parse_lsi(struct usbnet *dev, char *data, int datalen)
 	u32 expected_length;
 
 	if (datalen < sizeof(struct lsi_umts_single)) {
-		netdev_err(dev->net, "%s: Data length %d, exp >= %Zu\n",
+		netdev_err(dev->net, "%s: Data length %d, exp >= %zu\n",
 			   __func__, datalen, sizeof(struct lsi_umts_single));
 		return -1;
 	}
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
index 10098b7586f3..944b83cfc519 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
@@ -4874,7 +4874,7 @@ brcmf_cfg80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
 		kfree(af_params);
 	} else {
 		brcmf_dbg(TRACE, "Unhandled, fc=%04x!!\n", mgmt->frame_control);
-		brcmf_dbg_hex_dump(true, buf, len, "payload, len=%Zu\n", len);
+		brcmf_dbg_hex_dump(true, buf, len, "payload, len=%zu\n", len);
 	}
 
 exit:
diff --git a/drivers/net/wireless/intel/iwlegacy/4965-mac.c b/drivers/net/wireless/intel/iwlegacy/4965-mac.c
index a91d170a614b..2781f5728d07 100644
--- a/drivers/net/wireless/intel/iwlegacy/4965-mac.c
+++ b/drivers/net/wireless/intel/iwlegacy/4965-mac.c
@@ -4855,39 +4855,39 @@ il4965_ucode_callback(const struct firmware *ucode_raw, void *context)
 	 */
 
 	D_INFO("f/w package hdr ucode version raw = 0x%x\n", il->ucode_ver);
-	D_INFO("f/w package hdr runtime inst size = %Zd\n", pieces.inst_size);
-	D_INFO("f/w package hdr runtime data size = %Zd\n", pieces.data_size);
-	D_INFO("f/w package hdr init inst size = %Zd\n", pieces.init_size);
-	D_INFO("f/w package hdr init data size = %Zd\n", pieces.init_data_size);
-	D_INFO("f/w package hdr boot inst size = %Zd\n", pieces.boot_size);
+	D_INFO("f/w package hdr runtime inst size = %zd\n", pieces.inst_size);
+	D_INFO("f/w package hdr runtime data size = %zd\n", pieces.data_size);
+	D_INFO("f/w package hdr init inst size = %zd\n", pieces.init_size);
+	D_INFO("f/w package hdr init data size = %zd\n", pieces.init_data_size);
+	D_INFO("f/w package hdr boot inst size = %zd\n", pieces.boot_size);
 
 	/* Verify that uCode images will fit in card's SRAM */
 	if (pieces.inst_size > il->hw_params.max_inst_size) {
-		IL_ERR("uCode instr len %Zd too large to fit in\n",
+		IL_ERR("uCode instr len %zd too large to fit in\n",
 		       pieces.inst_size);
 		goto try_again;
 	}
 
 	if (pieces.data_size > il->hw_params.max_data_size) {
-		IL_ERR("uCode data len %Zd too large to fit in\n",
+		IL_ERR("uCode data len %zd too large to fit in\n",
 		       pieces.data_size);
 		goto try_again;
 	}
 
 	if (pieces.init_size > il->hw_params.max_inst_size) {
-		IL_ERR("uCode init instr len %Zd too large to fit in\n",
+		IL_ERR("uCode init instr len %zd too large to fit in\n",
 		       pieces.init_size);
 		goto try_again;
 	}
 
 	if (pieces.init_data_size > il->hw_params.max_data_size) {
-		IL_ERR("uCode init data len %Zd too large to fit in\n",
+		IL_ERR("uCode init data len %zd too large to fit in\n",
 		       pieces.init_data_size);
 		goto try_again;
 	}
 
 	if (pieces.boot_size > il->hw_params.max_bsm_size) {
-		IL_ERR("uCode boot instr len %Zd too large to fit in\n",
+		IL_ERR("uCode boot instr len %zd too large to fit in\n",
 		       pieces.boot_size);
 		goto try_again;
 	}
@@ -4938,7 +4938,7 @@ il4965_ucode_callback(const struct firmware *ucode_raw, void *context)
 	/* Copy images into buffers for card's bus-master reads ... */
 
 	/* Runtime instructions (first block of data in file) */
-	D_INFO("Copying (but not loading) uCode instr len %Zd\n",
+	D_INFO("Copying (but not loading) uCode instr len %zd\n",
 	       pieces.inst_size);
 	memcpy(il->ucode_code.v_addr, pieces.inst, pieces.inst_size);
 
@@ -4949,28 +4949,28 @@ il4965_ucode_callback(const struct firmware *ucode_raw, void *context)
 	 * Runtime data
 	 * NOTE:  Copy into backup buffer will be done in il_up()
 	 */
-	D_INFO("Copying (but not loading) uCode data len %Zd\n",
+	D_INFO("Copying (but not loading) uCode data len %zd\n",
 	       pieces.data_size);
 	memcpy(il->ucode_data.v_addr, pieces.data, pieces.data_size);
 	memcpy(il->ucode_data_backup.v_addr, pieces.data, pieces.data_size);
 
 	/* Initialization instructions */
 	if (pieces.init_size) {
-		D_INFO("Copying (but not loading) init instr len %Zd\n",
+		D_INFO("Copying (but not loading) init instr len %zd\n",
 		       pieces.init_size);
 		memcpy(il->ucode_init.v_addr, pieces.init, pieces.init_size);
 	}
 
 	/* Initialization data */
 	if (pieces.init_data_size) {
-		D_INFO("Copying (but not loading) init data len %Zd\n",
+		D_INFO("Copying (but not loading) init data len %zd\n",
 		       pieces.init_data_size);
 		memcpy(il->ucode_init_data.v_addr, pieces.init_data,
 		       pieces.init_data_size);
 	}
 
 	/* Bootstrap instructions */
-	D_INFO("Copying (but not loading) boot instr len %Zd\n",
+	D_INFO("Copying (but not loading) boot instr len %zd\n",
 	       pieces.boot_size);
 	memcpy(il->ucode_boot.v_addr, pieces.boot, pieces.boot_size);
 
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-drv.c b/drivers/net/wireless/intel/iwlwifi/iwl-drv.c
index 0e0293d42b5d..be466a074c1d 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-drv.c
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-drv.c
@@ -1141,21 +1141,21 @@ static int validate_sec_sizes(struct iwl_drv *drv,
 			      struct iwl_firmware_pieces *pieces,
 			      const struct iwl_cfg *cfg)
 {
-	IWL_DEBUG_INFO(drv, "f/w package hdr runtime inst size = %Zd\n",
+	IWL_DEBUG_INFO(drv, "f/w package hdr runtime inst size = %zd\n",
 		get_sec_size(pieces, IWL_UCODE_REGULAR,
 			     IWL_UCODE_SECTION_INST));
-	IWL_DEBUG_INFO(drv, "f/w package hdr runtime data size = %Zd\n",
+	IWL_DEBUG_INFO(drv, "f/w package hdr runtime data size = %zd\n",
 		get_sec_size(pieces, IWL_UCODE_REGULAR,
 			     IWL_UCODE_SECTION_DATA));
-	IWL_DEBUG_INFO(drv, "f/w package hdr init inst size = %Zd\n",
+	IWL_DEBUG_INFO(drv, "f/w package hdr init inst size = %zd\n",
 		get_sec_size(pieces, IWL_UCODE_INIT, IWL_UCODE_SECTION_INST));
-	IWL_DEBUG_INFO(drv, "f/w package hdr init data size = %Zd\n",
+	IWL_DEBUG_INFO(drv, "f/w package hdr init data size = %zd\n",
 		get_sec_size(pieces, IWL_UCODE_INIT, IWL_UCODE_SECTION_DATA));
 
 	/* Verify that uCode images will fit in card's SRAM. */
 	if (get_sec_size(pieces, IWL_UCODE_REGULAR, IWL_UCODE_SECTION_INST) >
 	    cfg->max_inst_size) {
-		IWL_ERR(drv, "uCode instr len %Zd too large to fit in\n",
+		IWL_ERR(drv, "uCode instr len %zd too large to fit in\n",
 			get_sec_size(pieces, IWL_UCODE_REGULAR,
 				     IWL_UCODE_SECTION_INST));
 		return -1;
@@ -1163,7 +1163,7 @@ static int validate_sec_sizes(struct iwl_drv *drv,
 
 	if (get_sec_size(pieces, IWL_UCODE_REGULAR, IWL_UCODE_SECTION_DATA) >
 	    cfg->max_data_size) {
-		IWL_ERR(drv, "uCode data len %Zd too large to fit in\n",
+		IWL_ERR(drv, "uCode data len %zd too large to fit in\n",
 			get_sec_size(pieces, IWL_UCODE_REGULAR,
 				     IWL_UCODE_SECTION_DATA));
 		return -1;
@@ -1171,7 +1171,7 @@ static int validate_sec_sizes(struct iwl_drv *drv,
 
 	if (get_sec_size(pieces, IWL_UCODE_INIT, IWL_UCODE_SECTION_INST) >
 	     cfg->max_inst_size) {
-		IWL_ERR(drv, "uCode init instr len %Zd too large to fit in\n",
+		IWL_ERR(drv, "uCode init instr len %zd too large to fit in\n",
 			get_sec_size(pieces, IWL_UCODE_INIT,
 				     IWL_UCODE_SECTION_INST));
 		return -1;
@@ -1179,7 +1179,7 @@ static int validate_sec_sizes(struct iwl_drv *drv,
 
 	if (get_sec_size(pieces, IWL_UCODE_INIT, IWL_UCODE_SECTION_DATA) >
 	    cfg->max_data_size) {
-		IWL_ERR(drv, "uCode init data len %Zd too large to fit in\n",
+		IWL_ERR(drv, "uCode init data len %zd too large to fit in\n",
 			get_sec_size(pieces, IWL_UCODE_REGULAR,
 				     IWL_UCODE_SECTION_DATA));
 		return -1;
diff --git a/drivers/parport/ieee1284_ops.c b/drivers/parport/ieee1284_ops.c
index c0e7d21c88c2..75071605d22f 100644
--- a/drivers/parport/ieee1284_ops.c
+++ b/drivers/parport/ieee1284_ops.c
@@ -307,7 +307,7 @@ size_t parport_ieee1284_read_byte (struct parport *port,
 		if (parport_read_status (port) & PARPORT_STATUS_ERROR) {
 		end_of_data:
 			DPRINTK (KERN_DEBUG
-				 "%s: No more byte data (%Zd bytes)\n",
+				 "%s: No more byte data (%zd bytes)\n",
 				 port->name, count);
 
 			/* Go to reverse idle phase. */
diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index 42d481d024ec..6903f03c88af 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -1290,7 +1290,7 @@ int osd_req_add_get_attr_list(struct osd_request *or,
 	or->enc_get_attr.total_bytes = total_bytes;
 
 	OSD_DEBUG(
-	       "get_attr.total_bytes=%u(%u) enc_get_attr.total_bytes=%u(%Zu)\n",
+	       "get_attr.total_bytes=%u(%u) enc_get_attr.total_bytes=%u(%zu)\n",
 	       or->get_attr.total_bytes,
 	       or->get_attr.total_bytes - _osd_req_sizeof_alist_header(or),
 	       or->enc_get_attr.total_bytes,
diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c
index 451de6c5e3c9..75ac662793a3 100644
--- a/drivers/scsi/osst.c
+++ b/drivers/scsi/osst.c
@@ -3435,7 +3435,7 @@ static ssize_t osst_write(struct file * filp, const char __user * buf, size_t co
 
 	/* Write must be integral number of blocks */
 	if (STp->block_size != 0 && (count % STp->block_size) != 0) {
-		printk(KERN_ERR "%s:E: Write (%Zd bytes) not multiple of tape block size (%d%c).\n",
+		printk(KERN_ERR "%s:E: Write (%zd bytes) not multiple of tape block size (%d%c).\n",
 				       name, count, STp->block_size<1024?
 				       STp->block_size:STp->block_size/1024, STp->block_size<1024?'b':'k');
 		retval = (-EINVAL);
@@ -3756,7 +3756,7 @@ static ssize_t osst_read(struct file * filp, char __user * buf, size_t count, lo
 
 	if ((count % STp->block_size) != 0) {
 		printk(KERN_WARNING
-		    "%s:W: Read (%Zd bytes) not multiple of tape block size (%d%c).\n", name, count,
+		    "%s:W: Read (%zd bytes) not multiple of tape block size (%d%c).\n", name, count,
 		    STp->block_size<1024?STp->block_size:STp->block_size/1024, STp->block_size<1024?'b':'k');
 	}
 
@@ -3815,7 +3815,7 @@ static ssize_t osst_read(struct file * filp, char __user * buf, size_t count, lo
 
 			if (transfer == 0) {
 				printk(KERN_WARNING
-				  "%s:W: Nothing can be transferred, requested %Zd, tape block size (%d%c).\n",
+				  "%s:W: Nothing can be transferred, requested %zd, tape block size (%d%c).\n",
 			   		name, count, STp->block_size < 1024?
 					STp->block_size:STp->block_size/1024,
 				       	STp->block_size<1024?'b':'k');
diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c
index 7b6317c8c2e9..265e1395bdb8 100644
--- a/drivers/scsi/qla2xxx/qla_init.c
+++ b/drivers/scsi/qla2xxx/qla_init.c
@@ -5669,7 +5669,7 @@ qla2x00_load_risc(scsi_qla_host_t *vha, uint32_t *srisc_addr)
 	/* Validate firmware image by checking version. */
 	if (blob->fw->size < 8 * sizeof(uint16_t)) {
 		ql_log(ql_log_fatal, vha, 0x0085,
-		    "Unable to verify integrity of firmware image (%Zd).\n",
+		    "Unable to verify integrity of firmware image (%zd).\n",
 		    blob->fw->size);
 		goto fail_fw_integrity;
 	}
@@ -5697,7 +5697,7 @@ qla2x00_load_risc(scsi_qla_host_t *vha, uint32_t *srisc_addr)
 		if (blob->fw->size < fwclen) {
 			ql_log(ql_log_fatal, vha, 0x0088,
 			    "Unable to verify integrity of firmware image "
-			    "(%Zd).\n", blob->fw->size);
+			    "(%zd).\n", blob->fw->size);
 			goto fail_fw_integrity;
 		}
 
@@ -5778,7 +5778,7 @@ qla24xx_load_risc_blob(scsi_qla_host_t *vha, uint32_t *srisc_addr)
 	/* Validate firmware image by checking version. */
 	if (blob->fw->size < 8 * sizeof(uint32_t)) {
 		ql_log(ql_log_fatal, vha, 0x0093,
-		    "Unable to verify integrity of firmware image (%Zd).\n",
+		    "Unable to verify integrity of firmware image (%zd).\n",
 		    blob->fw->size);
 		return QLA_FUNCTION_FAILED;
 	}
@@ -5789,7 +5789,7 @@ qla24xx_load_risc_blob(scsi_qla_host_t *vha, uint32_t *srisc_addr)
 	    (dcode[0] == 0 && dcode[1] == 0 && dcode[2] == 0 &&
 		dcode[3] == 0)) {
 		ql_log(ql_log_fatal, vha, 0x0094,
-		    "Unable to verify integrity of firmware image (%Zd).\n",
+		    "Unable to verify integrity of firmware image (%zd).\n",
 		    blob->fw->size);
 		ql_log(ql_log_fatal, vha, 0x0095,
 		    "Firmware data: %08x %08x %08x %08x.\n",
@@ -5807,7 +5807,7 @@ qla24xx_load_risc_blob(scsi_qla_host_t *vha, uint32_t *srisc_addr)
 		if (blob->fw->size < fwclen) {
 			ql_log(ql_log_fatal, vha, 0x0096,
 			    "Unable to verify integrity of firmware image "
-			    "(%Zd).\n", blob->fw->size);
+			    "(%zd).\n", blob->fw->size);
 			return QLA_FUNCTION_FAILED;
 		}
 
diff --git a/drivers/tty/n_hdlc.c b/drivers/tty/n_hdlc.c
index eb278832f5ce..1bacbc3b19a0 100644
--- a/drivers/tty/n_hdlc.c
+++ b/drivers/tty/n_hdlc.c
@@ -667,7 +667,7 @@ static ssize_t n_hdlc_tty_write(struct tty_struct *tty, struct file *file,
 	struct n_hdlc_buf *tbuf;
 
 	if (debuglevel >= DEBUG_LEVEL_INFO)	
-		printk("%s(%d)n_hdlc_tty_write() called count=%Zd\n",
+		printk("%s(%d)n_hdlc_tty_write() called count=%zd\n",
 			__FILE__,__LINE__,count);
 		
 	/* Verify pointers */
diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
index 6bde4396927c..a2615d64d07c 100644
--- a/drivers/usb/gadget/legacy/inode.c
+++ b/drivers/usb/gadget/legacy/inode.c
@@ -1848,7 +1848,7 @@ dev_config (struct file *fd, const char __user *buf, size_t len, loff_t *ptr)
 
 fail:
 	spin_unlock_irq (&dev->lock);
-	pr_debug ("%s: %s fail %Zd, %p\n", shortname, __func__, value, dev);
+	pr_debug ("%s: %s fail %zd, %p\n", shortname, __func__, value, dev);
 	kfree (dev->buf);
 	dev->buf = NULL;
 	return value;
diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c
index 063064801ceb..ac2c4eab478d 100644
--- a/drivers/usb/host/ehci-hcd.c
+++ b/drivers/usb/host/ehci-hcd.c
@@ -1322,7 +1322,7 @@ static int __init ehci_hcd_init(void)
 		printk(KERN_WARNING "Warning! ehci_hcd should always be loaded"
 				" before uhci_hcd and ohci_hcd, not after\n");
 
-	pr_debug("%s: block sizes: qh %Zd qtd %Zd itd %Zd sitd %Zd\n",
+	pr_debug("%s: block sizes: qh %zd qtd %zd itd %zd sitd %zd\n",
 		 hcd_name,
 		 sizeof(struct ehci_qh), sizeof(struct ehci_qtd),
 		 sizeof(struct ehci_itd), sizeof(struct ehci_sitd));
diff --git a/drivers/usb/host/fotg210-hcd.c b/drivers/usb/host/fotg210-hcd.c
index 9d0b0518290a..1c5b34b74860 100644
--- a/drivers/usb/host/fotg210-hcd.c
+++ b/drivers/usb/host/fotg210-hcd.c
@@ -5697,7 +5697,7 @@ static int __init fotg210_hcd_init(void)
 			test_bit(USB_OHCI_LOADED, &usb_hcds_loaded))
 		pr_warn("Warning! fotg210_hcd should always be loaded before uhci_hcd and ohci_hcd, not after\n");
 
-	pr_debug("%s: block sizes: qh %Zd qtd %Zd itd %Zd\n",
+	pr_debug("%s: block sizes: qh %zd qtd %zd itd %zd\n",
 			hcd_name, sizeof(struct fotg210_qh),
 			sizeof(struct fotg210_qtd),
 			sizeof(struct fotg210_itd));
diff --git a/drivers/usb/host/ohci-hcd.c b/drivers/usb/host/ohci-hcd.c
index 8685cf3e6292..b6daf2e69989 100644
--- a/drivers/usb/host/ohci-hcd.c
+++ b/drivers/usb/host/ohci-hcd.c
@@ -1252,7 +1252,7 @@ static int __init ohci_hcd_mod_init(void)
 		return -ENODEV;
 
 	printk(KERN_INFO "%s: " DRIVER_DESC "\n", hcd_name);
-	pr_debug ("%s: block sizes: ed %Zd td %Zd\n", hcd_name,
+	pr_debug ("%s: block sizes: ed %zd td %zd\n", hcd_name,
 		sizeof (struct ed), sizeof (struct td));
 	set_bit(USB_OHCI_LOADED, &usb_hcds_loaded);
 
diff --git a/drivers/usb/misc/adutux.c b/drivers/usb/misc/adutux.c
index a540e4f206c4..c5fa584d8f0a 100644
--- a/drivers/usb/misc/adutux.c
+++ b/drivers/usb/misc/adutux.c
@@ -563,20 +563,20 @@ static ssize_t adu_write(struct file *file, const __user char *buffer,
 			}
 
 			dev_dbg(&dev->udev->dev,
-				"%s : in progress, count = %Zd\n",
+				"%s : in progress, count = %zd\n",
 				__func__, count);
 		} else {
 			spin_unlock_irqrestore(&dev->buflock, flags);
 			set_current_state(TASK_RUNNING);
 			remove_wait_queue(&dev->write_wait, &waita);
-			dev_dbg(&dev->udev->dev, "%s : sending, count = %Zd\n",
+			dev_dbg(&dev->udev->dev, "%s : sending, count = %zd\n",
 				__func__, count);
 
 			/* write the data into interrupt_out_buffer from userspace */
 			buffer_size = usb_endpoint_maxp(dev->interrupt_out_endpoint);
 			bytes_to_write = count > buffer_size ? buffer_size : count;
 			dev_dbg(&dev->udev->dev,
-				"%s : buffer_size = %Zd, count = %Zd, bytes_to_write = %Zd\n",
+				"%s : buffer_size = %zd, count = %zd, bytes_to_write = %zd\n",
 				__func__, buffer_size, count, bytes_to_write);
 
 			if (copy_from_user(dev->interrupt_out_buffer, buffer, bytes_to_write) != 0) {
diff --git a/drivers/usb/misc/legousbtower.c b/drivers/usb/misc/legousbtower.c
index b10e26c74a90..322a042d6e59 100644
--- a/drivers/usb/misc/legousbtower.c
+++ b/drivers/usb/misc/legousbtower.c
@@ -673,7 +673,7 @@ static ssize_t tower_write (struct file *file, const char __user *buffer, size_t
 
 	/* write the data into interrupt_out_buffer from userspace */
 	bytes_to_write = min_t(int, count, write_buffer_size);
-	dev_dbg(&dev->udev->dev, "%s: count = %Zd, bytes_to_write = %Zd\n",
+	dev_dbg(&dev->udev->dev, "%s: count = %zd, bytes_to_write = %zd\n",
 		__func__, count, bytes_to_write);
 
 	if (copy_from_user (dev->interrupt_out_buffer, buffer, bytes_to_write)) {
diff --git a/drivers/usb/misc/uss720.c b/drivers/usb/misc/uss720.c
index 356d312add57..0a643fa74cab 100644
--- a/drivers/usb/misc/uss720.c
+++ b/drivers/usb/misc/uss720.c
@@ -526,7 +526,7 @@ static size_t parport_uss720_epp_write_data(struct parport *pp, const void *buf,
 		return 0;
 	i = usb_bulk_msg(usbdev, usb_sndbulkpipe(usbdev, 1), (void *)buf, length, &rlen, 20000);
 	if (i)
-		printk(KERN_ERR "uss720: sendbulk ep 1 buf %p len %Zu rlen %u\n", buf, length, rlen);
+		printk(KERN_ERR "uss720: sendbulk ep 1 buf %p len %zu rlen %u\n", buf, length, rlen);
 	change_mode(pp, ECR_PS2);
 	return rlen;
 #endif
@@ -587,7 +587,7 @@ static size_t parport_uss720_ecp_write_data(struct parport *pp, const void *buff
 		return 0;
 	i = usb_bulk_msg(usbdev, usb_sndbulkpipe(usbdev, 1), (void *)buffer, len, &rlen, 20000);
 	if (i)
-		printk(KERN_ERR "uss720: sendbulk ep 1 buf %p len %Zu rlen %u\n", buffer, len, rlen);
+		printk(KERN_ERR "uss720: sendbulk ep 1 buf %p len %zu rlen %u\n", buffer, len, rlen);
 	change_mode(pp, ECR_PS2);
 	return rlen;
 }
@@ -605,7 +605,7 @@ static size_t parport_uss720_ecp_read_data(struct parport *pp, void *buffer, siz
 		return 0;
 	i = usb_bulk_msg(usbdev, usb_rcvbulkpipe(usbdev, 2), buffer, len, &rlen, 20000);
 	if (i)
-		printk(KERN_ERR "uss720: recvbulk ep 2 buf %p len %Zu rlen %u\n", buffer, len, rlen);
+		printk(KERN_ERR "uss720: recvbulk ep 2 buf %p len %zu rlen %u\n", buffer, len, rlen);
 	change_mode(pp, ECR_PS2);
 	return rlen;
 }
@@ -638,7 +638,7 @@ static size_t parport_uss720_write_compat(struct parport *pp, const void *buffer
 		return 0;
 	i = usb_bulk_msg(usbdev, usb_sndbulkpipe(usbdev, 1), (void *)buffer, len, &rlen, 20000);
 	if (i)
-		printk(KERN_ERR "uss720: sendbulk ep 1 buf %p len %Zu rlen %u\n", buffer, len, rlen);
+		printk(KERN_ERR "uss720: sendbulk ep 1 buf %p len %zu rlen %u\n", buffer, len, rlen);
 	change_mode(pp, ECR_PS2);
 	return rlen;
 }
diff --git a/drivers/video/fbdev/metronomefb.c b/drivers/video/fbdev/metronomefb.c
index abb6bbf226d5..9085e9525341 100644
--- a/drivers/video/fbdev/metronomefb.c
+++ b/drivers/video/fbdev/metronomefb.c
@@ -187,7 +187,7 @@ static int load_waveform(u8 *mem, size_t size, int m, int t,
 		epd_frame_table[par->dt].wfm_size = user_wfm_size;
 
 	if (size != epd_frame_table[par->dt].wfm_size) {
-		dev_err(dev, "Error: unexpected size %Zd != %d\n", size,
+		dev_err(dev, "Error: unexpected size %zd != %d\n", size,
 					epd_frame_table[par->dt].wfm_size);
 		return -EINVAL;
 	}
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 51a241e09fbb..949f960337f5 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -252,7 +252,7 @@ static int afs_dir_iterate_block(struct dir_context *ctx,
 		/* skip entries marked unused in the bitmap */
 		if (!(block->pagehdr.bitmap[offset / 8] &
 		      (1 << (offset % 8)))) {
-			_debug("ENT[%Zu.%u]: unused",
+			_debug("ENT[%zu.%u]: unused",
 			       blkoff / sizeof(union afs_dir_block), offset);
 			if (offset >= curr)
 				ctx->pos = blkoff +
@@ -266,7 +266,7 @@ static int afs_dir_iterate_block(struct dir_context *ctx,
 			       sizeof(*block) -
 			       offset * sizeof(union afs_dirent));
 
-		_debug("ENT[%Zu.%u]: %s %Zu \"%s\"",
+		_debug("ENT[%zu.%u]: %s %zu \"%s\"",
 		       blkoff / sizeof(union afs_dir_block), offset,
 		       (offset < curr ? "skip" : "fill"),
 		       nlen, dire->u.name);
@@ -274,23 +274,23 @@ static int afs_dir_iterate_block(struct dir_context *ctx,
 		/* work out where the next possible entry is */
 		for (tmp = nlen; tmp > 15; tmp -= sizeof(union afs_dirent)) {
 			if (next >= AFS_DIRENT_PER_BLOCK) {
-				_debug("ENT[%Zu.%u]:"
+				_debug("ENT[%zu.%u]:"
 				       " %u travelled beyond end dir block"
-				       " (len %u/%Zu)",
+				       " (len %u/%zu)",
 				       blkoff / sizeof(union afs_dir_block),
 				       offset, next, tmp, nlen);
 				return -EIO;
 			}
 			if (!(block->pagehdr.bitmap[next / 8] &
 			      (1 << (next % 8)))) {
-				_debug("ENT[%Zu.%u]:"
-				       " %u unmarked extension (len %u/%Zu)",
+				_debug("ENT[%zu.%u]:"
+				       " %u unmarked extension (len %u/%zu)",
 				       blkoff / sizeof(union afs_dir_block),
 				       offset, next, tmp, nlen);
 				return -EIO;
 			}
 
-			_debug("ENT[%Zu.%u]: ext %u/%Zu",
+			_debug("ENT[%zu.%u]: ext %u/%zu",
 			       blkoff / sizeof(union afs_dir_block),
 			       next, tmp, nlen);
 			next++;
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index f32f272ee501..97b111d79489 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -525,7 +525,7 @@ static int do_tcp_rcv(struct ncp_server *server, void *buffer, size_t len)
 		return result;
 	}
 	if (result > len) {
-		pr_err("tcp: bug in recvmsg (%u > %Zu)\n", result, len);
+		pr_err("tcp: bug in recvmsg (%u > %zu)\n", result, len);
 		return -EIO;			
 	}
 	return result;
@@ -619,7 +619,7 @@ skipdata:;
 					goto skipdata2;
 				}
 				if (datalen > req->datalen + 8) {
-					pr_err("tcp: Unexpected reply len %d (expected at most %Zd)\n", datalen, req->datalen + 8);
+					pr_err("tcp: Unexpected reply len %d (expected at most %zd)\n", datalen, req->datalen + 8);
 					server->rcv.state = 3;
 					goto skipdata;
 				}
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 2905479f214a..0ca370d23ddb 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -381,7 +381,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
 	struct blk_plug plug;
 	int i;
 
-	dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
+	dprintk("%s enter, %zu@%lld\n", __func__, count, offset);
 
 	/* At this point, header->page_aray is a (sequential) list of nfs_pages.
 	 * We want to write each, and if there is an error set pnfs_error
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index a3fc48ba4931..18f98e08544d 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -482,7 +482,7 @@ filelayout_read_pagelist(struct nfs_pgio_header *hdr)
 	u32 j, idx;
 	struct nfs_fh *fh;
 
-	dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
+	dprintk("--> %s ino %lu pgbase %u req %zu@%llu\n",
 		__func__, hdr->inode->i_ino,
 		hdr->args.pgbase, (size_t)hdr->args.count, offset);
 
@@ -540,7 +540,7 @@ filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
 	if (IS_ERR(ds_clnt))
 		return PNFS_NOT_ATTEMPTED;
 
-	dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d\n",
+	dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d\n",
 		__func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
 		offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
 
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 0ca4af8cca5d..d6acc688df7e 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1751,7 +1751,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
 	int vers;
 	struct nfs_fh *fh;
 
-	dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
+	dprintk("--> %s ino %lu pgbase %u req %zu@%llu\n",
 		__func__, hdr->inode->i_ino,
 		hdr->args.pgbase, (size_t)hdr->args.count, offset);
 
@@ -1828,7 +1828,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
 
 	vers = nfs4_ff_layout_ds_version(lseg, idx);
 
-	dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d vers %d\n",
+	dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n",
 		__func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
 		offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count),
 		vers);
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 2a4cdce939a0..8f3d2acb81c3 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -291,7 +291,7 @@ objlayout_read_pagelist(struct nfs_pgio_header *hdr)
 			      &hdr->args.pgbase,
 			      hdr->args.offset, hdr->args.count);
 
-	dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
+	dprintk("%s: inode(%lx) offset 0x%llx count 0x%zx eof=%d\n",
 		__func__, inode->i_ino, offset, count, hdr->res.eof);
 
 	err = objio_read_pagelist(hdr);
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index d6b97b424ad1..96fd15979cbd 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -578,7 +578,7 @@ nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data)
 	struct kvec	*vec = &rqstp->rq_res.head[0];
 
 	if (vec->iov_len + data->iov_len > PAGE_SIZE) {
-		printk(KERN_WARNING "nfsd: cached reply too large (%Zd).\n",
+		printk(KERN_WARNING "nfsd: cached reply too large (%zd).\n",
 				data->iov_len);
 		return 0;
 	}
diff --git a/kernel/relay.c b/kernel/relay.c
index 8f8dc91db680..0e413d9eec8a 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -847,7 +847,7 @@ void relay_close(struct rchan *chan)
 
 	if (chan->last_toobig)
 		printk(KERN_WARNING "relay: one or more items not logged "
-		       "[item size (%Zd) > sub-buffer size (%Zd)]\n",
+		       "[item size (%zd) > sub-buffer size (%zd)]\n",
 		       chan->last_toobig, chan->subbuf_size);
 
 	list_del(&chan->list);
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 0967771d8f7f..e3bf4e0f10b5 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -1739,6 +1739,7 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
  * 'h', 'l', or 'L' for integer fields
  * 'z' support added 23/7/1999 S.H.
  * 'z' changed to 'Z' --davidm 1/25/99
+ * 'Z' changed to 'z' --adobriyan 2017-01-25
  * 't' added for ptrdiff_t
  *
  * @fmt: the format string
@@ -1838,7 +1839,7 @@ qualifier:
 	/* get the conversion qualifier */
 	qualifier = 0;
 	if (*fmt == 'h' || _tolower(*fmt) == 'l' ||
-	    _tolower(*fmt) == 'z' || *fmt == 't') {
+	    *fmt == 'z' || *fmt == 't') {
 		qualifier = *fmt++;
 		if (unlikely(qualifier == *fmt)) {
 			if (qualifier == 'l') {
@@ -1907,7 +1908,7 @@ qualifier:
 	else if (qualifier == 'l') {
 		BUILD_BUG_ON(FORMAT_TYPE_ULONG + SIGN != FORMAT_TYPE_LONG);
 		spec->type = FORMAT_TYPE_ULONG + (spec->flags & SIGN);
-	} else if (_tolower(qualifier) == 'z') {
+	} else if (qualifier == 'z') {
 		spec->type = FORMAT_TYPE_SIZE_T;
 	} else if (qualifier == 't') {
 		spec->type = FORMAT_TYPE_PTRDIFF;
@@ -2657,7 +2658,7 @@ int vsscanf(const char *buf, const char *fmt, va_list args)
 		/* get conversion qualifier */
 		qualifier = -1;
 		if (*fmt == 'h' || _tolower(*fmt) == 'l' ||
-		    _tolower(*fmt) == 'z') {
+		    *fmt == 'z') {
 			qualifier = *fmt++;
 			if (unlikely(qualifier == *fmt)) {
 				if (qualifier == 'h') {
@@ -2851,7 +2852,6 @@ int vsscanf(const char *buf, const char *fmt, va_list args)
 			else
 				*va_arg(args, unsigned long long *) = val.u;
 			break;
-		case 'Z':
 		case 'z':
 			*va_arg(args, size_t *) = val.u;
 			break;
diff --git a/mm/dmapool.c b/mm/dmapool.c
index cef82b8a9291..4d90a64b2fdc 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -93,7 +93,7 @@ show_pools(struct device *dev, struct device_attribute *attr, char *buf)
 		spin_unlock_irq(&pool->lock);
 
 		/* per-pool info, no real statistics yet */
-		temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n",
+		temp = scnprintf(next, size, "%-16s %4u %4zu %4zu %2u\n",
 				 pool->name, blocks,
 				 pages * (pool->allocation / pool->size),
 				 pool->size, pages);
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 10d2bdce686e..465cc24b41e5 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1656,7 +1656,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 	ddp->deh_dport = usat->sat_port;
 	ddp->deh_sport = at->src_port;
 
-	SOCK_DEBUG(sk, "SK %p: Copy user data (%Zd bytes).\n", sk, len);
+	SOCK_DEBUG(sk, "SK %p: Copy user data (%zd bytes).\n", sk, len);
 
 	err = memcpy_from_msg(skb_put(skb, len), msg, len);
 	if (err) {
@@ -1720,7 +1720,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 		 */
 		aarp_send_ddp(dev, skb, &usat->sat_addr, NULL);
 	}
-	SOCK_DEBUG(sk, "SK %p: Done write (%Zd).\n", sk, len);
+	SOCK_DEBUG(sk, "SK %p: Done write (%zd).\n", sk, len);
 
 out:
 	release_sock(sk);
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index 3b3b1a292ec8..a190800572bd 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -451,7 +451,7 @@ static void lane2_assoc_ind(struct net_device *dev, const u8 *mac_addr,
 			return;
 	}
 	if (end_of_tlvs - tlvs != 0)
-		pr_info("(%s) ignoring %Zd bytes of trailing TLV garbage\n",
+		pr_info("(%s) ignoring %zd bytes of trailing TLV garbage\n",
 			dev->name, end_of_tlvs - tlvs);
 }
 
diff --git a/net/bridge/netfilter/ebt_among.c b/net/bridge/netfilter/ebt_among.c
index 9024283d2bca..279527f8b1fe 100644
--- a/net/bridge/netfilter/ebt_among.c
+++ b/net/bridge/netfilter/ebt_among.c
@@ -187,7 +187,7 @@ static int ebt_among_mt_check(const struct xt_mtchk_param *par)
 	expected_length += ebt_mac_wormhash_size(wh_src);
 
 	if (em->match_size != EBT_ALIGN(expected_length)) {
-		pr_info("wrong size: %d against expected %d, rounded to %Zd\n",
+		pr_info("wrong size: %d against expected %d, rounded to %zd\n",
 			em->match_size, expected_length,
 			EBT_ALIGN(expected_length));
 		return -EINVAL;
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index e0bd013a1e5e..eedba7670b51 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -279,7 +279,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 	pr_debug("name = %s, mtu = %u\n", dev->name, mtu);
 
 	if (size > mtu) {
-		pr_debug("size = %Zu, mtu = %u\n", size, mtu);
+		pr_debug("size = %zu, mtu = %u\n", size, mtu);
 		err = -EMSGSIZE;
 		goto out_dev;
 	}
@@ -645,7 +645,7 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 	pr_debug("name = %s, mtu = %u\n", dev->name, mtu);
 
 	if (size > mtu) {
-		pr_debug("size = %Zu, mtu = %u\n", size, mtu);
+		pr_debug("size = %zu, mtu = %u\n", size, mtu);
 		err = -EMSGSIZE;
 		goto out_dev;
 	}
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index d8cea210af0e..2f0d8233950f 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2388,7 +2388,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
 
 	seq_printf(seq,
 		   "Basic info: size of leaf:"
-		   " %Zd bytes, size of tnode: %Zd bytes.\n",
+		   " %zd bytes, size of tnode: %zd bytes.\n",
 		   LEAF_SIZE, TNODE_SIZE(0));
 
 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index beacd028848c..c0317c940bcd 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2596,7 +2596,7 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
 		const char *name =  vif->dev ? vif->dev->name : "none";
 
 		seq_printf(seq,
-			   "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
+			   "%2zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
 			   vif - mrt->vif_table,
 			   name, vif->bytes_in, vif->pkt_in,
 			   vif->bytes_out, vif->pkt_out,
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index fcfd071f4705..bc1486f2c064 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -235,7 +235,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
 	}
 
 	if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
-		pr_debug("SO_ORIGINAL_DST: len %d not %Zu\n",
+		pr_debug("SO_ORIGINAL_DST: len %d not %zu\n",
 			 *len, sizeof(struct sockaddr_in));
 		return -EINVAL;
 	}
diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
index f6f713376e6e..2f3895ddc275 100644
--- a/net/ipv4/netfilter/nf_log_arp.c
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -69,7 +69,7 @@ static void dump_arp_packet(struct nf_log_buf *m,
 
 	ap = skb_header_pointer(skb, sizeof(_arph), sizeof(_arpp), &_arpp);
 	if (ap == NULL) {
-		nf_log_buf_add(m, " INCOMPLETE [%Zu bytes]",
+		nf_log_buf_add(m, " INCOMPLETE [%zu bytes]",
 			       skb->len - sizeof(_arph));
 		return;
 	}
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
index 055c51b80f5d..97c724224da7 100644
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -64,7 +64,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
 	nf_log_buf_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
 
 	/* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */
-	nf_log_buf_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
+	nf_log_buf_add(m, "LEN=%zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
 	       ntohs(ih->payload_len) + sizeof(struct ipv6hdr),
 	       (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20,
 	       ih->hop_limit,
diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c
index 1215693fdd22..35dbf3dc3d28 100644
--- a/net/irda/irnet/irnet_ppp.c
+++ b/net/irda/irnet/irnet_ppp.c
@@ -51,7 +51,7 @@ irnet_ctrl_write(irnet_socket *	ap,
   char *	next;		/* Next command to process */
   int		length;		/* Length of current command */
 
-  DENTER(CTRL_TRACE, "(ap=0x%p, count=%Zd)\n", ap, count);
+  DENTER(CTRL_TRACE, "(ap=0x%p, count=%zd)\n", ap, count);
 
   /* Check for overflow... */
   DABORT(count >= IRNET_MAX_COMMAND, -ENOMEM,
@@ -66,7 +66,7 @@ irnet_ctrl_write(irnet_socket *	ap,
 
   /* Safe terminate the string */
   command[count] = '\0';
-  DEBUG(CTRL_INFO, "Command line received is ``%s'' (%Zd).\n",
+  DEBUG(CTRL_INFO, "Command line received is ``%s'' (%zd).\n",
 	command, count);
 
   /* Check every commands in the command line */
@@ -285,7 +285,7 @@ irnet_ctrl_read(irnet_socket *	ap,
   char		event[75];
   ssize_t	ret = 0;
 
-  DENTER(CTRL_TRACE, "(ap=0x%p, count=%Zd)\n", ap, count);
+  DENTER(CTRL_TRACE, "(ap=0x%p, count=%zd)\n", ap, count);
 
 #ifdef INITIAL_DISCOVERY
   /* Check if we have read the log */
@@ -328,7 +328,7 @@ irnet_ctrl_read(irnet_socket *	ap,
   if(ret != 0)
     {
       /* No, return the error code */
-      DEXIT(CTRL_TRACE, " - ret %Zd\n", ret);
+      DEXIT(CTRL_TRACE, " - ret %zd\n", ret);
       return ret;
     }
 
@@ -568,7 +568,7 @@ dev_irnet_write(struct file *	file,
 {
   irnet_socket *	ap = file->private_data;
 
-  DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%Zd)\n",
+  DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%zd)\n",
 	file, ap, count);
   DABORT(ap == NULL, -ENXIO, FS_ERROR, "ap is NULL !!!\n");
 
@@ -592,7 +592,7 @@ dev_irnet_read(struct file *	file,
 {
   irnet_socket *	ap = file->private_data;
 
-  DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%Zd)\n",
+  DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%zd)\n",
 	file, ap, count);
   DABORT(ap == NULL, -ENXIO, FS_ERROR, "ap is NULL !!!\n");
 
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index b58000efee73..8adab6335ced 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1058,10 +1058,10 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb,
 
 	/* Debug */
 	if (session->send_seq)
-		l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %Zd bytes, ns=%u\n",
+		l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %zd bytes, ns=%u\n",
 			 session->name, data_len, session->ns - 1);
 	else
-		l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %Zd bytes\n",
+		l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %zd bytes\n",
 			 session->name, data_len);
 
 	if (session->debug & L2TP_MSG_DATA) {
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 096a45103f14..e6a2753dff9e 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -1429,7 +1429,7 @@ int __init ip_vs_conn_init(void)
 		"(size=%d, memory=%ldKbytes)\n",
 		ip_vs_conn_tab_size,
 		(long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024);
-	IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
+	IP_VS_DBG(0, "Each connection entry needs %zd bytes at least\n",
 		  sizeof(struct ip_vs_conn));
 
 	for (idx = 0; idx < ip_vs_conn_tab_size; idx++)
diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c
index 6be5c538b71e..75f798f8e83b 100644
--- a/net/netfilter/ipvs/ip_vs_dh.c
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -163,7 +163,7 @@ static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
 		return -ENOMEM;
 
 	svc->sched_data = s;
-	IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for "
+	IP_VS_DBG(6, "DH hash table (memory=%zdbytes) allocated for "
 		  "current service\n",
 		  sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
 
@@ -183,7 +183,7 @@ static void ip_vs_dh_done_svc(struct ip_vs_service *svc)
 
 	/* release the table itself */
 	kfree_rcu(s, rcu_head);
-	IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n",
+	IP_VS_DBG(6, "DH hash table (memory=%zdbytes) released\n",
 		  sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index cccf4d637412..5824927cf8e0 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -356,7 +356,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
 		return -ENOMEM;
 
 	svc->sched_data = tbl;
-	IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
+	IP_VS_DBG(6, "LBLC hash table (memory=%zdbytes) allocated for "
 		  "current service\n", sizeof(*tbl));
 
 	/*
@@ -393,7 +393,7 @@ static void ip_vs_lblc_done_svc(struct ip_vs_service *svc)
 
 	/* release the table itself */
 	kfree_rcu(tbl, rcu_head);
-	IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
+	IP_VS_DBG(6, "LBLC hash table (memory=%zdbytes) released\n",
 		  sizeof(*tbl));
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 796d70e47ddd..703f11877bee 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -519,7 +519,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
 		return -ENOMEM;
 
 	svc->sched_data = tbl;
-	IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
+	IP_VS_DBG(6, "LBLCR hash table (memory=%zdbytes) allocated for "
 		  "current service\n", sizeof(*tbl));
 
 	/*
@@ -556,7 +556,7 @@ static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
 
 	/* release the table itself */
 	kfree_rcu(tbl, rcu_head);
-	IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
+	IP_VS_DBG(6, "LBLCR hash table (memory=%zdbytes) released\n",
 		  sizeof(*tbl));
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index 1e373a5e44e3..16aaac6eedc9 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -239,7 +239,7 @@ static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
 		return -ENOMEM;
 
 	svc->sched_data = s;
-	IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
+	IP_VS_DBG(6, "SH hash table (memory=%zdbytes) allocated for "
 		  "current service\n",
 		  sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
 
@@ -259,7 +259,7 @@ static void ip_vs_sh_done_svc(struct ip_vs_service *svc)
 
 	/* release the table itself */
 	kfree_rcu(s, rcu_head);
-	IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
+	IP_VS_DBG(6, "SH hash table (memory=%zdbytes) released\n",
 		  sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 9350530c16c1..b03c28084f81 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1791,7 +1791,7 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
 	u16 mtu, min_mtu;
 
 	IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
-	IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
+	IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n",
 		  sizeof(struct ip_vs_sync_conn_v0));
 
 	if (!ipvs->sync_state) {
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index e3ed20060878..4aecef4a89fb 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -300,7 +300,7 @@ static int find_pattern(const char *data, size_t dlen,
 {
 	size_t i = plen;
 
-	pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen);
+	pr_debug("find_pattern `%s': dlen = %zu\n", pattern, dlen);
 
 	if (dlen <= plen) {
 		/* Short packet: try for partial? */
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 85406d5f8f41..71ce6b945dcb 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -177,7 +177,7 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet,
 {
 	sctp_xmit_t retval;
 
-	pr_debug("%s: packet:%p size:%Zu chunk:%p size:%d\n", __func__,
+	pr_debug("%s: packet:%p size:%zu chunk:%p size:%d\n", __func__,
 		 packet, packet->size, chunk, chunk->skb ? chunk->skb->len : -1);
 
 	switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) {
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index cdeb1d814833..4f16953e4954 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -763,7 +763,7 @@ err_put_ctx:
 err:
 	kfree(buf);
 out:
-	dprintk("RPC:       %s returning %Zd\n", __func__, err);
+	dprintk("RPC:       %s returning %zd\n", __func__, err);
 	return err;
 }
 
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 75f290bddca1..2e22889a8837 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1260,7 +1260,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
 	return 0;
 
 err_short_len:
-	svc_printk(rqstp, "short len %Zd, dropping request\n",
+	svc_printk(rqstp, "short len %zd, dropping request\n",
 			argv->iov_len);
 	goto close;
 
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index de066acdb34e..d227d97f7ad4 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -278,7 +278,7 @@ static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
 			       rqstp->rq_respages[0], tailoff);
 
 out:
-	dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n",
+	dprintk("svc: socket %p sendto([%p %zu... ], %d) = %d (addr %s)\n",
 		svsk, xdr->head[0].iov_base, xdr->head[0].iov_len,
 		xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf)));
 
@@ -346,7 +346,7 @@ static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr,
 	if (len == buflen)
 		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 
-	dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n",
+	dprintk("svc: socket %p recvfrom(%p, %zu) = %d\n",
 		svsk, iov[0].iov_base, iov[0].iov_len, len);
 	return len;
 }
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index af392d9b9cec..956c7bce80d1 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1188,7 +1188,7 @@ static inline void xs_tcp_read_xid(struct sock_xprt *transport, struct xdr_skb_r
 	char *p;
 
 	len = sizeof(transport->tcp_xid) - transport->tcp_offset;
-	dprintk("RPC:       reading XID (%Zu bytes)\n", len);
+	dprintk("RPC:       reading XID (%zu bytes)\n", len);
 	p = ((char *) &transport->tcp_xid) + transport->tcp_offset;
 	used = xdr_skb_read_bits(desc, p, len);
 	transport->tcp_offset += used;
@@ -1219,7 +1219,7 @@ static inline void xs_tcp_read_calldir(struct sock_xprt *transport,
 	 */
 	offset = transport->tcp_offset - sizeof(transport->tcp_xid);
 	len = sizeof(transport->tcp_calldir) - offset;
-	dprintk("RPC:       reading CALL/REPLY flag (%Zu bytes)\n", len);
+	dprintk("RPC:       reading CALL/REPLY flag (%zu bytes)\n", len);
 	p = ((char *) &transport->tcp_calldir) + offset;
 	used = xdr_skb_read_bits(desc, p, len);
 	transport->tcp_offset += used;
@@ -1310,7 +1310,7 @@ static inline void xs_tcp_read_common(struct rpc_xprt *xprt,
 		return;
 	}
 
-	dprintk("RPC:       XID %08x read %Zd bytes\n",
+	dprintk("RPC:       XID %08x read %zd bytes\n",
 			ntohl(transport->tcp_xid), r);
 	dprintk("RPC:       xprt = %p, tcp_copied = %lu, tcp_offset = %u, "
 			"tcp_reclen = %u\n", xprt, transport->tcp_copied,
@@ -1456,7 +1456,7 @@ static inline void xs_tcp_read_discard(struct sock_xprt *transport, struct xdr_s
 	desc->count -= len;
 	desc->offset += len;
 	transport->tcp_offset += len;
-	dprintk("RPC:       discarded %Zu bytes\n", len);
+	dprintk("RPC:       discarded %zu bytes\n", len);
 	xs_tcp_check_fraghdr(transport);
 }
 
diff --git a/security/selinux/ss/ebitmap.c b/security/selinux/ss/ebitmap.c
index 7d10e5d418bb..9db4709a6877 100644
--- a/security/selinux/ss/ebitmap.c
+++ b/security/selinux/ss/ebitmap.c
@@ -360,7 +360,7 @@ int ebitmap_read(struct ebitmap *e, void *fp)
 
 	if (mapunit != BITS_PER_U64) {
 		printk(KERN_ERR "SELinux: ebitmap: map size %u does not "
-		       "match my size %Zd (high bit was %d)\n",
+		       "match my size %zd (high bit was %d)\n",
 		       mapunit, BITS_PER_U64, e->highbit);
 		goto bad;
 	}
diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c
index d719db4219cd..9c92f29a38ea 100644
--- a/security/selinux/ss/policydb.c
+++ b/security/selinux/ss/policydb.c
@@ -2266,7 +2266,7 @@ int policydb_read(struct policydb *p, void *fp)
 	len = le32_to_cpu(buf[1]);
 	if (len != strlen(POLICYDB_STRING)) {
 		printk(KERN_ERR "SELinux:  policydb string length %d does not "
-		       "match expected length %Zu\n",
+		       "match expected length %zu\n",
 		       len, strlen(POLICYDB_STRING));
 		goto bad;
 	}
diff --git a/sound/pci/korg1212/korg1212.c b/sound/pci/korg1212/korg1212.c
index 565f7f55c3ca..1e25095fd144 100644
--- a/sound/pci/korg1212/korg1212.c
+++ b/sound/pci/korg1212/korg1212.c
@@ -2051,7 +2051,7 @@ static void snd_korg1212_proc_read(struct snd_info_entry *entry,
 	snd_iprintf(buffer, korg1212->card->longname);
 	snd_iprintf(buffer, " (index #%d)\n", korg1212->card->number + 1);
 	snd_iprintf(buffer, "\nGeneral settings\n");
-	snd_iprintf(buffer, "    period size: %Zd bytes\n", K1212_PERIOD_BYTES);
+	snd_iprintf(buffer, "    period size: %zd bytes\n", K1212_PERIOD_BYTES);
 	snd_iprintf(buffer, "     clock mode: %s\n", clockSourceName[korg1212->clkSrcRate] );
 	snd_iprintf(buffer, "  left ADC Sens: %d\n", korg1212->leftADCInSens );
 	snd_iprintf(buffer, " right ADC Sens: %d\n", korg1212->rightADCInSens );
@@ -2276,7 +2276,7 @@ static int snd_korg1212_create(struct snd_card *card, struct pci_dev *pci,
 
 	if (snd_dma_alloc_pages(SNDRV_DMA_TYPE_DEV, snd_dma_pci_data(pci),
 				sizeof(struct KorgSharedBuffer), &korg1212->dma_shared) < 0) {
-		snd_printk(KERN_ERR "korg1212: can not allocate shared buffer memory (%Zd bytes)\n", sizeof(struct KorgSharedBuffer));
+		snd_printk(KERN_ERR "korg1212: can not allocate shared buffer memory (%zd bytes)\n", sizeof(struct KorgSharedBuffer));
                 snd_korg1212_free(korg1212);
                 return -ENOMEM;
         }
diff --git a/sound/pci/pcxhr/pcxhr_hwdep.c b/sound/pci/pcxhr/pcxhr_hwdep.c
index 80633055e17e..a99808ab01fe 100644
--- a/sound/pci/pcxhr/pcxhr_hwdep.c
+++ b/sound/pci/pcxhr/pcxhr_hwdep.c
@@ -292,7 +292,7 @@ static int pcxhr_dsp_load(struct pcxhr_mgr *mgr, int index,
 	int err, card_index;
 
 	dev_dbg(&mgr->pci->dev,
-		"loading dsp [%d] size = %Zd\n", index, dsp->size);
+		"loading dsp [%d] size = %zd\n", index, dsp->size);
 
 	switch (index) {
 	case PCXHR_FIRMWARE_XLX_INT_INDEX:
diff --git a/sound/pcmcia/vx/vxp_ops.c b/sound/pcmcia/vx/vxp_ops.c
index 56aa1ba73ccc..5f97791f00d7 100644
--- a/sound/pcmcia/vx/vxp_ops.c
+++ b/sound/pcmcia/vx/vxp_ops.c
@@ -201,7 +201,7 @@ static int vxp_load_xilinx_binary(struct vx_core *_chip, const struct firmware *
 	c |= (int)vx_inb(chip, RXM) << 8;
 	c |= vx_inb(chip, RXL);
 
-	snd_printdd(KERN_DEBUG "xilinx: dsp size received 0x%x, orig 0x%Zx\n", c, fw->size);
+	snd_printdd(KERN_DEBUG "xilinx: dsp size received 0x%x, orig 0x%zx\n", c, fw->size);
 
 	vx_outb(chip, ICR, ICR_HF0);
 
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index a110d3987d4a..6dca408faae3 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -3041,7 +3041,7 @@ static int snd_soc_register_dais(struct snd_soc_component *component,
 	unsigned int i;
 	int ret;
 
-	dev_dbg(dev, "ASoC: dai register %s #%Zu\n", dev_name(dev), count);
+	dev_dbg(dev, "ASoC: dai register %s #%zu\n", dev_name(dev), count);
 
 	component->dai_drv = dai_drv;
 
-- 
cgit v1.2.3


From f1f1007644ffc8051a4c11427d58b1967ae7b75a Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Mon, 27 Feb 2017 14:30:07 -0800
Subject: mm: add new mmgrab() helper

Apart from adding the helper function itself, the rest of the kernel is
converted mechanically using:

  git grep -l 'atomic_inc.*mm_count' | xargs sed -i 's/atomic_inc(&\(.*\)->mm_count);/mmgrab\(\1\);/'
  git grep -l 'atomic_inc.*mm_count' | xargs sed -i 's/atomic_inc(&\(.*\)\.mm_count);/mmgrab\(\&\1\);/'

This is needed for a later patch that hooks into the helper, but might
be a worthwhile cleanup on its own.

(Michal Hocko provided most of the kerneldoc comment.)

Link: http://lkml.kernel.org/r/20161218123229.22952-1-vegard.nossum@oracle.com
Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/smp.c                  |  2 +-
 arch/arc/kernel/smp.c                    |  2 +-
 arch/arm/kernel/smp.c                    |  2 +-
 arch/arm64/kernel/smp.c                  |  2 +-
 arch/blackfin/mach-common/smp.c          |  2 +-
 arch/hexagon/kernel/smp.c                |  2 +-
 arch/ia64/kernel/setup.c                 |  2 +-
 arch/m32r/kernel/setup.c                 |  2 +-
 arch/metag/kernel/smp.c                  |  2 +-
 arch/mips/kernel/traps.c                 |  2 +-
 arch/mn10300/kernel/smp.c                |  2 +-
 arch/parisc/kernel/smp.c                 |  2 +-
 arch/powerpc/kernel/smp.c                |  2 +-
 arch/s390/kernel/processor.c             |  2 +-
 arch/score/kernel/traps.c                |  2 +-
 arch/sh/kernel/smp.c                     |  2 +-
 arch/sparc/kernel/leon_smp.c             |  2 +-
 arch/sparc/kernel/smp_64.c               |  2 +-
 arch/sparc/kernel/sun4d_smp.c            |  2 +-
 arch/sparc/kernel/sun4m_smp.c            |  2 +-
 arch/sparc/kernel/traps_32.c             |  2 +-
 arch/sparc/kernel/traps_64.c             |  2 +-
 arch/tile/kernel/smpboot.c               |  2 +-
 arch/x86/kernel/cpu/common.c             |  4 ++--
 arch/xtensa/kernel/smp.c                 |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c |  2 +-
 drivers/gpu/drm/i915/i915_gem_userptr.c  |  2 +-
 drivers/infiniband/hw/hfi1/file_ops.c    |  2 +-
 fs/proc/base.c                           |  4 ++--
 fs/userfaultfd.c                         |  2 +-
 include/linux/sched.h                    | 22 ++++++++++++++++++++++
 kernel/exit.c                            |  2 +-
 kernel/futex.c                           |  2 +-
 kernel/sched/core.c                      |  4 ++--
 mm/khugepaged.c                          |  2 +-
 mm/ksm.c                                 |  2 +-
 mm/mmu_context.c                         |  2 +-
 mm/mmu_notifier.c                        |  2 +-
 mm/oom_kill.c                            |  4 ++--
 virt/kvm/kvm_main.c                      |  2 +-
 40 files changed, 65 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index 46bf263c3153..acb4b146a607 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -144,7 +144,7 @@ smp_callin(void)
 		alpha_mv.smp_callin();
 
 	/* All kernel threads share the same mm context.  */
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 
 	/* inform the notifiers about the new cpu */
diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c
index 2afbafadb6ab..695624181682 100644
--- a/arch/arc/kernel/smp.c
+++ b/arch/arc/kernel/smp.c
@@ -140,7 +140,7 @@ void start_kernel_secondary(void)
 	setup_processor();
 
 	atomic_inc(&mm->mm_users);
-	atomic_inc(&mm->mm_count);
+	mmgrab(mm);
 	current->active_mm = mm;
 	cpumask_set_cpu(cpu, mm_cpumask(mm));
 
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 7dd14e8395e6..c6514ce0fcbc 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -371,7 +371,7 @@ asmlinkage void secondary_start_kernel(void)
 	 * reference and switch to it.
 	 */
 	cpu = smp_processor_id();
-	atomic_inc(&mm->mm_count);
+	mmgrab(mm);
 	current->active_mm = mm;
 	cpumask_set_cpu(cpu, mm_cpumask(mm));
 
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index a8ec5da530af..827d52d78b67 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -222,7 +222,7 @@ asmlinkage void secondary_start_kernel(void)
 	 * All kernel threads share the same mm context; grab a
 	 * reference and switch to it.
 	 */
-	atomic_inc(&mm->mm_count);
+	mmgrab(mm);
 	current->active_mm = mm;
 
 	/*
diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c
index 23c4ef5f8bdc..bc5617ef7128 100644
--- a/arch/blackfin/mach-common/smp.c
+++ b/arch/blackfin/mach-common/smp.c
@@ -308,7 +308,7 @@ void secondary_start_kernel(void)
 
 	/* Attach the new idle task to the global mm. */
 	atomic_inc(&mm->mm_users);
-	atomic_inc(&mm->mm_count);
+	mmgrab(mm);
 	current->active_mm = mm;
 
 	preempt_disable();
diff --git a/arch/hexagon/kernel/smp.c b/arch/hexagon/kernel/smp.c
index 983bae7d2665..c02a6455839e 100644
--- a/arch/hexagon/kernel/smp.c
+++ b/arch/hexagon/kernel/smp.c
@@ -162,7 +162,7 @@ void start_secondary(void)
 	);
 
 	/*  Set the memory struct  */
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 
 	cpu = smp_processor_id();
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index c483ece3eb84..d68322966f33 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -994,7 +994,7 @@ cpu_init (void)
 	 */
 	ia64_setreg(_IA64_REG_CR_DCR,  (  IA64_DCR_DP | IA64_DCR_DK | IA64_DCR_DX | IA64_DCR_DR
 					| IA64_DCR_DA | IA64_DCR_DD | IA64_DCR_LC));
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 	BUG_ON(current->mm);
 
diff --git a/arch/m32r/kernel/setup.c b/arch/m32r/kernel/setup.c
index 136c69f1fb8a..b18bc0bd6544 100644
--- a/arch/m32r/kernel/setup.c
+++ b/arch/m32r/kernel/setup.c
@@ -403,7 +403,7 @@ void __init cpu_init (void)
 	printk(KERN_INFO "Initializing CPU#%d\n", cpu_id);
 
 	/* Set up and load the per-CPU TSS and LDT */
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 	if (current->mm)
 		BUG();
diff --git a/arch/metag/kernel/smp.c b/arch/metag/kernel/smp.c
index bad13232de51..af9cff547a19 100644
--- a/arch/metag/kernel/smp.c
+++ b/arch/metag/kernel/smp.c
@@ -345,7 +345,7 @@ asmlinkage void secondary_start_kernel(void)
 	 * reference and switch to it.
 	 */
 	atomic_inc(&mm->mm_users);
-	atomic_inc(&mm->mm_count);
+	mmgrab(mm);
 	current->active_mm = mm;
 	cpumask_set_cpu(cpu, mm_cpumask(mm));
 	enter_lazy_tlb(mm, current);
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index cb479be31a50..49c6df20672a 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -2232,7 +2232,7 @@ void per_cpu_trap_init(bool is_boot_cpu)
 	if (!cpu_data[cpu].asid_cache)
 		cpu_data[cpu].asid_cache = asid_first_version(cpu);
 
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 	BUG_ON(current->mm);
 	enter_lazy_tlb(&init_mm, current);
diff --git a/arch/mn10300/kernel/smp.c b/arch/mn10300/kernel/smp.c
index 426173c4b0b9..e65b5cc2fa67 100644
--- a/arch/mn10300/kernel/smp.c
+++ b/arch/mn10300/kernel/smp.c
@@ -589,7 +589,7 @@ static void __init smp_cpu_init(void)
 	}
 	printk(KERN_INFO "Initializing CPU#%d\n", cpu_id);
 
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 	BUG_ON(current->mm);
 
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index 75dab2871346..67b452b41ff6 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -279,7 +279,7 @@ smp_cpu_init(int cpunum)
 	set_cpu_online(cpunum, true);
 
 	/* Initialise the idle task for this CPU */
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 	BUG_ON(current->mm);
 	enter_lazy_tlb(&init_mm, current);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 893bd7f79be6..573fb3a461b5 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -707,7 +707,7 @@ void start_secondary(void *unused)
 	unsigned int cpu = smp_processor_id();
 	int i, base;
 
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 
 	smp_store_cpu_info(cpu);
diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c
index 21004aaac69b..bc2b60dcb178 100644
--- a/arch/s390/kernel/processor.c
+++ b/arch/s390/kernel/processor.c
@@ -73,7 +73,7 @@ void cpu_init(void)
 	get_cpu_id(id);
 	if (machine_has_cpu_mhz)
 		update_cpu_mhz(NULL);
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 	BUG_ON(current->mm);
 	enter_lazy_tlb(&init_mm, current);
diff --git a/arch/score/kernel/traps.c b/arch/score/kernel/traps.c
index 2b22bcf02c27..569ac02f68df 100644
--- a/arch/score/kernel/traps.c
+++ b/arch/score/kernel/traps.c
@@ -336,7 +336,7 @@ void __init trap_init(void)
 	set_except_vector(18, handle_dbe);
 	flush_icache_range(DEBUG_VECTOR_BASE_ADDR, IRQ_VECTOR_BASE_ADDR);
 
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 	cpu_cache_init();
 }
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index 38e7860845db..ee379c699c08 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -178,7 +178,7 @@ asmlinkage void start_secondary(void)
 	struct mm_struct *mm = &init_mm;
 
 	enable_mmu();
-	atomic_inc(&mm->mm_count);
+	mmgrab(mm);
 	atomic_inc(&mm->mm_users);
 	current->active_mm = mm;
 #ifdef CONFIG_MMU
diff --git a/arch/sparc/kernel/leon_smp.c b/arch/sparc/kernel/leon_smp.c
index 71e16f2241c2..b99d33797e1d 100644
--- a/arch/sparc/kernel/leon_smp.c
+++ b/arch/sparc/kernel/leon_smp.c
@@ -93,7 +93,7 @@ void leon_cpu_pre_online(void *arg)
 			     : "memory" /* paranoid */);
 
 	/* Attach to the address space of init_task. */
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 
 	while (!cpumask_test_cpu(cpuid, &smp_commenced_mask))
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 90a02cb64e20..8e3e13924594 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -122,7 +122,7 @@ void smp_callin(void)
 	current_thread_info()->new_child = 0;
 
 	/* Attach to the address space of init_task. */
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 
 	/* inform the notifiers about the new cpu */
diff --git a/arch/sparc/kernel/sun4d_smp.c b/arch/sparc/kernel/sun4d_smp.c
index 9d98e5002a09..7b55c50eabe5 100644
--- a/arch/sparc/kernel/sun4d_smp.c
+++ b/arch/sparc/kernel/sun4d_smp.c
@@ -93,7 +93,7 @@ void sun4d_cpu_pre_online(void *arg)
 	show_leds(cpuid);
 
 	/* Attach to the address space of init_task. */
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 
 	local_ops->cache_all();
diff --git a/arch/sparc/kernel/sun4m_smp.c b/arch/sparc/kernel/sun4m_smp.c
index 278c40abce82..633c4cf6fdb0 100644
--- a/arch/sparc/kernel/sun4m_smp.c
+++ b/arch/sparc/kernel/sun4m_smp.c
@@ -59,7 +59,7 @@ void sun4m_cpu_pre_online(void *arg)
 			     : "memory" /* paranoid */);
 
 	/* Attach to the address space of init_task. */
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 
 	while (!cpumask_test_cpu(cpuid, &smp_commenced_mask))
diff --git a/arch/sparc/kernel/traps_32.c b/arch/sparc/kernel/traps_32.c
index 4f21df7d4f13..ecddac5a4c96 100644
--- a/arch/sparc/kernel/traps_32.c
+++ b/arch/sparc/kernel/traps_32.c
@@ -448,7 +448,7 @@ void trap_init(void)
 		thread_info_offsets_are_bolixed_pete();
 
 	/* Attach to the address space of init_task. */
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 
 	/* NOTE: Other cpus have this done as they are started
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index dfc97a47c9a0..e022d7b00390 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -2837,6 +2837,6 @@ void __init trap_init(void)
 	/* Attach to the address space of init_task.  On SMP we
 	 * do this in smp.c:smp_callin for other cpus.
 	 */
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 }
diff --git a/arch/tile/kernel/smpboot.c b/arch/tile/kernel/smpboot.c
index 6c0abaacec33..53ce940a5016 100644
--- a/arch/tile/kernel/smpboot.c
+++ b/arch/tile/kernel/smpboot.c
@@ -160,7 +160,7 @@ static void start_secondary(void)
 	__this_cpu_write(current_asid, min_asid);
 
 	/* Set up this thread as another owner of the init_mm */
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
 	if (current->mm)
 		BUG();
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f07005e6f461..c64ca5929cb5 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1510,7 +1510,7 @@ void cpu_init(void)
 	for (i = 0; i <= IO_BITMAP_LONGS; i++)
 		t->io_bitmap[i] = ~0UL;
 
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	me->active_mm = &init_mm;
 	BUG_ON(me->mm);
 	enter_lazy_tlb(&init_mm, me);
@@ -1561,7 +1561,7 @@ void cpu_init(void)
 	/*
 	 * Set up and load the per-CPU TSS and LDT
 	 */
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	curr->active_mm = &init_mm;
 	BUG_ON(curr->mm);
 	enter_lazy_tlb(&init_mm, curr);
diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c
index fc4ad21a5ed4..9bf5cea3bae4 100644
--- a/arch/xtensa/kernel/smp.c
+++ b/arch/xtensa/kernel/smp.c
@@ -136,7 +136,7 @@ void secondary_start_kernel(void)
 	/* All kernel threads share the same mm context. */
 
 	atomic_inc(&mm->mm_users);
-	atomic_inc(&mm->mm_count);
+	mmgrab(mm);
 	current->active_mm = mm;
 	cpumask_set_cpu(cpu, mm_cpumask(mm));
 	enter_lazy_tlb(mm, current);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index ef7c8de7060e..ca5f2aa7232d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -262,7 +262,7 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
 	 * and because the mmu_notifier_unregister function also drop
 	 * mm_count we need to take an extra count here.
 	 */
-	atomic_inc(&p->mm->mm_count);
+	mmgrab(p->mm);
 	mmu_notifier_unregister_no_release(&p->mmu_notifier, p->mm);
 	mmu_notifier_call_srcu(&p->rcu, &kfd_process_destroy_delayed);
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 6a8fa085b74e..65802d93fdc1 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -334,7 +334,7 @@ i915_gem_userptr_init__mm_struct(struct drm_i915_gem_object *obj)
 		mm->i915 = to_i915(obj->base.dev);
 
 		mm->mm = current->mm;
-		atomic_inc(&current->mm->mm_count);
+		mmgrab(current->mm);
 
 		mm->mn = NULL;
 
diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
index f46033984d07..3b19c16a9e45 100644
--- a/drivers/infiniband/hw/hfi1/file_ops.c
+++ b/drivers/infiniband/hw/hfi1/file_ops.c
@@ -185,7 +185,7 @@ static int hfi1_file_open(struct inode *inode, struct file *fp)
 	if (fd) {
 		fd->rec_cpu_num = -1; /* no cpu affinity by default */
 		fd->mm = current->mm;
-		atomic_inc(&fd->mm->mm_count);
+		mmgrab(fd->mm);
 		fp->private_data = fd;
 	} else {
 		fp->private_data = NULL;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b8f06273353e..5d51a188871b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -766,7 +766,7 @@ struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
 
 		if (!IS_ERR_OR_NULL(mm)) {
 			/* ensure this mm_struct can't be freed */
-			atomic_inc(&mm->mm_count);
+			mmgrab(mm);
 			/* but do not pin its memory */
 			mmput(mm);
 		}
@@ -1064,7 +1064,7 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
 		if (p) {
 			if (atomic_read(&p->mm->mm_users) > 1) {
 				mm = p->mm;
-				atomic_inc(&mm->mm_count);
+				mmgrab(mm);
 			}
 			task_unlock(p);
 		}
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index e6e0a619cb3a..3c421d06a18e 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1847,7 +1847,7 @@ static struct file *userfaultfd_file_create(int flags)
 	ctx->released = false;
 	ctx->mm = current->mm;
 	/* prevent the mm struct to be freed */
-	atomic_inc(&ctx->mm->mm_count);
+	mmgrab(ctx->mm);
 
 	file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
 				  O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 451e241f32c5..7cfa5546c840 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2904,6 +2904,28 @@ static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig)
  */
 extern struct mm_struct * mm_alloc(void);
 
+/**
+ * mmgrab() - Pin a &struct mm_struct.
+ * @mm: The &struct mm_struct to pin.
+ *
+ * Make sure that @mm will not get freed even after the owning task
+ * exits. This doesn't guarantee that the associated address space
+ * will still exist later on and mmget_not_zero() has to be used before
+ * accessing it.
+ *
+ * This is a preferred way to to pin @mm for a longer/unbounded amount
+ * of time.
+ *
+ * Use mmdrop() to release the reference acquired by mmgrab().
+ *
+ * See also <Documentation/vm/active_mm.txt> for an in-depth explanation
+ * of &mm_struct.mm_count vs &mm_struct.mm_users.
+ */
+static inline void mmgrab(struct mm_struct *mm)
+{
+	atomic_inc(&mm->mm_count);
+}
+
 /* mmdrop drops the mm and the page tables */
 extern void __mmdrop(struct mm_struct *);
 static inline void mmdrop(struct mm_struct *mm)
diff --git a/kernel/exit.c b/kernel/exit.c
index 90b09ca35c84..8a768a3672a5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -539,7 +539,7 @@ static void exit_mm(void)
 		__set_current_state(TASK_RUNNING);
 		down_read(&mm->mmap_sem);
 	}
-	atomic_inc(&mm->mm_count);
+	mmgrab(mm);
 	BUG_ON(mm != current->active_mm);
 	/* more a memory barrier than a real lock */
 	task_lock(current);
diff --git a/kernel/futex.c b/kernel/futex.c
index cdf365036141..b687cb22301c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -338,7 +338,7 @@ static inline bool should_fail_futex(bool fshared)
 
 static inline void futex_get_mm(union futex_key *key)
 {
-	atomic_inc(&key->private.mm->mm_count);
+	mmgrab(key->private.mm);
 	/*
 	 * Ensure futex_get_mm() implies a full barrier such that
 	 * get_futex_key() implies a full barrier. This is relied upon
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e1ae6ac15eac..6ea1925ac5c0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2847,7 +2847,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 
 	if (!mm) {
 		next->active_mm = oldmm;
-		atomic_inc(&oldmm->mm_count);
+		mmgrab(oldmm);
 		enter_lazy_tlb(oldmm, next);
 	} else
 		switch_mm_irqs_off(oldmm, mm, next);
@@ -6098,7 +6098,7 @@ void __init sched_init(void)
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
 	 */
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	enter_lazy_tlb(&init_mm, current);
 
 	/*
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 77ae3239c3de..34bce5c308e3 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -420,7 +420,7 @@ int __khugepaged_enter(struct mm_struct *mm)
 	list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
 	spin_unlock(&khugepaged_mm_lock);
 
-	atomic_inc(&mm->mm_count);
+	mmgrab(mm);
 	if (wakeup)
 		wake_up_interruptible(&khugepaged_wait);
 
diff --git a/mm/ksm.c b/mm/ksm.c
index cf211c01ceac..520e4c37fec7 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1854,7 +1854,7 @@ int __ksm_enter(struct mm_struct *mm)
 	spin_unlock(&ksm_mmlist_lock);
 
 	set_bit(MMF_VM_MERGEABLE, &mm->flags);
-	atomic_inc(&mm->mm_count);
+	mmgrab(mm);
 
 	if (needs_wakeup)
 		wake_up_interruptible(&ksm_thread_wait);
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index 6f4d27c5bb32..daf67bb02b4a 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -25,7 +25,7 @@ void use_mm(struct mm_struct *mm)
 	task_lock(tsk);
 	active_mm = tsk->active_mm;
 	if (active_mm != mm) {
-		atomic_inc(&mm->mm_count);
+		mmgrab(mm);
 		tsk->active_mm = mm;
 	}
 	tsk->mm = mm;
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index f4259e496f83..32bc9f2ff7eb 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -275,7 +275,7 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
 		mm->mmu_notifier_mm = mmu_notifier_mm;
 		mmu_notifier_mm = NULL;
 	}
-	atomic_inc(&mm->mm_count);
+	mmgrab(mm);
 
 	/*
 	 * Serialize the update against mmu_notifier_unregister. A
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 578321f1c070..51c091849dcb 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -653,7 +653,7 @@ static void mark_oom_victim(struct task_struct *tsk)
 
 	/* oom_mm is bound to the signal struct life time. */
 	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
-		atomic_inc(&tsk->signal->oom_mm->mm_count);
+		mmgrab(tsk->signal->oom_mm);
 
 	/*
 	 * Make sure that the task is woken up from uninterruptible sleep
@@ -870,7 +870,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
 
 	/* Get a reference to safely compare mm after task_unlock(victim) */
 	mm = victim->mm;
-	atomic_inc(&mm->mm_count);
+	mmgrab(mm);
 	/*
 	 * We should send SIGKILL before setting TIF_MEMDIE in order to prevent
 	 * the OOM victim from depleting the memory reserves from the user
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5b0dd4a9b2cb..35f71409d9ee 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -611,7 +611,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
 		return ERR_PTR(-ENOMEM);
 
 	spin_lock_init(&kvm->mmu_lock);
-	atomic_inc(&current->mm->mm_count);
+	mmgrab(current->mm);
 	kvm->mm = current->mm;
 	kvm_eventfd_init(kvm);
 	mutex_init(&kvm->lock);
-- 
cgit v1.2.3


From 3fce371bfac2be0396ffc1e763600e6c6b1bb52a Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Mon, 27 Feb 2017 14:30:10 -0800
Subject: mm: add new mmget() helper

Apart from adding the helper function itself, the rest of the kernel is
converted mechanically using:

  git grep -l 'atomic_inc.*mm_users' | xargs sed -i 's/atomic_inc(&\(.*\)->mm_users);/mmget\(\1\);/'
  git grep -l 'atomic_inc.*mm_users' | xargs sed -i 's/atomic_inc(&\(.*\)\.mm_users);/mmget\(\&\1\);/'

This is needed for a later patch that hooks into the helper, but might
be a worthwhile cleanup on its own.

(Michal Hocko provided most of the kerneldoc comment.)

Link: http://lkml.kernel.org/r/20161218123229.22952-2-vegard.nossum@oracle.com
Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arc/kernel/smp.c           |  2 +-
 arch/blackfin/mach-common/smp.c |  2 +-
 arch/frv/mm/mmu-context.c       |  2 +-
 arch/metag/kernel/smp.c         |  2 +-
 arch/sh/kernel/smp.c            |  2 +-
 arch/xtensa/kernel/smp.c        |  2 +-
 include/linux/sched.h           | 21 +++++++++++++++++++++
 kernel/fork.c                   |  4 ++--
 mm/swapfile.c                   | 10 +++++-----
 virt/kvm/async_pf.c             |  2 +-
 10 files changed, 35 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c
index 695624181682..b8e8d3944481 100644
--- a/arch/arc/kernel/smp.c
+++ b/arch/arc/kernel/smp.c
@@ -139,7 +139,7 @@ void start_kernel_secondary(void)
 	/* MMU, Caches, Vector Table, Interrupts etc */
 	setup_processor();
 
-	atomic_inc(&mm->mm_users);
+	mmget(mm);
 	mmgrab(mm);
 	current->active_mm = mm;
 	cpumask_set_cpu(cpu, mm_cpumask(mm));
diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c
index bc5617ef7128..a2e6db2ce811 100644
--- a/arch/blackfin/mach-common/smp.c
+++ b/arch/blackfin/mach-common/smp.c
@@ -307,7 +307,7 @@ void secondary_start_kernel(void)
 	local_irq_disable();
 
 	/* Attach the new idle task to the global mm. */
-	atomic_inc(&mm->mm_users);
+	mmget(mm);
 	mmgrab(mm);
 	current->active_mm = mm;
 
diff --git a/arch/frv/mm/mmu-context.c b/arch/frv/mm/mmu-context.c
index 81757d55a5b5..3473bde77f56 100644
--- a/arch/frv/mm/mmu-context.c
+++ b/arch/frv/mm/mmu-context.c
@@ -188,7 +188,7 @@ int cxn_pin_by_pid(pid_t pid)
 		task_lock(tsk);
 		if (tsk->mm) {
 			mm = tsk->mm;
-			atomic_inc(&mm->mm_users);
+			mmget(mm);
 			ret = 0;
 		}
 		task_unlock(tsk);
diff --git a/arch/metag/kernel/smp.c b/arch/metag/kernel/smp.c
index af9cff547a19..c622293254e4 100644
--- a/arch/metag/kernel/smp.c
+++ b/arch/metag/kernel/smp.c
@@ -344,7 +344,7 @@ asmlinkage void secondary_start_kernel(void)
 	 * All kernel threads share the same mm context; grab a
 	 * reference and switch to it.
 	 */
-	atomic_inc(&mm->mm_users);
+	mmget(mm);
 	mmgrab(mm);
 	current->active_mm = mm;
 	cpumask_set_cpu(cpu, mm_cpumask(mm));
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index ee379c699c08..edc4769b047e 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -179,7 +179,7 @@ asmlinkage void start_secondary(void)
 
 	enable_mmu();
 	mmgrab(mm);
-	atomic_inc(&mm->mm_users);
+	mmget(mm);
 	current->active_mm = mm;
 #ifdef CONFIG_MMU
 	enter_lazy_tlb(mm, current);
diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c
index 9bf5cea3bae4..fcea72019df7 100644
--- a/arch/xtensa/kernel/smp.c
+++ b/arch/xtensa/kernel/smp.c
@@ -135,7 +135,7 @@ void secondary_start_kernel(void)
 
 	/* All kernel threads share the same mm context. */
 
-	atomic_inc(&mm->mm_users);
+	mmget(mm);
 	mmgrab(mm);
 	current->active_mm = mm;
 	cpumask_set_cpu(cpu, mm_cpumask(mm));
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7cfa5546c840..4a28deb5f210 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2948,6 +2948,27 @@ static inline void mmdrop_async(struct mm_struct *mm)
 	}
 }
 
+/**
+ * mmget() - Pin the address space associated with a &struct mm_struct.
+ * @mm: The address space to pin.
+ *
+ * Make sure that the address space of the given &struct mm_struct doesn't
+ * go away. This does not protect against parts of the address space being
+ * modified or freed, however.
+ *
+ * Never use this function to pin this address space for an
+ * unbounded/indefinite amount of time.
+ *
+ * Use mmput() to release the reference acquired by mmget().
+ *
+ * See also <Documentation/vm/active_mm.txt> for an in-depth explanation
+ * of &mm_struct.mm_count vs &mm_struct.mm_users.
+ */
+static inline void mmget(struct mm_struct *mm)
+{
+	atomic_inc(&mm->mm_users);
+}
+
 static inline bool mmget_not_zero(struct mm_struct *mm)
 {
 	return atomic_inc_not_zero(&mm->mm_users);
diff --git a/kernel/fork.c b/kernel/fork.c
index 348fe73155bc..246bf9aaf9df 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1000,7 +1000,7 @@ struct mm_struct *get_task_mm(struct task_struct *task)
 		if (task->flags & PF_KTHREAD)
 			mm = NULL;
 		else
-			atomic_inc(&mm->mm_users);
+			mmget(mm);
 	}
 	task_unlock(task);
 	return mm;
@@ -1188,7 +1188,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
 	vmacache_flush(tsk);
 
 	if (clone_flags & CLONE_VM) {
-		atomic_inc(&oldmm->mm_users);
+		mmget(oldmm);
 		mm = oldmm;
 		goto good_mm;
 	}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2cac12cc9abe..7a0713b76668 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1671,7 +1671,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
 	 * that.
 	 */
 	start_mm = &init_mm;
-	atomic_inc(&init_mm.mm_users);
+	mmget(&init_mm);
 
 	/*
 	 * Keep on scanning until all entries have gone.  Usually,
@@ -1720,7 +1720,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
 		if (atomic_read(&start_mm->mm_users) == 1) {
 			mmput(start_mm);
 			start_mm = &init_mm;
-			atomic_inc(&init_mm.mm_users);
+			mmget(&init_mm);
 		}
 
 		/*
@@ -1757,8 +1757,8 @@ int try_to_unuse(unsigned int type, bool frontswap,
 			struct mm_struct *prev_mm = start_mm;
 			struct mm_struct *mm;
 
-			atomic_inc(&new_start_mm->mm_users);
-			atomic_inc(&prev_mm->mm_users);
+			mmget(new_start_mm);
+			mmget(prev_mm);
 			spin_lock(&mmlist_lock);
 			while (swap_count(*swap_map) && !retval &&
 					(p = p->next) != &start_mm->mmlist) {
@@ -1781,7 +1781,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
 
 				if (set_start_mm && *swap_map < swcount) {
 					mmput(new_start_mm);
-					atomic_inc(&mm->mm_users);
+					mmget(mm);
 					new_start_mm = mm;
 					set_start_mm = 0;
 				}
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 3815e940fbea..2366177172f6 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -204,7 +204,7 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
 	work->addr = hva;
 	work->arch = *arch;
 	work->mm = current->mm;
-	atomic_inc(&work->mm->mm_users);
+	mmget(work->mm);
 	kvm_get_kvm(work->vcpu->kvm);
 
 	/* this can't really happen otherwise gfn_to_pfn_async
-- 
cgit v1.2.3


From 388f79345502232d335467e8fa6f8e55a18844e1 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Mon, 27 Feb 2017 14:30:13 -0800
Subject: mm: use mmget_not_zero() helper

We already have the helper, we can convert the rest of the kernel
mechanically using:

  git grep -l 'atomic_inc_not_zero.*mm_users' | xargs sed -i 's/atomic_inc_not_zero(&\(.*\)->mm_users)/mmget_not_zero\(\1\)/'

This is needed for a later patch that hooks into the helper, but might
be a worthwhile cleanup on its own.

Link: http://lkml.kernel.org/r/20161218123229.22952-3-vegard.nossum@oracle.com
Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpu/drm/i915/i915_gem_userptr.c | 2 +-
 drivers/iommu/intel-svm.c               | 2 +-
 fs/proc/base.c                          | 4 ++--
 fs/proc/task_mmu.c                      | 4 ++--
 fs/proc/task_nommu.c                    | 2 +-
 kernel/events/uprobes.c                 | 2 +-
 mm/swapfile.c                           | 2 +-
 7 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 65802d93fdc1..0115989e324a 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -507,7 +507,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
 			flags |= FOLL_WRITE;
 
 		ret = -EFAULT;
-		if (atomic_inc_not_zero(&mm->mm_users)) {
+		if (mmget_not_zero(mm)) {
 			down_read(&mm->mmap_sem);
 			while (pinned < npages) {
 				ret = get_user_pages_remote
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index cb72e0011310..51f2b228723f 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -579,7 +579,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 		if (!svm->mm)
 			goto bad_req;
 		/* If the mm is already defunct, don't handle faults. */
-		if (!atomic_inc_not_zero(&svm->mm->mm_users))
+		if (!mmget_not_zero(svm->mm))
 			goto bad_req;
 		down_read(&svm->mm->mmap_sem);
 		vma = find_extend_vma(svm->mm, address);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5d51a188871b..1e1e182d571b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -813,7 +813,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
 		return -ENOMEM;
 
 	copied = 0;
-	if (!atomic_inc_not_zero(&mm->mm_users))
+	if (!mmget_not_zero(mm))
 		goto free;
 
 	/* Maybe we should limit FOLL_FORCE to actual ptrace users? */
@@ -921,7 +921,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
 		return -ENOMEM;
 
 	ret = 0;
-	if (!atomic_inc_not_zero(&mm->mm_users))
+	if (!mmget_not_zero(mm))
 		goto free;
 
 	down_read(&mm->mmap_sem);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 8f96a49178d0..ee3efb229ef6 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -167,7 +167,7 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
 		return ERR_PTR(-ESRCH);
 
 	mm = priv->mm;
-	if (!mm || !atomic_inc_not_zero(&mm->mm_users))
+	if (!mm || !mmget_not_zero(mm))
 		return NULL;
 
 	down_read(&mm->mmap_sem);
@@ -1352,7 +1352,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 	unsigned long end_vaddr;
 	int ret = 0, copied = 0;
 
-	if (!mm || !atomic_inc_not_zero(&mm->mm_users))
+	if (!mm || !mmget_not_zero(mm))
 		goto out;
 
 	ret = -EINVAL;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 37175621e890..1ef97cfcf422 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -219,7 +219,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 		return ERR_PTR(-ESRCH);
 
 	mm = priv->mm;
-	if (!mm || !atomic_inc_not_zero(&mm->mm_users))
+	if (!mm || !mmget_not_zero(mm))
 		return NULL;
 
 	down_read(&mm->mmap_sem);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 18c6b23edd3c..d630f8ac4d2f 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -747,7 +747,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
 			continue;
 		}
 
-		if (!atomic_inc_not_zero(&vma->vm_mm->mm_users))
+		if (!mmget_not_zero(vma->vm_mm))
 			continue;
 
 		info = prev;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 7a0713b76668..fadc6a1c0da0 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1763,7 +1763,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
 			while (swap_count(*swap_map) && !retval &&
 					(p = p->next) != &start_mm->mmlist) {
 				mm = list_entry(p, struct mm_struct, mmlist);
-				if (!atomic_inc_not_zero(&mm->mm_users))
+				if (!mmget_not_zero(mm))
 					continue;
 				spin_unlock(&mmlist_lock);
 				mmput(prev_mm);
-- 
cgit v1.2.3


From eba38a968258b5ad9d70722ab8c584e1753f4b16 Mon Sep 17 00:00:00 2001
From: Gary Lin <glin@suse.com>
Date: Wed, 1 Mar 2017 16:25:51 +0800
Subject: bpf: update the comment about the length of analysis

Commit 07016151a446 ("bpf, verifier: further improve search
pruning") increased the limit of processed instructions from
32k to 64k, but the comment still mentioned the 32k limit.
This commit updates the comment to reflect the change.

Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Gary Lin <glin@suse.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3fc6e39b223e..796b68d00119 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -33,7 +33,7 @@
  * - out of bounds or malformed jumps
  * The second pass is all possible path descent from the 1st insn.
  * Since it's analyzing all pathes through the program, the length of the
- * analysis is limited to 32k insn, which may be hit even if total number of
+ * analysis is limited to 64k insn, which may be hit even if total number of
  * insn is less then 4K, but there are too many branches that change stack/regs.
  * Number of 'branches to be analyzed' is limited to 1k
  *
-- 
cgit v1.2.3


From c930b2c0de32f45ce8f67affe936ce7a05b07b00 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 12:22:54 +0100
Subject: sched/core: Convert ___assert_task_state() link time assert to
 BUILD_BUG_ON()

The length of TASK_STATE_TO_CHAR_STR was still checked using the old
link-time manual error method - convert it to BUILD_BUG_ON(). This
has a couple of advantages:

 - it's more obvious what's going on

 - it reduces the size and complexity of <linux/sched.h>

 - BUILD_BUG_ON() will fail during compilation, with a clearer
   error message than the link time assert.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 3 ---
 kernel/sched/core.c   | 3 +++
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4a28deb5f210..c204613396cd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -223,9 +223,6 @@ extern void proc_sched_set_task(struct task_struct *p);
 
 #define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn"
 
-extern char ___assert_task_state[1 - 2*!!(
-		sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
-
 /* Convenience macros for the sake of set_current_state */
 #define TASK_KILLABLE		(TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
 #define TASK_STOPPED		(TASK_WAKEKILL | __TASK_STOPPED)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bbfb917a9b49..7d76ccb79e91 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5233,6 +5233,9 @@ void sched_show_task(struct task_struct *p)
 	int ppid;
 	unsigned long state = p->state;
 
+	/* Make sure the string lines up properly with the number of task states: */
+	BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1);
+
 	if (!try_get_task_stack(p))
 		return;
 	if (state)
-- 
cgit v1.2.3


From 59ddbcb2f45b958cf1f11f122b666cbcf50cd57b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 23:37:48 +0100
Subject: sched/core: Move the get_preempt_disable_ip() inline to sched/core.c

It's defined in <linux/sched.h>, but nothing outside the scheduler
uses it - so move it to the sched/core.c usage site.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 9 ---------
 kernel/sched/core.c   | 9 +++++++++
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c204613396cd..df42cac04243 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3419,15 +3419,6 @@ static inline void cond_resched_rcu(void)
 #endif
 }
 
-static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
-{
-#ifdef CONFIG_DEBUG_PREEMPT
-	return p->preempt_disable_ip;
-#else
-	return 0;
-#endif
-}
-
 /*
  * Does a critical section need to be broken due to another
  * task waiting?: (technically does not depend on CONFIG_PREEMPT,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7d76ccb79e91..2acdf19c5f7c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3211,6 +3211,15 @@ static inline void preempt_latency_start(int val) { }
 static inline void preempt_latency_stop(int val) { }
 #endif
 
+static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+	return p->preempt_disable_ip;
+#else
+	return 0;
+#endif
+}
+
 /*
  * Print scheduling while atomic bug:
  */
-- 
cgit v1.2.3


From 0c98d344fe5c27f6e4bce42ac503e9e9a51c7d1d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Feb 2017 15:38:10 +0100
Subject: sched/core: Remove the tsk_cpus_allowed() wrapper

So the original intention of tsk_cpus_allowed() was to 'future-proof'
the field - but it's pretty ineffectual at that, because half of
the code uses ->cpus_allowed directly ...

Also, the wrapper makes the code longer than the original expression!

So just get rid of it. This also shrinks <linux/sched.h> a bit.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/powerpc/kernel/smp.c                  |  2 +-
 arch/powerpc/platforms/cell/spufs/sched.c  |  2 +-
 arch/sparc/kernel/sysfs.c                  |  2 +-
 drivers/cpufreq/sparc-us2e-cpufreq.c       |  4 ++--
 drivers/cpufreq/sparc-us3-cpufreq.c        |  4 ++--
 drivers/infiniband/hw/hfi1/affinity.c      |  2 +-
 drivers/infiniband/hw/hfi1/sdma.c          |  2 +-
 include/linux/sched.h                      |  3 ---
 kernel/sched/core.c                        | 20 ++++++++++----------
 kernel/sched/cpudeadline.c                 |  4 ++--
 kernel/sched/cpupri.c                      |  4 ++--
 kernel/sched/deadline.c                    |  7 +++----
 kernel/sched/fair.c                        | 25 ++++++++++++-------------
 kernel/sched/rt.c                          |  5 ++---
 lib/smp_processor_id.c                     |  2 +-
 samples/trace_events/trace-events-sample.c |  2 +-
 16 files changed, 42 insertions(+), 48 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 573fb3a461b5..21fdf02583fe 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -795,7 +795,7 @@ void __init smp_cpus_done(unsigned int max_cpus)
 	 * se we pin us down to CPU 0 for a short while
 	 */
 	alloc_cpumask_var(&old_mask, GFP_NOWAIT);
-	cpumask_copy(old_mask, tsk_cpus_allowed(current));
+	cpumask_copy(old_mask, &current->cpus_allowed);
 	set_cpus_allowed_ptr(current, cpumask_of(boot_cpuid));
 	
 	if (smp_ops && smp_ops->setup_cpu)
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 460f5f31d5cb..9b543df210fb 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -140,7 +140,7 @@ void __spu_update_sched_info(struct spu_context *ctx)
 	 * runqueue. The context will be rescheduled on the proper node
 	 * if it is timesliced or preempted.
 	 */
-	cpumask_copy(&ctx->cpus_allowed, tsk_cpus_allowed(current));
+	cpumask_copy(&ctx->cpus_allowed, &current->cpus_allowed);
 
 	/* Save the current cpu id for spu interrupt routing. */
 	ctx->last_ran = raw_smp_processor_id();
diff --git a/arch/sparc/kernel/sysfs.c b/arch/sparc/kernel/sysfs.c
index 4808b6d23455..d63fc613e7a9 100644
--- a/arch/sparc/kernel/sysfs.c
+++ b/arch/sparc/kernel/sysfs.c
@@ -106,7 +106,7 @@ static unsigned long run_on_cpu(unsigned long cpu,
 	cpumask_t old_affinity;
 	unsigned long ret;
 
-	cpumask_copy(&old_affinity, tsk_cpus_allowed(current));
+	cpumask_copy(&old_affinity, &current->cpus_allowed);
 	/* should return -EINVAL to userspace */
 	if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))
 		return 0;
diff --git a/drivers/cpufreq/sparc-us2e-cpufreq.c b/drivers/cpufreq/sparc-us2e-cpufreq.c
index b73feeb666f9..35ddb6da93aa 100644
--- a/drivers/cpufreq/sparc-us2e-cpufreq.c
+++ b/drivers/cpufreq/sparc-us2e-cpufreq.c
@@ -234,7 +234,7 @@ static unsigned int us2e_freq_get(unsigned int cpu)
 	cpumask_t cpus_allowed;
 	unsigned long clock_tick, estar;
 
-	cpumask_copy(&cpus_allowed, tsk_cpus_allowed(current));
+	cpumask_copy(&cpus_allowed, &current->cpus_allowed);
 	set_cpus_allowed_ptr(current, cpumask_of(cpu));
 
 	clock_tick = sparc64_get_clock_tick(cpu) / 1000;
@@ -252,7 +252,7 @@ static int us2e_freq_target(struct cpufreq_policy *policy, unsigned int index)
 	unsigned long clock_tick, divisor, old_divisor, estar;
 	cpumask_t cpus_allowed;
 
-	cpumask_copy(&cpus_allowed, tsk_cpus_allowed(current));
+	cpumask_copy(&cpus_allowed, &current->cpus_allowed);
 	set_cpus_allowed_ptr(current, cpumask_of(cpu));
 
 	new_freq = clock_tick = sparc64_get_clock_tick(cpu) / 1000;
diff --git a/drivers/cpufreq/sparc-us3-cpufreq.c b/drivers/cpufreq/sparc-us3-cpufreq.c
index 9bb42ba50efa..a8d86a449ca1 100644
--- a/drivers/cpufreq/sparc-us3-cpufreq.c
+++ b/drivers/cpufreq/sparc-us3-cpufreq.c
@@ -82,7 +82,7 @@ static unsigned int us3_freq_get(unsigned int cpu)
 	unsigned long reg;
 	unsigned int ret;
 
-	cpumask_copy(&cpus_allowed, tsk_cpus_allowed(current));
+	cpumask_copy(&cpus_allowed, &current->cpus_allowed);
 	set_cpus_allowed_ptr(current, cpumask_of(cpu));
 
 	reg = read_safari_cfg();
@@ -99,7 +99,7 @@ static int us3_freq_target(struct cpufreq_policy *policy, unsigned int index)
 	unsigned long new_bits, new_freq, reg;
 	cpumask_t cpus_allowed;
 
-	cpumask_copy(&cpus_allowed, tsk_cpus_allowed(current));
+	cpumask_copy(&cpus_allowed, &current->cpus_allowed);
 	set_cpus_allowed_ptr(current, cpumask_of(cpu));
 
 	new_freq = sparc64_get_clock_tick(cpu) / 1000;
diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c
index 7a3d906b3671..e2cd2cd3b28a 100644
--- a/drivers/infiniband/hw/hfi1/affinity.c
+++ b/drivers/infiniband/hw/hfi1/affinity.c
@@ -576,7 +576,7 @@ int hfi1_get_proc_affinity(int node)
 	struct hfi1_affinity_node *entry;
 	cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
 	const struct cpumask *node_mask,
-		*proc_mask = tsk_cpus_allowed(current);
+		*proc_mask = &current->cpus_allowed;
 	struct hfi1_affinity_node_list *affinity = &node_affinity;
 	struct cpu_mask_set *set = &affinity->proc;
 
diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c
index 1d81cac1fa6c..5cde1ecda0fe 100644
--- a/drivers/infiniband/hw/hfi1/sdma.c
+++ b/drivers/infiniband/hw/hfi1/sdma.c
@@ -856,7 +856,7 @@ struct sdma_engine *sdma_select_user_engine(struct hfi1_devdata *dd,
 {
 	struct sdma_rht_node *rht_node;
 	struct sdma_engine *sde = NULL;
-	const struct cpumask *current_mask = tsk_cpus_allowed(current);
+	const struct cpumask *current_mask = &current->cpus_allowed;
 	unsigned long cpu_id;
 
 	/*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index df42cac04243..6d1cc20cc477 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1995,9 +1995,6 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
 }
 #endif
 
-/* Future-safe accessor for struct task_struct's cpus_allowed. */
-#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
-
 static inline int tsk_nr_cpus_allowed(struct task_struct *p)
 {
 	return p->nr_cpus_allowed;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2acdf19c5f7c..ef5bbf760a08 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -981,7 +981,7 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_
 		return rq;
 
 	/* Affinity changed (again). */
-	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
 		return rq;
 
 	rq = move_queued_task(rq, p, dest_cpu);
@@ -1259,10 +1259,10 @@ static int migrate_swap_stop(void *data)
 	if (task_cpu(arg->src_task) != arg->src_cpu)
 		goto unlock;
 
-	if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
+	if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
 		goto unlock;
 
-	if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
+	if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
 		goto unlock;
 
 	__migrate_swap_task(arg->src_task, arg->dst_cpu);
@@ -1303,10 +1303,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
 	if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
 		goto out;
 
-	if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
+	if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
 		goto out;
 
-	if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
+	if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
 		goto out;
 
 	trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
@@ -1490,14 +1490,14 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 		for_each_cpu(dest_cpu, nodemask) {
 			if (!cpu_active(dest_cpu))
 				continue;
-			if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+			if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
 				return dest_cpu;
 		}
 	}
 
 	for (;;) {
 		/* Any allowed, online CPU? */
-		for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
+		for_each_cpu(dest_cpu, &p->cpus_allowed) {
 			if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu))
 				continue;
 			if (!cpu_online(dest_cpu))
@@ -1552,7 +1552,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 	if (tsk_nr_cpus_allowed(p) > 1)
 		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
 	else
-		cpu = cpumask_any(tsk_cpus_allowed(p));
+		cpu = cpumask_any(&p->cpus_allowed);
 
 	/*
 	 * In order not to call set_task_cpu() on a blocking task we need
@@ -1564,7 +1564,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 	 * [ this allows ->select_task() to simply return task_cpu(p) and
 	 *   not worry about this generic constraint ]
 	 */
-	if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
+	if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
 		     !cpu_online(cpu)))
 		cpu = select_fallback_rq(task_cpu(p), p);
 
@@ -5473,7 +5473,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
 	if (curr_cpu == target_cpu)
 		return 0;
 
-	if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
+	if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
 		return -EINVAL;
 
 	/* TODO: This is not properly updating schedstats */
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index e73119013c53..fba235c7d026 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -128,10 +128,10 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
 	const struct sched_dl_entity *dl_se = &p->dl;
 
 	if (later_mask &&
-	    cpumask_and(later_mask, cp->free_cpus, tsk_cpus_allowed(p))) {
+	    cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
 		best_cpu = cpumask_any(later_mask);
 		goto out;
-	} else if (cpumask_test_cpu(cpudl_maximum(cp), tsk_cpus_allowed(p)) &&
+	} else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
 			dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
 		best_cpu = cpudl_maximum(cp);
 		if (later_mask)
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 11e9705bf937..981fcd7dc394 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -103,11 +103,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
 		if (skip)
 			continue;
 
-		if (cpumask_any_and(tsk_cpus_allowed(p), vec->mask) >= nr_cpu_ids)
+		if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
 			continue;
 
 		if (lowest_mask) {
-			cpumask_and(lowest_mask, tsk_cpus_allowed(p), vec->mask);
+			cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
 
 			/*
 			 * We have to ensure that we have at least one bit
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 27737f34757d..8e4d6e4e3ccc 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -252,7 +252,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
 		 * If we cannot preempt any rq, fall back to pick any
 		 * online cpu.
 		 */
-		cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p));
+		cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
 		if (cpu >= nr_cpu_ids) {
 			/*
 			 * Fail to find any suitable cpu.
@@ -1235,7 +1235,7 @@ static void set_curr_task_dl(struct rq *rq)
 static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
 {
 	if (!task_running(rq, p) &&
-	    cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+	    cpumask_test_cpu(cpu, &p->cpus_allowed))
 		return 1;
 	return 0;
 }
@@ -1384,8 +1384,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
 		/* Retry if something changed. */
 		if (double_lock_balance(rq, later_rq)) {
 			if (unlikely(task_rq(task) != rq ||
-				     !cpumask_test_cpu(later_rq->cpu,
-						       tsk_cpus_allowed(task)) ||
+				     !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) ||
 				     task_running(rq, task) ||
 				     !dl_task(task) ||
 				     !task_on_rq_queued(task))) {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 274c747a01ce..3b60d73ab290 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1551,7 +1551,7 @@ static void task_numa_compare(struct task_numa_env *env,
 	 */
 	if (cur) {
 		/* Skip this swap candidate if cannot move to the source cpu */
-		if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
+		if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
 			goto unlock;
 
 		/*
@@ -1661,7 +1661,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
 
 	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
 		/* Skip this CPU if the source task cannot migrate */
-		if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
+		if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
 			continue;
 
 		env->dst_cpu = cpu;
@@ -5458,7 +5458,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 
 		/* Skip over this group if it has no CPUs allowed */
 		if (!cpumask_intersects(sched_group_cpus(group),
-					tsk_cpus_allowed(p)))
+					&p->cpus_allowed))
 			continue;
 
 		local_group = cpumask_test_cpu(this_cpu,
@@ -5578,7 +5578,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 		return cpumask_first(sched_group_cpus(group));
 
 	/* Traverse only the allowed CPUs */
-	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
+	for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
 		if (idle_cpu(i)) {
 			struct rq *rq = cpu_rq(i);
 			struct cpuidle_state *idle = idle_get_state(rq);
@@ -5717,7 +5717,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
 	if (!test_idle_cores(target, false))
 		return -1;
 
-	cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
+	cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
 
 	for_each_cpu_wrap(core, cpus, target, wrap) {
 		bool idle = true;
@@ -5751,7 +5751,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
 		return -1;
 
 	for_each_cpu(cpu, cpu_smt_mask(target)) {
-		if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+		if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
 			continue;
 		if (idle_cpu(cpu))
 			return cpu;
@@ -5803,7 +5803,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 	time = local_clock();
 
 	for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
-		if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+		if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
 			continue;
 		if (idle_cpu(cpu))
 			break;
@@ -5958,7 +5958,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 	if (sd_flag & SD_BALANCE_WAKE) {
 		record_wakee(p);
 		want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
-			      && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+			      && cpumask_test_cpu(cpu, &p->cpus_allowed);
 	}
 
 	rcu_read_lock();
@@ -6698,7 +6698,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
 		return 0;
 
-	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+	if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
 		int cpu;
 
 		schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
@@ -6718,7 +6718,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 
 		/* Prevent to re-select dst_cpu via env's cpus */
 		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
-			if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+			if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
 				env->flags |= LBF_DST_PINNED;
 				env->new_dst_cpu = cpu;
 				break;
@@ -7252,7 +7252,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
 
 /*
  * Group imbalance indicates (and tries to solve) the problem where balancing
- * groups is inadequate due to tsk_cpus_allowed() constraints.
+ * groups is inadequate due to ->cpus_allowed constraints.
  *
  * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
  * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
@@ -8211,8 +8211,7 @@ more_balance:
 			 * if the curr task on busiest cpu can't be
 			 * moved to this_cpu
 			 */
-			if (!cpumask_test_cpu(this_cpu,
-					tsk_cpus_allowed(busiest->curr))) {
+			if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
 				raw_spin_unlock_irqrestore(&busiest->lock,
 							    flags);
 				env.flags |= LBF_ALL_PINNED;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e8836cfc4cdb..cbd356f63883 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1591,7 +1591,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
 	if (!task_running(rq, p) &&
-	    cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+	    cpumask_test_cpu(cpu, &p->cpus_allowed))
 		return 1;
 	return 0;
 }
@@ -1726,8 +1726,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 			 * Also make sure that it wasn't scheduled on its rq.
 			 */
 			if (unlikely(task_rq(task) != rq ||
-				     !cpumask_test_cpu(lowest_rq->cpu,
-						       tsk_cpus_allowed(task)) ||
+				     !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
 				     task_running(rq, task) ||
 				     !rt_task(task) ||
 				     !task_on_rq_queued(task))) {
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 1afec32de6f2..690d75b132fa 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -22,7 +22,7 @@ notrace static unsigned int check_preemption_disabled(const char *what1,
 	 * Kernel threads bound to a single CPU can safely use
 	 * smp_processor_id():
 	 */
-	if (cpumask_equal(tsk_cpus_allowed(current), cpumask_of(this_cpu)))
+	if (cpumask_equal(&current->cpus_allowed, cpumask_of(this_cpu)))
 		goto out;
 
 	/*
diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c
index 30e282d33d4d..bc7fcf010a5b 100644
--- a/samples/trace_events/trace-events-sample.c
+++ b/samples/trace_events/trace-events-sample.c
@@ -33,7 +33,7 @@ static void simple_thread_func(int cnt)
 
 	/* Silly tracepoints */
 	trace_foo_bar("hello", cnt, array, random_strings[len],
-		      tsk_cpus_allowed(current));
+		      &current->cpus_allowed);
 
 	trace_foo_with_template_simple("HELLO", cnt);
 
-- 
cgit v1.2.3


From 4b53a3412d6663214ce9c754eff9373a9cff9dee Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Feb 2017 15:41:03 +0100
Subject: sched/core: Remove the tsk_nr_cpus_allowed() wrapper

tsk_nr_cpus_allowed() too is a pretty pointless wrapper that
is not used consistently and which makes the code both harder
to read and longer as well.

So remove it - this also shrinks <linux/sched.h> a bit.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h   |  5 -----
 kernel/sched/core.c     |  2 +-
 kernel/sched/deadline.c | 28 ++++++++++++++--------------
 kernel/sched/rt.c       | 24 ++++++++++++------------
 4 files changed, 27 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6d1cc20cc477..e732881517f2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1995,11 +1995,6 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
 }
 #endif
 
-static inline int tsk_nr_cpus_allowed(struct task_struct *p)
-{
-	return p->nr_cpus_allowed;
-}
-
 #define TNF_MIGRATED	0x01
 #define TNF_NO_GROUP	0x02
 #define TNF_SHARED	0x04
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ef5bbf760a08..2d51ec65dc73 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1549,7 +1549,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 {
 	lockdep_assert_held(&p->pi_lock);
 
-	if (tsk_nr_cpus_allowed(p) > 1)
+	if (p->nr_cpus_allowed > 1)
 		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
 	else
 		cpu = cpumask_any(&p->cpus_allowed);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 8e4d6e4e3ccc..99b2c33a9fbc 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -134,7 +134,7 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
 	struct task_struct *p = dl_task_of(dl_se);
 
-	if (tsk_nr_cpus_allowed(p) > 1)
+	if (p->nr_cpus_allowed > 1)
 		dl_rq->dl_nr_migratory++;
 
 	update_dl_migration(dl_rq);
@@ -144,7 +144,7 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
 	struct task_struct *p = dl_task_of(dl_se);
 
-	if (tsk_nr_cpus_allowed(p) > 1)
+	if (p->nr_cpus_allowed > 1)
 		dl_rq->dl_nr_migratory--;
 
 	update_dl_migration(dl_rq);
@@ -958,7 +958,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 
 	enqueue_dl_entity(&p->dl, pi_se, flags);
 
-	if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
+	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
 		enqueue_pushable_dl_task(rq, p);
 }
 
@@ -1032,9 +1032,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
 	 * try to make it stay here, it might be important.
 	 */
 	if (unlikely(dl_task(curr)) &&
-	    (tsk_nr_cpus_allowed(curr) < 2 ||
+	    (curr->nr_cpus_allowed < 2 ||
 	     !dl_entity_preempt(&p->dl, &curr->dl)) &&
-	    (tsk_nr_cpus_allowed(p) > 1)) {
+	    (p->nr_cpus_allowed > 1)) {
 		int target = find_later_rq(p);
 
 		if (target != -1 &&
@@ -1055,7 +1055,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
 	 * Current can't be migrated, useless to reschedule,
 	 * let's hope p can move out.
 	 */
-	if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
+	if (rq->curr->nr_cpus_allowed == 1 ||
 	    cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
 		return;
 
@@ -1063,7 +1063,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
 	 * p is migratable, so let's not schedule it and
 	 * see if it is pushed or pulled somewhere else.
 	 */
-	if (tsk_nr_cpus_allowed(p) != 1 &&
+	if (p->nr_cpus_allowed != 1 &&
 	    cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
 		return;
 
@@ -1178,7 +1178,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
 {
 	update_curr_dl(rq);
 
-	if (on_dl_rq(&p->dl) && tsk_nr_cpus_allowed(p) > 1)
+	if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
 		enqueue_pushable_dl_task(rq, p);
 }
 
@@ -1279,7 +1279,7 @@ static int find_later_rq(struct task_struct *task)
 	if (unlikely(!later_mask))
 		return -1;
 
-	if (tsk_nr_cpus_allowed(task) == 1)
+	if (task->nr_cpus_allowed == 1)
 		return -1;
 
 	/*
@@ -1424,7 +1424,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
 
 	BUG_ON(rq->cpu != task_cpu(p));
 	BUG_ON(task_current(rq, p));
-	BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
+	BUG_ON(p->nr_cpus_allowed <= 1);
 
 	BUG_ON(!task_on_rq_queued(p));
 	BUG_ON(!dl_task(p));
@@ -1463,7 +1463,7 @@ retry:
 	 */
 	if (dl_task(rq->curr) &&
 	    dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
-	    tsk_nr_cpus_allowed(rq->curr) > 1) {
+	    rq->curr->nr_cpus_allowed > 1) {
 		resched_curr(rq);
 		return 0;
 	}
@@ -1610,9 +1610,9 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
 {
 	if (!task_running(rq, p) &&
 	    !test_tsk_need_resched(rq->curr) &&
-	    tsk_nr_cpus_allowed(p) > 1 &&
+	    p->nr_cpus_allowed > 1 &&
 	    dl_task(rq->curr) &&
-	    (tsk_nr_cpus_allowed(rq->curr) < 2 ||
+	    (rq->curr->nr_cpus_allowed < 2 ||
 	     !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
 		push_dl_tasks(rq);
 	}
@@ -1726,7 +1726,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
 
 	if (rq->curr != p) {
 #ifdef CONFIG_SMP
-		if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
+		if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
 			queue_push_tasks(rq);
 #endif
 		if (dl_task(rq->curr))
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index cbd356f63883..9f3e40226dec 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -335,7 +335,7 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 
 	rt_rq->rt_nr_total++;
-	if (tsk_nr_cpus_allowed(p) > 1)
+	if (p->nr_cpus_allowed > 1)
 		rt_rq->rt_nr_migratory++;
 
 	update_rt_migration(rt_rq);
@@ -352,7 +352,7 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 
 	rt_rq->rt_nr_total--;
-	if (tsk_nr_cpus_allowed(p) > 1)
+	if (p->nr_cpus_allowed > 1)
 		rt_rq->rt_nr_migratory--;
 
 	update_rt_migration(rt_rq);
@@ -1324,7 +1324,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 
 	enqueue_rt_entity(rt_se, flags);
 
-	if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
+	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
 		enqueue_pushable_task(rq, p);
 }
 
@@ -1413,7 +1413,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
 	 * will have to sort it out.
 	 */
 	if (curr && unlikely(rt_task(curr)) &&
-	    (tsk_nr_cpus_allowed(curr) < 2 ||
+	    (curr->nr_cpus_allowed < 2 ||
 	     curr->prio <= p->prio)) {
 		int target = find_lowest_rq(p);
 
@@ -1437,7 +1437,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 	 * Current can't be migrated, useless to reschedule,
 	 * let's hope p can move out.
 	 */
-	if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
+	if (rq->curr->nr_cpus_allowed == 1 ||
 	    !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
 		return;
 
@@ -1445,7 +1445,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 	 * p is migratable, so let's not schedule it and
 	 * see if it is pushed or pulled somewhere else.
 	 */
-	if (tsk_nr_cpus_allowed(p) != 1
+	if (p->nr_cpus_allowed != 1
 	    && cpupri_find(&rq->rd->cpupri, p, NULL))
 		return;
 
@@ -1579,7 +1579,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 	 * The previous task needs to be made eligible for pushing
 	 * if it is still active
 	 */
-	if (on_rt_rq(&p->rt) && tsk_nr_cpus_allowed(p) > 1)
+	if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
 		enqueue_pushable_task(rq, p);
 }
 
@@ -1629,7 +1629,7 @@ static int find_lowest_rq(struct task_struct *task)
 	if (unlikely(!lowest_mask))
 		return -1;
 
-	if (tsk_nr_cpus_allowed(task) == 1)
+	if (task->nr_cpus_allowed == 1)
 		return -1; /* No other targets possible */
 
 	if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
@@ -1761,7 +1761,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
 
 	BUG_ON(rq->cpu != task_cpu(p));
 	BUG_ON(task_current(rq, p));
-	BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
+	BUG_ON(p->nr_cpus_allowed <= 1);
 
 	BUG_ON(!task_on_rq_queued(p));
 	BUG_ON(!rt_task(p));
@@ -2121,9 +2121,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
 {
 	if (!task_running(rq, p) &&
 	    !test_tsk_need_resched(rq->curr) &&
-	    tsk_nr_cpus_allowed(p) > 1 &&
+	    p->nr_cpus_allowed > 1 &&
 	    (dl_task(rq->curr) || rt_task(rq->curr)) &&
-	    (tsk_nr_cpus_allowed(rq->curr) < 2 ||
+	    (rq->curr->nr_cpus_allowed < 2 ||
 	     rq->curr->prio <= p->prio))
 		push_rt_tasks(rq);
 }
@@ -2196,7 +2196,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
 	 */
 	if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
-		if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded)
+		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
 			queue_push_tasks(rq);
 #endif /* CONFIG_SMP */
 		if (p->prio < rq->curr->prio)
-- 
cgit v1.2.3


From f9411ebe3d85cbbea06298241e6053d031d281fc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 6 Feb 2017 09:50:49 +0100
Subject: rcu: Separate the RCU synchronization types and APIs into
 <linux/rcupdate_wait.h>

So rcupdate.h is a pretty complex header, in particular it includes
<linux/completion.h> which includes <linux/wait.h> - creating a
dependency that includes <linux/wait.h> in <linux/sched.h>,
which prevents the isolation of <linux/sched.h> from the derived
<linux/wait.h> header.

Solve part of the problem by decoupling rcupdate.h from completions:
this can be done by separating out the rcu_synchronize types and APIs,
and updating their usage sites.

Since this is a mostly RCU-internal types this will not just simplify
<linux/sched.h>'s dependencies, but will make all the hundreds of
.c files that include rcupdate.h but not completions or wait.h build
faster.

( For rcutiny this means that two dependent APIs have to be uninlined,
  but that shouldn't be much of a problem as they are rare variants. )

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/autofs4/autofs_i.h         |  1 +
 include/linux/dcache.h        |  1 +
 include/linux/rcupdate.h      | 40 ----------------------------------
 include/linux/rcupdate_wait.h | 50 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/rcutiny.h       | 11 ++--------
 kernel/rcu/srcu.c             |  2 +-
 kernel/rcu/tiny.c             | 14 +++++++++++-
 kernel/rcu/tree.c             |  2 +-
 kernel/rcu/update.c           |  1 +
 kernel/sched/core.c           |  1 +
 10 files changed, 71 insertions(+), 52 deletions(-)
 create mode 100644 include/linux/rcupdate_wait.h

(limited to 'kernel')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index c885daae68c8..beef981aa54f 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -14,6 +14,7 @@
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
 #include <linux/list.h>
+#include <linux/completion.h>
 
 /* This is the range of ioctl() numbers we claim as ours */
 #define AUTOFS_IOC_FIRST     AUTOFS_IOC_READY
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 591b6c16f9c1..d2e38dc6172c 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -11,6 +11,7 @@
 #include <linux/rcupdate.h>
 #include <linux/lockref.h>
 #include <linux/stringhash.h>
+#include <linux/wait.h>
 
 struct path;
 struct vfsmount;
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 6ade6a52d9d4..de88b33c0974 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -40,7 +40,6 @@
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
 #include <linux/lockdep.h>
-#include <linux/completion.h>
 #include <linux/debugobjects.h>
 #include <linux/bug.h>
 #include <linux/compiler.h>
@@ -226,45 +225,6 @@ void call_rcu_sched(struct rcu_head *head,
 
 void synchronize_sched(void);
 
-/*
- * Structure allowing asynchronous waiting on RCU.
- */
-struct rcu_synchronize {
-	struct rcu_head head;
-	struct completion completion;
-};
-void wakeme_after_rcu(struct rcu_head *head);
-
-void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
-		   struct rcu_synchronize *rs_array);
-
-#define _wait_rcu_gp(checktiny, ...) \
-do {									\
-	call_rcu_func_t __crcu_array[] = { __VA_ARGS__ };		\
-	struct rcu_synchronize __rs_array[ARRAY_SIZE(__crcu_array)];	\
-	__wait_rcu_gp(checktiny, ARRAY_SIZE(__crcu_array),		\
-			__crcu_array, __rs_array);			\
-} while (0)
-
-#define wait_rcu_gp(...) _wait_rcu_gp(false, __VA_ARGS__)
-
-/**
- * synchronize_rcu_mult - Wait concurrently for multiple grace periods
- * @...: List of call_rcu() functions for the flavors to wait on.
- *
- * This macro waits concurrently for multiple flavors of RCU grace periods.
- * For example, synchronize_rcu_mult(call_rcu, call_rcu_bh) would wait
- * on concurrent RCU and RCU-bh grace periods.  Waiting on a give SRCU
- * domain requires you to write a wrapper function for that SRCU domain's
- * call_srcu() function, supplying the corresponding srcu_struct.
- *
- * If Tiny RCU, tell _wait_rcu_gp() not to bother waiting for RCU
- * or RCU-bh, given that anywhere synchronize_rcu_mult() can be called
- * is automatically a grace period.
- */
-#define synchronize_rcu_mult(...) \
-	_wait_rcu_gp(IS_ENABLED(CONFIG_TINY_RCU), __VA_ARGS__)
-
 /**
  * call_rcu_tasks() - Queue an RCU for invocation task-based grace period
  * @head: structure to be used for queueing the RCU updates.
diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h
new file mode 100644
index 000000000000..e774b4f5f220
--- /dev/null
+++ b/include/linux/rcupdate_wait.h
@@ -0,0 +1,50 @@
+#ifndef _LINUX_SCHED_RCUPDATE_WAIT_H
+#define _LINUX_SCHED_RCUPDATE_WAIT_H
+
+/*
+ * RCU synchronization types and methods:
+ */
+
+#include <linux/rcupdate.h>
+#include <linux/completion.h>
+
+/*
+ * Structure allowing asynchronous waiting on RCU.
+ */
+struct rcu_synchronize {
+	struct rcu_head head;
+	struct completion completion;
+};
+void wakeme_after_rcu(struct rcu_head *head);
+
+void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
+		   struct rcu_synchronize *rs_array);
+
+#define _wait_rcu_gp(checktiny, ...) \
+do {									\
+	call_rcu_func_t __crcu_array[] = { __VA_ARGS__ };		\
+	struct rcu_synchronize __rs_array[ARRAY_SIZE(__crcu_array)];	\
+	__wait_rcu_gp(checktiny, ARRAY_SIZE(__crcu_array),		\
+			__crcu_array, __rs_array);			\
+} while (0)
+
+#define wait_rcu_gp(...) _wait_rcu_gp(false, __VA_ARGS__)
+
+/**
+ * synchronize_rcu_mult - Wait concurrently for multiple grace periods
+ * @...: List of call_rcu() functions for the flavors to wait on.
+ *
+ * This macro waits concurrently for multiple flavors of RCU grace periods.
+ * For example, synchronize_rcu_mult(call_rcu, call_rcu_bh) would wait
+ * on concurrent RCU and RCU-bh grace periods.  Waiting on a give SRCU
+ * domain requires you to write a wrapper function for that SRCU domain's
+ * call_srcu() function, supplying the corresponding srcu_struct.
+ *
+ * If Tiny RCU, tell _wait_rcu_gp() not to bother waiting for RCU
+ * or RCU-bh, given that anywhere synchronize_rcu_mult() can be called
+ * is automatically a grace period.
+ */
+#define synchronize_rcu_mult(...) \
+	_wait_rcu_gp(IS_ENABLED(CONFIG_TINY_RCU), __VA_ARGS__)
+
+#endif /* _LINUX_SCHED_RCUPDATE_WAIT_H */
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 4f9b2fa2173d..b452953e21c8 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -53,15 +53,8 @@ static inline void cond_synchronize_sched(unsigned long oldstate)
 	might_sleep();
 }
 
-static inline void rcu_barrier_bh(void)
-{
-	wait_rcu_gp(call_rcu_bh);
-}
-
-static inline void rcu_barrier_sched(void)
-{
-	wait_rcu_gp(call_rcu_sched);
-}
+extern void rcu_barrier_bh(void);
+extern void rcu_barrier_sched(void);
 
 static inline void synchronize_rcu_expedited(void)
 {
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index e773129c8b08..ef3bcfb15b39 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -30,7 +30,7 @@
 #include <linux/mutex.h>
 #include <linux/percpu.h>
 #include <linux/preempt.h>
-#include <linux/rcupdate.h>
+#include <linux/rcupdate_wait.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
 #include <linux/delay.h>
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index fa6a48d3917b..6ad330dbbae2 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -25,7 +25,7 @@
 #include <linux/completion.h>
 #include <linux/interrupt.h>
 #include <linux/notifier.h>
-#include <linux/rcupdate.h>
+#include <linux/rcupdate_wait.h>
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/mutex.h>
@@ -47,6 +47,18 @@ static void __call_rcu(struct rcu_head *head,
 
 #include "tiny_plugin.h"
 
+void rcu_barrier_bh(void)
+{
+	wait_rcu_gp(call_rcu_bh);
+}
+EXPORT_SYMBOL(rcu_barrier_bh);
+
+void rcu_barrier_sched(void)
+{
+	wait_rcu_gp(call_rcu_sched);
+}
+EXPORT_SYMBOL(rcu_barrier_sched);
+
 #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
 
 /*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d80e0d2f68c6..cb62ce23ffc7 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -32,7 +32,7 @@
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/smp.h>
-#include <linux/rcupdate.h>
+#include <linux/rcupdate_wait.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <linux/nmi.h>
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 9e03db9ea9c0..a0e90e0afc75 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -49,6 +49,7 @@
 #include <linux/moduleparam.h>
 #include <linux/kthread.h>
 #include <linux/tick.h>
+#include <linux/rcupdate_wait.h>
 
 #define CREATE_TRACE_POINTS
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d51ec65dc73..1bd317db9810 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10,6 +10,7 @@
 #include <linux/delayacct.h>
 #include <linux/init_task.h>
 #include <linux/context_tracking.h>
+#include <linux/rcupdate_wait.h>
 
 #include <linux/blkdev.h>
 #include <linux/kprobes.h>
-- 
cgit v1.2.3


From 780de9dd2720debc14c501dab4dc80d1f75ad50e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Feb 2017 11:50:56 +0100
Subject: sched/headers, cgroups: Remove the threadgroup_change_*() wrappery

threadgroup_change_begin()/end() is a pointless wrapper around
cgroup_threadgroup_change_begin()/end(), minus a might_sleep()
in the !CONFIG_CGROUPS=y case.

Remove the wrappery, move the might_sleep() (the down_read()
already has a might_sleep() check).

This debloats <linux/sched.h> a bit and simplifies this API.

Update all call sites.

No change in functionality.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/exec.c                   |  6 +++---
 include/linux/cgroup-defs.h | 13 ++++++++-----
 include/linux/sched.h       | 28 ----------------------------
 kernel/cgroup/pids.c        |  2 +-
 kernel/fork.c               |  6 +++---
 kernel/signal.c             |  6 +++---
 6 files changed, 18 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index 698a86094f76..e595e1529581 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1088,7 +1088,7 @@ static int de_thread(struct task_struct *tsk)
 		struct task_struct *leader = tsk->group_leader;
 
 		for (;;) {
-			threadgroup_change_begin(tsk);
+			cgroup_threadgroup_change_begin(tsk);
 			write_lock_irq(&tasklist_lock);
 			/*
 			 * Do this under tasklist_lock to ensure that
@@ -1099,7 +1099,7 @@ static int de_thread(struct task_struct *tsk)
 				break;
 			__set_current_state(TASK_KILLABLE);
 			write_unlock_irq(&tasklist_lock);
-			threadgroup_change_end(tsk);
+			cgroup_threadgroup_change_end(tsk);
 			schedule();
 			if (unlikely(__fatal_signal_pending(tsk)))
 				goto killed;
@@ -1157,7 +1157,7 @@ static int de_thread(struct task_struct *tsk)
 		if (unlikely(leader->ptrace))
 			__wake_up_parent(leader, leader->parent);
 		write_unlock_irq(&tasklist_lock);
-		threadgroup_change_end(tsk);
+		cgroup_threadgroup_change_end(tsk);
 
 		release_task(leader);
 	}
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 3c02404cfce9..6a3f850cabab 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -531,8 +531,8 @@ extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
  * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups
  * @tsk: target task
  *
- * Called from threadgroup_change_begin() and allows cgroup operations to
- * synchronize against threadgroup changes using a percpu_rw_semaphore.
+ * Allows cgroup operations to synchronize against threadgroup changes
+ * using a percpu_rw_semaphore.
  */
 static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
 {
@@ -543,8 +543,7 @@ static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
  * cgroup_threadgroup_change_end - threadgroup exclusion for cgroups
  * @tsk: target task
  *
- * Called from threadgroup_change_end().  Counterpart of
- * cgroup_threadcgroup_change_begin().
+ * Counterpart of cgroup_threadcgroup_change_begin().
  */
 static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
 {
@@ -555,7 +554,11 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
 
 #define CGROUP_SUBSYS_COUNT 0
 
-static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {}
+static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
+{
+	might_sleep();
+}
+
 static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {}
 
 #endif	/* CONFIG_CGROUPS */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e732881517f2..3f61baac928b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3162,34 +3162,6 @@ static inline void unlock_task_sighand(struct task_struct *tsk,
 	spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
 }
 
-/**
- * threadgroup_change_begin - mark the beginning of changes to a threadgroup
- * @tsk: task causing the changes
- *
- * All operations which modify a threadgroup - a new thread joining the
- * group, death of a member thread (the assertion of PF_EXITING) and
- * exec(2) dethreading the process and replacing the leader - are wrapped
- * by threadgroup_change_{begin|end}().  This is to provide a place which
- * subsystems needing threadgroup stability can hook into for
- * synchronization.
- */
-static inline void threadgroup_change_begin(struct task_struct *tsk)
-{
-	might_sleep();
-	cgroup_threadgroup_change_begin(tsk);
-}
-
-/**
- * threadgroup_change_end - mark the end of changes to a threadgroup
- * @tsk: task causing the changes
- *
- * See threadgroup_change_begin().
- */
-static inline void threadgroup_change_end(struct task_struct *tsk)
-{
-	cgroup_threadgroup_change_end(tsk);
-}
-
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 
 static inline struct thread_info *task_thread_info(struct task_struct *task)
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 2bd673783f1a..e756dae49300 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -214,7 +214,7 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
 
 /*
  * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
- * on threadgroup_change_begin() held by the copy_process().
+ * on cgroup_threadgroup_change_begin() held by the copy_process().
  */
 static int pids_can_fork(struct task_struct *task)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index 246bf9aaf9df..d043fedc03c8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1746,7 +1746,7 @@ static __latent_entropy struct task_struct *copy_process(
 	INIT_LIST_HEAD(&p->thread_group);
 	p->task_works = NULL;
 
-	threadgroup_change_begin(current);
+	cgroup_threadgroup_change_begin(current);
 	/*
 	 * Ensure that the cgroup subsystem policies allow the new process to be
 	 * forked. It should be noted the the new process's css_set can be changed
@@ -1843,7 +1843,7 @@ static __latent_entropy struct task_struct *copy_process(
 
 	proc_fork_connector(p);
 	cgroup_post_fork(p);
-	threadgroup_change_end(current);
+	cgroup_threadgroup_change_end(current);
 	perf_event_fork(p);
 
 	trace_task_newtask(p, clone_flags);
@@ -1854,7 +1854,7 @@ static __latent_entropy struct task_struct *copy_process(
 bad_fork_cancel_cgroup:
 	cgroup_cancel_fork(p);
 bad_fork_free_pid:
-	threadgroup_change_end(current);
+	cgroup_threadgroup_change_end(current);
 	if (pid != &init_struct_pid)
 		free_pid(pid);
 bad_fork_cleanup_thread:
diff --git a/kernel/signal.c b/kernel/signal.c
index 214a8feeb771..bae358532d0a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2395,11 +2395,11 @@ void exit_signals(struct task_struct *tsk)
 	 * @tsk is about to have PF_EXITING set - lock out users which
 	 * expect stable threadgroup.
 	 */
-	threadgroup_change_begin(tsk);
+	cgroup_threadgroup_change_begin(tsk);
 
 	if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
 		tsk->flags |= PF_EXITING;
-		threadgroup_change_end(tsk);
+		cgroup_threadgroup_change_end(tsk);
 		return;
 	}
 
@@ -2410,7 +2410,7 @@ void exit_signals(struct task_struct *tsk)
 	 */
 	tsk->flags |= PF_EXITING;
 
-	threadgroup_change_end(tsk);
+	cgroup_threadgroup_change_end(tsk);
 
 	if (!signal_pending(tsk))
 		goto out;
-- 
cgit v1.2.3


From 314ff7851fc8ea66cbf48eaa93d8ebfb5ca084a9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 11:03:31 +0100
Subject: mm/vmacache, sched/headers: Introduce 'struct vmacache' and move it
 from <linux/sched.h> to <linux/mm_types>

The <linux/sched.h> header includes various vmacache related defines,
which are arguably misplaced.

Move them to mm_types.h and minimize the sched.h impact by putting
all task vmacache state into a new 'struct vmacache' structure.

No change in functionality.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mm_types.h  | 12 ++++++++++++
 include/linux/sched.h     | 11 ++++-------
 include/linux/vmacache.h  |  2 +-
 kernel/debug/debug_core.c |  4 ++--
 mm/nommu.c                |  2 +-
 mm/vmacache.c             | 10 +++++-----
 6 files changed, 25 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 4f6d440ad785..137797cd7b50 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -360,6 +360,18 @@ struct vm_area_struct {
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
 };
 
+/*
+ * The per task VMA cache array:
+ */
+#define VMACACHE_BITS 2
+#define VMACACHE_SIZE (1U << VMACACHE_BITS)
+#define VMACACHE_MASK (VMACACHE_SIZE - 1)
+
+struct vmacache {
+	u32 seqnum;
+	struct vm_area_struct *vmas[VMACACHE_SIZE];
+};
+
 struct core_thread {
 	struct task_struct *task;
 	struct core_thread *next;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3f61baac928b..e87c97e1a947 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -134,10 +134,6 @@ struct blk_plug;
 struct filename;
 struct nameidata;
 
-#define VMACACHE_BITS 2
-#define VMACACHE_SIZE (1U << VMACACHE_BITS)
-#define VMACACHE_MASK (VMACACHE_SIZE - 1)
-
 /*
  * These are the constant used to fake the fixed-point load-average
  * counting. Some notes:
@@ -1550,9 +1546,10 @@ struct task_struct {
 #endif
 
 	struct mm_struct *mm, *active_mm;
-	/* per-thread vma caching */
-	u32 vmacache_seqnum;
-	struct vm_area_struct *vmacache[VMACACHE_SIZE];
+
+	/* Per-thread vma caching: */
+	struct vmacache vmacache;
+
 #if defined(SPLIT_RSS_COUNTING)
 	struct task_rss_stat	rss_stat;
 #endif
diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h
index c3fa0fd43949..1081db987391 100644
--- a/include/linux/vmacache.h
+++ b/include/linux/vmacache.h
@@ -12,7 +12,7 @@
 
 static inline void vmacache_flush(struct task_struct *tsk)
 {
-	memset(tsk->vmacache, 0, sizeof(tsk->vmacache));
+	memset(tsk->vmacache.vmas, 0, sizeof(tsk->vmacache.vmas));
 }
 
 extern void vmacache_flush_all(struct mm_struct *mm);
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 79517e5549f1..a603ef28f70c 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -232,9 +232,9 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
 		int i;
 
 		for (i = 0; i < VMACACHE_SIZE; i++) {
-			if (!current->vmacache[i])
+			if (!current->vmacache.vmas[i])
 				continue;
-			flush_cache_range(current->vmacache[i],
+			flush_cache_range(current->vmacache.vmas[i],
 					  addr, addr + BREAK_INSTR_SIZE);
 		}
 	}
diff --git a/mm/nommu.c b/mm/nommu.c
index fe9f4fa4a7a7..aae06e854552 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -757,7 +757,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
 	mm->map_count--;
 	for (i = 0; i < VMACACHE_SIZE; i++) {
 		/* if the vma is cached, invalidate the entire cache */
-		if (curr->vmacache[i] == vma) {
+		if (curr->vmacache.vmas[i] == vma) {
 			vmacache_invalidate(mm);
 			break;
 		}
diff --git a/mm/vmacache.c b/mm/vmacache.c
index 035fdeb35b43..7c233f8e20ee 100644
--- a/mm/vmacache.c
+++ b/mm/vmacache.c
@@ -60,7 +60,7 @@ static inline bool vmacache_valid_mm(struct mm_struct *mm)
 void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)
 {
 	if (vmacache_valid_mm(newvma->vm_mm))
-		current->vmacache[VMACACHE_HASH(addr)] = newvma;
+		current->vmacache.vmas[VMACACHE_HASH(addr)] = newvma;
 }
 
 static bool vmacache_valid(struct mm_struct *mm)
@@ -71,12 +71,12 @@ static bool vmacache_valid(struct mm_struct *mm)
 		return false;
 
 	curr = current;
-	if (mm->vmacache_seqnum != curr->vmacache_seqnum) {
+	if (mm->vmacache_seqnum != curr->vmacache.seqnum) {
 		/*
 		 * First attempt will always be invalid, initialize
 		 * the new cache for this task here.
 		 */
-		curr->vmacache_seqnum = mm->vmacache_seqnum;
+		curr->vmacache.seqnum = mm->vmacache_seqnum;
 		vmacache_flush(curr);
 		return false;
 	}
@@ -93,7 +93,7 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
 		return NULL;
 
 	for (i = 0; i < VMACACHE_SIZE; i++) {
-		struct vm_area_struct *vma = current->vmacache[i];
+		struct vm_area_struct *vma = current->vmacache.vmas[i];
 
 		if (!vma)
 			continue;
@@ -121,7 +121,7 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
 		return NULL;
 
 	for (i = 0; i < VMACACHE_SIZE; i++) {
-		struct vm_area_struct *vma = current->vmacache[i];
+		struct vm_area_struct *vma = current->vmacache.vmas[i];
 
 		if (vma && vma->vm_start == start && vma->vm_end == end) {
 			count_vm_vmacache_event(VMACACHE_FIND_HITS);
-- 
cgit v1.2.3


From 105ab3d8ce7269887d24d224054677125e18037c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 16:36:40 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/topology.h>

We are going to split <linux/sched/topology.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/topology.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/kernel/topology.c     | 1 +
 arch/arm64/kernel/topology.c   | 1 +
 arch/powerpc/kernel/smp.c      | 1 +
 arch/s390/kernel/topology.c    | 1 +
 arch/x86/kernel/smpboot.c      | 1 +
 block/blk-mq.c                 | 1 +
 block/blk-softirq.c            | 1 +
 include/linux/cpuset.h         | 1 +
 include/linux/sched/topology.h | 6 ++++++
 kernel/sched/fair.c            | 2 ++
 kernel/sched/sched.h           | 1 +
 11 files changed, 17 insertions(+)
 create mode 100644 include/linux/sched/topology.h

(limited to 'kernel')

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index ebf47d91b804..f8a3ab82e77f 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -21,6 +21,7 @@
 #include <linux/nodemask.h>
 #include <linux/of.h>
 #include <linux/sched.h>
+#include <linux/sched/topology.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 565dd69888cc..08243533e5ee 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -20,6 +20,7 @@
 #include <linux/nodemask.h>
 #include <linux/of.h>
 #include <linux/sched.h>
+#include <linux/sched/topology.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/cpufreq.h>
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 21fdf02583fe..fce17789c675 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -20,6 +20,7 @@
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/topology.h>
 #include <linux/smp.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 2cd5f4f1013c..17660e800e74 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -13,6 +13,7 @@
 #include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/topology.h>
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/slab.h>
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index a0d38685f7df..fe7a66e6b5a0 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -45,6 +45,7 @@
 #include <linux/smp.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/topology.h>
 #include <linux/percpu.h>
 #include <linux/bootmem.h>
 #include <linux/err.h>
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9e6b064e5339..746c14e0d157 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -20,6 +20,7 @@
 #include <linux/cpu.h>
 #include <linux/cache.h>
 #include <linux/sched/sysctl.h>
+#include <linux/sched/topology.h>
 #include <linux/delay.h>
 #include <linux/crash_dump.h>
 #include <linux/prefetch.h>
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 06cf9807f49a..87b7df4851bf 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -9,6 +9,7 @@
 #include <linux/interrupt.h>
 #include <linux/cpu.h>
 #include <linux/sched.h>
+#include <linux/sched/topology.h>
 
 #include "blk.h"
 
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index bfc204e70338..c608c39cb161 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -9,6 +9,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/topology.h>
 #include <linux/cpumask.h>
 #include <linux/nodemask.h>
 #include <linux/mm.h>
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
new file mode 100644
index 000000000000..4f1159f76f92
--- /dev/null
+++ b/include/linux/sched/topology.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_TOPOLOGY_H
+#define _LINUX_SCHED_TOPOLOGY_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_TOPOLOGY_H */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3b60d73ab290..11e0ab57748a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -21,6 +21,8 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/topology.h>
+
 #include <linux/latencytop.h>
 #include <linux/cpumask.h>
 #include <linux/cpuidle.h>
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 71b10a9b73cf..9b9cb260d9cf 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,6 +1,7 @@
 
 #include <linux/sched.h>
 #include <linux/sched/sysctl.h>
+#include <linux/sched/topology.h>
 #include <linux/sched/rt.h>
 #include <linux/u64_stats_sync.h>
 #include <linux/sched/deadline.h>
-- 
cgit v1.2.3


From 4c822698cba8bdd93724117eded12bf34eb80252 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 16:36:40 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/idle.h>

We are going to split  <linux/sched/idle.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/idle.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mwait.h   | 1 +
 arch/x86/kernel/process.c      | 1 +
 drivers/cpuidle/driver.c       | 1 +
 include/linux/sched/idle.h     | 6 ++++++
 include/linux/sched/topology.h | 2 ++
 kernel/sched/idle.c            | 1 +
 kernel/smp.c                   | 1 +
 7 files changed, 13 insertions(+)
 create mode 100644 include/linux/sched/idle.h

(limited to 'kernel')

diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index f37f2d8a2989..bda3c27f0da0 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -2,6 +2,7 @@
 #define _ASM_X86_MWAIT_H
 
 #include <linux/sched.h>
+#include <linux/sched/idle.h>
 
 #include <asm/cpufeature.h>
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 7780efa635b9..6395e0cd7dd6 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -7,6 +7,7 @@
 #include <linux/prctl.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/sched/idle.h>
 #include <linux/init.h>
 #include <linux/export.h>
 #include <linux/pm.h>
diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c
index ab264d393233..e53fb861beb0 100644
--- a/drivers/cpuidle/driver.c
+++ b/drivers/cpuidle/driver.c
@@ -11,6 +11,7 @@
 #include <linux/mutex.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/idle.h>
 #include <linux/cpuidle.h>
 #include <linux/cpumask.h>
 #include <linux/tick.h>
diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h
new file mode 100644
index 000000000000..a3cf79b86986
--- /dev/null
+++ b/include/linux/sched/idle.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_IDLE_H
+#define _LINUX_SCHED_IDLE_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_IDLE_H */
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 4f1159f76f92..184b56b8aa64 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -3,4 +3,6 @@
 
 #include <linux/sched.h>
 
+#include <linux/sched/idle.h>
+
 #endif /* _LINUX_SCHED_TOPOLOGY_H */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 6a4bae0a649d..ac6d5176463d 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -2,6 +2,7 @@
  * Generic entry point for the idle threads
  */
 #include <linux/sched.h>
+#include <linux/sched/idle.h>
 #include <linux/cpu.h>
 #include <linux/cpuidle.h>
 #include <linux/cpuhotplug.h>
diff --git a/kernel/smp.c b/kernel/smp.c
index 77fcdb9f2775..a817769b53c0 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -17,6 +17,7 @@
 #include <linux/smp.h>
 #include <linux/cpu.h>
 #include <linux/sched.h>
+#include <linux/sched/idle.h>
 #include <linux/hypervisor.h>
 
 #include "smpboot.h"
-- 
cgit v1.2.3


From 84f001e15737f8214b0f5f0f7dfec0fb1027938f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 16:36:40 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/wake_q.h>

We are going to split <linux/sched/wake_q.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/wake_q.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/wake_q.h    | 6 ++++++
 ipc/mqueue.c                    | 1 +
 ipc/msg.c                       | 1 +
 ipc/sem.c                       | 1 +
 kernel/futex.c                  | 1 +
 kernel/locking/mutex.c          | 1 +
 kernel/locking/rtmutex.c        | 1 +
 kernel/locking/rtmutex_common.h | 1 +
 kernel/locking/rwsem-xadd.c     | 3 ++-
 kernel/sched/sched.h            | 1 +
 10 files changed, 16 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/sched/wake_q.h

(limited to 'kernel')

diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h
new file mode 100644
index 000000000000..6383b35e4eba
--- /dev/null
+++ b/include/linux/sched/wake_q.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_WAKE_Q_H
+#define _LINUX_SCHED_WAKE_Q_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_WAKE_Q_H */
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 4fdd97031431..40e448de8a7e 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -35,6 +35,7 @@
 #include <linux/ipc_namespace.h>
 #include <linux/user_namespace.h>
 #include <linux/slab.h>
+#include <linux/sched/wake_q.h>
 
 #include <net/sock.h>
 #include "util.h"
diff --git a/ipc/msg.c b/ipc/msg.c
index e3e52ce01123..ecc387e573f6 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -31,6 +31,7 @@
 #include <linux/list.h>
 #include <linux/security.h>
 #include <linux/sched.h>
+#include <linux/sched/wake_q.h>
 #include <linux/syscalls.h>
 #include <linux/audit.h>
 #include <linux/seq_file.h>
diff --git a/ipc/sem.c b/ipc/sem.c
index e468cd1c12f0..947dc2348271 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -82,6 +82,7 @@
 #include <linux/rwsem.h>
 #include <linux/nsproxy.h>
 #include <linux/ipc_namespace.h>
+#include <linux/sched/wake_q.h>
 
 #include <linux/uaccess.h>
 #include "util.h"
diff --git a/kernel/futex.c b/kernel/futex.c
index b687cb22301c..8ddcf9ea953c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -61,6 +61,7 @@
 #include <linux/nsproxy.h>
 #include <linux/ptrace.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/wake_q.h>
 #include <linux/hugetlb.h>
 #include <linux/freezer.h>
 #include <linux/bootmem.h>
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index ad2d9e22697b..57f6311e2405 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -21,6 +21,7 @@
 #include <linux/ww_mutex.h>
 #include <linux/sched.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/wake_q.h>
 #include <linux/export.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index d340be3a488f..d4f798491361 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/deadline.h>
+#include <linux/sched/wake_q.h>
 #include <linux/timer.h>
 
 #include "rtmutex_common.h"
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 990134617b4c..856dfff5c33a 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -13,6 +13,7 @@
 #define __KERNEL_RTMUTEX_COMMON_H
 
 #include <linux/rtmutex.h>
+#include <linux/sched/wake_q.h>
 
 /*
  * This is the control structure for tasks blocked on a rt_mutex,
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 2ad8d8dc3bb1..4fe8d8ad4396 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -10,10 +10,11 @@
  * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
  */
 #include <linux/rwsem.h>
-#include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/export.h>
+#include <linux/sched.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/wake_q.h>
 #include <linux/osq_lock.h>
 
 #include "rwsem.h"
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9b9cb260d9cf..683570739f46 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3,6 +3,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/sched/topology.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/wake_q.h>
 #include <linux/u64_stats_sync.h>
 #include <linux/sched/deadline.h>
 #include <linux/kernel_stat.h>
-- 
cgit v1.2.3


From e601757102cfd3eeae068f53b3bc1234f3a2b2e9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 16:36:40 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/clock.h>

We are going to split <linux/sched/clock.h> out of <linux/sched.h>, which
will have to be picked up from other headers and .c files.

Create a trivial placeholder <linux/sched/clock.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/mach-bcm/platsmp.c             | 1 +
 arch/arm/mach-omap2/pm-debug.c          | 1 +
 arch/arm/probes/kprobes/test-core.c     | 1 +
 arch/cris/kernel/time.c                 | 2 +-
 arch/ia64/kernel/setup.c                | 1 +
 arch/microblaze/kernel/timer.c          | 1 +
 arch/mn10300/kernel/time.c              | 1 +
 arch/parisc/kernel/setup.c              | 1 +
 arch/parisc/kernel/time.c               | 1 +
 arch/powerpc/kernel/time.c              | 1 +
 arch/s390/kernel/time.c                 | 1 +
 arch/sparc/kernel/ds.c                  | 1 +
 arch/sparc/kernel/viohs.c               | 1 +
 arch/tile/kernel/time.c                 | 1 +
 arch/x86/events/amd/ibs.c               | 1 +
 arch/x86/events/core.c                  | 1 +
 arch/x86/kernel/cpu/amd.c               | 1 +
 arch/x86/kernel/cpu/centaur.c           | 1 +
 arch/x86/kernel/cpu/common.c            | 1 +
 arch/x86/kernel/cpu/cyrix.c             | 1 +
 arch/x86/kernel/cpu/intel.c             | 1 +
 arch/x86/kernel/cpu/transmeta.c         | 1 +
 arch/x86/kernel/kvmclock.c              | 1 +
 arch/x86/kernel/nmi.c                   | 1 +
 arch/x86/kernel/tsc.c                   | 1 +
 block/cfq-iosched.c                     | 1 +
 drivers/acpi/apei/ghes.c                | 1 +
 drivers/clocksource/arm_arch_timer.c    | 1 +
 drivers/clocksource/pxa_timer.c         | 1 +
 drivers/clocksource/timer-digicolor.c   | 1 +
 drivers/cpuidle/cpuidle.c               | 1 +
 drivers/firmware/tegra/bpmp.c           | 1 +
 drivers/gpu/drm/i915/i915_gem_request.c | 2 ++
 drivers/gpu/drm/i915/intel_drv.h        | 1 +
 drivers/md/bcache/bset.c                | 1 +
 drivers/md/bcache/btree.c               | 1 +
 drivers/md/bcache/sysfs.c               | 1 +
 drivers/md/bcache/util.c                | 1 +
 drivers/md/bcache/util.h                | 1 +
 drivers/md/bcache/writeback.c           | 1 +
 drivers/misc/cxl/native.c               | 1 +
 drivers/net/irda/pxaficp_ir.c           | 1 +
 drivers/perf/arm_pmu.c                  | 1 +
 drivers/vhost/net.c                     | 1 +
 include/linux/blkdev.h                  | 1 +
 include/linux/sched/clock.h             | 6 ++++++
 include/linux/skbuff.h                  | 1 +
 include/net/busy_poll.h                 | 1 +
 kernel/events/core.c                    | 1 +
 kernel/locking/lockdep.c                | 1 +
 kernel/locking/qspinlock_stat.h         | 1 +
 kernel/printk/printk.c                  | 1 +
 kernel/sched/clock.c                    | 1 +
 kernel/sched/core.c                     | 1 +
 kernel/sched/sched.h                    | 1 +
 kernel/time/sched_clock.c               | 1 +
 kernel/time/tick-sched.c                | 1 +
 kernel/torture.c                        | 1 +
 kernel/trace/ring_buffer.c              | 1 +
 kernel/trace/trace_clock.c              | 1 +
 kernel/trace/trace_hwlat.c              | 1 +
 kernel/trace/trace_output.c             | 1 +
 kernel/watchdog.c                       | 1 +
 lib/plist.c                             | 1 +
 net/ipv4/tcp_cdg.c                      | 2 ++
 65 files changed, 72 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/sched/clock.h

(limited to 'kernel')

diff --git a/arch/arm/mach-bcm/platsmp.c b/arch/arm/mach-bcm/platsmp.c
index 582886d0d02f..9e3f275934eb 100644
--- a/arch/arm/mach-bcm/platsmp.c
+++ b/arch/arm/mach-bcm/platsmp.c
@@ -21,6 +21,7 @@
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/smp.h>
 
 #include <asm/cacheflush.h>
diff --git a/arch/arm/mach-omap2/pm-debug.c b/arch/arm/mach-omap2/pm-debug.c
index 003a6cb248be..5c46ea6756d7 100644
--- a/arch/arm/mach-omap2/pm-debug.c
+++ b/arch/arm/mach-omap2/pm-debug.c
@@ -21,6 +21,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/clk.h>
 #include <linux/err.h>
 #include <linux/io.h>
diff --git a/arch/arm/probes/kprobes/test-core.c b/arch/arm/probes/kprobes/test-core.c
index 9775de22e2ff..c893726aa52d 100644
--- a/arch/arm/probes/kprobes/test-core.c
+++ b/arch/arm/probes/kprobes/test-core.c
@@ -203,6 +203,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/sched/clock.h>
 #include <linux/kprobes.h>
 #include <linux/errno.h>
 #include <linux/stddef.h>
diff --git a/arch/cris/kernel/time.c b/arch/cris/kernel/time.c
index 2dda6da71521..bc562cf511a6 100644
--- a/arch/cris/kernel/time.c
+++ b/arch/cris/kernel/time.c
@@ -29,7 +29,7 @@
 #include <linux/timex.h>
 #include <linux/init.h>
 #include <linux/profile.h>
-#include <linux/sched.h>	/* just for sched_clock() - funny that */
+#include <linux/sched/clock.h>
 
 
 #define D(x)
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index d68322966f33..32a6cc36296f 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -32,6 +32,7 @@
 #include <linux/kernel.h>
 #include <linux/reboot.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/seq_file.h>
 #include <linux/string.h>
 #include <linux/threads.h>
diff --git a/arch/microblaze/kernel/timer.c b/arch/microblaze/kernel/timer.c
index 1d6fad50fa76..999066192715 100644
--- a/arch/microblaze/kernel/timer.c
+++ b/arch/microblaze/kernel/timer.c
@@ -12,6 +12,7 @@
 #include <linux/interrupt.h>
 #include <linux/delay.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/sched_clock.h>
 #include <linux/clk.h>
 #include <linux/clockchips.h>
diff --git a/arch/mn10300/kernel/time.c b/arch/mn10300/kernel/time.c
index 67c6416a58f8..06b83b17c5f1 100644
--- a/arch/mn10300/kernel/time.c
+++ b/arch/mn10300/kernel/time.c
@@ -10,6 +10,7 @@
  * 2 of the Licence, or (at your option) any later version.
  */
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/kernel.h>
 #include <linux/interrupt.h>
 #include <linux/time.h>
diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c
index 068ed3607bac..dee6f9d6a153 100644
--- a/arch/parisc/kernel/setup.c
+++ b/arch/parisc/kernel/setup.c
@@ -37,6 +37,7 @@
 #include <linux/proc_fs.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 
 #include <asm/processor.h>
 #include <asm/sections.h>
diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c
index 1e22f981cd81..89421df70160 100644
--- a/arch/parisc/kernel/time.c
+++ b/arch/parisc/kernel/time.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/rtc.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/sched_clock.h>
 #include <linux/kernel.h>
 #include <linux/param.h>
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index bc84a8d47b9e..37833c5dc274 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -34,6 +34,7 @@
 #include <linux/errno.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/string.h>
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index de66abb479c9..c31da46bc037 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -18,6 +18,7 @@
 #include <linux/errno.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/string.h>
diff --git a/arch/sparc/kernel/ds.c b/arch/sparc/kernel/ds.c
index f87a55d77094..b542cc7c8d94 100644
--- a/arch/sparc/kernel/ds.c
+++ b/arch/sparc/kernel/ds.c
@@ -9,6 +9,7 @@
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/delay.h>
 #include <linux/mutex.h>
 #include <linux/kthread.h>
diff --git a/arch/sparc/kernel/viohs.c b/arch/sparc/kernel/viohs.c
index 526fcb5d8ce9..b30b30ab3ddd 100644
--- a/arch/sparc/kernel/viohs.c
+++ b/arch/sparc/kernel/viohs.c
@@ -8,6 +8,7 @@
 #include <linux/string.h>
 #include <linux/delay.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/slab.h>
 
 #include <asm/ldc.h>
diff --git a/arch/tile/kernel/time.c b/arch/tile/kernel/time.c
index c9357012b1c8..5bd4e88c7c60 100644
--- a/arch/tile/kernel/time.c
+++ b/arch/tile/kernel/time.c
@@ -20,6 +20,7 @@
 #include <linux/clockchips.h>
 #include <linux/hardirq.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/smp.h>
 #include <linux/delay.h>
 #include <linux/module.h>
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 496e60391fac..786fd875de92 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -12,6 +12,7 @@
 #include <linux/pci.h>
 #include <linux/ptrace.h>
 #include <linux/syscore_ops.h>
+#include <linux/sched/clock.h>
 
 #include <asm/apic.h>
 
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 1635c0c8df23..2ecc0e97772b 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -21,6 +21,7 @@
 #include <linux/init.h>
 #include <linux/kdebug.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/uaccess.h>
 #include <linux/slab.h>
 #include <linux/cpu.h>
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 4e95b2e0d95f..35a5d5dca2fa 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -5,6 +5,7 @@
 
 #include <linux/io.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/random.h>
 #include <asm/processor.h>
 #include <asm/apic.h>
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 2c234a6d94c4..adc0ebd8bed0 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,5 +1,6 @@
 
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 
 #include <asm/cpufeature.h>
 #include <asm/e820.h>
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c64ca5929cb5..c7e2ca966386 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -8,6 +8,7 @@
 #include <linux/ctype.h>
 #include <linux/delay.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/init.h>
 #include <linux/kprobes.h>
 #include <linux/kgdb.h>
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 47416f959a48..0a3bc19de017 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -10,6 +10,7 @@
 #include <asm/tsc.h>
 #include <asm/cpufeature.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 
 #include "cpu.h"
 
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 017ecd3bb553..fe0a615a051b 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -4,6 +4,7 @@
 #include <linux/bitops.h>
 #include <linux/smp.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/thread_info.h>
 #include <linux/init.h>
 #include <linux/uaccess.h>
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index c1ea5b999839..8457b4978668 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -1,5 +1,6 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/mm.h>
 #include <asm/cpufeature.h>
 #include <asm/msr.h>
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index bae6ea6cfb94..d88967659098 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -25,6 +25,7 @@
 #include <linux/hardirq.h>
 #include <linux/memblock.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 
 #include <asm/x86_init.h>
 #include <asm/reboot.h>
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index bfe4d6c96fbd..d17b1a940add 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -20,6 +20,7 @@
 #include <linux/ratelimit.h>
 #include <linux/slab.h>
 #include <linux/export.h>
+#include <linux/sched/clock.h>
 
 #if defined(CONFIG_EDAC)
 #include <linux/edac.h>
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 2724dc82f992..46bcda4cb1c2 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -2,6 +2,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/init.h>
 #include <linux/export.h>
 #include <linux/timer.h>
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 137944777859..440b95ee593c 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -8,6 +8,7 @@
  */
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/sched/clock.h>
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
 #include <linux/ktime.h>
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index e53bef6cf53c..b192b42a8351 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -44,6 +44,7 @@
 #include <linux/pci.h>
 #include <linux/aer.h>
 #include <linux/nmi.h>
+#include <linux/sched/clock.h>
 
 #include <acpi/ghes.h>
 #include <acpi/apei.h>
diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index 93aa1364376a..7a8a4117f123 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -24,6 +24,7 @@
 #include <linux/of_address.h>
 #include <linux/io.h>
 #include <linux/slab.h>
+#include <linux/sched/clock.h>
 #include <linux/sched_clock.h>
 #include <linux/acpi.h>
 
diff --git a/drivers/clocksource/pxa_timer.c b/drivers/clocksource/pxa_timer.c
index 9cae38eebec2..1c24de215c14 100644
--- a/drivers/clocksource/pxa_timer.c
+++ b/drivers/clocksource/pxa_timer.c
@@ -19,6 +19,7 @@
 #include <linux/clockchips.h>
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
+#include <linux/sched/clock.h>
 #include <linux/sched_clock.h>
 
 #include <clocksource/pxa.h>
diff --git a/drivers/clocksource/timer-digicolor.c b/drivers/clocksource/timer-digicolor.c
index 10318cc99c0e..e9f50d289362 100644
--- a/drivers/clocksource/timer-digicolor.c
+++ b/drivers/clocksource/timer-digicolor.c
@@ -31,6 +31,7 @@
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/irqreturn.h>
+#include <linux/sched/clock.h>
 #include <linux/sched_clock.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 62810ff3b00f..548b90be7685 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/mutex.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/notifier.h>
 #include <linux/pm_qos.h>
 #include <linux/cpu.h>
diff --git a/drivers/firmware/tegra/bpmp.c b/drivers/firmware/tegra/bpmp.c
index 4ff02d310868..84e4c9a58a0c 100644
--- a/drivers/firmware/tegra/bpmp.c
+++ b/drivers/firmware/tegra/bpmp.c
@@ -19,6 +19,7 @@
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
 #include <linux/semaphore.h>
+#include <linux/sched/clock.h>
 
 #include <soc/tegra/bpmp.h>
 #include <soc/tegra/bpmp-abi.h>
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index f31deeb72703..df3fef393dbe 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -24,6 +24,8 @@
 
 #include <linux/prefetch.h>
 #include <linux/dma-fence-array.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
 
 #include "i915_drv.h"
 
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index b9cde116dab3..344f238b283f 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -28,6 +28,7 @@
 #include <linux/async.h>
 #include <linux/i2c.h>
 #include <linux/hdmi.h>
+#include <linux/sched/clock.h>
 #include <drm/i915_drm.h>
 #include "i915_drv.h"
 #include <drm/drm_crtc.h>
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 646fe85261c1..18526d44688d 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -11,6 +11,7 @@
 #include "bset.h"
 
 #include <linux/console.h>
+#include <linux/sched/clock.h>
 #include <linux/random.h>
 #include <linux/prefetch.h>
 
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index a43eedd5804d..384559af310e 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -32,6 +32,7 @@
 #include <linux/prefetch.h>
 #include <linux/random.h>
 #include <linux/rcupdate.h>
+#include <linux/sched/clock.h>
 #include <trace/events/bcache.h>
 
 /*
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index b3ff57d61dde..f90f13616980 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -13,6 +13,7 @@
 
 #include <linux/blkdev.h>
 #include <linux/sort.h>
+#include <linux/sched/clock.h>
 
 static const char * const cache_replacement_policies[] = {
 	"lru",
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index dde6172f3f10..8c3a938f4bf0 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/types.h>
+#include <linux/sched/clock.h>
 
 #include "util.h"
 
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index cf2cbc211d83..a126919ed102 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -6,6 +6,7 @@
 #include <linux/errno.h>
 #include <linux/blkdev.h>
 #include <linux/kernel.h>
+#include <linux/sched/clock.h>
 #include <linux/llist.h>
 #include <linux/ratelimit.h>
 #include <linux/vmalloc.h>
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 69e1ae59cab8..6ac2e48b9235 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -13,6 +13,7 @@
 
 #include <linux/delay.h>
 #include <linux/kthread.h>
+#include <linux/sched/clock.h>
 #include <trace/events/bcache.h>
 
 /* Rate limiting */
diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
index 09505f432eda..7ae710585267 100644
--- a/drivers/misc/cxl/native.c
+++ b/drivers/misc/cxl/native.c
@@ -9,6 +9,7 @@
 
 #include <linux/spinlock.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/slab.h>
 #include <linux/mutex.h>
 #include <linux/mm.h>
diff --git a/drivers/net/irda/pxaficp_ir.c b/drivers/net/irda/pxaficp_ir.c
index 6e8f616be48e..1dba16bc7f8d 100644
--- a/drivers/net/irda/pxaficp_ir.c
+++ b/drivers/net/irda/pxaficp_ir.c
@@ -24,6 +24,7 @@
 #include <linux/dma/pxa-dma.h>
 #include <linux/gpio.h>
 #include <linux/slab.h>
+#include <linux/sched/clock.h>
 
 #include <net/irda/irda.h>
 #include <net/irda/irmod.h>
diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 6d9335865880..9612b84bc3e0 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -20,6 +20,7 @@
 #include <linux/perf/arm_pmu.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
+#include <linux/sched/clock.h>
 #include <linux/spinlock.h>
 #include <linux/irq.h>
 #include <linux/irqdesc.h>
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 2fe35354f20e..5c98ad4d2f4c 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -17,6 +17,7 @@
 #include <linux/workqueue.h>
 #include <linux/file.h>
 #include <linux/slab.h>
+#include <linux/sched/clock.h>
 #include <linux/vmalloc.h>
 
 #include <linux/net.h>
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index aecca0e7d9ca..796016e63c1d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -2,6 +2,7 @@
 #define _LINUX_BLKDEV_H
 
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 
 #ifdef CONFIG_BLOCK
 
diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h
new file mode 100644
index 000000000000..7cb7d79b2cf9
--- /dev/null
+++ b/include/linux/sched/clock.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_CLOCK_H
+#define _LINUX_SCHED_CLOCK_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_CLOCK_H */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 69ccd2636911..c776abd86937 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -34,6 +34,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/netdev_features.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <net/flow_dissector.h>
 #include <linux/splice.h>
 #include <linux/in6.h>
diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index b8d637225a07..b96832df239e 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -25,6 +25,7 @@
 #define _LINUX_NET_BUSY_POLL_H
 
 #include <linux/netdevice.h>
+#include <linux/sched/clock.h>
 #include <net/ip.h>
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1031bdf9f012..42fe16a84f97 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -46,6 +46,7 @@
 #include <linux/filter.h>
 #include <linux/namei.h>
 #include <linux/parser.h>
+#include <linux/sched/clock.h>
 
 #include "internal.h"
 
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 9812e5dd409e..cdafaff926ca 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -28,6 +28,7 @@
 #define DISABLE_BRANCH_PROFILING
 #include <linux/mutex.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/proc_fs.h>
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index e852be4851fc..4a30ef63c607 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -63,6 +63,7 @@ enum qlock_stats {
  */
 #include <linux/debugfs.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/fs.h>
 
 static const char * const qstat_names[qstat_num + 1] = {
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 34da86e73d00..47050eedc206 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -45,6 +45,7 @@
 #include <linux/utsname.h>
 #include <linux/ctype.h>
 #include <linux/uio.h>
+#include <linux/sched/clock.h>
 
 #include <linux/uaccess.h>
 #include <asm/sections.h>
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index ad64efe41722..dd7817cdbf58 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -58,6 +58,7 @@
 #include <linux/percpu.h>
 #include <linux/ktime.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/static_key.h>
 #include <linux/workqueue.h>
 #include <linux/compiler.h>
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1bd317db9810..1a7fd3d21e5a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6,6 +6,7 @@
  *  Copyright (C) 1991-2002  Linus Torvalds
  */
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/cpuset.h>
 #include <linux/delayacct.h>
 #include <linux/init_task.h>
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 683570739f46..4d386461f13a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3,6 +3,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/sched/topology.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/clock.h>
 #include <linux/sched/wake_q.h>
 #include <linux/u64_stats_sync.h>
 #include <linux/sched/deadline.h>
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index a26036d37a38..ea6b610c4c57 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -13,6 +13,7 @@
 #include <linux/kernel.h>
 #include <linux/moduleparam.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/syscore_ops.h>
 #include <linux/hrtimer.h>
 #include <linux/sched_clock.h>
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 2c115fdab397..4fee1c3abd0b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -19,6 +19,7 @@
 #include <linux/percpu.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/module.h>
 #include <linux/irq_work.h>
 #include <linux/posix-timers.h>
diff --git a/kernel/torture.c b/kernel/torture.c
index 01a99976f072..55de96529287 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -30,6 +30,7 @@
 #include <linux/smp.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/atomic.h>
 #include <linux/bitops.h>
 #include <linux/completion.h>
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a85739efcc30..96fc3c043ad6 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -6,6 +6,7 @@
 #include <linux/trace_events.h>
 #include <linux/ring_buffer.h>
 #include <linux/trace_clock.h>
+#include <linux/sched/clock.h>
 #include <linux/trace_seq.h>
 #include <linux/spinlock.h>
 #include <linux/irq_work.h>
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 0f06532a755b..5fdc779f411d 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -18,6 +18,7 @@
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/ktime.h>
 #include <linux/trace_clock.h>
 
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index edfacd954e1b..21ea6ae77d93 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -44,6 +44,7 @@
 #include <linux/uaccess.h>
 #include <linux/cpumask.h>
 #include <linux/delay.h>
+#include <linux/sched/clock.h>
 #include "trace.h"
 
 static struct trace_array	*hwlat_trace;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 070866c32eb9..107afca97649 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/ftrace.h>
+#include <linux/sched/clock.h>
 
 #include "trace_output.h"
 
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 63177be0159e..144d7b1b0364 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -21,6 +21,7 @@
 #include <linux/sched/rt.h>
 #include <linux/tick.h>
 #include <linux/workqueue.h>
+#include <linux/sched/clock.h>
 
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
diff --git a/lib/plist.c b/lib/plist.c
index 3a30c53db061..199408f91057 100644
--- a/lib/plist.c
+++ b/lib/plist.c
@@ -175,6 +175,7 @@ void plist_requeue(struct plist_node *node, struct plist_head *head)
 
 #ifdef CONFIG_DEBUG_PI_LIST
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/module.h>
 #include <linux/init.h>
 
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 35b280361cb2..50a0f3e51d5b 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -27,6 +27,8 @@
 #include <linux/kernel.h>
 #include <linux/random.h>
 #include <linux/module.h>
+#include <linux/sched/clock.h>
+
 #include <net/tcp.h>
 
 #define HYSTART_ACK_TRAIN	1
-- 
cgit v1.2.3


From ae7e81c077d60507dcec139e40a6d10cf932cf4b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 1 Feb 2017 18:07:51 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <uapi/linux/sched/types.h>

We are going to move scheduler ABI details to <uapi/linux/sched/types.h>,
which will be used from a number of .c files.

Create empty placeholder header that maps to <linux/types.h>.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/common/bL_switcher.c                 | 1 +
 crypto/crypto_engine.c                        | 1 +
 drivers/acpi/acpi_pad.c                       | 1 +
 drivers/block/drbd/drbd_receiver.c            | 1 +
 drivers/firmware/psci_checker.c               | 1 +
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 1 +
 drivers/gpu/drm/i915/intel_breadcrumbs.c      | 1 +
 drivers/media/pci/ivtv/ivtv-driver.c          | 1 +
 drivers/mmc/core/sdio_irq.c                   | 1 +
 drivers/spi/spi.c                             | 1 +
 drivers/staging/android/ion/ion_heap.c        | 1 +
 drivers/thermal/intel_powerclamp.c            | 1 +
 drivers/tty/serial/sc16is7xx.c                | 1 +
 include/uapi/linux/sched/types.h              | 6 ++++++
 kernel/irq/manage.c                           | 1 +
 kernel/kthread.c                              | 1 +
 kernel/locking/locktorture.c                  | 1 +
 kernel/rcu/rcuperf.c                          | 1 +
 kernel/rcu/rcutorture.c                       | 1 +
 kernel/rcu/tree.c                             | 1 +
 kernel/rcu/tree_plugin.h                      | 1 +
 kernel/sched/core.c                           | 1 +
 kernel/sched/cpufreq_schedutil.c              | 1 +
 kernel/trace/ring_buffer_benchmark.c          | 1 +
 kernel/trace/trace_selftest.c                 | 1 +
 kernel/watchdog.c                             | 1 +
 26 files changed, 31 insertions(+)
 create mode 100644 include/uapi/linux/sched/types.h

(limited to 'kernel')

diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c
index 46730017b3c5..083c9e517d22 100644
--- a/arch/arm/common/bL_switcher.c
+++ b/arch/arm/common/bL_switcher.c
@@ -14,6 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/interrupt.h>
 #include <linux/cpu_pm.h>
 #include <linux/cpu.h>
diff --git a/crypto/crypto_engine.c b/crypto/crypto_engine.c
index f1bf3418d968..727bd5c3569e 100644
--- a/crypto/crypto_engine.c
+++ b/crypto/crypto_engine.c
@@ -16,6 +16,7 @@
 #include <linux/delay.h>
 #include <crypto/engine.h>
 #include <crypto/internal/hash.h>
+#include <uapi/linux/sched/types.h>
 #include "internal.h"
 
 #define CRYPTO_ENGINE_MAX_QLEN 10
diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c
index eb76a4c10dbf..754431031282 100644
--- a/drivers/acpi/acpi_pad.c
+++ b/drivers/acpi/acpi_pad.c
@@ -20,6 +20,7 @@
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/kthread.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/freezer.h>
 #include <linux/cpu.h>
 #include <linux/tick.h>
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index c7728dd77230..8b40a5b2f8e6 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -36,6 +36,7 @@
 #include <linux/memcontrol.h>
 #include <linux/mm_inline.h>
 #include <linux/slab.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/pkt_sched.h>
 #define __KERNEL_SYSCALLS__
 #include <linux/unistd.h>
diff --git a/drivers/firmware/psci_checker.c b/drivers/firmware/psci_checker.c
index 29d58feaf675..6523ce962865 100644
--- a/drivers/firmware/psci_checker.c
+++ b/drivers/firmware/psci_checker.c
@@ -20,6 +20,7 @@
 #include <linux/cpu_pm.h>
 #include <linux/kernel.h>
 #include <linux/kthread.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/module.h>
 #include <linux/preempt.h>
 #include <linux/psci.h>
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index 1bf83ed113b3..16f96563cd2b 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -24,6 +24,7 @@
 #include <linux/kthread.h>
 #include <linux/wait.h>
 #include <linux/sched.h>
+#include <uapi/linux/sched/types.h>
 #include <drm/drmP.h>
 #include "gpu_scheduler.h"
 
diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
index fcfa423d08bd..7044e9a6abf7 100644
--- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -23,6 +23,7 @@
  */
 
 #include <linux/kthread.h>
+#include <uapi/linux/sched/types.h>
 
 #include "i915_drv.h"
 
diff --git a/drivers/media/pci/ivtv/ivtv-driver.c b/drivers/media/pci/ivtv/ivtv-driver.c
index ab2ae53618e8..e73c153285f0 100644
--- a/drivers/media/pci/ivtv/ivtv-driver.c
+++ b/drivers/media/pci/ivtv/ivtv-driver.c
@@ -59,6 +59,7 @@
 #include <media/tveeprom.h>
 #include <media/i2c/saa7115.h>
 #include "tuner-xc2028.h"
+#include <uapi/linux/sched/types.h>
 
 /* If you have already X v4l cards, then set this to X. This way
    the device numbers stay matched. Example: you have a WinTV card
diff --git a/drivers/mmc/core/sdio_irq.c b/drivers/mmc/core/sdio_irq.c
index d29faf2addfe..6d4b72080d51 100644
--- a/drivers/mmc/core/sdio_irq.c
+++ b/drivers/mmc/core/sdio_irq.c
@@ -15,6 +15,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/kthread.h>
 #include <linux/export.h>
 #include <linux/wait.h>
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 44222ef9471e..90b5b2efafbf 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -33,6 +33,7 @@
 #include <linux/pm_domain.h>
 #include <linux/export.h>
 #include <linux/sched/rt.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/ioport.h>
diff --git a/drivers/staging/android/ion/ion_heap.c b/drivers/staging/android/ion/ion_heap.c
index 4e5c0f17f579..c69d0bd53693 100644
--- a/drivers/staging/android/ion/ion_heap.c
+++ b/drivers/staging/android/ion/ion_heap.c
@@ -20,6 +20,7 @@
 #include <linux/mm.h>
 #include <linux/rtmutex.h>
 #include <linux/sched.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/scatterlist.h>
 #include <linux/vmalloc.h>
 #include "ion.h"
diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c
index a47103a659fa..d718cd179ddb 100644
--- a/drivers/thermal/intel_powerclamp.c
+++ b/drivers/thermal/intel_powerclamp.c
@@ -50,6 +50,7 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/sched/rt.h>
+#include <uapi/linux/sched/types.h>
 
 #include <asm/nmi.h>
 #include <asm/msr.h>
diff --git a/drivers/tty/serial/sc16is7xx.c b/drivers/tty/serial/sc16is7xx.c
index 793395451982..ca54ce074a5f 100644
--- a/drivers/tty/serial/sc16is7xx.c
+++ b/drivers/tty/serial/sc16is7xx.c
@@ -29,6 +29,7 @@
 #include <linux/tty_flip.h>
 #include <linux/spi/spi.h>
 #include <linux/uaccess.h>
+#include <uapi/linux/sched/types.h>
 
 #define SC16IS7XX_NAME			"sc16is7xx"
 #define SC16IS7XX_MAX_DEVS		8
diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
new file mode 100644
index 000000000000..d162d315f4b5
--- /dev/null
+++ b/include/uapi/linux/sched/types.h
@@ -0,0 +1,6 @@
+#ifndef _UAPI_LINUX_SCHED_TYPES_H
+#define _UAPI_LINUX_SCHED_TYPES_H
+
+#include <linux/types.h>
+
+#endif /* _UAPI_LINUX_SCHED_TYPES_H */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 944d068b6c48..09740952e4de 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -17,6 +17,7 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/sched/rt.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/task_work.h>
 
 #include "internals.h"
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 8461a4372e8a..ef9b9eb809c7 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -5,6 +5,7 @@
  * even if we're invoked from userspace (think modprobe, hotplug cpu,
  * etc.).
  */
+#include <uapi/linux/sched/types.h>
 #include <linux/sched.h>
 #include <linux/kthread.h>
 #include <linux/completion.h>
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 28350dc8ecbb..5ea0a8969ee2 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -32,6 +32,7 @@
 #include <linux/smp.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/atomic.h>
 #include <linux/moduleparam.h>
 #include <linux/delay.h>
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 123ccbd22449..a4a86fb47e4a 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -30,6 +30,7 @@
 #include <linux/rcupdate.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/atomic.h>
 #include <linux/bitops.h>
 #include <linux/completion.h>
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index d81345be730e..6a28b79710f0 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -33,6 +33,7 @@
 #include <linux/rcupdate.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/atomic.h>
 #include <linux/bitops.h>
 #include <linux/completion.h>
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index cb62ce23ffc7..e456327a63d6 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -49,6 +49,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/wait.h>
 #include <linux/kthread.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/prefetch.h>
 #include <linux/delay.h>
 #include <linux/stop_machine.h>
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index a240f3308be6..9dabb04003be 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -28,6 +28,7 @@
 #include <linux/gfp.h>
 #include <linux/oom.h>
 #include <linux/smpboot.h>
+#include <uapi/linux/sched/types.h>
 #include "../time/tick-internal.h"
 
 #ifdef CONFIG_RCU_BOOST
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1a7fd3d21e5a..ed39d1d0b64a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7,6 +7,7 @@
  */
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/cpuset.h>
 #include <linux/delayacct.h>
 #include <linux/init_task.h>
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index fd4659313640..8f8de3d4d6b7 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -13,6 +13,7 @@
 
 #include <linux/cpufreq.h>
 #include <linux/kthread.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/slab.h>
 #include <trace/events/power.h>
 
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 6df9a83e20d7..c190a4d5013c 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -6,6 +6,7 @@
 #include <linux/ring_buffer.h>
 #include <linux/completion.h>
 #include <linux/kthread.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/module.h>
 #include <linux/ktime.h>
 #include <asm/local.h>
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index b0f86ea77881..cb917cebae29 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1,5 +1,6 @@
 /* Include in trace.c */
 
+#include <uapi/linux/sched/types.h>
 #include <linux/stringify.h>
 #include <linux/kthread.h>
 #include <linux/delay.h>
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 144d7b1b0364..52718f4512e9 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -19,6 +19,7 @@
 #include <linux/sysctl.h>
 #include <linux/smpboot.h>
 #include <linux/sched/rt.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/tick.h>
 #include <linux/workqueue.h>
 #include <linux/sched/clock.h>
-- 
cgit v1.2.3


From 4f17722c7256af8e17c2c4f29f170247264bdf48 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 08:45:17 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/loadavg.h>

We are going to split <linux/sched/loadavg.h> out of <linux/sched.h>, which
will have to be picked up from a couple of .c files.

Create a trivial placeholder <linux/sched/topology.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/m68k/kernel/time.c                         | 1 +
 arch/microblaze/kernel/heartbeat.c              | 1 +
 arch/powerpc/platforms/cell/cpufreq_spudemand.c | 1 +
 arch/powerpc/platforms/cell/spufs/sched.c       | 1 +
 arch/s390/appldata/appldata_os.c                | 1 +
 arch/sh/drivers/heartbeat.c                     | 1 +
 arch/sparc/kernel/led.c                         | 1 +
 drivers/cpuidle/governors/menu.c                | 1 +
 drivers/leds/trigger/ledtrig-heartbeat.c        | 1 +
 drivers/platform/x86/intel_ips.c                | 1 +
 fs/proc/loadavg.c                               | 1 +
 include/linux/sched/loadavg.h                   | 6 ++++++
 kernel/debug/kdb/kdb_main.c                     | 1 +
 kernel/sched/core.c                             | 1 +
 kernel/sched/loadavg.c                          | 1 +
 kernel/sys.c                                    | 1 +
 kernel/time/timekeeping.c                       | 1 +
 net/sched/em_meta.c                             | 1 +
 18 files changed, 23 insertions(+)
 create mode 100644 include/linux/sched/loadavg.h

(limited to 'kernel')

diff --git a/arch/m68k/kernel/time.c b/arch/m68k/kernel/time.c
index 4e5aa2f4f522..87160b4415fb 100644
--- a/arch/m68k/kernel/time.c
+++ b/arch/m68k/kernel/time.c
@@ -14,6 +14,7 @@
 #include <linux/export.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/loadavg.h>
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/string.h>
diff --git a/arch/microblaze/kernel/heartbeat.c b/arch/microblaze/kernel/heartbeat.c
index 4643e3ab9414..2022130139d2 100644
--- a/arch/microblaze/kernel/heartbeat.c
+++ b/arch/microblaze/kernel/heartbeat.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/loadavg.h>
 #include <linux/io.h>
 
 #include <asm/setup.h>
diff --git a/arch/powerpc/platforms/cell/cpufreq_spudemand.c b/arch/powerpc/platforms/cell/cpufreq_spudemand.c
index 88301e53f085..882944c36ef5 100644
--- a/arch/powerpc/platforms/cell/cpufreq_spudemand.c
+++ b/arch/powerpc/platforms/cell/cpufreq_spudemand.c
@@ -22,6 +22,7 @@
 
 #include <linux/cpufreq.h>
 #include <linux/sched.h>
+#include <linux/sched/loadavg.h>
 #include <linux/module.h>
 #include <linux/timer.h>
 #include <linux/workqueue.h>
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 9b543df210fb..d317c84dc794 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -24,6 +24,7 @@
 
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/loadavg.h>
 #include <linux/sched/rt.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c
index 08b9e942a262..079446619f89 100644
--- a/arch/s390/appldata/appldata_os.c
+++ b/arch/s390/appldata/appldata_os.c
@@ -17,6 +17,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/netdevice.h>
 #include <linux/sched.h>
+#include <linux/sched/loadavg.h>
 #include <asm/appldata.h>
 #include <asm/smp.h>
 
diff --git a/arch/sh/drivers/heartbeat.c b/arch/sh/drivers/heartbeat.c
index 49bace446a1a..c6d96049a0bb 100644
--- a/arch/sh/drivers/heartbeat.c
+++ b/arch/sh/drivers/heartbeat.c
@@ -21,6 +21,7 @@
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/sched.h>
+#include <linux/sched/loadavg.h>
 #include <linux/timer.h>
 #include <linux/io.h>
 #include <linux/slab.h>
diff --git a/arch/sparc/kernel/led.c b/arch/sparc/kernel/led.c
index 3ae36f36e758..44a3ed93c214 100644
--- a/arch/sparc/kernel/led.c
+++ b/arch/sparc/kernel/led.c
@@ -8,6 +8,7 @@
 #include <linux/jiffies.h>
 #include <linux/timer.h>
 #include <linux/uaccess.h>
+#include <linux/sched/loadavg.h>
 
 #include <asm/auxio.h>
 
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 8d6d25c38c02..fe86060e48f7 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -18,6 +18,7 @@
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
 #include <linux/sched.h>
+#include <linux/sched/loadavg.h>
 #include <linux/math64.h>
 #include <linux/cpu.h>
 
diff --git a/drivers/leds/trigger/ledtrig-heartbeat.c b/drivers/leds/trigger/ledtrig-heartbeat.c
index e6f2f8b9f09a..afa3b4099214 100644
--- a/drivers/leds/trigger/ledtrig-heartbeat.c
+++ b/drivers/leds/trigger/ledtrig-heartbeat.c
@@ -17,6 +17,7 @@
 #include <linux/slab.h>
 #include <linux/timer.h>
 #include <linux/sched.h>
+#include <linux/sched/loadavg.h>
 #include <linux/leds.h>
 #include <linux/reboot.h>
 #include <linux/suspend.h>
diff --git a/drivers/platform/x86/intel_ips.c b/drivers/platform/x86/intel_ips.c
index 55663b3d7282..58dcee562d64 100644
--- a/drivers/platform/x86/intel_ips.c
+++ b/drivers/platform/x86/intel_ips.c
@@ -68,6 +68,7 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/sched.h>
+#include <linux/sched/loadavg.h>
 #include <linux/seq_file.h>
 #include <linux/string.h>
 #include <linux/tick.h>
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index aec66e6c2060..add5255ce7e0 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -3,6 +3,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/proc_fs.h>
 #include <linux/sched.h>
+#include <linux/sched/loadavg.h>
 #include <linux/seq_file.h>
 #include <linux/seqlock.h>
 #include <linux/time.h>
diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h
new file mode 100644
index 000000000000..3ae74be327e8
--- /dev/null
+++ b/include/linux/sched/loadavg.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_LOADAVG_H
+#define _LINUX_SCHED_LOADAVG_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_LOADAVG_H */
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index ca183919d302..308937c7a687 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -18,6 +18,7 @@
 #include <linux/kmsg_dump.h>
 #include <linux/reboot.h>
 #include <linux/sched.h>
+#include <linux/sched/loadavg.h>
 #include <linux/sysrq.h>
 #include <linux/smp.h>
 #include <linux/utsname.h>
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ed39d1d0b64a..11a0684a29a7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8,6 +8,7 @@
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
 #include <uapi/linux/sched/types.h>
+#include <linux/sched/loadavg.h>
 #include <linux/cpuset.h>
 #include <linux/delayacct.h>
 #include <linux/init_task.h>
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index a2d6eb71f06b..7296b7308eca 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/export.h>
+#include <linux/sched/loadavg.h>
 
 #include "sched.h"
 
diff --git a/kernel/sys.c b/kernel/sys.c
index b07adca97ea3..28b8a4c1bc7e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -49,6 +49,7 @@
 #include <linux/binfmts.h>
 
 #include <linux/sched.h>
+#include <linux/sched/loadavg.h>
 #include <linux/rcupdate.h>
 #include <linux/uidgid.h>
 #include <linux/cred.h>
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 95b258dd75db..fb564acee0f3 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -15,6 +15,7 @@
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/loadavg.h>
 #include <linux/syscore_ops.h>
 #include <linux/clocksource.h>
 #include <linux/jiffies.h>
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 41c80b6c3906..ae7e4f5b348b 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -63,6 +63,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/loadavg.h>
 #include <linux/string.h>
 #include <linux/skbuff.h>
 #include <linux/random.h>
-- 
cgit v1.2.3


From 4eb5aaa3af8a54e5e9bac90e2b42bbab1f1ee5a3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:29 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/autogroup.h>

We are going to split <linux/sched/autogroup.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/autogroup.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/proc/base.c                  | 1 +
 include/linux/sched/autogroup.h | 6 ++++++
 kernel/exit.c                   | 1 +
 kernel/fork.c                   | 1 +
 kernel/sched/autogroup.h        | 1 +
 kernel/sys.c                    | 1 +
 6 files changed, 11 insertions(+)
 create mode 100644 include/linux/sched/autogroup.h

(limited to 'kernel')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1e1e182d571b..27af4ea315a3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -85,6 +85,7 @@
 #include <linux/user_namespace.h>
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
+#include <linux/sched/autogroup.h>
 #include <linux/flex_array.h>
 #include <linux/posix-timers.h>
 #ifdef CONFIG_HARDWALL
diff --git a/include/linux/sched/autogroup.h b/include/linux/sched/autogroup.h
new file mode 100644
index 000000000000..d745f22e57dc
--- /dev/null
+++ b/include/linux/sched/autogroup.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_AUTOGROUP_H
+#define _LINUX_SCHED_AUTOGROUP_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_AUTOGROUP_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index 8a768a3672a5..9c0b92833fbb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -6,6 +6,7 @@
 
 #include <linux/mm.h>
 #include <linux/slab.h>
+#include <linux/sched/autogroup.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/capability.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index d043fedc03c8..234f4bfb8efd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -12,6 +12,7 @@
  */
 
 #include <linux/slab.h>
+#include <linux/sched/autogroup.h>
 #include <linux/init.h>
 #include <linux/unistd.h>
 #include <linux/module.h>
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h
index 890c95f2587a..ce40c810cd5c 100644
--- a/kernel/sched/autogroup.h
+++ b/kernel/sched/autogroup.h
@@ -2,6 +2,7 @@
 
 #include <linux/kref.h>
 #include <linux/rwsem.h>
+#include <linux/sched/autogroup.h>
 
 struct autogroup {
 	/*
diff --git a/kernel/sys.c b/kernel/sys.c
index 28b8a4c1bc7e..cfe82ae62423 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -49,6 +49,7 @@
 #include <linux/binfmts.h>
 
 #include <linux/sched.h>
+#include <linux/sched/autogroup.h>
 #include <linux/sched/loadavg.h>
 #include <linux/rcupdate.h>
 #include <linux/uidgid.h>
-- 
cgit v1.2.3


From 6e84f31522f931027bf695752087ece278c10d3f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:29 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/mm.h>

We are going to split <linux/sched/mm.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/mm.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

The APIs that are going to be moved first are:

   mm_alloc()
   __mmdrop()
   mmdrop()
   mmdrop_async_fn()
   mmdrop_async()
   mmget_not_zero()
   mmput()
   mmput_async()
   get_task_mm()
   mm_access()
   mm_release()

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arc/include/asm/mmu_context.h          | 1 +
 arch/arc/kernel/troubleshoot.c              | 2 ++
 arch/arm/mach-rpc/ecard.c                   | 1 +
 arch/frv/mm/mmu-context.c                   | 1 +
 arch/m68k/sun3/mmu_emu.c                    | 1 +
 arch/powerpc/platforms/cell/spufs/context.c | 2 ++
 drivers/android/binder.c                    | 1 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c    | 1 +
 drivers/gpu/drm/etnaviv/etnaviv_gem.c       | 1 +
 drivers/gpu/drm/i915/i915_gem_userptr.c     | 1 +
 drivers/infiniband/core/umem.c              | 1 +
 drivers/infiniband/core/umem_odp.c          | 1 +
 drivers/infiniband/hw/hfi1/file_ops.c       | 1 +
 drivers/infiniband/hw/mlx4/main.c           | 2 ++
 drivers/infiniband/hw/mlx5/main.c           | 1 +
 drivers/infiniband/hw/usnic/usnic_uiom.c    | 1 +
 drivers/iommu/amd_iommu_v2.c                | 1 +
 drivers/iommu/intel-svm.c                   | 1 +
 drivers/lguest/lguest_user.c                | 1 +
 drivers/misc/cxl/fault.c                    | 1 +
 drivers/misc/mic/scif/scif_rma.c            | 2 ++
 drivers/oprofile/buffer_sync.c              | 1 +
 drivers/vfio/vfio_iommu_spapr_tce.c         | 2 ++
 drivers/vfio/vfio_iommu_type1.c             | 1 +
 drivers/vhost/vhost.c                       | 1 +
 drivers/xen/gntdev.c                        | 1 +
 fs/exec.c                                   | 1 +
 fs/proc/array.c                             | 1 +
 fs/proc/base.c                              | 1 +
 fs/proc/task_mmu.c                          | 1 +
 fs/proc/task_nommu.c                        | 2 ++
 fs/userfaultfd.c                            | 1 +
 include/linux/sched/mm.h                    | 6 ++++++
 kernel/cgroup/cpuset.c                      | 1 +
 kernel/events/core.c                        | 1 +
 kernel/events/uprobes.c                     | 1 +
 kernel/exit.c                               | 1 +
 kernel/fork.c                               | 1 +
 kernel/futex.c                              | 1 +
 kernel/ptrace.c                             | 1 +
 kernel/sched/sched.h                        | 1 +
 kernel/sys.c                                | 1 +
 kernel/trace/trace_output.c                 | 1 +
 kernel/tsacct.c                             | 1 +
 mm/khugepaged.c                             | 1 +
 mm/ksm.c                                    | 1 +
 mm/memcontrol.c                             | 1 +
 mm/memory.c                                 | 1 +
 mm/mempolicy.c                              | 1 +
 mm/migrate.c                                | 1 +
 mm/mmu_context.c                            | 1 +
 mm/mmu_notifier.c                           | 1 +
 mm/nommu.c                                  | 1 +
 mm/oom_kill.c                               | 1 +
 mm/process_vm_access.c                      | 1 +
 mm/rmap.c                                   | 1 +
 mm/swapfile.c                               | 1 +
 mm/util.c                                   | 1 +
 virt/kvm/async_pf.c                         | 1 +
 virt/kvm/kvm_main.c                         | 1 +
 60 files changed, 71 insertions(+)
 create mode 100644 include/linux/sched/mm.h

(limited to 'kernel')

diff --git a/arch/arc/include/asm/mmu_context.h b/arch/arc/include/asm/mmu_context.h
index b0b87f2447f5..64b5ebae1ae8 100644
--- a/arch/arc/include/asm/mmu_context.h
+++ b/arch/arc/include/asm/mmu_context.h
@@ -20,6 +20,7 @@
 
 #include <asm/arcregs.h>
 #include <asm/tlb.h>
+#include <linux/sched/mm.h>
 
 #include <asm-generic/mm_hooks.h>
 
diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c
index 82f9bc819f4a..0c48907f56a9 100644
--- a/arch/arc/kernel/troubleshoot.c
+++ b/arch/arc/kernel/troubleshoot.c
@@ -13,6 +13,8 @@
 #include <linux/fs_struct.h>
 #include <linux/proc_fs.h>
 #include <linux/file.h>
+#include <linux/sched/mm.h>
+
 #include <asm/arcregs.h>
 #include <asm/irqflags.h>
 
diff --git a/arch/arm/mach-rpc/ecard.c b/arch/arm/mach-rpc/ecard.c
index dc67a7fb3831..6b279d037774 100644
--- a/arch/arm/mach-rpc/ecard.c
+++ b/arch/arm/mach-rpc/ecard.c
@@ -31,6 +31,7 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/interrupt.h>
 #include <linux/completion.h>
 #include <linux/reboot.h>
diff --git a/arch/frv/mm/mmu-context.c b/arch/frv/mm/mmu-context.c
index 3473bde77f56..4031289bf2ec 100644
--- a/arch/frv/mm/mmu-context.c
+++ b/arch/frv/mm/mmu-context.c
@@ -10,6 +10,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/mm.h>
 #include <asm/tlbflush.h>
 
diff --git a/arch/m68k/sun3/mmu_emu.c b/arch/m68k/sun3/mmu_emu.c
index e9d7fbe4d5ae..7fdc61525e0b 100644
--- a/arch/m68k/sun3/mmu_emu.c
+++ b/arch/m68k/sun3/mmu_emu.c
@@ -15,6 +15,7 @@
 #include <linux/bootmem.h>
 #include <linux/bitops.h>
 #include <linux/module.h>
+#include <linux/sched/mm.h>
 
 #include <asm/setup.h>
 #include <asm/traps.h>
diff --git a/arch/powerpc/platforms/cell/spufs/context.c b/arch/powerpc/platforms/cell/spufs/context.c
index 3b4152faeb1f..b500b17254a0 100644
--- a/arch/powerpc/platforms/cell/spufs/context.c
+++ b/arch/powerpc/platforms/cell/spufs/context.c
@@ -25,6 +25,8 @@
 #include <linux/slab.h>
 #include <linux/atomic.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
+
 #include <asm/spu.h>
 #include <asm/spu_csa.h>
 #include "spufs.h"
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 2bbcdc6fdfee..e33e7fbe870b 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -32,6 +32,7 @@
 #include <linux/debugfs.h>
 #include <linux/rbtree.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/seq_file.h>
 #include <linux/uaccess.h>
 #include <linux/vmalloc.h>
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index ca5f2aa7232d..84d1ffd1eef9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -23,6 +23,7 @@
 #include <linux/mutex.h>
 #include <linux/log2.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/slab.h>
 #include <linux/amd-iommu.h>
 #include <linux/notifier.h>
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
index e78f1406885d..ae2882b64b82 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
@@ -16,6 +16,7 @@
 
 #include <linux/spinlock.h>
 #include <linux/shmem_fs.h>
+#include <linux/sched/mm.h>
 
 #include "etnaviv_drv.h"
 #include "etnaviv_gem.h"
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 0115989e324a..22b46398831e 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -31,6 +31,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/mempolicy.h>
 #include <linux/swap.h>
+#include <linux/sched/mm.h>
 
 struct i915_mm_struct {
 	struct mm_struct *mm;
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 446b56a5260b..d525b1a2986a 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -35,6 +35,7 @@
 #include <linux/mm.h>
 #include <linux/dma-mapping.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/export.h>
 #include <linux/hugetlb.h>
 #include <linux/slab.h>
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index f2fc0431512d..7279f259494f 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -32,6 +32,7 @@
 
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/pid.h>
 #include <linux/slab.h>
 #include <linux/export.h>
diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
index 3b19c16a9e45..f78c739b330a 100644
--- a/drivers/infiniband/hw/hfi1/file_ops.c
+++ b/drivers/infiniband/hw/hfi1/file_ops.c
@@ -48,6 +48,7 @@
 #include <linux/cdev.h>
 #include <linux/vmalloc.h>
 #include <linux/io.h>
+#include <linux/sched/mm.h>
 
 #include <rdma/ib.h>
 
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 88608906ce25..56da8f0d71bb 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -39,6 +39,8 @@
 #include <linux/inetdevice.h>
 #include <linux/rtnetlink.h>
 #include <linux/if_vlan.h>
+#include <linux/sched/mm.h>
+
 #include <net/ipv6.h>
 #include <net/addrconf.h>
 #include <net/devlink.h>
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 5b3355268725..4b1ec3ff152a 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -41,6 +41,7 @@
 #include <asm/pat.h>
 #endif
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/delay.h>
 #include <rdma/ib_user_verbs.h>
 #include <rdma/ib_addr.h>
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
index 1ccee6ea5bc3..ba868be5af2e 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -35,6 +35,7 @@
 #include <linux/mm.h>
 #include <linux/dma-mapping.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/iommu.h>
 #include <linux/workqueue.h>
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index f8ed8c95b685..063343909b0d 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -22,6 +22,7 @@
 #include <linux/profile.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/iommu.h>
 #include <linux/wait.h>
 #include <linux/pci.h>
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index 51f2b228723f..23c427602c55 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -16,6 +16,7 @@
 #include <linux/intel-iommu.h>
 #include <linux/mmu_notifier.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/slab.h>
 #include <linux/intel-svm.h>
 #include <linux/rculist.h>
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index 30c60687d277..1a6787bc9386 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -8,6 +8,7 @@
 #include <linux/miscdevice.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/file.h>
 #include <linux/slab.h>
 #include <linux/export.h>
diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c
index 377e650a2a1d..e33011e3e7e2 100644
--- a/drivers/misc/cxl/fault.c
+++ b/drivers/misc/cxl/fault.c
@@ -9,6 +9,7 @@
 
 #include <linux/workqueue.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/pid.h>
 #include <linux/mm.h>
 #include <linux/moduleparam.h>
diff --git a/drivers/misc/mic/scif/scif_rma.c b/drivers/misc/mic/scif/scif_rma.c
index f806a4471eb9..d0e9c60f944e 100644
--- a/drivers/misc/mic/scif/scif_rma.c
+++ b/drivers/misc/mic/scif/scif_rma.c
@@ -17,6 +17,8 @@
  */
 #include <linux/dma_remapping.h>
 #include <linux/pagemap.h>
+#include <linux/sched/mm.h>
+
 #include "scif_main.h"
 #include "scif_map.h"
 
diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c
index 642478d35e99..eb0c35994b54 100644
--- a/drivers/oprofile/buffer_sync.c
+++ b/drivers/oprofile/buffer_sync.c
@@ -31,6 +31,7 @@
 #include <linux/fs.h>
 #include <linux/oprofile.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/gfp.h>
 
 #include "oprofile_stats.h"
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 59b3f62a2d64..185d50ee1b12 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -20,6 +20,8 @@
 #include <linux/err.h>
 #include <linux/vfio.h>
 #include <linux/vmalloc.h>
+#include <linux/sched/mm.h>
+
 #include <asm/iommu.h>
 #include <asm/tce.h>
 #include <asm/mmu_context.h>
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index bd6f293c4ebd..7e94c7fc90ae 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -32,6 +32,7 @@
 #include <linux/mm.h>
 #include <linux/rbtree.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/vfio.h>
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 4269e621e254..4a00140a7624 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -27,6 +27,7 @@
 #include <linux/cgroup.h>
 #include <linux/module.h>
 #include <linux/sort.h>
+#include <linux/sched/mm.h>
 #include <linux/interval_tree_generic.h>
 
 #include "vhost.h"
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 2ef2b61b69df..c77a0751a311 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -32,6 +32,7 @@
 #include <linux/types.h>
 #include <linux/uaccess.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
diff --git a/fs/exec.c b/fs/exec.c
index e595e1529581..8929da96cca1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -32,6 +32,7 @@
 #include <linux/swap.h>
 #include <linux/string.h>
 #include <linux/init.h>
+#include <linux/sched/mm.h>
 #include <linux/pagemap.h>
 #include <linux/perf_event.h>
 #include <linux/highmem.h>
diff --git a/fs/proc/array.c b/fs/proc/array.c
index fe12b519d09b..61bea3a4ecc5 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -60,6 +60,7 @@
 #include <linux/tty.h>
 #include <linux/string.h>
 #include <linux/mman.h>
+#include <linux/sched/mm.h>
 #include <linux/proc_fs.h>
 #include <linux/ioport.h>
 #include <linux/uaccess.h>
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 27af4ea315a3..9efe12376af1 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -86,6 +86,7 @@
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
 #include <linux/sched/autogroup.h>
+#include <linux/sched/mm.h>
 #include <linux/flex_array.h>
 #include <linux/posix-timers.h>
 #ifdef CONFIG_HARDWALL
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ee3efb229ef6..f08bd31c1081 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -11,6 +11,7 @@
 #include <linux/mempolicy.h>
 #include <linux/rmap.h>
 #include <linux/swap.h>
+#include <linux/sched/mm.h>
 #include <linux/swapops.h>
 #include <linux/mmu_notifier.h>
 #include <linux/page_idle.h>
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 1ef97cfcf422..23266694db11 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -7,6 +7,8 @@
 #include <linux/ptrace.h>
 #include <linux/slab.h>
 #include <linux/seq_file.h>
+#include <linux/sched/mm.h>
+
 #include "internal.h"
 
 /*
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 3c421d06a18e..18d12bfff770 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -15,6 +15,7 @@
 #include <linux/list.h>
 #include <linux/hashtable.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/mm.h>
 #include <linux/poll.h>
 #include <linux/slab.h>
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
new file mode 100644
index 000000000000..a7adba1cd0a9
--- /dev/null
+++ b/include/linux/sched/mm.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_MM_H
+#define _LINUX_SCHED_MM_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_MM_H */
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index b3088886cd37..a5c46db2855c 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -44,6 +44,7 @@
 #include <linux/proc_fs.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/seq_file.h>
 #include <linux/security.h>
 #include <linux/slab.h>
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 42fe16a84f97..6f41548f2e32 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -47,6 +47,7 @@
 #include <linux/namei.h>
 #include <linux/parser.h>
 #include <linux/sched/clock.h>
+#include <linux/sched/mm.h>
 
 #include "internal.h"
 
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index d630f8ac4d2f..62f2dbd13efc 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -27,6 +27,7 @@
 #include <linux/pagemap.h>	/* read_mapping_page */
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/export.h>
 #include <linux/rmap.h>		/* anon_vma_prepare */
 #include <linux/mmu_notifier.h>	/* set_pte_at_notify */
diff --git a/kernel/exit.c b/kernel/exit.c
index 9c0b92833fbb..48649ccbb649 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -7,6 +7,7 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/sched/autogroup.h>
+#include <linux/sched/mm.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/capability.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 234f4bfb8efd..1875468e2a81 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -13,6 +13,7 @@
 
 #include <linux/slab.h>
 #include <linux/sched/autogroup.h>
+#include <linux/sched/mm.h>
 #include <linux/init.h>
 #include <linux/unistd.h>
 #include <linux/module.h>
diff --git a/kernel/futex.c b/kernel/futex.c
index 8ddcf9ea953c..229a744b1781 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -62,6 +62,7 @@
 #include <linux/ptrace.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/wake_q.h>
+#include <linux/sched/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/freezer.h>
 #include <linux/bootmem.h>
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 49ba7c1ade9d..6fe5a2897912 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -10,6 +10,7 @@
 #include <linux/capability.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4d386461f13a..e2307a6c29f1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -5,6 +5,7 @@
 #include <linux/sched/rt.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/wake_q.h>
+#include <linux/sched/mm.h>
 #include <linux/u64_stats_sync.h>
 #include <linux/sched/deadline.h>
 #include <linux/kernel_stat.h>
diff --git a/kernel/sys.c b/kernel/sys.c
index cfe82ae62423..dc71ec1e1967 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -51,6 +51,7 @@
 #include <linux/sched.h>
 #include <linux/sched/autogroup.h>
 #include <linux/sched/loadavg.h>
+#include <linux/sched/mm.h>
 #include <linux/rcupdate.h>
 #include <linux/uidgid.h>
 #include <linux/cred.h>
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 107afca97649..02a4aeb22c47 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -9,6 +9,7 @@
 #include <linux/mutex.h>
 #include <linux/ftrace.h>
 #include <linux/sched/clock.h>
+#include <linux/sched/mm.h>
 
 #include "trace_output.h"
 
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 5c21f0535056..571a2d3821d8 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -18,6 +18,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/tsacct_kern.h>
 #include <linux/acct.h>
 #include <linux/jiffies.h>
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 34bce5c308e3..212a2afe8830 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2,6 +2,7 @@
 
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/mmu_notifier.h>
 #include <linux/rmap.h>
 #include <linux/swap.h>
diff --git a/mm/ksm.c b/mm/ksm.c
index 520e4c37fec7..5632c9d02457 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -19,6 +19,7 @@
 #include <linux/fs.h>
 #include <linux/mman.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/rwsem.h>
 #include <linux/pagemap.h>
 #include <linux/rmap.h>
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 45867e439d31..c52ec893e241 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -35,6 +35,7 @@
 #include <linux/memcontrol.h>
 #include <linux/cgroup.h>
 #include <linux/mm.h>
+#include <linux/sched/mm.h>
 #include <linux/shmem_fs.h>
 #include <linux/hugetlb.h>
 #include <linux/pagemap.h>
diff --git a/mm/memory.c b/mm/memory.c
index 14fc0b40f0bb..b0495ec74d29 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -40,6 +40,7 @@
 
 #include <linux/kernel_stat.h>
 #include <linux/mm.h>
+#include <linux/sched/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 1e7873e40c9a..90892416ed75 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -73,6 +73,7 @@
 #include <linux/hugetlb.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/nodemask.h>
 #include <linux/cpuset.h>
 #include <linux/slab.h>
diff --git a/mm/migrate.c b/mm/migrate.c
index 2c63ac06791b..9a0897a14d37 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -40,6 +40,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/page_idle.h>
 #include <linux/page_owner.h>
+#include <linux/sched/mm.h>
 
 #include <asm/tlbflush.h>
 
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index daf67bb02b4a..8888b124a386 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -5,6 +5,7 @@
 
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/mmu_context.h>
 #include <linux/export.h>
 
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 32bc9f2ff7eb..a7652acd2ab9 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -17,6 +17,7 @@
 #include <linux/srcu.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/slab.h>
 
 /* global SRCU for all MMs */
diff --git a/mm/nommu.c b/mm/nommu.c
index aae06e854552..79abed514a4b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -17,6 +17,7 @@
 
 #include <linux/export.h>
 #include <linux/mm.h>
+#include <linux/sched/mm.h>
 #include <linux/vmacache.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 51c091849dcb..d28936618698 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -22,6 +22,7 @@
 #include <linux/err.h>
 #include <linux/gfp.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/swap.h>
 #include <linux/timex.h>
 #include <linux/jiffies.h>
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 84d0c7eada2b..8973cd231ece 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -12,6 +12,7 @@
 #include <linux/mm.h>
 #include <linux/uio.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/highmem.h>
 #include <linux/ptrace.h>
 #include <linux/slab.h>
diff --git a/mm/rmap.c b/mm/rmap.c
index 8774791e2809..0367a0506667 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -46,6 +46,7 @@
  */
 
 #include <linux/mm.h>
+#include <linux/sched/mm.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
diff --git a/mm/swapfile.c b/mm/swapfile.c
index fadc6a1c0da0..ff2bf3f61b14 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/mm.h>
+#include <linux/sched/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/mman.h>
 #include <linux/slab.h>
diff --git a/mm/util.c b/mm/util.c
index b8f538863b5a..14f4e13323e2 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -5,6 +5,7 @@
 #include <linux/export.h>
 #include <linux/err.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/security.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 2366177172f6..bb298a200cd3 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -24,6 +24,7 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/mmu_context.h>
+#include <linux/sched/mm.h>
 
 #include "async_pf.h"
 #include <trace/events/kvm.h>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 35f71409d9ee..dd5de09bf362 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -33,6 +33,7 @@
 #include <linux/syscore_ops.h>
 #include <linux/cpu.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/cpumask.h>
 #include <linux/smp.h>
 #include <linux/anon_inodes.h>
-- 
cgit v1.2.3


From f7ccbae45c5e2c1077654b0e857e7efb1aa31c92 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:30 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/coredump.h>

We are going to split <linux/sched/coredump.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/coredump.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/binfmt_elf.c                | 1 +
 fs/binfmt_elf_fdpic.c          | 1 +
 fs/coredump.c                  | 1 +
 fs/exec.c                      | 1 +
 fs/proc/base.c                 | 1 +
 fs/proc/internal.h             | 1 +
 include/linux/khugepaged.h     | 3 ++-
 include/linux/ksm.h            | 1 +
 include/linux/sched/coredump.h | 6 ++++++
 kernel/cred.c                  | 1 +
 kernel/events/uprobes.c        | 1 +
 kernel/fork.c                  | 1 +
 kernel/ptrace.c                | 1 +
 kernel/sys.c                   | 1 +
 kernel/sysctl.c                | 1 +
 mm/huge_memory.c               | 1 +
 mm/khugepaged.c                | 1 +
 mm/ksm.c                       | 1 +
 mm/memory.c                    | 1 +
 mm/oom_kill.c                  | 1 +
 20 files changed, 26 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/sched/coredump.h

(limited to 'kernel')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 443a6f537d56..fbbe52e1250e 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -35,6 +35,7 @@
 #include <linux/utsname.h>
 #include <linux/coredump.h>
 #include <linux/sched.h>
+#include <linux/sched/coredump.h>
 #include <linux/dax.h>
 #include <linux/uaccess.h>
 #include <asm/param.h>
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index ffca4bbc3d63..222873bb689c 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -35,6 +35,7 @@
 #include <linux/elf-fdpic.h>
 #include <linux/elfcore.h>
 #include <linux/coredump.h>
+#include <linux/sched/coredump.h>
 #include <linux/dax.h>
 
 #include <linux/uaccess.h>
diff --git a/fs/coredump.c b/fs/coredump.c
index ae6b05629ca1..23a539b29225 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -16,6 +16,7 @@
 #include <linux/personality.h>
 #include <linux/binfmts.h>
 #include <linux/coredump.h>
+#include <linux/sched/coredump.h>
 #include <linux/utsname.h>
 #include <linux/pid_namespace.h>
 #include <linux/module.h>
diff --git a/fs/exec.c b/fs/exec.c
index 8929da96cca1..9c80d011594e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,6 +33,7 @@
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
 #include <linux/pagemap.h>
 #include <linux/perf_event.h>
 #include <linux/highmem.h>
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9efe12376af1..4c5fa704e38c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -87,6 +87,7 @@
 #include <linux/slab.h>
 #include <linux/sched/autogroup.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
 #include <linux/flex_array.h>
 #include <linux/posix-timers.h>
 #ifdef CONFIG_HARDWALL
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 5d6960f5f1c0..550e8b183abe 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -14,6 +14,7 @@
 #include <linux/spinlock.h>
 #include <linux/atomic.h>
 #include <linux/binfmts.h>
+#include <linux/sched/coredump.h>
 
 struct ctl_table_header;
 struct mempolicy;
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index 1e032a1ddb3e..5d9a400af509 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -1,7 +1,8 @@
 #ifndef _LINUX_KHUGEPAGED_H
 #define _LINUX_KHUGEPAGED_H
 
-#include <linux/sched.h> /* MMF_VM_HUGEPAGE */
+#include <linux/sched/coredump.h> /* MMF_VM_HUGEPAGE */
+
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 extern struct attribute_group khugepaged_attr_group;
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 481c8c4627ca..e1cfda4bee58 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -12,6 +12,7 @@
 #include <linux/pagemap.h>
 #include <linux/rmap.h>
 #include <linux/sched.h>
+#include <linux/sched/coredump.h>
 
 struct stable_node;
 struct mem_cgroup;
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
new file mode 100644
index 000000000000..ab81e564e076
--- /dev/null
+++ b/include/linux/sched/coredump.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_COREDUMP_H
+#define _LINUX_SCHED_COREDUMP_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_COREDUMP_H */
diff --git a/kernel/cred.c b/kernel/cred.c
index 5f264fb5737d..2bc66075740f 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -12,6 +12,7 @@
 #include <linux/cred.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/sched/coredump.h>
 #include <linux/key.h>
 #include <linux/keyctl.h>
 #include <linux/init_task.h>
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 62f2dbd13efc..0e137f98a50c 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
 #include <linux/export.h>
 #include <linux/rmap.h>		/* anon_vma_prepare */
 #include <linux/mmu_notifier.h>	/* set_pte_at_notify */
diff --git a/kernel/fork.c b/kernel/fork.c
index 1875468e2a81..7332448b668a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/sched/autogroup.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
 #include <linux/init.h>
 #include <linux/unistd.h>
 #include <linux/module.h>
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 6fe5a2897912..bcdbdc19eb35 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -11,6 +11,7 @@
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
diff --git a/kernel/sys.c b/kernel/sys.c
index dc71ec1e1967..e2d743fb7f55 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -52,6 +52,7 @@
 #include <linux/sched/autogroup.h>
 #include <linux/sched/loadavg.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
 #include <linux/rcupdate.h>
 #include <linux/uidgid.h>
 #include <linux/cred.h>
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index bb260ceb3718..acf0a5a06da7 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -63,6 +63,7 @@
 #include <linux/capability.h>
 #include <linux/binfmts.h>
 #include <linux/sched/sysctl.h>
+#include <linux/sched/coredump.h>
 #include <linux/kexec.h>
 #include <linux/bpf.h>
 #include <linux/mount.h>
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 71e3dede95b4..dee92fa26bbb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -9,6 +9,7 @@
 
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/coredump.h>
 #include <linux/highmem.h>
 #include <linux/hugetlb.h>
 #include <linux/mmu_notifier.h>
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 212a2afe8830..ba40b7f673f4 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -3,6 +3,7 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
 #include <linux/mmu_notifier.h>
 #include <linux/rmap.h>
 #include <linux/swap.h>
diff --git a/mm/ksm.c b/mm/ksm.c
index 5632c9d02457..19b4f2dea7a5 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -20,6 +20,7 @@
 #include <linux/mman.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
 #include <linux/rwsem.h>
 #include <linux/pagemap.h>
 #include <linux/rmap.h>
diff --git a/mm/memory.c b/mm/memory.c
index b0495ec74d29..6fc918b2c459 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -41,6 +41,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/mm.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
 #include <linux/hugetlb.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d28936618698..69f75b014607 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -23,6 +23,7 @@
 #include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
 #include <linux/swap.h>
 #include <linux/timex.h>
 #include <linux/jiffies.h>
-- 
cgit v1.2.3


From 3f07c0144132e4f59d88055ac8ff3e691a5fa2b8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:30 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/signal.h>

We are going to split <linux/sched/signal.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/signal.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/kernel/osf_sys.c                                    | 2 +-
 arch/alpha/kernel/signal.c                                     | 2 +-
 arch/alpha/kernel/traps.c                                      | 2 +-
 arch/alpha/mm/fault.c                                          | 2 +-
 arch/arc/kernel/traps.c                                        | 2 +-
 arch/arc/mm/fault.c                                            | 2 +-
 arch/arm/kernel/ptrace.c                                       | 2 +-
 arch/arm/kernel/traps.c                                        | 2 +-
 arch/arm/mm/alignment.c                                        | 2 +-
 arch/arm/mm/fault.c                                            | 2 +-
 arch/arm/mm/init.c                                             | 1 +
 arch/arm/mm/mmap.c                                             | 2 +-
 arch/arm/vfp/vfpmodule.c                                       | 2 +-
 arch/arm64/kernel/fpsimd.c                                     | 2 +-
 arch/arm64/kernel/ptrace.c                                     | 2 +-
 arch/arm64/kernel/traps.c                                      | 2 +-
 arch/arm64/mm/fault.c                                          | 2 +-
 arch/arm64/mm/mmap.c                                           | 2 +-
 arch/avr32/kernel/traps.c                                      | 2 +-
 arch/blackfin/kernel/trace.c                                   | 2 +-
 arch/blackfin/kernel/traps.c                                   | 1 +
 arch/cris/mm/fault.c                                           | 1 +
 arch/frv/kernel/traps.c                                        | 2 +-
 arch/h8300/kernel/ptrace_s.c                                   | 2 +-
 arch/hexagon/kernel/traps.c                                    | 2 +-
 arch/hexagon/mm/vm_fault.c                                     | 1 +
 arch/ia64/kernel/asm-offsets.c                                 | 2 +-
 arch/ia64/kernel/brl_emu.c                                     | 2 +-
 arch/ia64/kernel/mca.c                                         | 2 +-
 arch/ia64/kernel/traps.c                                       | 2 +-
 arch/ia64/kernel/unaligned.c                                   | 2 +-
 arch/ia64/mm/fault.c                                           | 2 +-
 arch/ia64/mm/init.c                                            | 1 +
 arch/mips/kernel/branch.c                                      | 2 +-
 arch/mips/kernel/signal_o32.c                                  | 1 +
 arch/mips/mm/mmap.c                                            | 2 +-
 arch/mips/sgi-ip22/ip22-berr.c                                 | 2 +-
 arch/mips/sgi-ip22/ip22-reset.c                                | 2 +-
 arch/mn10300/kernel/fpu.c                                      | 2 ++
 arch/openrisc/mm/fault.c                                       | 2 +-
 arch/parisc/kernel/sys_parisc.c                                | 1 +
 arch/parisc/kernel/unaligned.c                                 | 2 +-
 arch/parisc/math-emu/driver.c                                  | 3 ++-
 arch/powerpc/kvm/book3s_64_vio.c                               | 1 +
 arch/powerpc/mm/mmap.c                                         | 2 +-
 arch/powerpc/mm/mmu_context_iommu.c                            | 2 +-
 arch/powerpc/platforms/cell/spufs/fault.c                      | 2 +-
 arch/powerpc/xmon/xmon.c                                       | 2 +-
 arch/s390/kernel/nmi.c                                         | 3 +++
 arch/s390/mm/mmap.c                                            | 1 +
 arch/score/kernel/traps.c                                      | 2 +-
 arch/sh/kernel/cpu/sh2a/fpu.c                                  | 2 +-
 arch/sh/kernel/hw_breakpoint.c                                 | 1 +
 arch/sh/kernel/traps.c                                         | 2 ++
 arch/sh/math-emu/math.c                                        | 2 +-
 arch/sh/mm/asids-debugfs.c                                     | 2 ++
 arch/sh/mm/fault.c                                             | 1 +
 arch/sparc/kernel/sys_sparc_32.c                               | 2 +-
 arch/sparc/kernel/sys_sparc_64.c                               | 2 +-
 arch/sparc/kernel/unaligned_32.c                               | 2 +-
 arch/tile/mm/mmap.c                                            | 2 +-
 arch/um/drivers/line.c                                         | 3 ++-
 arch/um/kernel/reboot.c                                        | 2 +-
 arch/um/kernel/skas/mmu.c                                      | 3 ++-
 arch/um/kernel/tlb.c                                           | 3 ++-
 arch/um/kernel/trap.c                                          | 2 +-
 arch/unicore32/kernel/fpu-ucf64.c                              | 2 +-
 arch/unicore32/kernel/traps.c                                  | 1 +
 arch/unicore32/mm/fault.c                                      | 2 +-
 arch/x86/entry/vsyscall/vsyscall_64.c                          | 1 +
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c                       | 2 +-
 arch/x86/kvm/mmu.c                                             | 1 +
 arch/x86/mm/mmap.c                                             | 2 +-
 arch/xtensa/kernel/traps.c                                     | 2 +-
 drivers/android/binder.c                                       | 2 +-
 drivers/block/drbd/drbd_int.h                                  | 2 +-
 drivers/char/snsc_event.c                                      | 2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_events.c                        | 2 +-
 drivers/gpu/drm/drm_lock.c                                     | 2 ++
 drivers/gpu/drm/ttm/ttm_lock.c                                 | 2 +-
 drivers/infiniband/core/umem.c                                 | 2 +-
 drivers/infiniband/hw/hfi1/user_pages.c                        | 2 +-
 drivers/infiniband/hw/qib/qib_user_pages.c                     | 1 +
 drivers/infiniband/hw/usnic/usnic_uiom.c                       | 2 +-
 drivers/isdn/i4l/isdn_tty.c                                    | 1 +
 drivers/isdn/mISDN/l1oip_core.c                                | 2 ++
 drivers/md/md.c                                                | 1 +
 drivers/md/raid1.c                                             | 3 +++
 drivers/md/raid5.c                                             | 2 ++
 drivers/misc/genwqe/card_dev.c                                 | 2 +-
 drivers/misc/mic/cosm/cosm_scif_server.c                       | 2 ++
 drivers/misc/mic/cosm_client/cosm_scif_client.c                | 2 ++
 drivers/misc/mic/scif/scif_rma.c                               | 1 +
 drivers/net/slip/slip.c                                        | 2 +-
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c        | 2 +-
 drivers/parisc/power.c                                         | 2 +-
 drivers/ps3/ps3-sys-manager.c                                  | 1 +
 drivers/s390/char/fs3270.c                                     | 1 +
 drivers/s390/char/keyboard.c                                   | 2 +-
 drivers/scsi/bnx2fc/bnx2fc.h                                   | 2 +-
 drivers/scsi/bnx2i/bnx2i.h                                     | 2 +-
 drivers/scsi/libiscsi.c                                        | 1 +
 drivers/staging/android/lowmemorykiller.c                      | 2 +-
 drivers/staging/lustre/lnet/libcfs/linux/linux-prim.c          | 2 +-
 drivers/staging/rtl8188eu/include/osdep_service.h              | 2 +-
 drivers/staging/rtl8712/osdep_service.h                        | 2 +-
 drivers/staging/rtl8712/rtl8712_cmd.c                          | 1 +
 drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c  | 1 +
 drivers/staging/vc04_services/interface/vchiq_arm/vchiq_util.h | 2 +-
 drivers/target/iscsi/iscsi_target.c                            | 1 +
 drivers/target/iscsi/iscsi_target_erl0.c                       | 2 ++
 drivers/target/iscsi/iscsi_target_login.c                      | 1 +
 drivers/target/iscsi/iscsi_target_nego.c                       | 1 +
 drivers/tty/pty.c                                              | 2 +-
 drivers/tty/sysrq.c                                            | 2 +-
 drivers/tty/tty_io.c                                           | 2 +-
 drivers/tty/vt/keyboard.c                                      | 2 +-
 drivers/tty/vt/vt.c                                            | 2 +-
 drivers/tty/vt/vt_ioctl.c                                      | 2 +-
 drivers/usb/atm/usbatm.c                                       | 2 +-
 drivers/usb/core/devio.c                                       | 1 +
 drivers/usb/gadget/function/f_mass_storage.c                   | 1 +
 drivers/vfio/vfio_iommu_spapr_tce.c                            | 1 +
 drivers/vfio/vfio_iommu_type1.c                                | 2 +-
 drivers/w1/w1_family.c                                         | 2 +-
 drivers/w1/w1_int.c                                            | 1 +
 fs/attr.c                                                      | 1 +
 fs/autofs4/waitq.c                                             | 1 +
 fs/cifs/connect.c                                              | 1 +
 fs/coda/upcall.c                                               | 2 +-
 fs/coredump.c                                                  | 2 +-
 fs/exec.c                                                      | 1 +
 fs/file.c                                                      | 2 +-
 fs/fs_struct.c                                                 | 2 +-
 fs/jffs2/background.c                                          | 2 +-
 fs/lockd/svc.c                                                 | 2 +-
 fs/ncpfs/inode.c                                               | 1 +
 fs/ncpfs/sock.c                                                | 1 +
 fs/nfs/callback.c                                              | 1 +
 fs/nfsd/nfssvc.c                                               | 2 +-
 fs/proc/fd.c                                                   | 2 +-
 fs/select.c                                                    | 4 ++--
 include/linux/oom.h                                            | 2 +-
 include/linux/ptrace.h                                         | 1 +
 include/linux/sched/signal.h                                   | 6 ++++++
 include/linux/signalfd.h                                       | 2 +-
 include/linux/taskstats_kern.h                                 | 2 +-
 ipc/mqueue.c                                                   | 1 +
 kernel/bpf/syscall.c                                           | 1 +
 kernel/cpu.c                                                   | 2 +-
 kernel/debug/gdbstub.c                                         | 1 +
 kernel/debug/kdb/kdb_bt.c                                      | 2 +-
 kernel/hung_task.c                                             | 2 ++
 kernel/rcu/update.c                                            | 2 +-
 kernel/sched/sched.h                                           | 1 +
 kernel/time/itimer.c                                           | 1 +
 kernel/time/posix-cpu-timers.c                                 | 2 +-
 kernel/time/tick-sched.c                                       | 2 +-
 kernel/tracepoint.c                                            | 2 +-
 kernel/tsacct.c                                                | 2 +-
 kernel/user_namespace.c                                        | 1 +
 lib/is_single_threaded.c                                       | 3 +--
 mm/filemap.c                                                   | 1 +
 mm/kmemleak.c                                                  | 2 +-
 mm/memory-failure.c                                            | 2 +-
 mm/vmacache.c                                                  | 2 +-
 net/9p/client.c                                                | 2 +-
 net/atm/common.c                                               | 2 +-
 net/ax25/af_ax25.c                                             | 2 +-
 net/caif/caif_socket.c                                         | 2 +-
 net/core/stream.c                                              | 1 +
 net/decnet/af_decnet.c                                         | 2 +-
 net/irda/af_irda.c                                             | 1 +
 net/netrom/af_netrom.c                                         | 2 +-
 net/rose/af_rose.c                                             | 2 +-
 net/sctp/socket.c                                              | 1 +
 net/sunrpc/svc.c                                               | 2 +-
 net/unix/af_unix.c                                             | 2 +-
 net/x25/af_x25.c                                               | 2 +-
 security/selinux/hooks.c                                       | 2 +-
 180 files changed, 204 insertions(+), 121 deletions(-)
 create mode 100644 include/linux/sched/signal.h

(limited to 'kernel')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 9d27a7d333dc..568ca29f2ad9 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -11,7 +11,7 @@
  */
 
 #include <linux/errno.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c
index 17308f925306..b8221f112eee 100644
--- a/arch/alpha/kernel/signal.c
+++ b/arch/alpha/kernel/signal.c
@@ -6,7 +6,7 @@
  *  1997-11-02  Modified for POSIX.1b signals by Richard Henderson
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
 #include <linux/errno.h>
diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c
index af2994206b4b..6448ab00043d 100644
--- a/arch/alpha/kernel/traps.c
+++ b/arch/alpha/kernel/traps.c
@@ -10,7 +10,7 @@
 
 #include <linux/jiffies.h>
 #include <linux/mm.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/tty.h>
 #include <linux/delay.h>
 #include <linux/extable.h>
diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index 47948b4dd157..c25e8827e7cd 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -4,7 +4,7 @@
  *  Copyright (C) 1995  Linus Torvalds
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <asm/io.h>
diff --git a/arch/arc/kernel/traps.c b/arch/arc/kernel/traps.c
index c927aa84e652..ff83e78d0cfb 100644
--- a/arch/arc/kernel/traps.c
+++ b/arch/arc/kernel/traps.c
@@ -13,7 +13,7 @@
  * Rahul Trivedi: Codito Technologies 2004
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/kdebug.h>
 #include <linux/uaccess.h>
 #include <linux/ptrace.h>
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
index e94e5aa33985..162c97528872 100644
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -9,7 +9,7 @@
 
 #include <linux/signal.h>
 #include <linux/interrupt.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/errno.h>
 #include <linux/ptrace.h>
 #include <linux/uaccess.h>
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index ae738a6319f6..46f7bab81c40 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -10,7 +10,7 @@
  * published by the Free Software Foundation.
  */
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/mm.h>
 #include <linux/elf.h>
 #include <linux/smp.h>
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 9688ec0c6ef4..dc5f1a22b3c9 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -24,7 +24,7 @@
 #include <linux/bug.h>
 #include <linux/delay.h>
 #include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/irq.h>
 
 #include <linux/atomic.h>
diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c
index 7d5f4c736a16..d7fe2de9cc9e 100644
--- a/arch/arm/mm/alignment.c
+++ b/arch/arm/mm/alignment.c
@@ -19,7 +19,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/uaccess.h>
 
 #include <asm/cp15.h>
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index c2b5b9892fd1..520c7778d330 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -16,7 +16,7 @@
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
 #include <linux/page-flags.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/highmem.h>
 #include <linux/perf_event.h>
 
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index bf4d3bc41a7a..2a9040dcf47e 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -13,6 +13,7 @@
 #include <linux/init.h>
 #include <linux/bootmem.h>
 #include <linux/mman.h>
+#include <linux/sched/signal.h>
 #include <linux/export.h>
 #include <linux/nodemask.h>
 #include <linux/initrd.h>
diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c
index 66353caa35b9..d448f9cd7715 100644
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -5,7 +5,7 @@
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/shm.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/io.h>
 #include <linux/personality.h>
 #include <linux/random.h>
diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c
index 569d5a650a4a..a71a48e71fff 100644
--- a/arch/arm/vfp/vfpmodule.c
+++ b/arch/arm/vfp/vfpmodule.c
@@ -15,7 +15,7 @@
 #include <linux/kernel.h>
 #include <linux/notifier.h>
 #include <linux/signal.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/smp.h>
 #include <linux/init.h>
 #include <linux/uaccess.h>
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index b883f1f75216..06da8ea16bbe 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -21,7 +21,7 @@
 #include <linux/cpu_pm.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/signal.h>
 #include <linux/hardirq.h>
 
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index a22161ccf447..64fc32ea3422 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -22,7 +22,7 @@
 #include <linux/audit.h>
 #include <linux/compat.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/ptrace.h>
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 7d47c2cdfd93..5b8779a849a2 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -29,7 +29,7 @@
 #include <linux/kexec.h>
 #include <linux/delay.h>
 #include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/syscalls.h>
 
 #include <asm/atomic.h>
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 81283851c9af..30dd60ab8a65 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -26,7 +26,7 @@
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
 #include <linux/page-flags.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/highmem.h>
 #include <linux/perf_event.h>
 #include <linux/preempt.h>
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index 01c171723bb3..1e0a2650c88b 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -22,7 +22,7 @@
 #include <linux/mman.h>
 #include <linux/export.h>
 #include <linux/shm.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/io.h>
 #include <linux/personality.h>
 #include <linux/random.h>
diff --git a/arch/avr32/kernel/traps.c b/arch/avr32/kernel/traps.c
index eb4a3fcfbaff..50b541325025 100644
--- a/arch/avr32/kernel/traps.c
+++ b/arch/avr32/kernel/traps.c
@@ -14,7 +14,7 @@
 #include <linux/extable.h>
 #include <linux/module.h>	/* print_modules */
 #include <linux/notifier.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/uaccess.h>
 
 #include <asm/addrspace.h>
diff --git a/arch/blackfin/kernel/trace.c b/arch/blackfin/kernel/trace.c
index 719dd796c12c..01e546ad2f7a 100644
--- a/arch/blackfin/kernel/trace.c
+++ b/arch/blackfin/kernel/trace.c
@@ -11,7 +11,7 @@
 #include <linux/thread_info.h>
 #include <linux/mm.h>
 #include <linux/oom.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
diff --git a/arch/blackfin/kernel/traps.c b/arch/blackfin/kernel/traps.c
index 1ed85ddadc0d..32676d7721b1 100644
--- a/arch/blackfin/kernel/traps.c
+++ b/arch/blackfin/kernel/traps.c
@@ -9,6 +9,7 @@
 #include <linux/bug.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
+#include <linux/sched/signal.h>
 #include <asm/traps.h>
 #include <asm/cplb.h>
 #include <asm/blackfin.h>
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c
index 94183d3639ef..1fca464f1b9e 100644
--- a/arch/cris/mm/fault.c
+++ b/arch/cris/mm/fault.c
@@ -8,6 +8,7 @@
 #include <linux/interrupt.h>
 #include <linux/extable.h>
 #include <linux/wait.h>
+#include <linux/sched/signal.h>
 #include <linux/uaccess.h>
 #include <arch/system.h>
 
diff --git a/arch/frv/kernel/traps.c b/arch/frv/kernel/traps.c
index 31221fb4348e..43c134d6081c 100644
--- a/arch/frv/kernel/traps.c
+++ b/arch/frv/kernel/traps.c
@@ -9,7 +9,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/signal.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/arch/h8300/kernel/ptrace_s.c b/arch/h8300/kernel/ptrace_s.c
index ef5a9c13e76d..c0af930052c0 100644
--- a/arch/h8300/kernel/ptrace_s.c
+++ b/arch/h8300/kernel/ptrace_s.c
@@ -10,7 +10,7 @@
  */
 
 #include <linux/linkage.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/errno.h>
 #include <asm/ptrace.h>
 
diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c
index 110dab152f82..4496bcf605ef 100644
--- a/arch/hexagon/kernel/traps.c
+++ b/arch/hexagon/kernel/traps.c
@@ -19,7 +19,7 @@
  */
 
 #include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/kdebug.h>
diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c
index 489875fd2be4..3eec33c5cfd7 100644
--- a/arch/hexagon/mm/vm_fault.c
+++ b/arch/hexagon/mm/vm_fault.c
@@ -28,6 +28,7 @@
 #include <asm/traps.h>
 #include <linux/uaccess.h>
 #include <linux/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/signal.h>
 #include <linux/extable.h>
 #include <linux/hardirq.h>
diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c
index 60ef83e6db71..8786c8b4f187 100644
--- a/arch/ia64/kernel/asm-offsets.c
+++ b/arch/ia64/kernel/asm-offsets.c
@@ -6,7 +6,7 @@
 
 #define ASM_OFFSETS_C 1
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/pid.h>
 #include <linux/clocksource.h>
 #include <linux/kbuild.h>
diff --git a/arch/ia64/kernel/brl_emu.c b/arch/ia64/kernel/brl_emu.c
index 8682df6263d6..987b11be0021 100644
--- a/arch/ia64/kernel/brl_emu.c
+++ b/arch/ia64/kernel/brl_emu.c
@@ -8,7 +8,7 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/uaccess.h>
 #include <asm/processor.h>
 
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 9509cc73b9c6..5ac51069e453 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -72,7 +72,7 @@
 #include <linux/jiffies.h>
 #include <linux/types.h>
 #include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/bootmem.h>
diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c
index 8981ce98afb3..48ba46b025e1 100644
--- a/arch/ia64/kernel/traps.c
+++ b/arch/ia64/kernel/traps.c
@@ -9,7 +9,7 @@
 
 #include <linux/kernel.h>
 #include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/tty.h>
 #include <linux/vt_kern.h>		/* For unblank_screen() */
 #include <linux/export.h>
diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c
index 99348d7f2255..a13680ca1e61 100644
--- a/arch/ia64/kernel/unaligned.c
+++ b/arch/ia64/kernel/unaligned.c
@@ -15,7 +15,7 @@
  */
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/tty.h>
 #include <linux/extable.h>
 #include <linux/ratelimit.h>
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index 7f2feb21753c..15f09cfff335 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -4,7 +4,7 @@
  * Copyright (C) 1998-2002 Hewlett-Packard Co
  *	David Mosberger-Tang <davidm@hpl.hp.com>
  */
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/extable.h>
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 06cdaef54b2e..8f3efa682ee8 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -12,6 +12,7 @@
 #include <linux/elf.h>
 #include <linux/memblock.h>
 #include <linux/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/mmzone.h>
 #include <linux/module.h>
 #include <linux/personality.h>
diff --git a/arch/mips/kernel/branch.c b/arch/mips/kernel/branch.c
index ae037a304ee4..b11facd11c9d 100644
--- a/arch/mips/kernel/branch.c
+++ b/arch/mips/kernel/branch.c
@@ -7,7 +7,7 @@
  * Copyright (C) 2001 MIPS Technologies, Inc.
  */
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/signal.h>
 #include <linux/export.h>
 #include <asm/branch.h>
diff --git a/arch/mips/kernel/signal_o32.c b/arch/mips/kernel/signal_o32.c
index 5e169fc5ca5c..2b3572fb5f1b 100644
--- a/arch/mips/kernel/signal_o32.c
+++ b/arch/mips/kernel/signal_o32.c
@@ -11,6 +11,7 @@
 #include <linux/compiler.h>
 #include <linux/errno.h>
 #include <linux/signal.h>
+#include <linux/sched/signal.h>
 #include <linux/uaccess.h>
 
 #include <asm/abi.h>
diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c
index d6d92c02308d..374d71e61ef6 100644
--- a/arch/mips/mm/mmap.c
+++ b/arch/mips/mm/mmap.c
@@ -13,7 +13,7 @@
 #include <linux/export.h>
 #include <linux/personality.h>
 #include <linux/random.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 
 unsigned long shm_align_mask = PAGE_SIZE - 1;	/* Sane caches */
 EXPORT_SYMBOL(shm_align_mask);
diff --git a/arch/mips/sgi-ip22/ip22-berr.c b/arch/mips/sgi-ip22/ip22-berr.c
index 3f6ccd53c15d..ff8e1935c873 100644
--- a/arch/mips/sgi-ip22/ip22-berr.c
+++ b/arch/mips/sgi-ip22/ip22-berr.c
@@ -6,7 +6,7 @@
 
 #include <linux/init.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 
 #include <asm/addrspace.h>
 #include <asm/traps.h>
diff --git a/arch/mips/sgi-ip22/ip22-reset.c b/arch/mips/sgi-ip22/ip22-reset.c
index a36f6b87548a..03a39ac5ead9 100644
--- a/arch/mips/sgi-ip22/ip22-reset.c
+++ b/arch/mips/sgi-ip22/ip22-reset.c
@@ -10,7 +10,7 @@
 #include <linux/rtc/ds1286.h>
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/notifier.h>
 #include <linux/pm.h>
 #include <linux/timer.h>
diff --git a/arch/mn10300/kernel/fpu.c b/arch/mn10300/kernel/fpu.c
index 2578b7ae7dd5..50ce7b447fed 100644
--- a/arch/mn10300/kernel/fpu.c
+++ b/arch/mn10300/kernel/fpu.c
@@ -9,6 +9,8 @@
  * 2 of the Licence, or (at your option) any later version.
  */
 #include <linux/uaccess.h>
+#include <linux/sched/signal.h>
+
 #include <asm/fpu.h>
 #include <asm/elf.h>
 #include <asm/exceptions.h>
diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c
index 53592a639744..e310ab499385 100644
--- a/arch/openrisc/mm/fault.c
+++ b/arch/openrisc/mm/fault.c
@@ -18,7 +18,7 @@
 #include <linux/mm.h>
 #include <linux/interrupt.h>
 #include <linux/extable.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 
 #include <linux/uaccess.h>
 #include <asm/siginfo.h>
diff --git a/arch/parisc/kernel/sys_parisc.c b/arch/parisc/kernel/sys_parisc.c
index bf3294171230..ce07cd3f2507 100644
--- a/arch/parisc/kernel/sys_parisc.c
+++ b/arch/parisc/kernel/sys_parisc.c
@@ -30,6 +30,7 @@
 #include <linux/linkage.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
+#include <linux/sched/signal.h>
 #include <linux/shm.h>
 #include <linux/syscalls.h>
 #include <linux/utsname.h>
diff --git a/arch/parisc/kernel/unaligned.c b/arch/parisc/kernel/unaligned.c
index 0a21067ac0a3..a08ab481e556 100644
--- a/arch/parisc/kernel/unaligned.c
+++ b/arch/parisc/kernel/unaligned.c
@@ -23,7 +23,7 @@
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/signal.h>
 #include <linux/ratelimit.h>
 #include <linux/uaccess.h>
diff --git a/arch/parisc/math-emu/driver.c b/arch/parisc/math-emu/driver.c
index 09ef4136c693..2fb59d2e2b29 100644
--- a/arch/parisc/math-emu/driver.c
+++ b/arch/parisc/math-emu/driver.c
@@ -27,7 +27,8 @@
  *  Copyright (C) 2001	      Hewlett-Packard <bame@debian.org>
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+
 #include "float.h"
 #include "math-emu.h"
 
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index ab9d14c0e460..3e26cd4979f9 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -24,6 +24,7 @@
 #include <linux/highmem.h>
 #include <linux/gfp.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/hugetlb.h>
 #include <linux/list.h>
 #include <linux/anon_inodes.h>
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index 2f1e44362198..8013861aeaa7 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -25,7 +25,7 @@
 #include <linux/personality.h>
 #include <linux/mm.h>
 #include <linux/random.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/elf-randomize.h>
 #include <linux/security.h>
 #include <linux/mman.h>
diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
index 7de7124ac91b..497130c5c742 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -10,7 +10,7 @@
  *
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/rculist.h>
 #include <linux/vmalloc.h>
diff --git a/arch/powerpc/platforms/cell/spufs/fault.c b/arch/powerpc/platforms/cell/spufs/fault.c
index e29e4d5afa2d..870c0a82d560 100644
--- a/arch/powerpc/platforms/cell/spufs/fault.c
+++ b/arch/powerpc/platforms/cell/spufs/fault.c
@@ -19,7 +19,7 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/mm.h>
 
 #include <asm/spu.h>
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 26fa03fc9f3c..16321ad9e70c 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -13,7 +13,7 @@
 
 #include <linux/kernel.h>
 #include <linux/errno.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
 #include <linux/reboot.h>
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c
index 80c093e0c6f1..9bf8327154ee 100644
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -13,6 +13,9 @@
 #include <linux/errno.h>
 #include <linux/hardirq.h>
 #include <linux/time.h>
+#include <linux/module.h>
+#include <linux/sched/signal.h>
+
 #include <linux/export.h>
 #include <asm/lowcore.h>
 #include <asm/smp.h>
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index 7ae1282d5be9..5ea09403bb87 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -26,6 +26,7 @@
 #include <linux/personality.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
+#include <linux/sched/signal.h>
 #include <linux/random.h>
 #include <linux/compat.h>
 #include <linux/security.h>
diff --git a/arch/score/kernel/traps.c b/arch/score/kernel/traps.c
index 569ac02f68df..fa624f30f783 100644
--- a/arch/score/kernel/traps.c
+++ b/arch/score/kernel/traps.c
@@ -24,7 +24,7 @@
  */
 
 #include <linux/extable.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 
 #include <asm/cacheflush.h>
 #include <asm/irq.h>
diff --git a/arch/sh/kernel/cpu/sh2a/fpu.c b/arch/sh/kernel/cpu/sh2a/fpu.c
index 98bbaa447c93..352f894bece1 100644
--- a/arch/sh/kernel/cpu/sh2a/fpu.c
+++ b/arch/sh/kernel/cpu/sh2a/fpu.c
@@ -9,7 +9,7 @@
  *
  * FIXME! These routines can be optimized in big endian case.
  */
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/signal.h>
 #include <asm/processor.h>
 #include <asm/io.h>
diff --git a/arch/sh/kernel/hw_breakpoint.c b/arch/sh/kernel/hw_breakpoint.c
index 2197fc584186..afe965712a69 100644
--- a/arch/sh/kernel/hw_breakpoint.c
+++ b/arch/sh/kernel/hw_breakpoint.c
@@ -11,6 +11,7 @@
  */
 #include <linux/init.h>
 #include <linux/perf_event.h>
+#include <linux/sched/signal.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/percpu.h>
 #include <linux/kallsyms.h>
diff --git a/arch/sh/kernel/traps.c b/arch/sh/kernel/traps.c
index 9513fa7840aa..3036dee854d1 100644
--- a/arch/sh/kernel/traps.c
+++ b/arch/sh/kernel/traps.c
@@ -8,6 +8,8 @@
 #include <linux/hardirq.h>
 #include <linux/kernel.h>
 #include <linux/kexec.h>
+#include <linux/sched/signal.h>
+
 #include <linux/extable.h>
 #include <linux/module.h>	/* print_modules */
 #include <asm/unwinder.h>
diff --git a/arch/sh/math-emu/math.c b/arch/sh/math-emu/math.c
index 5078cb809750..c86f4360c6ce 100644
--- a/arch/sh/math-emu/math.c
+++ b/arch/sh/math-emu/math.c
@@ -10,7 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/types.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/signal.h>
 #include <linux/perf_event.h>
 
diff --git a/arch/sh/mm/asids-debugfs.c b/arch/sh/mm/asids-debugfs.c
index bf95fdaedd0c..110bd35165bf 100644
--- a/arch/sh/mm/asids-debugfs.c
+++ b/arch/sh/mm/asids-debugfs.c
@@ -20,6 +20,8 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/spinlock.h>
+#include <linux/sched/signal.h>
+
 #include <asm/processor.h>
 #include <asm/mmu_context.h>
 
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index 9bf876780cef..6fd1bf7481c7 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -13,6 +13,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/hardirq.h>
 #include <linux/kprobes.h>
 #include <linux/perf_event.h>
diff --git a/arch/sparc/kernel/sys_sparc_32.c b/arch/sparc/kernel/sys_sparc_32.c
index fb7b185ee941..ae49639a484e 100644
--- a/arch/sparc/kernel/sys_sparc_32.c
+++ b/arch/sparc/kernel/sys_sparc_32.c
@@ -7,7 +7,7 @@
 
 #include <linux/errno.h>
 #include <linux/types.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
 #include <linux/file.h>
diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c
index 884c70331345..54d3999d8119 100644
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -7,7 +7,7 @@
 
 #include <linux/errno.h>
 #include <linux/types.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/mm.h>
diff --git a/arch/sparc/kernel/unaligned_32.c b/arch/sparc/kernel/unaligned_32.c
index d20d4e3fd129..8367dce5f41b 100644
--- a/arch/sparc/kernel/unaligned_32.c
+++ b/arch/sparc/kernel/unaligned_32.c
@@ -8,7 +8,7 @@
 
 
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/mm.h>
 #include <asm/ptrace.h>
 #include <asm/processor.h>
diff --git a/arch/tile/mm/mmap.c b/arch/tile/mm/mmap.c
index ef61c597898b..377e312dc27e 100644
--- a/arch/tile/mm/mmap.c
+++ b/arch/tile/mm/mmap.c
@@ -17,7 +17,7 @@
 #include <linux/mm.h>
 #include <linux/random.h>
 #include <linux/limits.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/mman.h>
 #include <linux/compat.h>
 
diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index 62087028a9ce..366e57f5e8d6 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -5,8 +5,9 @@
 
 #include <linux/irqreturn.h>
 #include <linux/kd.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
+
 #include "chan.h"
 #include <irq_kern.h>
 #include <irq_user.h>
diff --git a/arch/um/kernel/reboot.c b/arch/um/kernel/reboot.c
index b60a9f8cda75..79218106a033 100644
--- a/arch/um/kernel/reboot.c
+++ b/arch/um/kernel/reboot.c
@@ -3,7 +3,7 @@
  * Licensed under the GPL
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/oom.h>
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index 3943e9d7d13d..7a1f2a936fd1 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -5,8 +5,9 @@
  */
 
 #include <linux/mm.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
+
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
 #include <asm/sections.h>
diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c
index 3777b82759bd..37508b190106 100644
--- a/arch/um/kernel/tlb.c
+++ b/arch/um/kernel/tlb.c
@@ -5,7 +5,8 @@
 
 #include <linux/mm.h>
 #include <linux/module.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <as-layout.h>
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index ad8f206ab5e8..9711ae4aaa6a 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -4,7 +4,7 @@
  */
 
 #include <linux/mm.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/hardirq.h>
 #include <linux/module.h>
 #include <linux/uaccess.h>
diff --git a/arch/unicore32/kernel/fpu-ucf64.c b/arch/unicore32/kernel/fpu-ucf64.c
index a53343a90ca2..12c8c9527b8e 100644
--- a/arch/unicore32/kernel/fpu-ucf64.c
+++ b/arch/unicore32/kernel/fpu-ucf64.c
@@ -13,7 +13,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/init.h>
 
 #include <asm/fpu-ucf64.h>
diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c
index c54e32410ead..7f5e06f9a202 100644
--- a/arch/unicore32/kernel/traps.c
+++ b/arch/unicore32/kernel/traps.c
@@ -14,6 +14,7 @@
  */
 #include <linux/module.h>
 #include <linux/signal.h>
+#include <linux/sched/signal.h>
 #include <linux/spinlock.h>
 #include <linux/personality.h>
 #include <linux/kallsyms.h>
diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c
index b656d216a8a8..bbefcc46a45e 100644
--- a/arch/unicore32/mm/fault.c
+++ b/arch/unicore32/mm/fault.c
@@ -17,7 +17,7 @@
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
 #include <linux/page-flags.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/io.h>
 
 #include <asm/pgtable.h>
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 636c4b341f36..df91fb393a01 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -27,6 +27,7 @@
 
 #include <linux/kernel.h>
 #include <linux/timer.h>
+#include <linux/sched/signal.h>
 #include <linux/syscalls.h>
 #include <linux/ratelimit.h>
 
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 8af04afdfcb9..d48af18e7baf 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -25,7 +25,7 @@
 #include <linux/sysfs.h>
 #include <linux/kernfs.h>
 #include <linux/seq_file.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/cpu.h>
 #include <linux/task_work.h>
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1cda35277278..ac7810513d0e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -36,6 +36,7 @@
 #include <linux/compiler.h>
 #include <linux/srcu.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/uaccess.h>
 #include <linux/hash.h>
 #include <linux/kern_levels.h>
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index d2dc0438d654..5eabf34008f1 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -28,7 +28,7 @@
 #include <linux/mm.h>
 #include <linux/random.h>
 #include <linux/limits.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <asm/elf.h>
 
 struct va_alignment __read_mostly va_align = {
diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c
index 282bf721a4d6..84abd66e680d 100644
--- a/arch/xtensa/kernel/traps.c
+++ b/arch/xtensa/kernel/traps.c
@@ -24,7 +24,7 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/stringify.h>
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index e33e7fbe870b..aae4d8d4be36 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -31,7 +31,7 @@
 #include <linux/poll.h>
 #include <linux/debugfs.h>
 #include <linux/rbtree.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
 #include <linux/seq_file.h>
 #include <linux/uaccess.h>
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 4cb8f21ff4ef..724d1c50fc52 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -30,7 +30,7 @@
 #include <linux/compiler.h>
 #include <linux/types.h>
 #include <linux/list.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/bitops.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
diff --git a/drivers/char/snsc_event.c b/drivers/char/snsc_event.c
index 59bcefd6ec7c..e452673dff66 100644
--- a/drivers/char/snsc_event.c
+++ b/drivers/char/snsc_event.c
@@ -16,7 +16,7 @@
  */
 
 #include <linux/interrupt.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <asm/byteorder.h>
 #include <asm/sn/sn_sal.h>
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 6a3470f84998..d1ce83d73a87 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -23,7 +23,7 @@
 #include <linux/mm_types.h>
 #include <linux/slab.h>
 #include <linux/types.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/uaccess.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
diff --git a/drivers/gpu/drm/drm_lock.c b/drivers/gpu/drm/drm_lock.c
index 32d43f86a8f2..96bb6badb818 100644
--- a/drivers/gpu/drm/drm_lock.c
+++ b/drivers/gpu/drm/drm_lock.c
@@ -34,6 +34,8 @@
  */
 
 #include <linux/export.h>
+#include <linux/sched/signal.h>
+
 #include <drm/drmP.h>
 #include "drm_legacy.h"
 #include "drm_internal.h"
diff --git a/drivers/gpu/drm/ttm/ttm_lock.c b/drivers/gpu/drm/ttm/ttm_lock.c
index f154fb1929bd..913f4318cdc0 100644
--- a/drivers/gpu/drm/ttm/ttm_lock.c
+++ b/drivers/gpu/drm/ttm/ttm_lock.c
@@ -33,7 +33,7 @@
 #include <linux/atomic.h>
 #include <linux/errno.h>
 #include <linux/wait.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/module.h>
 
 #define TTM_WRITE_LOCK_PENDING    (1 << 0)
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index d525b1a2986a..27f155d2df8d 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -34,7 +34,7 @@
 
 #include <linux/mm.h>
 #include <linux/dma-mapping.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
 #include <linux/export.h>
 #include <linux/hugetlb.h>
diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c
index 20f4ddcac3b0..68295a12b771 100644
--- a/drivers/infiniband/hw/hfi1/user_pages.c
+++ b/drivers/infiniband/hw/hfi1/user_pages.c
@@ -46,7 +46,7 @@
  */
 
 #include <linux/mm.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/device.h>
 #include <linux/module.h>
 
diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c
index 75f08624ac05..ce83ba9a12ef 100644
--- a/drivers/infiniband/hw/qib/qib_user_pages.c
+++ b/drivers/infiniband/hw/qib/qib_user_pages.c
@@ -32,6 +32,7 @@
  */
 
 #include <linux/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/device.h>
 
 #include "qib.h"
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
index ba868be5af2e..c49db7c33979 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -34,7 +34,7 @@
 
 #include <linux/mm.h>
 #include <linux/dma-mapping.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/iommu.h>
diff --git a/drivers/isdn/i4l/isdn_tty.c b/drivers/isdn/i4l/isdn_tty.c
index 63eaa0a9f8a1..1b169559a240 100644
--- a/drivers/isdn/i4l/isdn_tty.c
+++ b/drivers/isdn/i4l/isdn_tty.c
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/mutex.h>
+#include <linux/sched/signal.h>
 #include "isdn_common.h"
 #include "isdn_tty.h"
 #ifdef CONFIG_ISDN_AUDIO
diff --git a/drivers/isdn/mISDN/l1oip_core.c b/drivers/isdn/mISDN/l1oip_core.c
index 67c21876c35f..6ceca7db62ad 100644
--- a/drivers/isdn/mISDN/l1oip_core.c
+++ b/drivers/isdn/mISDN/l1oip_core.c
@@ -234,6 +234,8 @@
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
+
 #include <net/sock.h>
 #include "core.h"
 #include "l1oip.h"
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 985374f20e2e..548d1b8014f8 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -44,6 +44,7 @@
 
 */
 
+#include <linux/sched/signal.h>
 #include <linux/kthread.h>
 #include <linux/blkdev.h>
 #include <linux/badblocks.h>
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7453d94eeed7..fbc2d7851b49 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -37,7 +37,10 @@
 #include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/ratelimit.h>
+#include <linux/sched/signal.h>
+
 #include <trace/events/block.h>
+
 #include "md.h"
 #include "raid1.h"
 #include "bitmap.h"
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2ce23b01dbb2..4fb09b3fcb41 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -55,6 +55,8 @@
 #include <linux/ratelimit.h>
 #include <linux/nodemask.h>
 #include <linux/flex_array.h>
+#include <linux/sched/signal.h>
+
 #include <trace/events/block.h>
 
 #include "md.h"
diff --git a/drivers/misc/genwqe/card_dev.c b/drivers/misc/genwqe/card_dev.c
index cb290b8ca0c8..dd4617764f14 100644
--- a/drivers/misc/genwqe/card_dev.c
+++ b/drivers/misc/genwqe/card_dev.c
@@ -29,7 +29,7 @@
 #include <linux/pci.h>
 #include <linux/string.h>
 #include <linux/fs.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/wait.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
diff --git a/drivers/misc/mic/cosm/cosm_scif_server.c b/drivers/misc/mic/cosm/cosm_scif_server.c
index 5696df4326b5..85f7d09cc65f 100644
--- a/drivers/misc/mic/cosm/cosm_scif_server.c
+++ b/drivers/misc/mic/cosm/cosm_scif_server.c
@@ -19,6 +19,8 @@
  *
  */
 #include <linux/kthread.h>
+#include <linux/sched/signal.h>
+
 #include "cosm_main.h"
 
 /*
diff --git a/drivers/misc/mic/cosm_client/cosm_scif_client.c b/drivers/misc/mic/cosm_client/cosm_scif_client.c
index 03e98bf1ac15..aa530fcceaa9 100644
--- a/drivers/misc/mic/cosm_client/cosm_scif_client.c
+++ b/drivers/misc/mic/cosm_client/cosm_scif_client.c
@@ -22,6 +22,8 @@
 #include <linux/delay.h>
 #include <linux/reboot.h>
 #include <linux/kthread.h>
+#include <linux/sched/signal.h>
+
 #include "../cosm/cosm_main.h"
 
 #define COSM_SCIF_MAX_RETRIES 10
diff --git a/drivers/misc/mic/scif/scif_rma.c b/drivers/misc/mic/scif/scif_rma.c
index d0e9c60f944e..329727e00e97 100644
--- a/drivers/misc/mic/scif/scif_rma.c
+++ b/drivers/misc/mic/scif/scif_rma.c
@@ -18,6 +18,7 @@
 #include <linux/dma_remapping.h>
 #include <linux/pagemap.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
 
 #include "scif_main.h"
 #include "scif_map.h"
diff --git a/drivers/net/slip/slip.c b/drivers/net/slip/slip.c
index 08db4d687533..1da31dc47f86 100644
--- a/drivers/net/slip/slip.c
+++ b/drivers/net/slip/slip.c
@@ -66,7 +66,7 @@
 
 #include <linux/uaccess.h>
 #include <linux/bitops.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
index c5744b45ec8f..65689469c5a1 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
@@ -22,7 +22,7 @@
 #include <linux/pci_ids.h>
 #include <linux/netdevice.h>
 #include <linux/interrupt.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/mmc/sdio.h>
 #include <linux/mmc/sdio_ids.h>
 #include <linux/mmc/sdio_func.h>
diff --git a/drivers/parisc/power.c b/drivers/parisc/power.c
index ef31b77404ef..e2a3112f1c98 100644
--- a/drivers/parisc/power.c
+++ b/drivers/parisc/power.c
@@ -39,7 +39,7 @@
 #include <linux/kernel.h>
 #include <linux/notifier.h>
 #include <linux/reboot.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/kthread.h>
 #include <linux/pm.h>
 
diff --git a/drivers/ps3/ps3-sys-manager.c b/drivers/ps3/ps3-sys-manager.c
index f2ab435954f6..73e496a72113 100644
--- a/drivers/ps3/ps3-sys-manager.c
+++ b/drivers/ps3/ps3-sys-manager.c
@@ -22,6 +22,7 @@
 #include <linux/module.h>
 #include <linux/workqueue.h>
 #include <linux/reboot.h>
+#include <linux/sched/signal.h>
 
 #include <asm/firmware.h>
 #include <asm/lv1call.h>
diff --git a/drivers/s390/char/fs3270.c b/drivers/s390/char/fs3270.c
index 85eca1cef063..c4518168fd02 100644
--- a/drivers/s390/char/fs3270.c
+++ b/drivers/s390/char/fs3270.c
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/compat.h>
+#include <linux/sched/signal.h>
 #include <linux/module.h>
 #include <linux/list.h>
 #include <linux/slab.h>
diff --git a/drivers/s390/char/keyboard.c b/drivers/s390/char/keyboard.c
index 82c913318b73..ba0e4f93503d 100644
--- a/drivers/s390/char/keyboard.c
+++ b/drivers/s390/char/keyboard.c
@@ -7,7 +7,7 @@
  */
 
 #include <linux/module.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/sysrq.h>
 
diff --git a/drivers/scsi/bnx2fc/bnx2fc.h b/drivers/scsi/bnx2fc/bnx2fc.h
index fdd4eb4e41b2..4fc8ed5fe067 100644
--- a/drivers/scsi/bnx2fc/bnx2fc.h
+++ b/drivers/scsi/bnx2fc/bnx2fc.h
@@ -39,7 +39,7 @@
 #include <linux/bitops.h>
 #include <linux/log2.h>
 #include <linux/interrupt.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/io.h>
 
 #include <scsi/scsi.h>
diff --git a/drivers/scsi/bnx2i/bnx2i.h b/drivers/scsi/bnx2i/bnx2i.h
index ed7f3228e234..89ef1a1678d1 100644
--- a/drivers/scsi/bnx2i/bnx2i.h
+++ b/drivers/scsi/bnx2i/bnx2i.h
@@ -25,7 +25,7 @@
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/in.h>
 #include <linux/kfifo.h>
 #include <linux/netdevice.h>
diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index 834d1212b6d5..07c08ce68d70 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -26,6 +26,7 @@
 #include <linux/delay.h>
 #include <linux/log2.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/module.h>
 #include <asm/unaligned.h>
 #include <net/tcp.h>
diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c
index ec3b66561412..054660049395 100644
--- a/drivers/staging/android/lowmemorykiller.c
+++ b/drivers/staging/android/lowmemorykiller.c
@@ -37,7 +37,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/oom.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/swap.h>
 #include <linux/rcupdate.h>
 #include <linux/profile.h>
diff --git a/drivers/staging/lustre/lnet/libcfs/linux/linux-prim.c b/drivers/staging/lustre/lnet/libcfs/linux/linux-prim.c
index cf902154f0aa..bcf9f3dd0310 100644
--- a/drivers/staging/lustre/lnet/libcfs/linux/linux-prim.c
+++ b/drivers/staging/lustre/lnet/libcfs/linux/linux-prim.c
@@ -34,7 +34,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/fs_struct.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 
 #include "../../../include/linux/libcfs/libcfs.h"
 
diff --git a/drivers/staging/rtl8188eu/include/osdep_service.h b/drivers/staging/rtl8188eu/include/osdep_service.h
index ee3f5ee06529..9e390648d93e 100644
--- a/drivers/staging/rtl8188eu/include/osdep_service.h
+++ b/drivers/staging/rtl8188eu/include/osdep_service.h
@@ -37,7 +37,7 @@
 #include <linux/io.h>
 #include <linux/mutex.h>
 #include <linux/sem.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/etherdevice.h>
 #include <linux/wireless.h>
 #include <net/iw_handler.h>
diff --git a/drivers/staging/rtl8712/osdep_service.h b/drivers/staging/rtl8712/osdep_service.h
index b8a170978434..5d33020554cd 100644
--- a/drivers/staging/rtl8712/osdep_service.h
+++ b/drivers/staging/rtl8712/osdep_service.h
@@ -33,7 +33,7 @@
 
 #include <linux/interrupt.h>
 #include <linux/semaphore.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sem.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
diff --git a/drivers/staging/rtl8712/rtl8712_cmd.c b/drivers/staging/rtl8712/rtl8712_cmd.c
index f19b6b27aa71..5346c657485d 100644
--- a/drivers/staging/rtl8712/rtl8712_cmd.c
+++ b/drivers/staging/rtl8712/rtl8712_cmd.c
@@ -32,6 +32,7 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/module.h>
 #include <linux/kref.h>
 #include <linux/netdevice.h>
diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c
index cb0b7ca36b1e..8a0d214f6e9b 100644
--- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c
+++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c
@@ -34,6 +34,7 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/sched/signal.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/cdev.h>
diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_util.h b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_util.h
index 4055d4bf9f74..e63964f5a18a 100644
--- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_util.h
+++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_util.h
@@ -47,7 +47,7 @@
 #include <linux/types.h>
 #include <linux/interrupt.h>
 #include <linux/random.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/ctype.h>
 #include <linux/uaccess.h>
 #include <linux/time.h>  /* for time_t */
diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c
index da2c73a255de..fa1d578d56bd 100644
--- a/drivers/target/iscsi/iscsi_target.c
+++ b/drivers/target/iscsi/iscsi_target.c
@@ -24,6 +24,7 @@
 #include <linux/vmalloc.h>
 #include <linux/idr.h>
 #include <linux/delay.h>
+#include <linux/sched/signal.h>
 #include <asm/unaligned.h>
 #include <net/ipv6.h>
 #include <scsi/scsi_proto.h>
diff --git a/drivers/target/iscsi/iscsi_target_erl0.c b/drivers/target/iscsi/iscsi_target_erl0.c
index b54e72c7ab0f..a4d5e6749932 100644
--- a/drivers/target/iscsi/iscsi_target_erl0.c
+++ b/drivers/target/iscsi/iscsi_target_erl0.c
@@ -17,6 +17,8 @@
  * GNU General Public License for more details.
  ******************************************************************************/
 
+#include <linux/sched/signal.h>
+
 #include <scsi/iscsi_proto.h>
 #include <target/target_core_base.h>
 #include <target/target_core_fabric.h>
diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c
index eab274d17b5c..b03cc03423c1 100644
--- a/drivers/target/iscsi/iscsi_target_login.c
+++ b/drivers/target/iscsi/iscsi_target_login.c
@@ -20,6 +20,7 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/kthread.h>
+#include <linux/sched/signal.h>
 #include <linux/idr.h>
 #include <linux/tcp.h>        /* TCP_NODELAY */
 #include <net/ipv6.h>         /* ipv6_addr_v4mapped() */
diff --git a/drivers/target/iscsi/iscsi_target_nego.c b/drivers/target/iscsi/iscsi_target_nego.c
index 46388c9e08da..29eb09e0cd15 100644
--- a/drivers/target/iscsi/iscsi_target_nego.c
+++ b/drivers/target/iscsi/iscsi_target_nego.c
@@ -19,6 +19,7 @@
 #include <linux/ctype.h>
 #include <linux/kthread.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <net/sock.h>
 #include <scsi/iscsi_proto.h>
 #include <target/target_core_base.h>
diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index a23fa5ed1d67..66b59a15780d 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -12,7 +12,7 @@
 #include <linux/tty.h>
 #include <linux/tty_flip.h>
 #include <linux/fcntl.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/string.h>
 #include <linux/major.h>
 #include <linux/mm.h>
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 71136742e606..65db0aeb3d80 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -14,7 +14,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/rt.h>
 #include <linux/interrupt.h>
 #include <linux/mm.h>
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index a1fd3f7d487a..985d33f0f315 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -69,7 +69,7 @@
 #include <linux/errno.h>
 #include <linux/signal.h>
 #include <linux/fcntl.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/interrupt.h>
 #include <linux/tty.h>
 #include <linux/tty_driver.h>
diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c
index 397e1509fe51..6b3a2c00974d 100644
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -26,7 +26,7 @@
 
 #include <linux/consolemap.h>
 #include <linux/module.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/tty.h>
 #include <linux/tty_flip.h>
 #include <linux/mm.h>
diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index 9d3ce505e7ab..5c4933bb4b53 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -72,7 +72,7 @@
 
 #include <linux/module.h>
 #include <linux/types.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/tty.h>
 #include <linux/tty_flip.h>
 #include <linux/kernel.h>
diff --git a/drivers/tty/vt/vt_ioctl.c b/drivers/tty/vt/vt_ioctl.c
index a56edf2d58eb..0cbfe1ff6f6c 100644
--- a/drivers/tty/vt/vt_ioctl.c
+++ b/drivers/tty/vt/vt_ioctl.c
@@ -10,7 +10,7 @@
 
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/tty.h>
 #include <linux/timer.h>
 #include <linux/kernel.h>
diff --git a/drivers/usb/atm/usbatm.c b/drivers/usb/atm/usbatm.c
index 5a59da0dc98a..3e80aa3b917a 100644
--- a/drivers/usb/atm/usbatm.c
+++ b/drivers/usb/atm/usbatm.c
@@ -74,7 +74,7 @@
 #include <linux/moduleparam.h>
 #include <linux/netdevice.h>
 #include <linux/proc_fs.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/signal.h>
 #include <linux/slab.h>
 #include <linux/stat.h>
diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index ca425e8099ea..cfc3cff6e8d5 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -36,6 +36,7 @@
 
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/signal.h>
 #include <linux/poll.h>
diff --git a/drivers/usb/gadget/function/f_mass_storage.c b/drivers/usb/gadget/function/f_mass_storage.c
index 8f3659b65f53..4c8aacc232c0 100644
--- a/drivers/usb/gadget/function/f_mass_storage.c
+++ b/drivers/usb/gadget/function/f_mass_storage.c
@@ -207,6 +207,7 @@
 #include <linux/fs.h>
 #include <linux/kref.h>
 #include <linux/kthread.h>
+#include <linux/sched/signal.h>
 #include <linux/limits.h>
 #include <linux/rwsem.h>
 #include <linux/slab.h>
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 185d50ee1b12..cf3de91fbfe7 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -21,6 +21,7 @@
 #include <linux/vfio.h>
 #include <linux/vmalloc.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
 
 #include <asm/iommu.h>
 #include <asm/tce.h>
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 7e94c7fc90ae..c26fa1f3ed86 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -31,7 +31,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/rbtree.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
diff --git a/drivers/w1/w1_family.c b/drivers/w1/w1_family.c
index df1c9bb90eb5..2096f460498f 100644
--- a/drivers/w1/w1_family.c
+++ b/drivers/w1/w1_family.c
@@ -14,7 +14,7 @@
 
 #include <linux/spinlock.h>
 #include <linux/list.h>
-#include <linux/sched.h>	/* schedule_timeout() */
+#include <linux/sched/signal.h>
 #include <linux/delay.h>
 #include <linux/export.h>
 
diff --git a/drivers/w1/w1_int.c b/drivers/w1/w1_int.c
index 4ce1b66d5092..2cae7b29bb5f 100644
--- a/drivers/w1/w1_int.c
+++ b/drivers/w1/w1_int.c
@@ -17,6 +17,7 @@
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/export.h>
 #include <linux/moduleparam.h>
 
diff --git a/fs/attr.c b/fs/attr.c
index c902b3d53508..135304146120 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -9,6 +9,7 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/string.h>
+#include <linux/sched/signal.h>
 #include <linux/capability.h>
 #include <linux/fsnotify.h>
 #include <linux/fcntl.h>
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 79fbd85db4ba..24a58bf9ca72 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -10,6 +10,7 @@
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/signal.h>
+#include <linux/sched/signal.h>
 #include <linux/file.h>
 #include "autofs_i.h"
 
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 777ad9f4fc3c..9bf25be05636 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -21,6 +21,7 @@
 #include <linux/fs.h>
 #include <linux/net.h>
 #include <linux/string.h>
+#include <linux/sched/signal.h>
 #include <linux/list.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index f6c6c8adbc01..e82357c89979 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -15,7 +15,7 @@
  */
 
 #include <linux/signal.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/fs/coredump.c b/fs/coredump.c
index 23a539b29225..c74ab43b8383 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -17,6 +17,7 @@
 #include <linux/binfmts.h>
 #include <linux/coredump.h>
 #include <linux/sched/coredump.h>
+#include <linux/sched/signal.h>
 #include <linux/utsname.h>
 #include <linux/pid_namespace.h>
 #include <linux/module.h>
@@ -34,7 +35,6 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/oom.h>
 #include <linux/compat.h>
-#include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/path.h>
 #include <linux/timekeeping.h>
diff --git a/fs/exec.c b/fs/exec.c
index 9c80d011594e..aa228b98ad5b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -34,6 +34,7 @@
 #include <linux/init.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
+#include <linux/sched/signal.h>
 #include <linux/pagemap.h>
 #include <linux/perf_event.h>
 #include <linux/highmem.h>
diff --git a/fs/file.c b/fs/file.c
index 69d6990e3021..ad6f094f2eff 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -12,7 +12,7 @@
 #include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <linux/time.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/file.h>
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 7dca743b2ce1..543ed50f0387 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -1,5 +1,5 @@
 #include <linux/export.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/fs.h>
 #include <linux/path.h>
 #include <linux/slab.h>
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index e5c1783ab64a..453a6a1fff34 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -16,7 +16,7 @@
 #include <linux/jffs2.h>
 #include <linux/mtd/mtd.h>
 #include <linux/completion.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include "nodelist.h"
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 7e4ea3b9f472..e7c8b9c76e48 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -17,7 +17,7 @@
 #include <linux/sysctl.h>
 #include <linux/moduleparam.h>
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/errno.h>
 #include <linux/in.h>
 #include <linux/uio.h>
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 7eb89c23c847..d5606099712a 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -30,6 +30,7 @@
 #include <linux/vfs.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/sched/signal.h>
 #include <linux/namei.h>
 
 #include <net/sock.h>
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index 97b111d79489..bdea177aa405 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -16,6 +16,7 @@
 #include <linux/fcntl.h>
 #include <linux/stat.h>
 #include <linux/string.h>
+#include <linux/sched/signal.h>
 #include <linux/uaccess.h>
 #include <linux/in.h>
 #include <linux/net.h>
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 484bebc20bca..bb79972dc638 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -9,6 +9,7 @@
 #include <linux/completion.h>
 #include <linux/ip.h>
 #include <linux/module.h>
+#include <linux/sched/signal.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/svcsock.h>
 #include <linux/nfs_fs.h>
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index efd66da99201..786a4a2cb2d7 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -6,7 +6,7 @@
  * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/freezer.h>
 #include <linux/module.h>
 #include <linux/fs_struct.h>
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 00ce1531b2f5..c330495c3115 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -1,4 +1,4 @@
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/errno.h>
 #include <linux/dcache.h>
 #include <linux/path.h>
diff --git a/fs/select.c b/fs/select.c
index 305c0daf5d67..e2112270d75a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -15,7 +15,8 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/rt.h>
 #include <linux/syscalls.h>
 #include <linux/export.h>
 #include <linux/slab.h>
@@ -26,7 +27,6 @@
 #include <linux/fs.h>
 #include <linux/rcupdate.h>
 #include <linux/hrtimer.h>
-#include <linux/sched/rt.h>
 #include <linux/freezer.h>
 #include <net/busy_poll.h>
 #include <linux/vmalloc.h>
diff --git a/include/linux/oom.h b/include/linux/oom.h
index b4e36e92bc87..8a266e2be5a6 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -2,7 +2,7 @@
 #define __INCLUDE_LINUX_OOM_H
 
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/types.h>
 #include <linux/nodemask.h>
 #include <uapi/linux/oom.h>
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index e0e539321ab9..422bc2e4cb6a 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -3,6 +3,7 @@
 
 #include <linux/compiler.h>		/* For unlikely.  */
 #include <linux/sched.h>		/* For struct task_struct.  */
+#include <linux/sched/signal.h>		/* For send_sig(), same_thread_group(), etc. */
 #include <linux/err.h>			/* for IS_ERR_VALUE */
 #include <linux/bug.h>			/* For BUG_ON.  */
 #include <linux/pid_namespace.h>	/* For task_active_pid_ns.  */
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
new file mode 100644
index 000000000000..da69cd05cd68
--- /dev/null
+++ b/include/linux/sched/signal.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_SIGNAL_H
+#define _LINUX_SCHED_SIGNAL_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_SIGNAL_H */
diff --git a/include/linux/signalfd.h b/include/linux/signalfd.h
index eadbe227c256..4985048640a7 100644
--- a/include/linux/signalfd.h
+++ b/include/linux/signalfd.h
@@ -8,7 +8,7 @@
 #define _LINUX_SIGNALFD_H
 
 #include <uapi/linux/signalfd.h>
-
+#include <linux/sched/signal.h>
 
 #ifdef CONFIG_SIGNALFD
 
diff --git a/include/linux/taskstats_kern.h b/include/linux/taskstats_kern.h
index 58de6edf751f..e2a5daf8d14f 100644
--- a/include/linux/taskstats_kern.h
+++ b/include/linux/taskstats_kern.h
@@ -8,7 +8,7 @@
 #define _LINUX_TASKSTATS_KERN_H
 
 #include <linux/taskstats.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 
 #ifdef CONFIG_TASKSTATS
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 40e448de8a7e..4f7241fbeff3 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -36,6 +36,7 @@
 #include <linux/user_namespace.h>
 #include <linux/slab.h>
 #include <linux/sched/wake_q.h>
+#include <linux/sched/signal.h>
 
 #include <net/sock.h>
 #include "util.h"
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 461eb1e66a0f..7af0dcc5d755 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -13,6 +13,7 @@
 #include <linux/bpf_trace.h>
 #include <linux/syscalls.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/vmalloc.h>
 #include <linux/mmzone.h>
 #include <linux/anon_inodes.h>
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 0a5f630f5c54..0aae3183b029 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -7,7 +7,7 @@
 #include <linux/smp.h>
 #include <linux/init.h>
 #include <linux/notifier.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/unistd.h>
 #include <linux/cpu.h>
 #include <linux/oom.h>
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 19d9a578c753..7510dc687c0d 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -29,6 +29,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/kgdb.h>
 #include <linux/kdb.h>
 #include <linux/serial_core.h>
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index fe15fff5df53..9b976a42376d 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -12,7 +12,7 @@
 #include <linux/ctype.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/kdb.h>
 #include <linux/nmi.h>
 #include "kdb_private.h"
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 40c07e4fa116..129247e56902 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -16,6 +16,8 @@
 #include <linux/export.h>
 #include <linux/sysctl.h>
 #include <linux/utsname.h>
+#include <linux/sched/signal.h>
+
 #include <trace/events/sched.h>
 
 /*
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index a0e90e0afc75..da128deb10ec 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -36,7 +36,7 @@
 #include <linux/spinlock.h>
 #include <linux/smp.h>
 #include <linux/interrupt.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/atomic.h>
 #include <linux/bitops.h>
 #include <linux/percpu.h>
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e2307a6c29f1..641249471952 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -5,6 +5,7 @@
 #include <linux/sched/rt.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/wake_q.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
 #include <linux/u64_stats_sync.h>
 #include <linux/sched/deadline.h>
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index a95f13c31464..f6b961c5e58c 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -10,6 +10,7 @@
 #include <linux/interrupt.h>
 #include <linux/syscalls.h>
 #include <linux/time.h>
+#include <linux/sched/signal.h>
 #include <linux/posix-timers.h>
 #include <linux/hrtimer.h>
 #include <trace/events/timer.h>
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index b4377a5e4269..a2475a9f57d8 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -2,7 +2,7 @@
  * Implement CPU time clocks for the POSIX clock interface.
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/posix-timers.h>
 #include <linux/errno.h>
 #include <linux/math64.h>
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 4fee1c3abd0b..0f411a4690a0 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -18,7 +18,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/percpu.h>
 #include <linux/profile.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/clock.h>
 #include <linux/module.h>
 #include <linux/irq_work.h>
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 1f9a31f934a4..9f6984d52fec 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -24,7 +24,7 @@
 #include <linux/tracepoint.h>
 #include <linux/err.h>
 #include <linux/slab.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/static_key.h>
 
 extern struct tracepoint * const __start___tracepoints_ptrs[];
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 571a2d3821d8..d9a03b80b75d 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -17,7 +17,7 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
 #include <linux/tsacct_kern.h>
 #include <linux/acct.h>
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 86b7854fec8e..2f735cbe05e8 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -8,6 +8,7 @@
 #include <linux/export.h>
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/user_namespace.h>
 #include <linux/proc_ns.h>
 #include <linux/highuid.h>
diff --git a/lib/is_single_threaded.c b/lib/is_single_threaded.c
index 391fd23976a2..9745cfffcb69 100644
--- a/lib/is_single_threaded.c
+++ b/lib/is_single_threaded.c
@@ -9,8 +9,7 @@
  * as published by the Free Software Foundation; either version
  * 2 of the Licence, or (at your option) any later version.
  */
-
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 
 /*
  * Returns true if the task does not share ->mm with another thread/process.
diff --git a/mm/filemap.c b/mm/filemap.c
index 1944c631e3e6..1694623a6289 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -13,6 +13,7 @@
 #include <linux/compiler.h>
 #include <linux/dax.h>
 #include <linux/fs.h>
+#include <linux/sched/signal.h>
 #include <linux/uaccess.h>
 #include <linux/capability.h>
 #include <linux/kernel_stat.h>
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index da3436953022..2df6d3687b2a 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -73,7 +73,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/jiffies.h>
 #include <linux/delay.h>
 #include <linux/export.h>
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 3d0f2fd4bf73..b74984db386e 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -40,7 +40,7 @@
 #include <linux/mm.h>
 #include <linux/page-flags.h>
 #include <linux/kernel-page-flags.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/ksm.h>
 #include <linux/rmap.h>
 #include <linux/export.h>
diff --git a/mm/vmacache.c b/mm/vmacache.c
index 7c233f8e20ee..4355d34c68a6 100644
--- a/mm/vmacache.c
+++ b/mm/vmacache.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (C) 2014 Davidlohr Bueso.
  */
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/mm.h>
 #include <linux/vmacache.h>
 
diff --git a/net/9p/client.c b/net/9p/client.c
index 3fc94a49ccd5..25cfd8a4bc36 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -32,7 +32,7 @@
 #include <linux/idr.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/uaccess.h>
 #include <linux/uio.h>
 #include <net/9p/9p.h>
diff --git a/net/atm/common.c b/net/atm/common.c
index a3ca922d307b..9613381f5db0 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -13,7 +13,7 @@
 #include <linux/errno.h>	/* error codes */
 #include <linux/capability.h>
 #include <linux/mm.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/time.h>		/* struct timeval */
 #include <linux/skbuff.h>
 #include <linux/bitops.h>
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 90fcf5fc2e0a..a8e42cedf1db 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -20,7 +20,7 @@
 #include <linux/socket.h>
 #include <linux/in.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/timer.h>
 #include <linux/string.h>
 #include <linux/sockios.h>
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 92cbbd2afddb..adcad344c843 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -9,7 +9,7 @@
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/spinlock.h>
 #include <linux/mutex.h>
 #include <linux/list.h>
diff --git a/net/core/stream.c b/net/core/stream.c
index f575bcf64af2..20231dbb1da0 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -13,6 +13,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/sched/signal.h>
 #include <linux/net.h>
 #include <linux/signal.h>
 #include <linux/tcp.h>
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index a90ed67027b0..e6e79eda9763 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -106,7 +106,7 @@ Version 0.0.6    2.1.110   07-aug-98   Eduardo Marcelo Serrat
 #include <linux/socket.h>
 #include <linux/in.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/timer.h>
 #include <linux/string.h>
 #include <linux/sockios.h>
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index ab254041dab7..81adc29a448d 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -46,6 +46,7 @@
 #include <linux/socket.h>
 #include <linux/sockios.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/init.h>
 #include <linux/net.h>
 #include <linux/irda.h>
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index ed212ffc1d9d..4bbf4526b885 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -17,7 +17,7 @@
 #include <linux/in.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/timer.h>
 #include <linux/string.h>
 #include <linux/sockios.h>
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 9ad301c46b88..b8a1df2c9785 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -20,7 +20,7 @@
 #include <linux/in.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/spinlock.h>
 #include <linux/timer.h>
 #include <linux/string.h>
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 465a9c8464f9..6f0a9be50f50 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -57,6 +57,7 @@
 #include <linux/kernel.h>
 #include <linux/wait.h>
 #include <linux/time.h>
+#include <linux/sched/signal.h>
 #include <linux/ip.h>
 #include <linux/capability.h>
 #include <linux/fcntl.h>
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index b94efd93d3e4..a08aeb56b8e4 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -11,7 +11,7 @@
  */
 
 #include <linux/linkage.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/errno.h>
 #include <linux/net.h>
 #include <linux/in.h>
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index e2d18b9f910f..ee37b390260a 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -85,7 +85,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/stat.h>
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 079c883aa96e..fd28a49dbe8f 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -41,7 +41,7 @@
 #include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/timer.h>
 #include <linux/string.h>
 #include <linux/net.h>
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 9a8f12f8d5b7..b12f873f92ba 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -28,7 +28,7 @@
 #include <linux/kernel.h>
 #include <linux/tracehook.h>
 #include <linux/errno.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/lsm_hooks.h>
 #include <linux/xattr.h>
 #include <linux/capability.h>
-- 
cgit v1.2.3


From 8703e8a465b1e9cadc3680b4b1248f5987e54518 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:30 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/user.h>

We are going to split <linux/sched/user.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/user.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 block/ioprio.c               | 1 +
 include/linux/cred.h         | 2 +-
 include/linux/sched/user.h   | 6 ++++++
 ipc/mqueue.c                 | 1 +
 kernel/fork.c                | 1 +
 kernel/signal.c              | 1 +
 kernel/user.c                | 1 +
 mm/mlock.c                   | 1 +
 net/core/scm.c               | 1 +
 security/keys/process_keys.c | 1 +
 10 files changed, 15 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/sched/user.h

(limited to 'kernel')

diff --git a/block/ioprio.c b/block/ioprio.c
index 3790669232ff..89c43e07787d 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -25,6 +25,7 @@
 #include <linux/ioprio.h>
 #include <linux/blkdev.h>
 #include <linux/capability.h>
+#include <linux/sched/user.h>
 #include <linux/syscalls.h>
 #include <linux/security.h>
 #include <linux/pid_namespace.h>
diff --git a/include/linux/cred.h b/include/linux/cred.h
index f0e70a1bb3ac..045d33e48069 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -18,8 +18,8 @@
 #include <linux/selinux.h>
 #include <linux/atomic.h>
 #include <linux/uidgid.h>
+#include <linux/sched/user.h>
 
-struct user_struct;
 struct cred;
 struct inode;
 
diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
new file mode 100644
index 000000000000..83238e5ddadd
--- /dev/null
+++ b/include/linux/sched/user.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_USER_H
+#define _LINUX_SCHED_USER_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_USER_H */
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 4f7241fbeff3..e8d41ff57241 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -37,6 +37,7 @@
 #include <linux/slab.h>
 #include <linux/sched/wake_q.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/user.h>
 
 #include <net/sock.h>
 #include "util.h"
diff --git a/kernel/fork.c b/kernel/fork.c
index 7332448b668a..a5ad1e4ab604 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -15,6 +15,7 @@
 #include <linux/sched/autogroup.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
+#include <linux/sched/user.h>
 #include <linux/init.h>
 #include <linux/unistd.h>
 #include <linux/module.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index bae358532d0a..762fcf64f0c3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -14,6 +14,7 @@
 #include <linux/export.h>
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/sched/user.h>
 #include <linux/fs.h>
 #include <linux/tty.h>
 #include <linux/binfmts.h>
diff --git a/kernel/user.c b/kernel/user.c
index b069ccbfb0b0..00281add65b2 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/bitops.h>
 #include <linux/key.h>
+#include <linux/sched/user.h>
 #include <linux/interrupt.h>
 #include <linux/export.h>
 #include <linux/user_namespace.h>
diff --git a/mm/mlock.c b/mm/mlock.c
index cdbed8aaa426..1050511f8b2b 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -8,6 +8,7 @@
 #include <linux/capability.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
+#include <linux/sched/user.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/pagemap.h>
diff --git a/net/core/scm.c b/net/core/scm.c
index b6d83686e149..b1ff8a441748 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -14,6 +14,7 @@
 #include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/user.h>
 #include <linux/mm.h>
 #include <linux/kernel.h>
 #include <linux/stat.h>
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 918cddcd4516..b6fdd22205b1 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/sched/user.h>
 #include <linux/keyctl.h>
 #include <linux/fs.h>
 #include <linux/err.h>
-- 
cgit v1.2.3


From 38b8d208a4544c9a26b10baec89b8a21042e5305 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:31 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/nmi.h>

We are going to move softlockup APIs out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

<linux/nmi.h> already includes <linux/sched.h>.

Include the <linux/nmi.h> header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/ia64/kernel/time.c               | 1 +
 arch/ia64/kernel/uncached.c           | 1 +
 arch/powerpc/kernel/swsusp_64.c       | 1 +
 arch/x86/kernel/pvclock.c             | 2 ++
 arch/x86/xen/smp.c                    | 1 +
 drivers/ide/ide-taskfile.c            | 1 +
 drivers/mtd/nand/nand_base.c          | 1 +
 drivers/video/fbdev/nvidia/nv_accel.c | 2 ++
 init/main.c                           | 1 +
 kernel/debug/debug_core.c             | 1 +
 kernel/power/hibernate.c              | 1 +
 kernel/power/snapshot.c               | 1 +
 kernel/sched/clock.c                  | 1 +
 kernel/time/tick-sched.c              | 1 +
 kernel/time/timekeeping.c             | 1 +
 15 files changed, 17 insertions(+)

(limited to 'kernel')

diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index faa116822c4c..144f9db7a876 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -16,6 +16,7 @@
 #include <linux/profile.h>
 #include <linux/sched.h>
 #include <linux/time.h>
+#include <linux/nmi.h>
 #include <linux/interrupt.h>
 #include <linux/efi.h>
 #include <linux/timex.h>
diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c
index f3976da36721..583f7ff6b589 100644
--- a/arch/ia64/kernel/uncached.c
+++ b/arch/ia64/kernel/uncached.c
@@ -19,6 +19,7 @@
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/efi.h>
+#include <linux/nmi.h>
 #include <linux/genalloc.h>
 #include <linux/gfp.h>
 #include <asm/page.h>
diff --git a/arch/powerpc/kernel/swsusp_64.c b/arch/powerpc/kernel/swsusp_64.c
index 0e899e47c325..51db012808f5 100644
--- a/arch/powerpc/kernel/swsusp_64.c
+++ b/arch/powerpc/kernel/swsusp_64.c
@@ -10,6 +10,7 @@
 #include <linux/irq.h>
 #include <linux/sched.h>
 #include <linux/interrupt.h>
+#include <linux/nmi.h>
 
 void do_after_copyback(void)
 {
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 9e93fe5803b4..5c3f6d6a5078 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -21,6 +21,8 @@
 #include <linux/sched.h>
 #include <linux/gfp.h>
 #include <linux/bootmem.h>
+#include <linux/nmi.h>
+
 #include <asm/fixmap.h>
 #include <asm/pvclock.h>
 
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 0dee6f59ea82..7ff2f1bfb7ec 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -18,6 +18,7 @@
 #include <linux/smp.h>
 #include <linux/irq_work.h>
 #include <linux/tick.h>
+#include <linux/nmi.h>
 
 #include <asm/paravirt.h>
 #include <asm/desc.h>
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 247b9faccce1..4c0007cb74e3 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -19,6 +19,7 @@
 #include <linux/delay.h>
 #include <linux/hdreg.h>
 #include <linux/ide.h>
+#include <linux/nmi.h>
 #include <linux/scatterlist.h>
 #include <linux/uaccess.h>
 
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 1492c12906f6..b0524f8accb6 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -36,6 +36,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
+#include <linux/nmi.h>
 #include <linux/types.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/nand.h>
diff --git a/drivers/video/fbdev/nvidia/nv_accel.c b/drivers/video/fbdev/nvidia/nv_accel.c
index ad6472a894ea..7341fed63e35 100644
--- a/drivers/video/fbdev/nvidia/nv_accel.c
+++ b/drivers/video/fbdev/nvidia/nv_accel.c
@@ -48,6 +48,8 @@
  */
 
 #include <linux/fb.h>
+#include <linux/nmi.h>
+
 #include "nv_type.h"
 #include "nv_proto.h"
 #include "nv_dma.h"
diff --git a/init/main.c b/init/main.c
index ae9f2008fb86..c891cfb8928f 100644
--- a/init/main.c
+++ b/init/main.c
@@ -27,6 +27,7 @@
 #include <linux/bootmem.h>
 #include <linux/acpi.h>
 #include <linux/tty.h>
+#include <linux/nmi.h>
 #include <linux/percpu.h>
 #include <linux/kmod.h>
 #include <linux/vmalloc.h>
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index a603ef28f70c..65c0f1363788 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -49,6 +49,7 @@
 #include <linux/init.h>
 #include <linux/kgdb.h>
 #include <linux/kdb.h>
+#include <linux/nmi.h>
 #include <linux/pid.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 86385af1080f..b8be5c803cdd 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -21,6 +21,7 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/pm.h>
+#include <linux/nmi.h>
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 905d5bbd595f..d79a38de425a 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -22,6 +22,7 @@
 #include <linux/device.h>
 #include <linux/init.h>
 #include <linux/bootmem.h>
+#include <linux/nmi.h>
 #include <linux/syscalls.h>
 #include <linux/console.h>
 #include <linux/highmem.h>
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index dd7817cdbf58..a08795e21628 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -58,6 +58,7 @@
 #include <linux/percpu.h>
 #include <linux/ktime.h>
 #include <linux/sched.h>
+#include <linux/nmi.h>
 #include <linux/sched/clock.h>
 #include <linux/static_key.h>
 #include <linux/workqueue.h>
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 0f411a4690a0..5e9e123b4321 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -17,6 +17,7 @@
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
 #include <linux/percpu.h>
+#include <linux/nmi.h>
 #include <linux/profile.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/clock.h>
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index fb564acee0f3..5b63a2102c29 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -14,6 +14,7 @@
 #include <linux/percpu.h>
 #include <linux/init.h>
 #include <linux/mm.h>
+#include <linux/nmi.h>
 #include <linux/sched.h>
 #include <linux/sched/loadavg.h>
 #include <linux/syscore_ops.h>
-- 
cgit v1.2.3


From 55687da166bf51129ed6b110d7711f4c7560abe2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:31 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/cpufreq.h>

We are going to split <linux/sched/cpufreq.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/cpufreq.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/cpufreq/cpufreq_governor.c | 1 -
 drivers/cpufreq/cpufreq_governor.h | 1 +
 drivers/cpufreq/cpufreq_ondemand.c | 1 +
 drivers/cpufreq/intel_pstate.c     | 2 +-
 include/linux/sched/cpufreq.h      | 6 ++++++
 kernel/sched/sched.h               | 1 +
 6 files changed, 10 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/sched/cpufreq.h

(limited to 'kernel')

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 631bd2c86c5e..47e24b5384b3 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -18,7 +18,6 @@
 
 #include <linux/export.h>
 #include <linux/kernel_stat.h>
-#include <linux/sched.h>
 #include <linux/slab.h>
 
 #include "cpufreq_governor.h"
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index f5717ca070cc..0236ec2cd654 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -20,6 +20,7 @@
 #include <linux/atomic.h>
 #include <linux/irq_work.h>
 #include <linux/cpufreq.h>
+#include <linux/sched/cpufreq.h>
 #include <linux/kernel_stat.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 4a017e895296..3937acf7e026 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -16,6 +16,7 @@
 #include <linux/percpu-defs.h>
 #include <linux/slab.h>
 #include <linux/tick.h>
+#include <linux/sched/cpufreq.h>
 
 #include "cpufreq_ondemand.h"
 
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index eb0f7fb71685..8f6d29302aac 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -19,7 +19,7 @@
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
 #include <linux/slab.h>
-#include <linux/sched.h>
+#include <linux/sched/cpufreq.h>
 #include <linux/list.h>
 #include <linux/cpu.h>
 #include <linux/cpufreq.h>
diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h
new file mode 100644
index 000000000000..07ed9664fa20
--- /dev/null
+++ b/include/linux/sched/cpufreq.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_CPUFREQ_H
+#define _LINUX_SCHED_CPUFREQ_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_CPUFREQ_H */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 641249471952..7dfeb14fa43c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -7,6 +7,7 @@
 #include <linux/sched/wake_q.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/cpufreq.h>
 #include <linux/u64_stats_sync.h>
 #include <linux/sched/deadline.h>
 #include <linux/kernel_stat.h>
-- 
cgit v1.2.3


From 6a3827d7509cbf96b7e961f8957c1f01d1bcf894 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:31 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/numa_balancing.h>

We are going to split <linux/sched/numa_balancing.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/numa_balancing.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/exec.c                            | 1 +
 fs/proc/array.c                      | 1 +
 include/linux/sched/numa_balancing.h | 6 ++++++
 include/trace/events/sched.h         | 2 +-
 kernel/fork.c                        | 1 +
 kernel/sched/sched.h                 | 1 +
 mm/huge_memory.c                     | 1 +
 mm/memory.c                          | 1 +
 mm/mempolicy.c                       | 1 +
 9 files changed, 14 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/sched/numa_balancing.h

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index aa228b98ad5b..e857c7260b1f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -35,6 +35,7 @@
 #include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/numa_balancing.h>
 #include <linux/pagemap.h>
 #include <linux/perf_event.h>
 #include <linux/highmem.h>
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 61bea3a4ecc5..cfe7f934a300 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -61,6 +61,7 @@
 #include <linux/string.h>
 #include <linux/mman.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/numa_balancing.h>
 #include <linux/proc_fs.h>
 #include <linux/ioport.h>
 #include <linux/uaccess.h>
diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
new file mode 100644
index 000000000000..999182279f78
--- /dev/null
+++ b/include/linux/sched/numa_balancing.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_NUMA_BALANCING_H
+#define _LINUX_SCHED_NUMA_BALANCING_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_NUMA_BALANCING_H */
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 9b90c57517a9..9e3ef6c99e4b 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -4,7 +4,7 @@
 #if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_SCHED_H
 
-#include <linux/sched.h>
+#include <linux/sched/numa_balancing.h>
 #include <linux/tracepoint.h>
 #include <linux/binfmts.h>
 
diff --git a/kernel/fork.c b/kernel/fork.c
index a5ad1e4ab604..08d6c091ac7c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -16,6 +16,7 @@
 #include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
 #include <linux/sched/user.h>
+#include <linux/sched/numa_balancing.h>
 #include <linux/init.h>
 #include <linux/unistd.h>
 #include <linux/module.h>
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7dfeb14fa43c..7e8ce0347fbf 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -6,6 +6,7 @@
 #include <linux/sched/clock.h>
 #include <linux/sched/wake_q.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/numa_balancing.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/cpufreq.h>
 #include <linux/u64_stats_sync.h>
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index dee92fa26bbb..d36b2af4d1bf 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -10,6 +10,7 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/sched/coredump.h>
+#include <linux/sched/numa_balancing.h>
 #include <linux/highmem.h>
 #include <linux/hugetlb.h>
 #include <linux/mmu_notifier.h>
diff --git a/mm/memory.c b/mm/memory.c
index 6fc918b2c459..d4994a28dc85 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -42,6 +42,7 @@
 #include <linux/mm.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
+#include <linux/sched/numa_balancing.h>
 #include <linux/hugetlb.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 90892416ed75..18e0105810df 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -74,6 +74,7 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/numa_balancing.h>
 #include <linux/nodemask.h>
 #include <linux/cpuset.h>
 #include <linux/slab.h>
-- 
cgit v1.2.3


From 7fce777cd4eacc0bdcb33017e5a4c495d28afed1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Feb 2017 14:47:27 +0100
Subject: sched/headers: Prepare header dependency changes, move the
 <asm/paravirt.h> include to kernel/sched/sched.h

Recent header reorganizations unearthed this hidden dependency:

  kernel/sched/core.c:199:25: error: 'paravirt_steal_rq_enabled' undeclared (first use in this function)
  kernel/sched/core.c:200:11: error: implicit declaration of function 'paravirt_steal_clock' [-Werror=implicit-function-declaration]

So move the asm/paravirt.h include from kernel/sched/cpuclock.c to kernel/sched/sched.h.

( NOTE: We do this change before doing the changes that introduce the build failure,
        so the series remains fully bisectable. )

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cputime.c | 4 ----
 kernel/sched/sched.h   | 4 ++++
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 2ecec3a4f1ee..54031fa681e2 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -6,10 +6,6 @@
 #include <linux/context_tracking.h>
 #include <linux/cputime.h>
 #include "sched.h"
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#endif
-
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7e8ce0347fbf..04f376cf7ba9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -20,6 +20,10 @@
 #include <linux/tick.h>
 #include <linux/slab.h>
 
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
+
 #include "cpupri.h"
 #include "cpudeadline.h"
 #include "cpuacct.h"
-- 
cgit v1.2.3


From 5b825c3af1d8a0af4deb4a5eb349d0d0050c62e5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Feb 2017 17:54:15 +0100
Subject: sched/headers: Prepare to remove <linux/cred.h> inclusion from
 <linux/sched.h>

Add #include <linux/cred.h> dependencies to all .c files rely on sched.h
doing that for them.

Note that even if the count where we need to add extra headers seems high,
it's still a net win, because <linux/sched.h> is included in over
2,200 files ...

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/kernel/sys_oabi-compat.c                     | 1 +
 arch/mips/kernel/mips-mt-fpaff.c                      | 1 +
 arch/x86/include/asm/intel_rdt.h                      | 1 +
 arch/x86/kernel/cpu/intel_cacheinfo.c                 | 1 +
 block/ioprio.c                                        | 1 +
 drivers/misc/eeprom/eeprom.c                          | 1 +
 drivers/misc/lkdtm_heap.c                             | 1 +
 drivers/misc/vmw_vmci/vmci_context.c                  | 1 +
 drivers/misc/vmw_vmci/vmci_host.c                     | 1 +
 drivers/staging/lustre/lustre/include/lustre_compat.h | 1 +
 drivers/staging/lustre/lustre/ptlrpc/sec.c            | 1 +
 drivers/xen/balloon.c                                 | 1 +
 fs/9p/v9fs.c                                          | 1 +
 fs/affs/inode.c                                       | 1 +
 fs/affs/super.c                                       | 1 +
 fs/autofs4/dev-ioctl.c                                | 1 +
 fs/befs/linuxvfs.c                                    | 1 +
 fs/binfmt_elf.c                                       | 1 +
 fs/cachefiles/internal.h                              | 1 +
 fs/compat.c                                           | 1 +
 fs/exportfs/expfs.c                                   | 1 +
 fs/ext2/balloc.c                                      | 1 +
 fs/ext4/ialloc.c                                      | 2 ++
 fs/file_table.c                                       | 1 +
 fs/gfs2/inode.c                                       | 1 +
 fs/gfs2/sys.c                                         | 1 +
 fs/hfs/inode.c                                        | 1 +
 fs/hfsplus/inode.c                                    | 1 +
 fs/isofs/inode.c                                      | 1 +
 fs/jffs2/fs.c                                         | 1 +
 fs/libfs.c                                            | 1 +
 fs/namespace.c                                        | 1 +
 fs/ncpfs/ioctl.c                                      | 1 +
 fs/notify/fanotify/fanotify.c                         | 1 +
 fs/notify/inotify/inotify_fsnotify.c                  | 1 +
 fs/omfs/inode.c                                       | 1 +
 fs/overlayfs/copy_up.c                                | 1 +
 fs/overlayfs/inode.c                                  | 1 +
 fs/overlayfs/namei.c                                  | 1 +
 fs/overlayfs/super.c                                  | 1 +
 fs/overlayfs/util.c                                   | 1 +
 fs/posix_acl.c                                        | 1 +
 fs/proc/proc_sysctl.c                                 | 1 +
 fs/proc/root.c                                        | 1 +
 fs/quota/dquot.c                                      | 1 +
 fs/stat.c                                             | 1 +
 fs/xfs/xfs_ioctl.c                                    | 1 +
 include/linux/cred.h                                  | 1 +
 include/linux/sched/signal.h                          | 1 +
 include/linux/wait.h                                  | 1 +
 include/net/scm.h                                     | 1 +
 include/rdma/ib.h                                     | 1 +
 ipc/namespace.c                                       | 1 +
 kernel/pid_namespace.c                                | 1 +
 kernel/ucount.c                                       | 1 +
 kernel/uid16.c                                        | 1 +
 kernel/utsname.c                                      | 1 +
 mm/usercopy.c                                         | 1 +
 net/dns_resolver/dns_query.c                          | 2 ++
 net/netfilter/xt_owner.c                              | 2 ++
 net/sunrpc/auth.c                                     | 1 +
 security/apparmor/policy.c                            | 1 +
 security/keys/internal.h                              | 1 +
 security/keys/keyctl.c                                | 1 +
 security/keys/persistent.c                            | 2 ++
 65 files changed, 69 insertions(+)

(limited to 'kernel')

diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c
index 5f221acd21ae..b9786f491873 100644
--- a/arch/arm/kernel/sys_oabi-compat.c
+++ b/arch/arm/kernel/sys_oabi-compat.c
@@ -76,6 +76,7 @@
 #include <linux/syscalls.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
+#include <linux/cred.h>
 #include <linux/fcntl.h>
 #include <linux/eventpoll.h>
 #include <linux/sem.h>
diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c
index 1a0a3b4ecc3e..ffc6bd3464d9 100644
--- a/arch/mips/kernel/mips-mt-fpaff.c
+++ b/arch/mips/kernel/mips-mt-fpaff.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/security.h>
 #include <linux/types.h>
 #include <linux/uaccess.h>
diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
index 95ce5c85b009..0d64397cee58 100644
--- a/arch/x86/include/asm/intel_rdt.h
+++ b/arch/x86/include/asm/intel_rdt.h
@@ -3,6 +3,7 @@
 
 #ifdef CONFIG_INTEL_RDT_A
 
+#include <linux/sched.h>
 #include <linux/kernfs.h>
 #include <linux/jump_label.h>
 
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 0282b0df004a..c55fb2cb2acc 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -11,6 +11,7 @@
 #include <linux/cacheinfo.h>
 #include <linux/cpu.h>
 #include <linux/sched.h>
+#include <linux/capability.h>
 #include <linux/sysfs.h>
 #include <linux/pci.h>
 
diff --git a/block/ioprio.c b/block/ioprio.c
index 89c43e07787d..4b9ab8367dda 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -23,6 +23,7 @@
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/ioprio.h>
+#include <linux/cred.h>
 #include <linux/blkdev.h>
 #include <linux/capability.h>
 #include <linux/sched/user.h>
diff --git a/drivers/misc/eeprom/eeprom.c b/drivers/misc/eeprom/eeprom.c
index 3d1d55157e5f..2fad790db3bf 100644
--- a/drivers/misc/eeprom/eeprom.c
+++ b/drivers/misc/eeprom/eeprom.c
@@ -19,6 +19,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/device.h>
+#include <linux/capability.h>
 #include <linux/jiffies.h>
 #include <linux/i2c.h>
 #include <linux/mutex.h>
diff --git a/drivers/misc/lkdtm_heap.c b/drivers/misc/lkdtm_heap.c
index 0f1581664c1c..ffb6aeac07b3 100644
--- a/drivers/misc/lkdtm_heap.c
+++ b/drivers/misc/lkdtm_heap.c
@@ -4,6 +4,7 @@
  */
 #include "lkdtm.h"
 #include <linux/slab.h>
+#include <linux/sched.h>
 
 /*
  * This tries to stay within the next largest power-of-2 kmalloc cache
diff --git a/drivers/misc/vmw_vmci/vmci_context.c b/drivers/misc/vmw_vmci/vmci_context.c
index f35f0c8606b9..21d0fa592145 100644
--- a/drivers/misc/vmw_vmci/vmci_context.c
+++ b/drivers/misc/vmw_vmci/vmci_context.c
@@ -19,6 +19,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/slab.h>
 
 #include "vmci_queue_pair.h"
diff --git a/drivers/misc/vmw_vmci/vmci_host.c b/drivers/misc/vmw_vmci/vmci_host.c
index ec090105eb4b..8a16a26e9658 100644
--- a/drivers/misc/vmw_vmci/vmci_host.c
+++ b/drivers/misc/vmw_vmci/vmci_host.c
@@ -24,6 +24,7 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/slab.h>
 #include <linux/file.h>
 #include <linux/init.h>
diff --git a/drivers/staging/lustre/lustre/include/lustre_compat.h b/drivers/staging/lustre/lustre/include/lustre_compat.h
index 300e96fb032a..da9ce195c52e 100644
--- a/drivers/staging/lustre/lustre/include/lustre_compat.h
+++ b/drivers/staging/lustre/lustre/include/lustre_compat.h
@@ -35,6 +35,7 @@
 
 #include <linux/fs_struct.h>
 #include <linux/namei.h>
+#include <linux/cred.h>
 
 #include "lustre_patchless_compat.h"
 
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec.c b/drivers/staging/lustre/lustre/ptlrpc/sec.c
index e860df7c45a2..49f34fd655c3 100644
--- a/drivers/staging/lustre/lustre/ptlrpc/sec.c
+++ b/drivers/staging/lustre/lustre/ptlrpc/sec.c
@@ -38,6 +38,7 @@
 
 #include "../../include/linux/libcfs/libcfs.h"
 #include <linux/crypto.h>
+#include <linux/cred.h>
 #include <linux/key.h>
 
 #include "../include/obd.h"
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index db107fa50ca1..a6d4378eb8d9 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -41,6 +41,7 @@
 #include <linux/cpu.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/bootmem.h>
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 072e7599583a..a89f3cfe3c7d 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -29,6 +29,7 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/parser.h>
 #include <linux/idr.h>
 #include <linux/slab.h>
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index a5e6097eb5a9..abcc59899229 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -10,6 +10,7 @@
  *  (C) 1991  Linus Torvalds - minix filesystem
  */
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/gfp.h>
 #include "affs.h"
 
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 37532538e8ab..c2c27a8f128e 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -16,6 +16,7 @@
 #include <linux/parser.h>
 #include <linux/magic.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/slab.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 806df746f1a9..734cbf8d9676 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -17,6 +17,7 @@
 #include <linux/file.h>
 #include <linux/fdtable.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/compat.h>
 #include <linux/syscalls.h>
 #include <linux/magic.h>
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 19407165f4aa..c500e954debb 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -18,6 +18,7 @@
 #include <linux/parser.h>
 #include <linux/namei.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/exportfs.h>
 
 #include "befs.h"
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index fbbe52e1250e..2c95257fa4da 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -36,6 +36,7 @@
 #include <linux/coredump.h>
 #include <linux/sched.h>
 #include <linux/sched/coredump.h>
+#include <linux/cred.h>
 #include <linux/dax.h>
 #include <linux/uaccess.h>
 #include <asm/param.h>
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index cd1effee8a49..9bf90bcc56ac 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -19,6 +19,7 @@
 #include <linux/fscache-cache.h>
 #include <linux/timer.h>
 #include <linux/wait.h>
+#include <linux/cred.h>
 #include <linux/workqueue.h>
 #include <linux/security.h>
 
diff --git a/fs/compat.c b/fs/compat.c
index e50a2114f474..c61b506f5bc9 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -21,6 +21,7 @@
 #include <linux/compat.h>
 #include <linux/errno.h>
 #include <linux/time.h>
+#include <linux/cred.h>
 #include <linux/fs.h>
 #include <linux/fcntl.h>
 #include <linux/namei.h>
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index a4b531be9168..9ec1038f937e 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -15,6 +15,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 
 #define dprintk(fmt, args...) do{}while(0)
 
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 4c40c0786e16..d0bdb74f0e15 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -15,6 +15,7 @@
 #include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/buffer_head.h>
 #include <linux/capability.h>
 
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index b14bae2598bc..17bc043308f3 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -21,6 +21,8 @@
 #include <linux/random.h>
 #include <linux/bitops.h>
 #include <linux/blkdev.h>
+#include <linux/cred.h>
+
 #include <asm/byteorder.h>
 
 #include "ext4.h"
diff --git a/fs/file_table.c b/fs/file_table.c
index 6d982b57de92..954d510b765a 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/security.h>
+#include <linux/cred.h>
 #include <linux/eventpoll.h>
 #include <linux/rcupdate.h>
 #include <linux/mount.h>
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index eb7724b8578a..9d28f55fbd1d 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -13,6 +13,7 @@
 #include <linux/buffer_head.h>
 #include <linux/namei.h>
 #include <linux/mm.h>
+#include <linux/cred.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/gfs2_ondisk.h>
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index f8d30e41d1d3..7a515345610c 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -10,6 +10,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index f776acf2378a..bfbba799430f 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -14,6 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/uio.h>
 #include <linux/xattr.h>
 
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 2e796f8302ff..e8638d528195 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -14,6 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/uio.h>
 
 #include "hfsplus_fs.h"
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 871c8b392099..020ba0936146 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 
 #include <linux/slab.h>
+#include <linux/cred.h>
 #include <linux/nls.h>
 #include <linux/ctype.h>
 #include <linux/statfs.h>
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 567653f7c0ce..76fa814df3d1 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -15,6 +15,7 @@
 #include <linux/capability.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/fs.h>
 #include <linux/list.h>
 #include <linux/mtd/mtd.h>
diff --git a/fs/libfs.c b/fs/libfs.c
index 28d6f35feed6..217896ca4fae 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -7,6 +7,7 @@
 #include <linux/export.h>
 #include <linux/pagemap.h>
 #include <linux/slab.h>
+#include <linux/cred.h>
 #include <linux/mount.h>
 #include <linux/vfs.h>
 #include <linux/quotaops.h>
diff --git a/fs/namespace.c b/fs/namespace.c
index 8bfad42c1ccf..131cd7b94f47 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -15,6 +15,7 @@
 #include <linux/user_namespace.h>
 #include <linux/namei.h>
 #include <linux/security.h>
+#include <linux/cred.h>
 #include <linux/idr.h>
 #include <linux/init.h>		/* init_rootfs */
 #include <linux/fs_struct.h>	/* get_fs_root et.al. */
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 4434e4977cf3..12550c2320cc 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -19,6 +19,7 @@
 #include <linux/highuid.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 
 #include <linux/uaccess.h>
 
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index a4c46221755e..e5f7e47de68e 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -6,6 +6,7 @@
 #include <linux/kernel.h> /* UINT_MAX */
 #include <linux/mount.h>
 #include <linux/sched.h>
+#include <linux/sched/user.h>
 #include <linux/types.h>
 #include <linux/wait.h>
 
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index f36c29398de3..1aeb837ae414 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -30,6 +30,7 @@
 #include <linux/slab.h> /* kmem_* */
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/sched/user.h>
 
 #include "inotify.h"
 
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index df7ea8543a2e..8c9034ee7383 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -8,6 +8,7 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/vfs.h>
+#include <linux/cred.h>
 #include <linux/parser.h>
 #include <linux/buffer_head.h>
 #include <linux/vmalloc.h>
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index f57043dace62..53d3f830358f 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -16,6 +16,7 @@
 #include <linux/security.h>
 #include <linux/uaccess.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/namei.h>
 #include <linux/fdtable.h>
 #include <linux/ratelimit.h>
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 08643ac44a02..6639f487f835 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -9,6 +9,7 @@
 
 #include <linux/fs.h>
 #include <linux/slab.h>
+#include <linux/cred.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include "overlayfs.h"
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 023bb0b03352..b8b077821fb0 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/cred.h>
 #include <linux/namei.h>
 #include <linux/xattr.h>
 #include <linux/ratelimit.h>
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 20f48abbb82f..9aa37c2f7f7d 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -7,6 +7,7 @@
  * the Free Software Foundation.
  */
 
+#include <uapi/linux/magic.h>
 #include <linux/fs.h>
 #include <linux/namei.h>
 #include <linux/xattr.h>
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 952286f4826c..9dc1c0af586b 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -10,6 +10,7 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/slab.h>
+#include <linux/cred.h>
 #include <linux/xattr.h>
 #include "overlayfs.h"
 #include "ovl_entry.h"
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index c9d48dc78495..eebf5f6cf6d5 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -15,6 +15,7 @@
 #include <linux/atomic.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/posix_acl.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/xattr.h>
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 3e64c6502dc8..3d203b1f5a02 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -8,6 +8,7 @@
 #include <linux/printk.h>
 #include <linux/security.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/namei.h>
 #include <linux/mm.h>
 #include <linux/module.h>
diff --git a/fs/proc/root.c b/fs/proc/root.c
index b90da888b81a..5d5fed20bfff 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -20,6 +20,7 @@
 #include <linux/mount.h>
 #include <linux/pid_namespace.h>
 #include <linux/parser.h>
+#include <linux/cred.h>
 
 #include "internal.h"
 
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 406fed92362a..74b489e3714d 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -72,6 +72,7 @@
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/kmod.h>
 #include <linux/namei.h>
 #include <linux/capability.h>
diff --git a/fs/stat.c b/fs/stat.c
index 3f14d1ef0868..95bd41762770 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -12,6 +12,7 @@
 #include <linux/fs.h>
 #include <linux/namei.h>
 #include <linux/security.h>
+#include <linux/cred.h>
 #include <linux/syscalls.h>
 #include <linux/pagemap.h>
 
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index cf1363dbf32b..2fd7fdf5438f 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -43,6 +43,7 @@
 #include "xfs_acl.h"
 
 #include <linux/capability.h>
+#include <linux/cred.h>
 #include <linux/dcache.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 045d33e48069..b03e7d049a64 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -18,6 +18,7 @@
 #include <linux/selinux.h>
 #include <linux/atomic.h>
 #include <linux/uidgid.h>
+#include <linux/sched.h>
 #include <linux/sched/user.h>
 
 struct cred;
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index ce93c4a02b79..0f4e9f4a43fd 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -1,6 +1,7 @@
 #ifndef _LINUX_SCHED_SIGNAL_H
 #define _LINUX_SCHED_SIGNAL_H
 
+#include <linux/cred.h>
 #include <linux/sched.h>
 #include <linux/sched/jobctl.h>
 
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 1421132e9086..aacb1282d19a 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -6,6 +6,7 @@
 #include <linux/list.h>
 #include <linux/stddef.h>
 #include <linux/spinlock.h>
+
 #include <asm/current.h>
 #include <uapi/linux/wait.h>
 
diff --git a/include/net/scm.h b/include/net/scm.h
index 59fa93c01d2a..142ea9e7a6d0 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -3,6 +3,7 @@
 
 #include <linux/limits.h>
 #include <linux/net.h>
+#include <linux/cred.h>
 #include <linux/security.h>
 #include <linux/pid.h>
 #include <linux/nsproxy.h>
diff --git a/include/rdma/ib.h b/include/rdma/ib.h
index a6b93706b0fc..9b4c22a36931 100644
--- a/include/rdma/ib.h
+++ b/include/rdma/ib.h
@@ -35,6 +35,7 @@
 
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 
 struct ib_addr {
 	union {
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 0abdea496493..1f1d713ac19c 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -9,6 +9,7 @@
 #include <linux/rcupdate.h>
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
+#include <linux/cred.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/user_namespace.h>
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index eef2ce968636..25314c96fbe6 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -12,6 +12,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/user_namespace.h>
 #include <linux/syscalls.h>
+#include <linux/cred.h>
 #include <linux/err.h>
 #include <linux/acct.h>
 #include <linux/slab.h>
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 8a11fc0cb459..62630a40ab3a 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -8,6 +8,7 @@
 #include <linux/stat.h>
 #include <linux/sysctl.h>
 #include <linux/slab.h>
+#include <linux/cred.h>
 #include <linux/hash.h>
 #include <linux/user_namespace.h>
 
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 71645ae9303a..5c2dc5b2bf4f 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/highuid.h>
 #include <linux/security.h>
+#include <linux/cred.h>
 #include <linux/syscalls.h>
 
 #include <linux/uaccess.h>
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 6976cd47dcf6..06585ad296ff 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,6 +14,7 @@
 #include <linux/utsname.h>
 #include <linux/err.h>
 #include <linux/slab.h>
+#include <linux/cred.h>
 #include <linux/user_namespace.h>
 #include <linux/proc_ns.h>
 
diff --git a/mm/usercopy.c b/mm/usercopy.c
index 8345299e3e3b..7ccad05c0d5c 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -16,6 +16,7 @@
 
 #include <linux/mm.h>
 #include <linux/slab.h>
+#include <linux/sched.h>
 #include <asm/sections.h>
 
 enum {
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
index ecc28cff08ab..ab9fec00a788 100644
--- a/net/dns_resolver/dns_query.c
+++ b/net/dns_resolver/dns_query.c
@@ -37,8 +37,10 @@
 
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/cred.h>
 #include <linux/dns_resolver.h>
 #include <linux/err.h>
+
 #include <keys/dns_resolver-type.h>
 #include <keys/user-type.h>
 
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index 16477df45b3b..3d705c688a27 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -13,6 +13,8 @@
 #include <linux/module.h>
 #include <linux/skbuff.h>
 #include <linux/file.h>
+#include <linux/cred.h>
+
 #include <net/sock.h>
 #include <net/inet_sock.h>
 #include <linux/netfilter/x_tables.h>
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index a1ee933e3029..d2623b9f23d6 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -8,6 +8,7 @@
 
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/errno.h>
diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c
index f44312a19522..462c5d36b871 100644
--- a/security/apparmor/policy.c
+++ b/security/apparmor/policy.c
@@ -76,6 +76,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/string.h>
+#include <linux/cred.h>
 #include <linux/user_namespace.h>
 
 #include "include/apparmor.h"
diff --git a/security/keys/internal.h b/security/keys/internal.h
index a705a7d92ad7..a2f4c0abb8d8 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -13,6 +13,7 @@
 #define _INTERNAL_H
 
 #include <linux/sched.h>
+#include <linux/cred.h>
 #include <linux/key-type.h>
 #include <linux/task_work.h>
 #include <linux/keyctl.h>
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 04a764f71ec8..bcb0b597c391 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -18,6 +18,7 @@
 #include <linux/keyctl.h>
 #include <linux/fs.h>
 #include <linux/capability.h>
+#include <linux/cred.h>
 #include <linux/string.h>
 #include <linux/err.h>
 #include <linux/vmalloc.h>
diff --git a/security/keys/persistent.c b/security/keys/persistent.c
index 1edc1f0a0ce2..d0cb5b32eff7 100644
--- a/security/keys/persistent.c
+++ b/security/keys/persistent.c
@@ -10,6 +10,8 @@
  */
 
 #include <linux/user_namespace.h>
+#include <linux/cred.h>
+
 #include "internal.h"
 
 unsigned persistent_keyring_expiry = 3 * 24 * 3600; /* Expire after 3 days of non-use */
-- 
cgit v1.2.3


From 037741a6d4ab2e4b0580dae97aca963d8a8dc68f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 10:08:30 +0100
Subject: sched/headers: Prepare for the removal of <linux/rtmutex.h> from
 <linux/sched.h>

Fix up missing #includes in other places that rely on sched.h doing that for them.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/i2c.h          | 1 +
 kernel/fork.c                | 1 +
 kernel/locking/locktorture.c | 1 +
 kernel/rcu/tree.h            | 1 +
 4 files changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index bed8fbb45f31..6b183521c616 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -30,6 +30,7 @@
 #include <linux/device.h>	/* for struct device */
 #include <linux/sched.h>	/* for completion */
 #include <linux/mutex.h>
+#include <linux/rtmutex.h>
 #include <linux/irqdomain.h>		/* for Host Notify IRQ */
 #include <linux/of.h>		/* for struct device_node */
 #include <linux/swab.h>		/* for swab16 */
diff --git a/kernel/fork.c b/kernel/fork.c
index 08d6c091ac7c..8fbe8bcd1e20 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -17,6 +17,7 @@
 #include <linux/sched/coredump.h>
 #include <linux/sched/user.h>
 #include <linux/sched/numa_balancing.h>
+#include <linux/rtmutex.h>
 #include <linux/init.h>
 #include <linux/unistd.h>
 #include <linux/module.h>
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 5ea0a8969ee2..f24582d4dad3 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -33,6 +33,7 @@
 #include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <uapi/linux/sched/types.h>
+#include <linux/rtmutex.h>
 #include <linux/atomic.h>
 #include <linux/moduleparam.h>
 #include <linux/delay.h>
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index b60f2b6caa14..ec62a05bfdb3 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -24,6 +24,7 @@
 
 #include <linux/cache.h>
 #include <linux/spinlock.h>
+#include <linux/rtmutex.h>
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
-- 
cgit v1.2.3


From 174cd4b1e5fbd0d74c68cf3a74f5bd4923485512 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Feb 2017 19:15:33 +0100
Subject: sched/headers: Prepare to move signal wakeup & sigpending methods
 from <linux/sched.h> into <linux/sched/signal.h>

Fix up affected files that include this signal functionality via sched.h.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/common/bL_switcher.c                      |  2 +-
 arch/cris/arch-v10/drivers/sync_serial.c           |  2 +-
 arch/cris/arch-v32/drivers/sync_serial.c           |  2 +-
 arch/mips/kernel/rtlx.c                            |  2 ++
 arch/mips/kvm/mips.c                               |  2 ++
 arch/powerpc/kvm/book3s_hv.c                       |  2 +-
 arch/powerpc/kvm/powerpc.c                         |  1 +
 arch/powerpc/platforms/83xx/suspend.c              |  1 +
 arch/powerpc/platforms/cell/spufs/sched.c          |  2 +-
 arch/powerpc/platforms/cell/spufs/spufs.h          |  1 +
 arch/s390/crypto/prng.c                            |  2 ++
 arch/s390/kvm/kvm-s390.c                           |  2 ++
 arch/s390/kvm/vsie.c                               |  2 ++
 arch/sh/kernel/cpu/fpu.c                           |  2 +-
 arch/sh/kernel/disassemble.c                       |  2 ++
 arch/sh/kernel/process.c                           |  2 +-
 arch/um/drivers/random.c                           |  2 +-
 arch/x86/kernel/apm_32.c                           |  2 +-
 block/blk-cgroup.c                                 |  1 +
 block/blk-mq.c                                     |  1 +
 crypto/algboss.c                                   |  2 +-
 crypto/algif_aead.c                                |  1 +
 crypto/algif_skcipher.c                            |  1 +
 crypto/api.c                                       |  2 +-
 drivers/atm/horizon.c                              |  1 +
 drivers/base/core.c                                |  1 +
 drivers/base/power/wakeup.c                        |  2 +-
 drivers/block/drbd/drbd_main.c                     |  1 +
 drivers/block/drbd/drbd_receiver.c                 |  1 +
 drivers/block/drbd/drbd_worker.c                   |  2 +-
 drivers/block/swim3.c                              |  2 +-
 drivers/char/applicom.c                            |  2 +-
 drivers/char/hpet.c                                |  1 +
 drivers/char/hw_random/core.c                      |  1 +
 drivers/char/ipmi/ipmi_watchdog.c                  |  1 +
 drivers/char/lp.c                                  |  2 +-
 drivers/char/ppdev.c                               |  2 +-
 drivers/char/rtc.c                                 |  2 +-
 drivers/char/snsc.c                                |  2 +-
 drivers/dma-buf/dma-fence.c                        |  1 +
 drivers/gpu/drm/omapdrm/displays/panel-dsi-cm.c    |  2 +-
 drivers/gpu/drm/vc4/vc4_gem.c                      |  1 +
 drivers/gpu/vga/vgaarb.c                           |  2 +-
 drivers/hid/hid-debug.c                            |  2 +-
 drivers/hid/hid-roccat.c                           |  2 +-
 drivers/hid/hidraw.c                               |  2 +-
 drivers/hid/usbhid/hiddev.c                        |  1 +
 drivers/hsi/clients/cmt_speech.c                   |  2 +-
 drivers/i2c/busses/i2c-ibm_iic.c                   |  2 ++
 drivers/i2c/busses/i2c-mpc.c                       |  2 +-
 drivers/iio/industrialio-buffer.c                  |  2 +-
 drivers/infiniband/ulp/ipoib/ipoib_cm.c            |  1 +
 drivers/infiniband/ulp/ipoib/ipoib_vlan.c          |  1 +
 drivers/isdn/capi/kcapi.c                          |  2 +-
 drivers/isdn/mISDN/timerdev.c                      |  2 ++
 drivers/lguest/core.c                              |  1 +
 drivers/macintosh/adb.c                            |  2 +-
 drivers/macintosh/smu.c                            |  1 +
 drivers/macintosh/via-pmu.c                        |  2 +-
 drivers/mailbox/mailbox-test.c                     |  1 +
 drivers/md/dm.c                                    |  1 +
 drivers/media/dvb-core/dvb_ca_en50221.c            |  2 +-
 drivers/media/dvb-core/dvb_demux.c                 |  2 +-
 drivers/media/dvb-core/dvb_frontend.c              |  2 +-
 drivers/media/pci/cx18/cx18-driver.h               |  2 +-
 drivers/media/pci/ivtv/ivtv-driver.h               | 35 +++++++++++-----------
 drivers/media/pci/pt1/pt1.c                        |  1 +
 drivers/media/pci/pt3/pt3.c                        |  1 +
 drivers/media/pci/solo6x10/solo6x10-i2c.c          |  1 +
 drivers/media/pci/zoran/zoran_device.c             |  1 +
 drivers/media/platform/vivid/vivid-radio-rx.c      |  2 ++
 drivers/media/platform/vivid/vivid-radio-tx.c      |  1 +
 drivers/media/rc/lirc_dev.c                        |  2 +-
 drivers/media/usb/cpia2/cpia2_core.c               |  1 +
 drivers/media/usb/gspca/cpia1.c                    |  2 ++
 drivers/misc/cxl/fault.c                           |  2 +-
 drivers/misc/cxl/file.c                            |  2 +-
 drivers/misc/ibmasm/r_heartbeat.c                  |  2 +-
 drivers/misc/lis3lv02d/lis3lv02d.c                 |  1 +
 drivers/misc/mei/bus.c                             |  2 +-
 drivers/misc/mei/client.c                          |  2 +-
 drivers/misc/mei/main.c                            |  2 +-
 drivers/misc/mic/scif/scif_main.h                  |  2 +-
 drivers/misc/vexpress-syscfg.c                     |  2 +-
 drivers/mtd/tests/mtd_test.h                       |  2 +-
 drivers/net/bonding/bond_options.c                 |  2 ++
 drivers/net/bonding/bond_sysfs.c                   |  2 +-
 drivers/net/can/softing/softing_fw.c               |  2 +-
 drivers/net/ethernet/broadcom/tg3.c                |  1 +
 drivers/net/ethernet/cavium/liquidio/octeon_main.h |  2 ++
 drivers/net/ethernet/sfc/falcon/falcon.c           |  2 ++
 drivers/net/irda/stir4200.c                        |  1 +
 drivers/net/macvtap.c                              |  2 +-
 drivers/net/ppp/ppp_generic.c                      |  1 +
 drivers/net/tun.c                                  |  1 +
 drivers/net/usb/hso.c                              |  2 +-
 drivers/net/usb/qmi_wwan.c                         |  1 +
 drivers/net/wan/cosa.c                             |  2 +-
 drivers/net/wireless/ath/ath6kl/cfg80211.c         |  1 +
 drivers/net/wireless/broadcom/b43legacy/main.c     |  2 +-
 drivers/net/wireless/intersil/hostap/hostap_hw.c   |  2 +-
 .../net/wireless/intersil/hostap/hostap_ioctl.c    |  2 +-
 drivers/oprofile/event_buffer.c                    |  2 +-
 drivers/parport/daisy.c                            |  2 +-
 drivers/parport/ieee1284.c                         |  2 +-
 drivers/parport/ieee1284_ops.c                     |  2 +-
 drivers/parport/parport_ip32.c                     |  2 +-
 drivers/parport/parport_pc.c                       |  2 +-
 drivers/parport/share.c                            |  2 +-
 drivers/pci/access.c                               |  2 +-
 drivers/pci/hotplug/cpci_hotplug_core.c            |  1 +
 drivers/pci/hotplug/cpqphp.h                       |  2 +-
 drivers/pci/hotplug/pciehp.h                       |  2 +-
 drivers/pci/hotplug/shpchp.h                       |  2 +-
 drivers/rtc/rtc-dev.c                              |  2 +-
 drivers/s390/cio/device.c                          |  1 +
 drivers/scsi/lpfc/lpfc_vport.c                     |  1 +
 drivers/scsi/osst.c                                |  2 +-
 drivers/scsi/st.c                                  |  2 +-
 drivers/soc/fsl/qbman/dpaa_sys.h                   |  1 +
 drivers/staging/comedi/comedi_fops.c               |  2 +-
 drivers/staging/dgnc/dgnc_tty.c                    |  2 +-
 drivers/staging/dgnc/dgnc_utils.c                  |  2 +-
 drivers/staging/greybus/uart.c                     |  2 +-
 .../lustre/lustre/include/lustre/lustre_user.h     |  1 +
 drivers/staging/lustre/lustre/include/lustre_lib.h |  2 +-
 .../staging/lustre/lustre/include/obd_support.h    |  2 ++
 drivers/staging/media/lirc/lirc_sir.c              |  2 +-
 drivers/staging/media/lirc/lirc_zilog.c            |  2 +-
 drivers/staging/speakup/speakup_soft.c             |  2 +-
 drivers/target/iscsi/cxgbit/cxgbit_target.c        |  2 ++
 drivers/tty/n_gsm.c                                |  2 +-
 drivers/tty/serial/crisv10.c                       |  2 +-
 drivers/tty/serial/serial_core.c                   |  1 +
 drivers/tty/tty_ioctl.c                            |  2 +-
 drivers/tty/tty_port.c                             |  2 +-
 drivers/uio/uio.c                                  |  2 +-
 drivers/usb/class/cdc-acm.c                        |  1 +
 drivers/usb/class/usblp.c                          |  2 +-
 drivers/usb/gadget/function/f_fs.c                 |  1 +
 drivers/usb/image/mdc800.c                         |  2 +-
 drivers/usb/misc/adutux.c                          |  1 +
 drivers/usb/misc/idmouse.c                         |  1 +
 drivers/usb/misc/rio500.c                          |  2 +-
 drivers/usb/misc/uss720.c                          |  1 +
 drivers/usb/mon/mon_bin.c                          |  1 +
 drivers/usb/mon/mon_text.c                         |  1 +
 drivers/usb/serial/digi_acceleport.c               |  1 +
 drivers/usb/serial/generic.c                       |  1 +
 drivers/vhost/net.c                                |  1 +
 drivers/vhost/vhost.c                              |  1 +
 drivers/video/fbdev/cobalt_lcdfb.c                 |  1 +
 .../fbdev/omap2/omapfb/displays/panel-dsi-cm.c     |  2 +-
 fs/afs/rxrpc.c                                     |  2 ++
 fs/aio.c                                           |  2 +-
 fs/btrfs/ctree.h                                   |  1 +
 fs/ceph/caps.c                                     |  2 +-
 fs/cifs/inode.c                                    |  2 ++
 fs/coda/psdev.c                                    |  2 +-
 fs/dlm/user.c                                      |  1 +
 fs/ecryptfs/read_write.c                           |  2 ++
 fs/eventfd.c                                       |  2 +-
 fs/eventpoll.c                                     |  2 +-
 fs/ext4/ext4.h                                     |  1 +
 fs/f2fs/data.c                                     |  1 +
 fs/fuse/dev.c                                      |  1 +
 fs/gfs2/lock_dlm.c                                 |  1 +
 fs/gfs2/super.c                                    |  2 +-
 fs/hpfs/hpfs_fn.h                                  |  2 +-
 fs/hugetlbfs/inode.c                               |  2 +-
 fs/jffs2/nodemgmt.c                                |  2 +-
 fs/nfs/inode.c                                     |  2 +-
 fs/nilfs2/segment.c                                |  2 ++
 fs/notify/fanotify/fanotify_user.c                 |  1 +
 fs/notify/inotify/inotify_user.c                   |  2 +-
 fs/ntfs/file.c                                     |  2 +-
 fs/ocfs2/alloc.c                                   |  1 +
 fs/ocfs2/dlm/dlmdomain.c                           |  1 +
 fs/ocfs2/dlmfs/userdlm.c                           |  1 +
 fs/ocfs2/dlmglue.c                                 |  1 +
 fs/orangefs/orangefs-kernel.h                      |  2 +-
 fs/overlayfs/copy_up.c                             |  2 +-
 fs/splice.c                                        |  2 ++
 fs/userfaultfd.c                                   |  2 +-
 fs/xfs/xfs_linux.h                                 |  2 +-
 include/drm/drm_os_linux.h                         |  1 +
 include/linux/sunrpc/types.h                       |  1 +
 include/media/v4l2-ioctl.h                         |  1 +
 include/net/busy_poll.h                            |  1 +
 kernel/locking/mutex.c                             |  2 +-
 kernel/locking/rtmutex.c                           |  2 +-
 kernel/locking/rwsem-spinlock.c                    |  2 +-
 kernel/locking/rwsem-xadd.c                        |  2 +-
 kernel/rcu/rcutorture.c                            |  2 +-
 kernel/sched/completion.c                          |  2 +-
 kernel/sched/swait.c                               |  2 +-
 kernel/sched/wait.c                                |  2 +-
 kernel/time/alarmtimer.c                           |  1 +
 kernel/time/hrtimer.c                              |  2 +-
 kernel/time/timer.c                                |  2 +-
 lib/percpu_ida.c                                   |  2 +-
 mm/compaction.c                                    |  1 +
 mm/gup.c                                           |  2 +-
 mm/hugetlb.c                                       |  1 +
 mm/memory_hotplug.c                                |  1 +
 mm/shmem.c                                         |  1 +
 mm/userfaultfd.c                                   |  1 +
 net/atm/svc.c                                      |  2 +-
 net/bluetooth/af_bluetooth.c                       |  2 ++
 net/bluetooth/cmtp/capi.c                          |  2 +-
 net/bluetooth/hci_request.c                        |  2 ++
 net/bluetooth/l2cap_sock.c                         |  1 +
 net/bluetooth/rfcomm/sock.c                        |  1 +
 net/bluetooth/sco.c                                |  1 +
 net/bridge/br_sysfs_br.c                           |  1 +
 net/bridge/br_sysfs_if.c                           |  1 +
 net/core/ethtool.c                                 |  2 +-
 net/core/net-sysfs.c                               |  1 +
 net/dccp/output.c                                  |  1 +
 net/ipv4/devinet.c                                 |  1 +
 net/ipv6/addrconf.c                                |  1 +
 net/irda/ircomm/ircomm_tty.c                       |  2 +-
 net/irda/irnet/irnet_ppp.c                         |  3 +-
 net/iucv/af_iucv.c                                 |  2 +-
 net/kcm/kcmsock.c                                  |  2 ++
 net/llc/af_llc.c                                   |  2 ++
 net/nfc/llcp_sock.c                                |  1 +
 net/phonet/pep.c                                   |  1 +
 net/phonet/socket.c                                |  2 ++
 net/rxrpc/conn_client.c                            |  2 ++
 net/rxrpc/recvmsg.c                                |  2 ++
 net/rxrpc/sendmsg.c                                |  2 ++
 net/tipc/socket.c                                  |  2 ++
 net/vmw_vsock/af_vsock.c                           |  1 +
 net/vmw_vsock/virtio_transport_common.c            |  1 +
 sound/core/control.c                               |  1 +
 sound/core/hwdep.c                                 |  1 +
 sound/core/oss/pcm_oss.c                           |  1 +
 sound/core/pcm_lib.c                               |  1 +
 sound/core/pcm_native.c                            |  1 +
 sound/core/rawmidi.c                               |  2 +-
 sound/core/seq/oss/seq_oss_device.h                |  2 +-
 sound/core/seq/oss/seq_oss_writeq.c                |  1 +
 sound/core/seq/seq_fifo.c                          |  2 ++
 sound/core/seq/seq_memory.c                        |  1 +
 sound/core/timer.c                                 |  1 +
 sound/firewire/bebob/bebob.h                       |  1 +
 sound/firewire/dice/dice.h                         |  1 +
 sound/firewire/digi00x/digi00x.h                   |  1 +
 sound/firewire/fireworks/fireworks.h               |  1 +
 sound/firewire/oxfw/oxfw.h                         |  1 +
 sound/firewire/tascam/tascam.h                     |  1 +
 sound/isa/gus/gus_pcm.c                            |  2 ++
 sound/isa/msnd/msnd.c                              |  1 +
 sound/isa/sb/emu8000.c                             |  2 +-
 sound/isa/sb/emu8000_patch.c                       |  2 ++
 sound/isa/sb/emu8000_pcm.c                         |  2 ++
 sound/isa/wavefront/wavefront_synth.c              |  1 +
 sound/oss/dmabuf.c                                 |  2 ++
 sound/oss/dmasound/dmasound_core.c                 |  1 +
 sound/oss/midibuf.c                                |  2 ++
 sound/oss/msnd_pinnacle.c                          |  2 ++
 sound/oss/sound_config.h                           |  1 +
 sound/oss/swarm_cs4297a.c                          |  2 +-
 virt/kvm/kvm_main.c                                |  2 +-
 265 files changed, 319 insertions(+), 139 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c
index 083c9e517d22..57f3b7512636 100644
--- a/arch/arm/common/bL_switcher.c
+++ b/arch/arm/common/bL_switcher.c
@@ -13,7 +13,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <uapi/linux/sched/types.h>
 #include <linux/interrupt.h>
 #include <linux/cpu_pm.h>
diff --git a/arch/cris/arch-v10/drivers/sync_serial.c b/arch/cris/arch-v10/drivers/sync_serial.c
index 9ac75d68f184..cc62572c1b94 100644
--- a/arch/cris/arch-v10/drivers/sync_serial.c
+++ b/arch/cris/arch-v10/drivers/sync_serial.c
@@ -16,7 +16,7 @@
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/major.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/interrupt.h>
 #include <linux/poll.h>
 #include <linux/init.h>
diff --git a/arch/cris/arch-v32/drivers/sync_serial.c b/arch/cris/arch-v32/drivers/sync_serial.c
index ef515af1a377..8efcc1a899a8 100644
--- a/arch/cris/arch-v32/drivers/sync_serial.c
+++ b/arch/cris/arch-v32/drivers/sync_serial.c
@@ -11,7 +11,7 @@
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/major.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/mutex.h>
 #include <linux/interrupt.h>
 #include <linux/poll.h>
diff --git a/arch/mips/kernel/rtlx.c b/arch/mips/kernel/rtlx.c
index c5c4fd54d797..b80dd8b17a76 100644
--- a/arch/mips/kernel/rtlx.c
+++ b/arch/mips/kernel/rtlx.c
@@ -12,6 +12,8 @@
 #include <linux/syscalls.h>
 #include <linux/moduleloader.h>
 #include <linux/atomic.h>
+#include <linux/sched/signal.h>
+
 #include <asm/mipsmtregs.h>
 #include <asm/mips_mt.h>
 #include <asm/processor.h>
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index ed81e5ac1426..15a1b1716c2e 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -16,8 +16,10 @@
 #include <linux/module.h>
 #include <linux/uaccess.h>
 #include <linux/vmalloc.h>
+#include <linux/sched/signal.h>
 #include <linux/fs.h>
 #include <linux/bootmem.h>
+
 #include <asm/fpu.h>
 #include <asm/page.h>
 #include <asm/cacheflush.h>
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 1e107ece4e37..fc56a9ce785d 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -22,7 +22,7 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/preempt.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/fs.h>
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 2b38d824e9e5..95c91a9de351 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -23,6 +23,7 @@
 #include <linux/kvm_host.h>
 #include <linux/vmalloc.h>
 #include <linux/hrtimer.h>
+#include <linux/sched/signal.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/file.h>
diff --git a/arch/powerpc/platforms/83xx/suspend.c b/arch/powerpc/platforms/83xx/suspend.c
index 08f92f6ed228..978b85bb3233 100644
--- a/arch/powerpc/platforms/83xx/suspend.c
+++ b/arch/powerpc/platforms/83xx/suspend.c
@@ -15,6 +15,7 @@
 #include <linux/ioport.h>
 #include <linux/interrupt.h>
 #include <linux/wait.h>
+#include <linux/sched/signal.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/suspend.h>
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index d317c84dc794..1fbb5da17dd2 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -23,7 +23,7 @@
 #undef DEBUG
 
 #include <linux/errno.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/loadavg.h>
 #include <linux/sched/rt.h>
 #include <linux/kernel.h>
diff --git a/arch/powerpc/platforms/cell/spufs/spufs.h b/arch/powerpc/platforms/cell/spufs/spufs.h
index aac733966092..5e59f80e95db 100644
--- a/arch/powerpc/platforms/cell/spufs/spufs.h
+++ b/arch/powerpc/platforms/cell/spufs/spufs.h
@@ -27,6 +27,7 @@
 #include <linux/spinlock.h>
 #include <linux/fs.h>
 #include <linux/cpumask.h>
+#include <linux/sched/signal.h>
 
 #include <asm/spu.h>
 #include <asm/spu_csa.h>
diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c
index 85b7f5efe06a..5a3ec04a7082 100644
--- a/arch/s390/crypto/prng.c
+++ b/arch/s390/crypto/prng.c
@@ -20,6 +20,8 @@
 #include <linux/cpufeature.h>
 #include <linux/random.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
+
 #include <asm/debug.h>
 #include <linux/uaccess.h>
 #include <asm/timex.h>
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index f5694838234d..fd6cd05bb6a7 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -29,6 +29,8 @@
 #include <linux/timer.h>
 #include <linux/vmalloc.h>
 #include <linux/bitmap.h>
+#include <linux/sched/signal.h>
+
 #include <asm/asm-offsets.h>
 #include <asm/lowcore.h>
 #include <asm/stp.h>
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 38556e395915..5491be39776b 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -14,6 +14,8 @@
 #include <linux/bug.h>
 #include <linux/list.h>
 #include <linux/bitmap.h>
+#include <linux/sched/signal.h>
+
 #include <asm/gmap.h>
 #include <asm/mmu_context.h>
 #include <asm/sclp.h>
diff --git a/arch/sh/kernel/cpu/fpu.c b/arch/sh/kernel/cpu/fpu.c
index 4d4737d099c6..37e35330aae6 100644
--- a/arch/sh/kernel/cpu/fpu.c
+++ b/arch/sh/kernel/cpu/fpu.c
@@ -1,4 +1,4 @@
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <asm/processor.h>
 #include <asm/fpu.h>
diff --git a/arch/sh/kernel/disassemble.c b/arch/sh/kernel/disassemble.c
index 64d5d8dded7c..015fee58014b 100644
--- a/arch/sh/kernel/disassemble.c
+++ b/arch/sh/kernel/disassemble.c
@@ -12,6 +12,8 @@
 #include <linux/string.h>
 #include <linux/uaccess.h>
 
+#include <asm/ptrace.h>
+
 /*
  * Format of an instruction in memory.
  */
diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c
index 30bff33e810e..80a61c5e3015 100644
--- a/arch/sh/kernel/process.c
+++ b/arch/sh/kernel/process.c
@@ -1,7 +1,7 @@
 #include <linux/mm.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/export.h>
 #include <linux/stackprotector.h>
 #include <asm/fpu.h>
diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c
index 57f03050c850..37c51a6be690 100644
--- a/arch/um/drivers/random.c
+++ b/arch/um/drivers/random.c
@@ -6,7 +6,7 @@
  * This software may be used and distributed according to the terms
  * of the GNU General Public License, incorporated herein by reference.
  */
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/interrupt.h>
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 4a7080c84a5a..dc04b30cbd60 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -218,7 +218,7 @@
 #include <linux/apm_bios.h>
 #include <linux/init.h>
 #include <linux/time.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/pm.h>
 #include <linux/capability.h>
 #include <linux/device.h>
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 295e98c2c8cc..bbe7ee00bd3d 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -17,6 +17,7 @@
 #include <linux/ioprio.h>
 #include <linux/kdev_t.h>
 #include <linux/module.h>
+#include <linux/sched/signal.h>
 #include <linux/err.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 746c14e0d157..6f35b6fd4799 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -21,6 +21,7 @@
 #include <linux/cache.h>
 #include <linux/sched/sysctl.h>
 #include <linux/sched/topology.h>
+#include <linux/sched/signal.h>
 #include <linux/delay.h>
 #include <linux/crash_dump.h>
 #include <linux/prefetch.h>
diff --git a/crypto/algboss.c b/crypto/algboss.c
index ccb85e1798f2..960d8548171b 100644
--- a/crypto/algboss.c
+++ b/crypto/algboss.c
@@ -19,7 +19,7 @@
 #include <linux/module.h>
 #include <linux/notifier.h>
 #include <linux/rtnetlink.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 
diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c
index 533265f110e0..5a8053758657 100644
--- a/crypto/algif_aead.c
+++ b/crypto/algif_aead.c
@@ -19,6 +19,7 @@
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/net.h>
diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c
index a9e79d8eff87..43839b00fe6c 100644
--- a/crypto/algif_skcipher.c
+++ b/crypto/algif_skcipher.c
@@ -18,6 +18,7 @@
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/net.h>
diff --git a/crypto/api.c b/crypto/api.c
index b16ce1653284..941cd4c6c7ec 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -21,7 +21,7 @@
 #include <linux/kmod.h>
 #include <linux/module.h>
 #include <linux/param.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include "internal.h"
diff --git a/drivers/atm/horizon.c b/drivers/atm/horizon.c
index 2bf1ef1c3c78..0f18480b33b5 100644
--- a/drivers/atm/horizon.c
+++ b/drivers/atm/horizon.c
@@ -27,6 +27,7 @@
 
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/mm.h>
 #include <linux/pci.h>
 #include <linux/errno.h>
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 3050e6f99403..684bda4d14a1 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -26,6 +26,7 @@
 #include <linux/mutex.h>
 #include <linux/pm_runtime.h>
 #include <linux/netdevice.h>
+#include <linux/sched/signal.h>
 #include <linux/sysfs.h>
 
 #include "base.h"
diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index f546f8f107b0..136854970489 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -8,7 +8,7 @@
 
 #include <linux/device.h>
 #include <linux/slab.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/capability.h>
 #include <linux/export.h>
 #include <linux/suspend.h>
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 116509852a34..c7d530a95e53 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -52,6 +52,7 @@
 #define __KERNEL_SYSCALLS__
 #include <linux/unistd.h>
 #include <linux/vmalloc.h>
+#include <linux/sched/signal.h>
 
 #include <linux/drbd_limits.h>
 #include "drbd_int.h"
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 8b40a5b2f8e6..aa6bf9692eff 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -37,6 +37,7 @@
 #include <linux/mm_inline.h>
 #include <linux/slab.h>
 #include <uapi/linux/sched/types.h>
+#include <linux/sched/signal.h>
 #include <linux/pkt_sched.h>
 #define __KERNEL_SYSCALLS__
 #include <linux/unistd.h>
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index c6755c9a0aea..3bff33f21435 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -25,7 +25,7 @@
 
 #include <linux/module.h>
 #include <linux/drbd.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/wait.h>
 #include <linux/mm.h>
 #include <linux/memcontrol.h>
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index aabd8e9d3035..61b3ffa4f458 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -20,7 +20,7 @@
 
 #include <linux/stddef.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/timer.h>
 #include <linux/delay.h>
 #include <linux/fd.h>
diff --git a/drivers/char/applicom.c b/drivers/char/applicom.c
index e5c62dcf2c11..e770ad977472 100644
--- a/drivers/char/applicom.c
+++ b/drivers/char/applicom.c
@@ -23,7 +23,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/mutex.h>
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index 20b32bb8c2af..8bdc38d81adf 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -25,6 +25,7 @@
 #include <linux/spinlock.h>
 #include <linux/sysctl.h>
 #include <linux/wait.h>
+#include <linux/sched/signal.h>
 #include <linux/bcd.h>
 #include <linux/seq_file.h>
 #include <linux/bitops.h>
diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index 5c654b5d4adf..503a41dfa193 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -17,6 +17,7 @@
 #include <linux/hw_random.h>
 #include <linux/kernel.h>
 #include <linux/kthread.h>
+#include <linux/sched/signal.h>
 #include <linux/miscdevice.h>
 #include <linux/module.h>
 #include <linux/random.h>
diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index 30b9e83bf1bf..5ca24d9b101b 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c
@@ -53,6 +53,7 @@
 #include <linux/ctype.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
+#include <linux/sched/signal.h>
 
 #ifdef CONFIG_X86
 /*
diff --git a/drivers/char/lp.c b/drivers/char/lp.c
index 5b6742770656..565e4cf04a02 100644
--- a/drivers/char/lp.c
+++ b/drivers/char/lp.c
@@ -117,7 +117,7 @@
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/major.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/fcntl.h>
 #include <linux/delay.h>
diff --git a/drivers/char/ppdev.c b/drivers/char/ppdev.c
index 87885d146dbb..2a558c706581 100644
--- a/drivers/char/ppdev.c
+++ b/drivers/char/ppdev.c
@@ -58,7 +58,7 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/device.h>
 #include <linux/ioctl.h>
 #include <linux/parport.h>
diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c
index 35259961cc38..974d48927b07 100644
--- a/drivers/char/rtc.c
+++ b/drivers/char/rtc.c
@@ -74,7 +74,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/spinlock.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sysctl.h>
 #include <linux/wait.h>
 #include <linux/bcd.h>
diff --git a/drivers/char/snsc.c b/drivers/char/snsc.c
index ec07f0e99732..6aa32679fd58 100644
--- a/drivers/char/snsc.c
+++ b/drivers/char/snsc.c
@@ -16,7 +16,7 @@
  */
 
 #include <linux/interrupt.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/device.h>
 #include <linux/poll.h>
 #include <linux/init.h>
diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
index d1f1f456f5c4..d195d617076d 100644
--- a/drivers/dma-buf/dma-fence.c
+++ b/drivers/dma-buf/dma-fence.c
@@ -22,6 +22,7 @@
 #include <linux/export.h>
 #include <linux/atomic.h>
 #include <linux/dma-fence.h>
+#include <linux/sched/signal.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/dma_fence.h>
diff --git a/drivers/gpu/drm/omapdrm/displays/panel-dsi-cm.c b/drivers/gpu/drm/omapdrm/displays/panel-dsi-cm.c
index a2bb855a2851..ac5800c72cb4 100644
--- a/drivers/gpu/drm/omapdrm/displays/panel-dsi-cm.c
+++ b/drivers/gpu/drm/omapdrm/displays/panel-dsi-cm.c
@@ -18,7 +18,7 @@
 #include <linux/jiffies.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>
 #include <linux/of_device.h>
diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index ab3016982466..1eef98c3331d 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -26,6 +26,7 @@
 #include <linux/pm_runtime.h>
 #include <linux/device.h>
 #include <linux/io.h>
+#include <linux/sched/signal.h>
 
 #include "uapi/drm/vc4_drm.h"
 #include "vc4_drv.h"
diff --git a/drivers/gpu/vga/vgaarb.c b/drivers/gpu/vga/vgaarb.c
index 0f5b2dd24507..92f1452dad57 100644
--- a/drivers/gpu/vga/vgaarb.c
+++ b/drivers/gpu/vga/vgaarb.c
@@ -41,7 +41,7 @@
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/list.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/wait.h>
 #include <linux/spinlock.h>
 #include <linux/poll.h>
diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c
index acfb522a432a..c6c9c51c806f 100644
--- a/drivers/hid/hid-debug.c
+++ b/drivers/hid/hid-debug.c
@@ -30,7 +30,7 @@
 
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
diff --git a/drivers/hid/hid-roccat.c b/drivers/hid/hid-roccat.c
index 76d06cf87b2a..fb77dec720a4 100644
--- a/drivers/hid/hid-roccat.c
+++ b/drivers/hid/hid-roccat.c
@@ -25,7 +25,7 @@
 
 #include <linux/cdev.h>
 #include <linux/poll.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/hid-roccat.h>
 #include <linux/module.h>
 
diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c
index f0e2757cb909..ec530454e6f6 100644
--- a/drivers/hid/hidraw.c
+++ b/drivers/hid/hidraw.c
@@ -33,7 +33,7 @@
 #include <linux/slab.h>
 #include <linux/hid.h>
 #include <linux/mutex.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/string.h>
 
 #include <linux/hidraw.h>
diff --git a/drivers/hid/usbhid/hiddev.c b/drivers/hid/usbhid/hiddev.c
index 700145b15088..774bd701dae0 100644
--- a/drivers/hid/usbhid/hiddev.c
+++ b/drivers/hid/usbhid/hiddev.c
@@ -27,6 +27,7 @@
 
 #include <linux/poll.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/input.h>
diff --git a/drivers/hsi/clients/cmt_speech.c b/drivers/hsi/clients/cmt_speech.c
index 7175e6bedf21..727f968ac1cb 100644
--- a/drivers/hsi/clients/cmt_speech.c
+++ b/drivers/hsi/clients/cmt_speech.c
@@ -31,7 +31,7 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/poll.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/ioctl.h>
 #include <linux/uaccess.h>
 #include <linux/pm_qos.h>
diff --git a/drivers/i2c/busses/i2c-ibm_iic.c b/drivers/i2c/busses/i2c-ibm_iic.c
index 412b91d255ad..961c5f42d956 100644
--- a/drivers/i2c/busses/i2c-ibm_iic.c
+++ b/drivers/i2c/busses/i2c-ibm_iic.c
@@ -37,6 +37,8 @@
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
+#include <linux/sched/signal.h>
+
 #include <asm/irq.h>
 #include <linux/io.h>
 #include <linux/i2c.h>
diff --git a/drivers/i2c/busses/i2c-mpc.c b/drivers/i2c/busses/i2c-mpc.c
index 565a49a0c564..96caf378b1dc 100644
--- a/drivers/i2c/busses/i2c-mpc.c
+++ b/drivers/i2c/busses/i2c-mpc.c
@@ -15,7 +15,7 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
 #include <linux/of_platform.h>
diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index 4972986f6455..d2b465140a6b 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -20,7 +20,7 @@
 #include <linux/cdev.h>
 #include <linux/slab.h>
 #include <linux/poll.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 
 #include <linux/iio/iio.h>
 #include "iio_core.h"
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index a6d6c617b597..0cdf2b7f272f 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -38,6 +38,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/moduleparam.h>
+#include <linux/sched/signal.h>
 
 #include "ipoib.h"
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
index deedb6fc1b05..3e10e3dac2e7 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
@@ -31,6 +31,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/sched/signal.h>
 
 #include <linux/init.h>
 #include <linux/seq_file.h>
diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c
index 49d0f70c2bae..1dfd1085a04f 100644
--- a/drivers/isdn/capi/kcapi.c
+++ b/drivers/isdn/capi/kcapi.c
@@ -18,7 +18,7 @@
 #include <linux/interrupt.h>
 #include <linux/ioport.h>
 #include <linux/proc_fs.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/seq_file.h>
 #include <linux/skbuff.h>
 #include <linux/workqueue.h>
diff --git a/drivers/isdn/mISDN/timerdev.c b/drivers/isdn/mISDN/timerdev.c
index 9438d7ec3308..b1e135fc1fb5 100644
--- a/drivers/isdn/mISDN/timerdev.c
+++ b/drivers/isdn/mISDN/timerdev.c
@@ -25,6 +25,8 @@
 #include <linux/module.h>
 #include <linux/mISDNif.h>
 #include <linux/mutex.h>
+#include <linux/sched/signal.h>
+
 #include "core.h"
 
 static DEFINE_MUTEX(mISDN_mutex);
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index ac219045daf7..395ed1961dbf 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -8,6 +8,7 @@
 #include <linux/stddef.h>
 #include <linux/io.h>
 #include <linux/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/vmalloc.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c
index 152414e6378a..fee939efc4fc 100644
--- a/drivers/macintosh/adb.c
+++ b/drivers/macintosh/adb.c
@@ -23,7 +23,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/adb.h>
 #include <linux/cuda.h>
 #include <linux/pmu.h>
diff --git a/drivers/macintosh/smu.c b/drivers/macintosh/smu.c
index 227869159ac0..1ac66421877a 100644
--- a/drivers/macintosh/smu.c
+++ b/drivers/macintosh/smu.c
@@ -39,6 +39,7 @@
 #include <linux/of_platform.h>
 #include <linux/slab.h>
 #include <linux/memblock.h>
+#include <linux/sched/signal.h>
 
 #include <asm/byteorder.h>
 #include <asm/io.h>
diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index 43b8db2b5445..cce99f72e4ae 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -23,7 +23,7 @@
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/delay.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/miscdevice.h>
 #include <linux/blkdev.h>
 #include <linux/pci.h>
diff --git a/drivers/mailbox/mailbox-test.c b/drivers/mailbox/mailbox-test.c
index 9c79f8019d2a..97fb956bb6e0 100644
--- a/drivers/mailbox/mailbox-test.c
+++ b/drivers/mailbox/mailbox-test.c
@@ -21,6 +21,7 @@
 #include <linux/poll.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
+#include <linux/sched/signal.h>
 
 #define MBOX_MAX_SIG_LEN	8
 #define MBOX_MAX_MSG_LEN	128
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 9f37d7fc2786..f4ffd1eb8f44 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/sched/signal.h>
 #include <linux/blkpg.h>
 #include <linux/bio.h>
 #include <linux/mempool.h>
diff --git a/drivers/media/dvb-core/dvb_ca_en50221.c b/drivers/media/dvb-core/dvb_ca_en50221.c
index 000d737ad827..8d65028c7a74 100644
--- a/drivers/media/dvb-core/dvb_ca_en50221.c
+++ b/drivers/media/dvb-core/dvb_ca_en50221.c
@@ -34,7 +34,7 @@
 #include <linux/vmalloc.h>
 #include <linux/delay.h>
 #include <linux/spinlock.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/kthread.h>
 
 #include "dvb_ca_en50221.h"
diff --git a/drivers/media/dvb-core/dvb_demux.c b/drivers/media/dvb-core/dvb_demux.c
index 4eac71e50c5f..6628f80d184f 100644
--- a/drivers/media/dvb-core/dvb_demux.c
+++ b/drivers/media/dvb-core/dvb_demux.c
@@ -19,7 +19,7 @@
 
 #define pr_fmt(fmt) "dvb_demux: " fmt
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
diff --git a/drivers/media/dvb-core/dvb_frontend.c b/drivers/media/dvb-core/dvb_frontend.c
index 85ae3669aa66..e3fff8f64d37 100644
--- a/drivers/media/dvb-core/dvb_frontend.c
+++ b/drivers/media/dvb-core/dvb_frontend.c
@@ -29,7 +29,7 @@
 
 #include <linux/string.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
 #include <linux/poll.h>
diff --git a/drivers/media/pci/cx18/cx18-driver.h b/drivers/media/pci/cx18/cx18-driver.h
index fef3c736fcba..7be2088c45fe 100644
--- a/drivers/media/pci/cx18/cx18-driver.h
+++ b/drivers/media/pci/cx18/cx18-driver.h
@@ -24,7 +24,7 @@
 #include <linux/moduleparam.h>
 #include <linux/init.h>
 #include <linux/delay.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/fs.h>
 #include <linux/pci.h>
 #include <linux/interrupt.h>
diff --git a/drivers/media/pci/ivtv/ivtv-driver.h b/drivers/media/pci/ivtv/ivtv-driver.h
index cde452e30746..d27c5c2c07ea 100644
--- a/drivers/media/pci/ivtv/ivtv-driver.h
+++ b/drivers/media/pci/ivtv/ivtv-driver.h
@@ -38,37 +38,38 @@
  *                using information provided by Jiun-Kuei Jung @ AVerMedia.
  */
 
-#include <asm/byteorder.h>
+#include <linux/module.h>
+#include <linux/init.h>
 #include <linux/delay.h>
-#include <linux/device.h>
+#include <linux/sched/signal.h>
 #include <linux/fs.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
 #include <linux/i2c.h>
 #include <linux/i2c-algo-bit.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/ivtv.h>
-#include <linux/kernel.h>
-#include <linux/kthread.h>
 #include <linux/list.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
+#include <linux/unistd.h>
 #include <linux/pagemap.h>
-#include <linux/pci.h>
 #include <linux/scatterlist.h>
-#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
 #include <linux/slab.h>
-#include <linux/spinlock.h>
 #include <linux/uaccess.h>
-#include <linux/unistd.h>
+#include <asm/byteorder.h>
 
-#include <media/drv-intf/cx2341x.h>
-#include <media/i2c/ir-kbd-i2c.h>
-#include <media/tuner.h>
+#include <linux/dvb/video.h>
+#include <linux/dvb/audio.h>
 #include <media/v4l2-common.h>
+#include <media/v4l2-ioctl.h>
 #include <media/v4l2-ctrls.h>
 #include <media/v4l2-device.h>
 #include <media/v4l2-fh.h>
-#include <media/v4l2-ioctl.h>
+#include <media/tuner.h>
+#include <media/drv-intf/cx2341x.h>
+#include <media/i2c/ir-kbd-i2c.h>
+
+#include <linux/ivtv.h>
 
 /* Memory layout */
 #define IVTV_ENCODER_OFFSET	0x00000000
diff --git a/drivers/media/pci/pt1/pt1.c b/drivers/media/pci/pt1/pt1.c
index da1eebd2016f..3219d2f3271e 100644
--- a/drivers/media/pci/pt1/pt1.c
+++ b/drivers/media/pci/pt1/pt1.c
@@ -18,6 +18,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
diff --git a/drivers/media/pci/pt3/pt3.c b/drivers/media/pci/pt3/pt3.c
index 77f4d15f322b..e8b5d0992157 100644
--- a/drivers/media/pci/pt3/pt3.c
+++ b/drivers/media/pci/pt3/pt3.c
@@ -21,6 +21,7 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/string.h>
+#include <linux/sched/signal.h>
 
 #include "dmxdev.h"
 #include "dvbdev.h"
diff --git a/drivers/media/pci/solo6x10/solo6x10-i2c.c b/drivers/media/pci/solo6x10/solo6x10-i2c.c
index c908672b2c40..e83bb79f9349 100644
--- a/drivers/media/pci/solo6x10/solo6x10-i2c.c
+++ b/drivers/media/pci/solo6x10/solo6x10-i2c.c
@@ -27,6 +27,7 @@
  * thread context, ACK the interrupt, and move on. -- BenC */
 
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 
 #include "solo6x10.h"
 
diff --git a/drivers/media/pci/zoran/zoran_device.c b/drivers/media/pci/zoran/zoran_device.c
index 671907a6e6b6..40adceebca7e 100644
--- a/drivers/media/pci/zoran/zoran_device.c
+++ b/drivers/media/pci/zoran/zoran_device.c
@@ -28,6 +28,7 @@
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/ktime.h>
+#include <linux/sched/signal.h>
 
 #include <linux/interrupt.h>
 #include <linux/proc_fs.h>
diff --git a/drivers/media/platform/vivid/vivid-radio-rx.c b/drivers/media/platform/vivid/vivid-radio-rx.c
index f99092ca8f5c..47c36c26096b 100644
--- a/drivers/media/platform/vivid/vivid-radio-rx.c
+++ b/drivers/media/platform/vivid/vivid-radio-rx.c
@@ -22,6 +22,8 @@
 #include <linux/delay.h>
 #include <linux/videodev2.h>
 #include <linux/v4l2-dv-timings.h>
+#include <linux/sched/signal.h>
+
 #include <media/v4l2-common.h>
 #include <media/v4l2-event.h>
 #include <media/v4l2-dv-timings.h>
diff --git a/drivers/media/platform/vivid/vivid-radio-tx.c b/drivers/media/platform/vivid/vivid-radio-tx.c
index 8c59d4f53200..0e8025b7b4dd 100644
--- a/drivers/media/platform/vivid/vivid-radio-tx.c
+++ b/drivers/media/platform/vivid/vivid-radio-tx.c
@@ -19,6 +19,7 @@
 
 #include <linux/errno.h>
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/delay.h>
 #include <linux/videodev2.h>
 #include <linux/v4l2-dv-timings.h>
diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c
index a54ca531d8ef..393dccaabdd0 100644
--- a/drivers/media/rc/lirc_dev.c
+++ b/drivers/media/rc/lirc_dev.c
@@ -19,7 +19,7 @@
 
 #include <linux/module.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/errno.h>
 #include <linux/ioctl.h>
 #include <linux/fs.h>
diff --git a/drivers/media/usb/cpia2/cpia2_core.c b/drivers/media/usb/cpia2/cpia2_core.c
index 431dd0b4b332..b1d13444ff30 100644
--- a/drivers/media/usb/cpia2/cpia2_core.c
+++ b/drivers/media/usb/cpia2/cpia2_core.c
@@ -32,6 +32,7 @@
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/firmware.h>
+#include <linux/sched/signal.h>
 
 #define FIRMWARE "cpia2/stv0672_vp4.bin"
 MODULE_FIRMWARE(FIRMWARE);
diff --git a/drivers/media/usb/gspca/cpia1.c b/drivers/media/usb/gspca/cpia1.c
index 23d3285f182a..e91d00762e94 100644
--- a/drivers/media/usb/gspca/cpia1.c
+++ b/drivers/media/usb/gspca/cpia1.c
@@ -27,6 +27,8 @@
 #define MODULE_NAME "cpia1"
 
 #include <linux/input.h>
+#include <linux/sched/signal.h>
+
 #include "gspca.h"
 
 MODULE_AUTHOR("Hans de Goede <hdegoede@redhat.com>");
diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c
index e33011e3e7e2..2fa015c05561 100644
--- a/drivers/misc/cxl/fault.c
+++ b/drivers/misc/cxl/fault.c
@@ -8,7 +8,7 @@
  */
 
 #include <linux/workqueue.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
 #include <linux/pid.h>
 #include <linux/mm.h>
diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c
index 859959f19f10..e7139c76f961 100644
--- a/drivers/misc/cxl/file.c
+++ b/drivers/misc/cxl/file.c
@@ -12,7 +12,7 @@
 #include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/bitmap.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/poll.h>
 #include <linux/pid.h>
 #include <linux/fs.h>
diff --git a/drivers/misc/ibmasm/r_heartbeat.c b/drivers/misc/ibmasm/r_heartbeat.c
index 232034f5da48..5c7dd26db716 100644
--- a/drivers/misc/ibmasm/r_heartbeat.c
+++ b/drivers/misc/ibmasm/r_heartbeat.c
@@ -20,7 +20,7 @@
  *
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include "ibmasm.h"
 #include "dot_command.h"
 
diff --git a/drivers/misc/lis3lv02d/lis3lv02d.c b/drivers/misc/lis3lv02d/lis3lv02d.c
index fb8705fc3aca..e389b0b5278d 100644
--- a/drivers/misc/lis3lv02d/lis3lv02d.c
+++ b/drivers/misc/lis3lv02d/lis3lv02d.c
@@ -23,6 +23,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/dmi.h>
 #include <linux/module.h>
 #include <linux/types.h>
diff --git a/drivers/misc/mei/bus.c b/drivers/misc/mei/bus.c
index cb3e9e0ca049..df5f78ae3d25 100644
--- a/drivers/misc/mei/bus.c
+++ b/drivers/misc/mei/bus.c
@@ -16,7 +16,7 @@
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
diff --git a/drivers/misc/mei/client.c b/drivers/misc/mei/client.c
index 68fe37b5bc52..d3e3372424d6 100644
--- a/drivers/misc/mei/client.c
+++ b/drivers/misc/mei/client.c
@@ -14,7 +14,7 @@
  *
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/wait.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
diff --git a/drivers/misc/mei/main.c b/drivers/misc/mei/main.c
index 9d0b7050c79a..bf816449cd40 100644
--- a/drivers/misc/mei/main.c
+++ b/drivers/misc/mei/main.c
@@ -26,7 +26,7 @@
 #include <linux/init.h>
 #include <linux/ioctl.h>
 #include <linux/cdev.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/uuid.h>
 #include <linux/compat.h>
 #include <linux/jiffies.h>
diff --git a/drivers/misc/mic/scif/scif_main.h b/drivers/misc/mic/scif/scif_main.h
index a08f0b600a9e..0e5eff9ad080 100644
--- a/drivers/misc/mic/scif/scif_main.h
+++ b/drivers/misc/mic/scif/scif_main.h
@@ -18,7 +18,7 @@
 #ifndef SCIF_MAIN_H
 #define SCIF_MAIN_H
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/pci.h>
 #include <linux/miscdevice.h>
 #include <linux/dmaengine.h>
diff --git a/drivers/misc/vexpress-syscfg.c b/drivers/misc/vexpress-syscfg.c
index c344483fa7d6..2cde80c7bb93 100644
--- a/drivers/misc/vexpress-syscfg.c
+++ b/drivers/misc/vexpress-syscfg.c
@@ -16,7 +16,7 @@
 #include <linux/io.h>
 #include <linux/of.h>
 #include <linux/platform_device.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/syscore_ops.h>
 #include <linux/vexpress.h>
diff --git a/drivers/mtd/tests/mtd_test.h b/drivers/mtd/tests/mtd_test.h
index 4b7bee17c924..04afd0e7074f 100644
--- a/drivers/mtd/tests/mtd_test.h
+++ b/drivers/mtd/tests/mtd_test.h
@@ -1,5 +1,5 @@
 #include <linux/mtd/mtd.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 
 static inline int mtdtest_relax(void)
 {
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index 577e57cad1dc..1bcbb8913e17 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -16,6 +16,8 @@
 #include <linux/rcupdate.h>
 #include <linux/ctype.h>
 #include <linux/inet.h>
+#include <linux/sched/signal.h>
+
 #include <net/bonding.h>
 
 static int bond_option_active_slave_set(struct bonding *bond,
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index e23c3ed737de..770623a0cc01 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -24,7 +24,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/device.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/fs.h>
 #include <linux/types.h>
 #include <linux/string.h>
diff --git a/drivers/net/can/softing/softing_fw.c b/drivers/net/can/softing/softing_fw.c
index 4063215c9b54..aac58ce6e371 100644
--- a/drivers/net/can/softing/softing_fw.c
+++ b/drivers/net/can/softing/softing_fw.c
@@ -17,7 +17,7 @@
  */
 
 #include <linux/firmware.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <asm/div64.h>
 #include <asm/io.h>
 
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index a448177990fe..30d1eb9ebec9 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -20,6 +20,7 @@
 #include <linux/moduleparam.h>
 #include <linux/stringify.h>
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/types.h>
 #include <linux/compiler.h>
 #include <linux/slab.h>
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_main.h b/drivers/net/ethernet/cavium/liquidio/octeon_main.h
index 8cd389148166..aa36e9ae7676 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_main.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_main.h
@@ -23,6 +23,8 @@
 #ifndef _OCTEON_MAIN_H_
 #define  _OCTEON_MAIN_H_
 
+#include <linux/sched/signal.h>
+
 #if BITS_PER_LONG == 32
 #define CVM_CAST64(v) ((long long)(v))
 #elif BITS_PER_LONG == 64
diff --git a/drivers/net/ethernet/sfc/falcon/falcon.c b/drivers/net/ethernet/sfc/falcon/falcon.c
index c6ff0cc5ef18..93c713c1f627 100644
--- a/drivers/net/ethernet/sfc/falcon/falcon.c
+++ b/drivers/net/ethernet/sfc/falcon/falcon.c
@@ -16,6 +16,8 @@
 #include <linux/i2c.h>
 #include <linux/mii.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
+
 #include "net_driver.h"
 #include "bitfield.h"
 #include "efx.h"
diff --git a/drivers/net/irda/stir4200.c b/drivers/net/irda/stir4200.c
index 42da094b68dd..7ee514879531 100644
--- a/drivers/net/irda/stir4200.c
+++ b/drivers/net/irda/stir4200.c
@@ -40,6 +40,7 @@
 #include <linux/moduleparam.h>
 
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/ktime.h>
 #include <linux/types.h>
 #include <linux/time.h>
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index a4bfc10b61dd..da85057680d6 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -9,7 +9,7 @@
 #include <linux/module.h>
 #include <linux/skbuff.h>
 #include <linux/cache.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/wait.h>
diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
index a411b43a69eb..f9c0e62716ea 100644
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -24,6 +24,7 @@
 
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/kmod.h>
 #include <linux/init.h>
 #include <linux/list.h>
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 30863e378925..dc1b1dd9157c 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -44,6 +44,7 @@
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/major.h>
 #include <linux/slab.h>
 #include <linux/poll.h>
diff --git a/drivers/net/usb/hso.c b/drivers/net/usb/hso.c
index e7b516342678..4f2e8141dbe2 100644
--- a/drivers/net/usb/hso.c
+++ b/drivers/net/usb/hso.c
@@ -52,7 +52,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/delay.h>
diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index 24d5272cdce5..805674550683 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/sched/signal.h>
 #include <linux/netdevice.h>
 #include <linux/ethtool.h>
 #include <linux/etherdevice.h>
diff --git a/drivers/net/wan/cosa.c b/drivers/net/wan/cosa.c
index 087eb266601f..4ca71bca39ac 100644
--- a/drivers/net/wan/cosa.c
+++ b/drivers/net/wan/cosa.c
@@ -78,7 +78,7 @@
 
 #include <linux/module.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/poll.h>
 #include <linux/fs.h>
diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c
index b7fe0af4cb24..363b30a549c2 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
@@ -20,6 +20,7 @@
 #include <linux/moduleparam.h>
 #include <linux/inetdevice.h>
 #include <linux/export.h>
+#include <linux/sched/signal.h>
 
 #include "core.h"
 #include "cfg80211.h"
diff --git a/drivers/net/wireless/broadcom/b43legacy/main.c b/drivers/net/wireless/broadcom/b43legacy/main.c
index e97ab2b91663..cdafebb9c936 100644
--- a/drivers/net/wireless/broadcom/b43legacy/main.c
+++ b/drivers/net/wireless/broadcom/b43legacy/main.c
@@ -36,7 +36,7 @@
 #include <linux/etherdevice.h>
 #include <linux/firmware.h>
 #include <linux/workqueue.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/skbuff.h>
 #include <linux/dma-mapping.h>
 #include <linux/slab.h>
diff --git a/drivers/net/wireless/intersil/hostap/hostap_hw.c b/drivers/net/wireless/intersil/hostap/hostap_hw.c
index 544ef7adde7d..04dfd040a650 100644
--- a/drivers/net/wireless/intersil/hostap/hostap_hw.c
+++ b/drivers/net/wireless/intersil/hostap/hostap_hw.c
@@ -43,7 +43,7 @@
 #include <linux/delay.h>
 #include <linux/random.h>
 #include <linux/wait.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/rtnetlink.h>
 #include <linux/wireless.h>
 #include <net/iw_handler.h>
diff --git a/drivers/net/wireless/intersil/hostap/hostap_ioctl.c b/drivers/net/wireless/intersil/hostap/hostap_ioctl.c
index a5656bc0e6aa..b2c6b065b542 100644
--- a/drivers/net/wireless/intersil/hostap/hostap_ioctl.c
+++ b/drivers/net/wireless/intersil/hostap/hostap_ioctl.c
@@ -2,7 +2,7 @@
 
 #include <linux/slab.h>
 #include <linux/types.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/ethtool.h>
 #include <linux/if_arp.h>
 #include <linux/module.h>
diff --git a/drivers/oprofile/event_buffer.c b/drivers/oprofile/event_buffer.c
index 67935fbbbcab..32888f2bd1a9 100644
--- a/drivers/oprofile/event_buffer.c
+++ b/drivers/oprofile/event_buffer.c
@@ -14,7 +14,7 @@
 
 #include <linux/vmalloc.h>
 #include <linux/oprofile.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/capability.h>
 #include <linux/dcookies.h>
 #include <linux/fs.h>
diff --git a/drivers/parport/daisy.c b/drivers/parport/daisy.c
index d998d0ed2bec..46eb15fb57ff 100644
--- a/drivers/parport/daisy.c
+++ b/drivers/parport/daisy.c
@@ -23,7 +23,7 @@
 #include <linux/parport.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 
 #include <asm/current.h>
 #include <linux/uaccess.h>
diff --git a/drivers/parport/ieee1284.c b/drivers/parport/ieee1284.c
index f9fd4b33a546..74cc6dd982d2 100644
--- a/drivers/parport/ieee1284.c
+++ b/drivers/parport/ieee1284.c
@@ -23,7 +23,7 @@
 #include <linux/kernel.h>
 #include <linux/interrupt.h>
 #include <linux/timer.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 
 #undef DEBUG /* undef me for production */
 
diff --git a/drivers/parport/ieee1284_ops.c b/drivers/parport/ieee1284_ops.c
index 75071605d22f..a959224d011b 100644
--- a/drivers/parport/ieee1284_ops.c
+++ b/drivers/parport/ieee1284_ops.c
@@ -17,7 +17,7 @@
 #include <linux/module.h>
 #include <linux/parport.h>
 #include <linux/delay.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/uaccess.h>
 
 #undef DEBUG /* undef me for production */
diff --git a/drivers/parport/parport_ip32.c b/drivers/parport/parport_ip32.c
index 30e981be14c2..dcbeeb220dda 100644
--- a/drivers/parport/parport_ip32.c
+++ b/drivers/parport/parport_ip32.c
@@ -102,7 +102,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/parport.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/stddef.h>
diff --git a/drivers/parport/parport_pc.c b/drivers/parport/parport_pc.c
index 3e56e7deab8e..9d42dfe65d44 100644
--- a/drivers/parport/parport_pc.c
+++ b/drivers/parport/parport_pc.c
@@ -44,7 +44,7 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/interrupt.h>
diff --git a/drivers/parport/share.c b/drivers/parport/share.c
index 3308427ed9f7..bc090daa850a 100644
--- a/drivers/parport/share.c
+++ b/drivers/parport/share.c
@@ -27,7 +27,7 @@
 #include <linux/ioport.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/kmod.h>
 #include <linux/device.h>
 
diff --git a/drivers/pci/access.c b/drivers/pci/access.c
index b9dd37c8c9ce..8b7382705bf2 100644
--- a/drivers/pci/access.c
+++ b/drivers/pci/access.c
@@ -1,7 +1,7 @@
 #include <linux/delay.h>
 #include <linux/pci.h>
 #include <linux/module.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/ioport.h>
 #include <linux/wait.h>
diff --git a/drivers/pci/hotplug/cpci_hotplug_core.c b/drivers/pci/hotplug/cpci_hotplug_core.c
index 7ec8a8f72c69..95f689f53920 100644
--- a/drivers/pci/hotplug/cpci_hotplug_core.c
+++ b/drivers/pci/hotplug/cpci_hotplug_core.c
@@ -27,6 +27,7 @@
 
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/pci.h>
 #include <linux/pci_hotplug.h>
diff --git a/drivers/pci/hotplug/cpqphp.h b/drivers/pci/hotplug/cpqphp.h
index 9103a7b9f3b9..48c8a066a6b7 100644
--- a/drivers/pci/hotplug/cpqphp.h
+++ b/drivers/pci/hotplug/cpqphp.h
@@ -32,7 +32,7 @@
 #include <asm/io.h>		/* for read? and write? functions */
 #include <linux/delay.h>	/* for delays */
 #include <linux/mutex.h>
-#include <linux/sched.h>	/* for signal_pending() */
+#include <linux/sched/signal.h>	/* for signal_pending() */
 
 #define MY_NAME	"cpqphp"
 
diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h
index 37d70b5ad22f..06109d40c4ac 100644
--- a/drivers/pci/hotplug/pciehp.h
+++ b/drivers/pci/hotplug/pciehp.h
@@ -33,7 +33,7 @@
 #include <linux/pci.h>
 #include <linux/pci_hotplug.h>
 #include <linux/delay.h>
-#include <linux/sched.h>		/* signal_pending() */
+#include <linux/sched/signal.h>		/* signal_pending() */
 #include <linux/pcieport_if.h>
 #include <linux/mutex.h>
 #include <linux/workqueue.h>
diff --git a/drivers/pci/hotplug/shpchp.h b/drivers/pci/hotplug/shpchp.h
index 4da8fc601467..70c7ea6af034 100644
--- a/drivers/pci/hotplug/shpchp.h
+++ b/drivers/pci/hotplug/shpchp.h
@@ -33,7 +33,7 @@
 #include <linux/pci.h>
 #include <linux/pci_hotplug.h>
 #include <linux/delay.h>
-#include <linux/sched.h>	/* signal_pending(), struct timer_list */
+#include <linux/sched/signal.h>	/* signal_pending(), struct timer_list */
 #include <linux/mutex.h>
 #include <linux/workqueue.h>
 
diff --git a/drivers/rtc/rtc-dev.c b/drivers/rtc/rtc-dev.c
index a6d9434addf6..6dc8f29697ab 100644
--- a/drivers/rtc/rtc-dev.c
+++ b/drivers/rtc/rtc-dev.c
@@ -15,7 +15,7 @@
 
 #include <linux/module.h>
 #include <linux/rtc.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include "rtc-core.h"
 
 static dev_t rtc_devt;
diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index 79823ee9c100..b8006ea9099c 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -24,6 +24,7 @@
 #include <linux/delay.h>
 #include <linux/timer.h>
 #include <linux/kernel_stat.h>
+#include <linux/sched/signal.h>
 
 #include <asm/ccwdev.h>
 #include <asm/cio.h>
diff --git a/drivers/scsi/lpfc/lpfc_vport.c b/drivers/scsi/lpfc/lpfc_vport.c
index e18bbc66e83b..4e36998a266c 100644
--- a/drivers/scsi/lpfc/lpfc_vport.c
+++ b/drivers/scsi/lpfc/lpfc_vport.c
@@ -28,6 +28,7 @@
 #include <linux/pci.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
+#include <linux/sched/signal.h>
 
 #include <scsi/scsi.h>
 #include <scsi/scsi_device.h>
diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c
index 75ac662793a3..c47f4b349bac 100644
--- a/drivers/scsi/osst.c
+++ b/drivers/scsi/osst.c
@@ -35,7 +35,7 @@ static const char * osst_version = "0.99.4";
 
 #include <linux/fs.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/proc_fs.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 81212d4bd9bf..e5ef78a6848e 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -23,7 +23,7 @@ static const char *verstr = "20160209";
 
 #include <linux/fs.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/mm.h>
 #include <linux/init.h>
 #include <linux/string.h>
diff --git a/drivers/soc/fsl/qbman/dpaa_sys.h b/drivers/soc/fsl/qbman/dpaa_sys.h
index 2eaf3184f61d..2ce394aa4c95 100644
--- a/drivers/soc/fsl/qbman/dpaa_sys.h
+++ b/drivers/soc/fsl/qbman/dpaa_sys.h
@@ -36,6 +36,7 @@
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/kthread.h>
+#include <linux/sched/signal.h>
 #include <linux/vmalloc.h>
 #include <linux/platform_device.h>
 #include <linux/of.h>
diff --git a/drivers/staging/comedi/comedi_fops.c b/drivers/staging/comedi/comedi_fops.c
index 57e8599b54e6..8deac8d9225d 100644
--- a/drivers/staging/comedi/comedi_fops.c
+++ b/drivers/staging/comedi/comedi_fops.c
@@ -23,7 +23,7 @@
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/fcntl.h>
 #include <linux/delay.h>
 #include <linux/mm.h>
diff --git a/drivers/staging/dgnc/dgnc_tty.c b/drivers/staging/dgnc/dgnc_tty.c
index c63e591631f6..c3b8fc54883d 100644
--- a/drivers/staging/dgnc/dgnc_tty.c
+++ b/drivers/staging/dgnc/dgnc_tty.c
@@ -19,7 +19,7 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/sched.h>	/* For jiffies, task states */
+#include <linux/sched/signal.h>	/* For jiffies, task states, etc. */
 #include <linux/interrupt.h>	/* For tasklet and interrupt structs/defines */
 #include <linux/module.h>
 #include <linux/ctype.h>
diff --git a/drivers/staging/dgnc/dgnc_utils.c b/drivers/staging/dgnc/dgnc_utils.c
index 95272f4765fc..6f59240024d1 100644
--- a/drivers/staging/dgnc/dgnc_utils.c
+++ b/drivers/staging/dgnc/dgnc_utils.c
@@ -1,5 +1,5 @@
 #include <linux/tty.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include "dgnc_utils.h"
 
 /*
diff --git a/drivers/staging/greybus/uart.c b/drivers/staging/greybus/uart.c
index ab0dbf5cab5a..43255e2e9276 100644
--- a/drivers/staging/greybus/uart.c
+++ b/drivers/staging/greybus/uart.c
@@ -14,7 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/module.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_user.h b/drivers/staging/lustre/lustre/include/lustre/lustre_user.h
index 21aec0ca9ad3..7d8628ce0d3b 100644
--- a/drivers/staging/lustre/lustre/include/lustre/lustre_user.h
+++ b/drivers/staging/lustre/lustre/include/lustre/lustre_user.h
@@ -44,6 +44,7 @@
 
 #ifdef __KERNEL__
 # include <linux/quota.h>
+# include <linux/sched/signal.h>
 # include <linux/string.h> /* snprintf() */
 # include <linux/version.h>
 #else /* !__KERNEL__ */
diff --git a/drivers/staging/lustre/lustre/include/lustre_lib.h b/drivers/staging/lustre/lustre/include/lustre_lib.h
index 27f3148c4344..b04d613846ee 100644
--- a/drivers/staging/lustre/lustre/include/lustre_lib.h
+++ b/drivers/staging/lustre/lustre/include/lustre_lib.h
@@ -42,7 +42,7 @@
  * @{
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/signal.h>
 #include <linux/types.h>
 #include "../../include/linux/libcfs/libcfs.h"
diff --git a/drivers/staging/lustre/lustre/include/obd_support.h b/drivers/staging/lustre/lustre/include/obd_support.h
index aaedec7d793c..dace6591a0a4 100644
--- a/drivers/staging/lustre/lustre/include/obd_support.h
+++ b/drivers/staging/lustre/lustre/include/obd_support.h
@@ -34,6 +34,8 @@
 #define _OBD_SUPPORT
 
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
+
 #include "../../include/linux/libcfs/libcfs.h"
 #include "lustre_compat.h"
 #include "lprocfs_status.h"
diff --git a/drivers/staging/media/lirc/lirc_sir.c b/drivers/staging/media/lirc/lirc_sir.c
index c75ae43095ba..c6c3de94adaa 100644
--- a/drivers/staging/media/lirc/lirc_sir.c
+++ b/drivers/staging/media/lirc/lirc_sir.c
@@ -36,7 +36,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/module.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/errno.h>
 #include <linux/signal.h>
 #include <linux/fs.h>
diff --git a/drivers/staging/media/lirc/lirc_zilog.c b/drivers/staging/media/lirc/lirc_zilog.c
index 34aac3e2eb87..e4a533b6beb3 100644
--- a/drivers/staging/media/lirc/lirc_zilog.c
+++ b/drivers/staging/media/lirc/lirc_zilog.c
@@ -42,7 +42,7 @@
 #include <linux/module.h>
 #include <linux/kmod.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/fs.h>
 #include <linux/poll.h>
 #include <linux/string.h>
diff --git a/drivers/staging/speakup/speakup_soft.c b/drivers/staging/speakup/speakup_soft.c
index ff68a384f9c2..d2ff0afd685a 100644
--- a/drivers/staging/speakup/speakup_soft.c
+++ b/drivers/staging/speakup/speakup_soft.c
@@ -22,7 +22,7 @@
 #include <linux/unistd.h>
 #include <linux/miscdevice.h>	/* for misc_register, and SYNTH_MINOR */
 #include <linux/poll.h>		/* for poll_wait() */
-#include <linux/sched.h> /* schedule(), signal_pending(), TASK_INTERRUPTIBLE */
+#include <linux/sched/signal.h> /* schedule(), signal_pending(), TASK_INTERRUPTIBLE */
 
 #include "spk_priv.h"
 #include "speakup.h"
diff --git a/drivers/target/iscsi/cxgbit/cxgbit_target.c b/drivers/target/iscsi/cxgbit/cxgbit_target.c
index 8bcb9b71f764..0c3e8fce3695 100644
--- a/drivers/target/iscsi/cxgbit/cxgbit_target.c
+++ b/drivers/target/iscsi/cxgbit/cxgbit_target.c
@@ -8,6 +8,8 @@
 
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
+#include <linux/sched/signal.h>
+
 #include <asm/unaligned.h>
 #include <net/tcp.h>
 #include <target/target_core_base.h>
diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c
index f3932baed07d..55577cf9b6a4 100644
--- a/drivers/tty/n_gsm.c
+++ b/drivers/tty/n_gsm.c
@@ -39,7 +39,7 @@
 #include <linux/errno.h>
 #include <linux/signal.h>
 #include <linux/fcntl.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/interrupt.h>
 #include <linux/tty.h>
 #include <linux/ctype.h>
diff --git a/drivers/tty/serial/crisv10.c b/drivers/tty/serial/crisv10.c
index e92c23470e51..59a2a7e18b5a 100644
--- a/drivers/tty/serial/crisv10.c
+++ b/drivers/tty/serial/crisv10.c
@@ -12,7 +12,7 @@ static char *serial_version = "$Revision: 1.25 $";
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/signal.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/timer.h>
 #include <linux/interrupt.h>
 #include <linux/tty.h>
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 9939c3d9912b..3fe56894974a 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -24,6 +24,7 @@
 #include <linux/tty.h>
 #include <linux/tty_flip.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/init.h>
 #include <linux/console.h>
 #include <linux/of.h>
diff --git a/drivers/tty/tty_ioctl.c b/drivers/tty/tty_ioctl.c
index f27fc0f14c11..a9a978731c5b 100644
--- a/drivers/tty/tty_ioctl.c
+++ b/drivers/tty/tty_ioctl.c
@@ -9,7 +9,7 @@
 #include <linux/types.h>
 #include <linux/termios.h>
 #include <linux/errno.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/kernel.h>
 #include <linux/major.h>
 #include <linux/tty.h>
diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c
index 5cd3cd932293..1d21a9c1d33e 100644
--- a/drivers/tty/tty_port.c
+++ b/drivers/tty/tty_port.c
@@ -11,7 +11,7 @@
 #include <linux/timer.h>
 #include <linux/string.h>
 #include <linux/slab.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/wait.h>
 #include <linux/bitops.h>
 #include <linux/delay.h>
diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c
index 31d95dc9c202..60ce7fd54e89 100644
--- a/drivers/uio/uio.c
+++ b/drivers/uio/uio.c
@@ -20,7 +20,7 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/idr.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/string.h>
 #include <linux/kobject.h>
 #include <linux/cdev.h>
diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index 235e305f8473..d5388938bc7a 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -32,6 +32,7 @@
 #undef VERBOSE_DEBUG
 
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/slab.h>
diff --git a/drivers/usb/class/usblp.c b/drivers/usb/class/usblp.c
index 071964c7847f..cc61055fb9be 100644
--- a/drivers/usb/class/usblp.c
+++ b/drivers/usb/class/usblp.c
@@ -49,7 +49,7 @@
 
 #include <linux/module.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/signal.h>
 #include <linux/poll.h>
 #include <linux/slab.h>
diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index 87fccf611b69..a5b7cd615698 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -23,6 +23,7 @@
 #include <linux/export.h>
 #include <linux/hid.h>
 #include <linux/module.h>
+#include <linux/sched/signal.h>
 #include <linux/uio.h>
 #include <asm/unaligned.h>
 
diff --git a/drivers/usb/image/mdc800.c b/drivers/usb/image/mdc800.c
index 5cf2633cdb04..e92540a21b6b 100644
--- a/drivers/usb/image/mdc800.c
+++ b/drivers/usb/image/mdc800.c
@@ -85,7 +85,7 @@
  * (20/10/1999)
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/signal.h>
 #include <linux/spinlock.h>
 #include <linux/errno.h>
diff --git a/drivers/usb/misc/adutux.c b/drivers/usb/misc/adutux.c
index c5fa584d8f0a..db9a9e6ff6be 100644
--- a/drivers/usb/misc/adutux.c
+++ b/drivers/usb/misc/adutux.c
@@ -21,6 +21,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/module.h>
diff --git a/drivers/usb/misc/idmouse.c b/drivers/usb/misc/idmouse.c
index debc1fd74b0d..8b9fd7534f69 100644
--- a/drivers/usb/misc/idmouse.c
+++ b/drivers/usb/misc/idmouse.c
@@ -17,6 +17,7 @@
 */
 
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/errno.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
diff --git a/drivers/usb/misc/rio500.c b/drivers/usb/misc/rio500.c
index fc329c98a6e8..b106ce76997b 100644
--- a/drivers/usb/misc/rio500.c
+++ b/drivers/usb/misc/rio500.c
@@ -31,7 +31,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/mutex.h>
 #include <linux/errno.h>
 #include <linux/random.h>
diff --git a/drivers/usb/misc/uss720.c b/drivers/usb/misc/uss720.c
index 0a643fa74cab..e45a3a680db8 100644
--- a/drivers/usb/misc/uss720.c
+++ b/drivers/usb/misc/uss720.c
@@ -50,6 +50,7 @@
 #include <linux/completion.h>
 #include <linux/kref.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 
 /*
  * Version Information
diff --git a/drivers/usb/mon/mon_bin.c b/drivers/usb/mon/mon_bin.c
index 9fb8b1e6ecc2..b6d8bf475c92 100644
--- a/drivers/usb/mon/mon_bin.c
+++ b/drivers/usb/mon/mon_bin.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/types.h>
 #include <linux/fs.h>
 #include <linux/cdev.h>
diff --git a/drivers/usb/mon/mon_text.c b/drivers/usb/mon/mon_text.c
index db1a4abf2806..19c416d69eb9 100644
--- a/drivers/usb/mon/mon_text.c
+++ b/drivers/usb/mon/mon_text.c
@@ -8,6 +8,7 @@
 #include <linux/list.h>
 #include <linux/usb.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/time.h>
 #include <linux/ktime.h>
 #include <linux/export.h>
diff --git a/drivers/usb/serial/digi_acceleport.c b/drivers/usb/serial/digi_acceleport.c
index eb433922598c..ab78111e0968 100644
--- a/drivers/usb/serial/digi_acceleport.c
+++ b/drivers/usb/serial/digi_acceleport.c
@@ -27,6 +27,7 @@
 #include <linux/uaccess.h>
 #include <linux/usb.h>
 #include <linux/wait.h>
+#include <linux/sched/signal.h>
 #include <linux/usb/serial.h>
 
 /* Defines */
diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c
index 944de657a07a..49ce2be90fa0 100644
--- a/drivers/usb/serial/generic.c
+++ b/drivers/usb/serial/generic.c
@@ -10,6 +10,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/sysrq.h>
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 5c98ad4d2f4c..9b519897cc17 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -18,6 +18,7 @@
 #include <linux/file.h>
 #include <linux/slab.h>
 #include <linux/sched/clock.h>
+#include <linux/sched/signal.h>
 #include <linux/vmalloc.h>
 
 #include <linux/net.h>
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 4a00140a7624..dcbe2e29bf17 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -28,6 +28,7 @@
 #include <linux/module.h>
 #include <linux/sort.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/interval_tree_generic.h>
 
 #include "vhost.h"
diff --git a/drivers/video/fbdev/cobalt_lcdfb.c b/drivers/video/fbdev/cobalt_lcdfb.c
index 038ac6934fe9..9da90bd242f4 100644
--- a/drivers/video/fbdev/cobalt_lcdfb.c
+++ b/drivers/video/fbdev/cobalt_lcdfb.c
@@ -26,6 +26,7 @@
 #include <linux/uaccess.h>
 #include <linux/platform_device.h>
 #include <linux/module.h>
+#include <linux/sched/signal.h>
 
 /*
  * Cursor position address
diff --git a/drivers/video/fbdev/omap2/omapfb/displays/panel-dsi-cm.c b/drivers/video/fbdev/omap2/omapfb/displays/panel-dsi-cm.c
index 8b810696a42b..fd2b372d0264 100644
--- a/drivers/video/fbdev/omap2/omapfb/displays/panel-dsi-cm.c
+++ b/drivers/video/fbdev/omap2/omapfb/displays/panel-dsi-cm.c
@@ -19,7 +19,7 @@
 #include <linux/jiffies.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>
 #include <linux/of_device.h>
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 95f42872b787..4f6b00efc27c 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -10,6 +10,8 @@
  */
 
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
+
 #include <net/sock.h>
 #include <net/af_rxrpc.h>
 #include <rxrpc/packet.h>
diff --git a/fs/aio.c b/fs/aio.c
index 7e2ab9c8e39c..0bb108476de2 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -20,7 +20,7 @@
 #include <linux/backing-dev.h>
 #include <linux/uio.h>
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/mm.h>
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 105d4d43993e..a8812d95359d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -20,6 +20,7 @@
 #define __BTRFS_CTREE__
 
 #include <linux/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/highmem.h>
 #include <linux/fs.h>
 #include <linux/rwsem.h>
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index cd966f276a8d..68c78be19d5b 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2,7 +2,7 @@
 
 #include <linux/fs.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/wait.h>
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 7ab5be7944aa..1858fc20eb7d 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -23,6 +23,8 @@
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/freezer.h>
+#include <linux/sched/signal.h>
+
 #include <asm/div64.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 822629126e89..f40e3953e7fe 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -22,7 +22,7 @@
 #include <linux/kernel.h>
 #include <linux/major.h>
 #include <linux/time.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/ioport.h>
 #include <linux/fcntl.h>
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 1ce908c2232c..23488f559cf9 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -17,6 +17,7 @@
 #include <linux/dlm.h>
 #include <linux/dlm_device.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 
 #include "dlm_internal.h"
 #include "lockspace.h"
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 158a3a39f82d..039e627194a9 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -22,6 +22,8 @@
 
 #include <linux/fs.h>
 #include <linux/pagemap.h>
+#include <linux/sched/signal.h>
+
 #include "ecryptfs_kernel.h"
 
 /**
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 1231cd1999d8..68b9fffcb2c8 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -9,7 +9,7 @@
 #include <linux/poll.h>
 #include <linux/init.h>
 #include <linux/fs.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/list.h>
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 5ec16313da1a..341251421ced 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -13,7 +13,7 @@
 
 #include <linux/init.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/signal.h>
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2fd17e8e4984..77798a46b0c6 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -28,6 +28,7 @@
 #include <linux/timer.h>
 #include <linux/version.h>
 #include <linux/wait.h>
+#include <linux/sched/signal.h>
 #include <linux/blockgroup_lock.h>
 #include <linux/percpu_counter.h>
 #include <linux/ratelimit.h>
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 1375fef11146..1602b4bccae6 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -22,6 +22,7 @@
 #include <linux/mm.h>
 #include <linux/memcontrol.h>
 #include <linux/cleancache.h>
+#include <linux/sched/signal.h>
 
 #include "f2fs.h"
 #include "node.h"
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index f11792672977..b681b43c766e 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/poll.h>
+#include <linux/sched/signal.h>
 #include <linux/uio.h>
 #include <linux/miscdevice.h>
 #include <linux/pagemap.h>
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 8b907c5cc913..0515f0a68637 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -15,6 +15,7 @@
 #include <linux/types.h>
 #include <linux/delay.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/sched/signal.h>
 
 #include "incore.h"
 #include "glock.h"
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index e3ee387a6dfe..361796a84fce 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -10,7 +10,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/bio.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index aebb78f9e47f..d352f3a6af7f 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -18,7 +18,7 @@
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/slab.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/blkdev.h>
 #include <asm/unaligned.h>
 
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 54de77e78775..8f96461236f6 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -11,7 +11,7 @@
 
 #include <linux/thread_info.h>
 #include <asm/current.h>
-#include <linux/sched.h>		/* remove ASAP */
+#include <linux/sched/signal.h>		/* remove ASAP */
 #include <linux/falloc.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index cda0774c2c9c..a7bbe879cfc3 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -14,7 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/mtd/mtd.h>
 #include <linux/compiler.h>
-#include <linux/sched.h> /* For cond_resched() */
+#include <linux/sched/signal.h>
 #include "nodelist.h"
 #include "debug.h"
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 5ca4d96b1942..685565b229c3 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -15,7 +15,7 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/time.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 7d18d62e8e07..febed1217b3f 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -30,6 +30,8 @@
 #include <linux/crc32.h>
 #include <linux/pagevec.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
+
 #include "nilfs.h"
 #include "btnode.h"
 #include "page.h"
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 7ebfca6a1427..2b37f2785834 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -14,6 +14,7 @@
 #include <linux/types.h>
 #include <linux/uaccess.h>
 #include <linux/compat.h>
+#include <linux/sched/signal.h>
 
 #include <asm/ioctls.h>
 
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 1cf41c623be1..498d609b26c7 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -30,7 +30,7 @@
 #include <linux/inotify.h>
 #include <linux/kernel.h> /* roundup() */
 #include <linux/namei.h> /* LOOKUP_FOLLOW */
-#include <linux/sched.h> /* struct user */
+#include <linux/sched/signal.h>
 #include <linux/slab.h> /* struct kmem_cache */
 #include <linux/syscalls.h>
 #include <linux/types.h>
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 358ed7e1195a..c4f68c338735 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -24,7 +24,7 @@
 #include <linux/gfp.h>
 #include <linux/pagemap.h>
 #include <linux/pagevec.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/swap.h>
 #include <linux/uio.h>
 #include <linux/writeback.h>
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d4ec0d8961a6..fb15a96df0b6 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -30,6 +30,7 @@
 #include <linux/swap.h>
 #include <linux/quotaops.h>
 #include <linux/blkdev.h>
+#include <linux/sched/signal.h>
 
 #include <cluster/masklog.h>
 
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 32fd261ae13d..a2b19fbdcf46 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -33,6 +33,7 @@
 #include <linux/delay.h>
 #include <linux/err.h>
 #include <linux/debugfs.h>
+#include <linux/sched/signal.h>
 
 #include "cluster/heartbeat.h"
 #include "cluster/nodemanager.h"
diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c
index f70cda2f090d..9cecf4857195 100644
--- a/fs/ocfs2/dlmfs/userdlm.c
+++ b/fs/ocfs2/dlmfs/userdlm.c
@@ -28,6 +28,7 @@
  */
 
 #include <linux/signal.h>
+#include <linux/sched/signal.h>
 
 #include <linux/module.h>
 #include <linux/fs.h>
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 8dce4099a6ca..3b7c937a36b5 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -33,6 +33,7 @@
 #include <linux/seq_file.h>
 #include <linux/time.h>
 #include <linux/quotaops.h>
+#include <linux/sched/signal.h>
 
 #define MLOG_MASK_PREFIX ML_DLM_GLUE
 #include <cluster/masklog.h>
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 70355a9a2596..8948683b367f 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -41,7 +41,7 @@
 #include <linux/uaccess.h>
 #include <linux/atomic.h>
 #include <linux/uio.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/mm.h>
 #include <linux/wait.h>
 #include <linux/dcache.h>
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 53d3f830358f..a34aa7aa2563 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -15,7 +15,7 @@
 #include <linux/xattr.h>
 #include <linux/security.h>
 #include <linux/uaccess.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/cred.h>
 #include <linux/namei.h>
 #include <linux/fdtable.h>
diff --git a/fs/splice.c b/fs/splice.c
index 4ef78aa8ef61..e49336555739 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -33,6 +33,8 @@
 #include <linux/gfp.h>
 #include <linux/socket.h>
 #include <linux/compat.h>
+#include <linux/sched/signal.h>
+
 #include "internal.h"
 
 /*
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 18d12bfff770..973607df579d 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -14,7 +14,7 @@
 
 #include <linux/list.h>
 #include <linux/hashtable.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
 #include <linux/mm.h>
 #include <linux/poll.h>
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 7a989de224f4..592fdf7111cb 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -55,7 +55,7 @@ typedef __u32			xfs_nlink_t;
 #include <linux/file.h>
 #include <linux/swap.h>
 #include <linux/errno.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/bitops.h>
 #include <linux/major.h>
 #include <linux/pagemap.h>
diff --git a/include/drm/drm_os_linux.h b/include/drm/drm_os_linux.h
index 86ab99bc0ac5..35e1482ba8a1 100644
--- a/include/drm/drm_os_linux.h
+++ b/include/drm/drm_os_linux.h
@@ -4,6 +4,7 @@
  */
 
 #include <linux/interrupt.h>	/* For task queue support */
+#include <linux/sched/signal.h>
 #include <linux/delay.h>
 
 #ifndef readq
diff --git a/include/linux/sunrpc/types.h b/include/linux/sunrpc/types.h
index d222f47550af..11a7536c0fd2 100644
--- a/include/linux/sunrpc/types.h
+++ b/include/linux/sunrpc/types.h
@@ -10,6 +10,7 @@
 #define _LINUX_SUNRPC_TYPES_H_
 
 #include <linux/timer.h>
+#include <linux/sched/signal.h>
 #include <linux/workqueue.h>
 #include <linux/sunrpc/debug.h>
 #include <linux/list.h>
diff --git a/include/media/v4l2-ioctl.h b/include/media/v4l2-ioctl.h
index 574ff2ae94be..6cd94e5ee113 100644
--- a/include/media/v4l2-ioctl.h
+++ b/include/media/v4l2-ioctl.h
@@ -12,6 +12,7 @@
 #include <linux/poll.h>
 #include <linux/fs.h>
 #include <linux/mutex.h>
+#include <linux/sched/signal.h>
 #include <linux/compiler.h> /* need __user */
 #include <linux/videodev2.h>
 
diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index b96832df239e..c0452de83086 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -26,6 +26,7 @@
 
 #include <linux/netdevice.h>
 #include <linux/sched/clock.h>
+#include <linux/sched/signal.h>
 #include <net/ip.h>
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 57f6311e2405..82ff5726bc1b 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -19,7 +19,7 @@
  */
 #include <linux/mutex.h>
 #include <linux/ww_mutex.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/wake_q.h>
 #include <linux/export.h>
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index d4f798491361..1c8c8707853a 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -12,7 +12,7 @@
  */
 #include <linux/spinlock.h>
 #include <linux/export.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/deadline.h>
 #include <linux/sched/wake_q.h>
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 5eacab880f67..91e7bc8e9d5a 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -6,7 +6,7 @@
  * - Derived also from comments by Linus
  */
 #include <linux/rwsem.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/export.h>
 
 enum rwsem_waiter_type {
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 4fe8d8ad4396..9bd6e768164e 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -12,7 +12,7 @@
 #include <linux/rwsem.h>
 #include <linux/init.h>
 #include <linux/export.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/wake_q.h>
 #include <linux/osq_lock.h>
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 6a28b79710f0..cccc417a8135 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -32,7 +32,7 @@
 #include <linux/smp.h>
 #include <linux/rcupdate.h>
 #include <linux/interrupt.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <uapi/linux/sched/types.h>
 #include <linux/atomic.h>
 #include <linux/bitops.h>
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index f063a25d4449..b294a8ee2842 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -11,7 +11,7 @@
  * Waiting for completion is a typically sync point, but not an exclusion point.
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/completion.h>
 
 /**
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 82f0dff90030..3d5610dcce11 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -1,4 +1,4 @@
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/swait.h>
 
 void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 9453efe9b25a..1fedfcf6fc9b 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -5,7 +5,7 @@
  */
 #include <linux/init.h>
 #include <linux/export.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/mm.h>
 #include <linux/wait.h>
 #include <linux/hash.h>
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index e6dc9a538efa..04939053c823 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -19,6 +19,7 @@
 #include <linux/hrtimer.h>
 #include <linux/timerqueue.h>
 #include <linux/rtc.h>
+#include <linux/sched/signal.h>
 #include <linux/alarmtimer.h>
 #include <linux/mutex.h>
 #include <linux/platform_device.h>
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 8e11d8d9f419..df512e8c04d4 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -43,7 +43,7 @@
 #include <linux/seq_file.h>
 #include <linux/err.h>
 #include <linux/debugobjects.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/sysctl.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/deadline.h>
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 82a6bfa0c307..f4465d2a25d1 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -38,7 +38,7 @@
 #include <linux/tick.h>
 #include <linux/kallsyms.h>
 #include <linux/irq_work.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/sysctl.h>
 #include <linux/slab.h>
 #include <linux/compat.h>
diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c
index 6d40944960de..010410990bc6 100644
--- a/lib/percpu_ida.c
+++ b/lib/percpu_ida.c
@@ -22,7 +22,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/percpu.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/string.h>
 #include <linux/spinlock.h>
 #include <linux/percpu_ida.h>
diff --git a/mm/compaction.c b/mm/compaction.c
index 0fdfde016ee2..81e1eaa2a2cf 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -12,6 +12,7 @@
 #include <linux/migrate.h>
 #include <linux/compaction.h>
 #include <linux/mm_inline.h>
+#include <linux/sched/signal.h>
 #include <linux/backing-dev.h>
 #include <linux/sysctl.h>
 #include <linux/sysfs.h>
diff --git a/mm/gup.c b/mm/gup.c
index 94fab8fa432b..9c047e951aa3 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -10,7 +10,7 @@
 #include <linux/swap.h>
 #include <linux/swapops.h>
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/rwsem.h>
 #include <linux/hugetlb.h>
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2e0e8159ce8e..a7aa811b7d14 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -18,6 +18,7 @@
 #include <linux/bootmem.h>
 #include <linux/sysfs.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/rmap.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1d3ed58f92ab..295479b792ec 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -6,6 +6,7 @@
 
 #include <linux/stddef.h>
 #include <linux/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/swap.h>
 #include <linux/interrupt.h>
 #include <linux/pagemap.h>
diff --git a/mm/shmem.c b/mm/shmem.c
index a26649a6633f..de8cdef4ef9b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -29,6 +29,7 @@
 #include <linux/pagemap.h>
 #include <linux/file.h>
 #include <linux/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/export.h>
 #include <linux/swap.h>
 #include <linux/uio.h>
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 9f0ad2a4f102..479e631d43c2 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/pagemap.h>
 #include <linux/rmap.h>
 #include <linux/swap.h>
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 878563a8354d..db9794ec61d8 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -10,7 +10,7 @@
 #include <linux/kernel.h>	/* printk */
 #include <linux/skbuff.h>
 #include <linux/wait.h>
-#include <linux/sched.h>	/* jiffies and HZ */
+#include <linux/sched/signal.h>
 #include <linux/fcntl.h>	/* O_NONBLOCK */
 #include <linux/init.h>
 #include <linux/atm.h>		/* ATM stuff */
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index cfb2faba46de..69e1f7d362a8 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -27,6 +27,8 @@
 #include <linux/module.h>
 #include <linux/debugfs.h>
 #include <linux/stringify.h>
+#include <linux/sched/signal.h>
+
 #include <asm/ioctls.h>
 
 #include <net/bluetooth/bluetooth.h>
diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c
index 46ac686c8911..bb308224099c 100644
--- a/net/bluetooth/cmtp/capi.c
+++ b/net/bluetooth/cmtp/capi.c
@@ -26,7 +26,7 @@
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/poll.h>
 #include <linux/fcntl.h>
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 1015d9c8d97d..b5faff458d8b 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -21,6 +21,8 @@
    SOFTWARE IS DISCLAIMED.
 */
 
+#include <linux/sched/signal.h>
+
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci_core.h>
 #include <net/bluetooth/mgmt.h>
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index a8ba752732c9..f307b145ea54 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -29,6 +29,7 @@
 
 #include <linux/module.h>
 #include <linux/export.h>
+#include <linux/sched/signal.h>
 
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci_core.h>
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 7511df72347f..aa1a814ceddc 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -27,6 +27,7 @@
 
 #include <linux/export.h>
 #include <linux/debugfs.h>
+#include <linux/sched/signal.h>
 
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci_core.h>
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 3125ce670c2f..e4e9a2da1e7e 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -27,6 +27,7 @@
 #include <linux/module.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
+#include <linux/sched/signal.h>
 
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci_core.h>
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 0f4034934d56..0b5dd607444c 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -19,6 +19,7 @@
 #include <linux/rtnetlink.h>
 #include <linux/spinlock.h>
 #include <linux/times.h>
+#include <linux/sched/signal.h>
 
 #include "br_private.h"
 
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 05e8946ccc03..79aee759aba5 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -17,6 +17,7 @@
 #include <linux/if_bridge.h>
 #include <linux/rtnetlink.h>
 #include <linux/spinlock.h>
+#include <linux/sched/signal.h>
 
 #include "br_private.h"
 
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index be7bab1adcde..aecb2c7241b6 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -24,7 +24,7 @@
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
 #include <linux/rtnetlink.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/net.h>
 
 /*
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index b0c04cf4851d..3945821e9c1f 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -15,6 +15,7 @@
 #include <net/switchdev.h>
 #include <linux/if_arp.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/nsproxy.h>
 #include <net/sock.h>
 #include <net/net_namespace.h>
diff --git a/net/dccp/output.c b/net/dccp/output.c
index b66c84db0766..91a15b3c4915 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -14,6 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 
 #include <net/inet_sock.h>
 #include <net/sock.h>
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 5d367b7ff542..cebedd545e5e 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -32,6 +32,7 @@
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/socket.h>
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 3a2025f5bf2c..77362b88a661 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -43,6 +43,7 @@
 #include <linux/errno.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/socket.h>
 #include <linux/sockios.h>
 #include <linux/net.h>
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
index 817b1b186aff..f6061c4bb0a8 100644
--- a/net/irda/ircomm/ircomm_tty.c
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -32,7 +32,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/seq_file.h>
 #include <linux/termios.h>
 #include <linux/tty.h>
diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c
index 35dbf3dc3d28..7025dcb853d0 100644
--- a/net/irda/irnet/irnet_ppp.c
+++ b/net/irda/irnet/irnet_ppp.c
@@ -13,8 +13,9 @@
  *	2) as a control channel (write commands, read events)
  */
 
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
+
 #include "irnet_ppp.h"		/* Private header */
 /* Please put other headers in irnet.h - Thanks */
 
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 13190b38f22e..89bbde1081ce 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -17,7 +17,7 @@
 #include <linux/list.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/skbuff.h>
 #include <linux/init.h>
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index a646f3481240..309062f3debe 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -24,6 +24,8 @@
 #include <linux/uaccess.h>
 #include <linux/workqueue.h>
 #include <linux/syscalls.h>
+#include <linux/sched/signal.h>
+
 #include <net/kcm.h>
 #include <net/netns/generic.h>
 #include <net/sock.h>
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 5e9296382420..06186d608a27 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -26,6 +26,8 @@
 #include <linux/rtnetlink.h>
 #include <linux/init.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
+
 #include <net/llc.h>
 #include <net/llc_sap.h>
 #include <net/llc_pdu.h>
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index b9edf5fae6ae..879885b31cce 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -21,6 +21,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/nfc.h>
+#include <linux/sched/signal.h>
 
 #include "nfc.h"
 #include "llcp.h"
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index 8bad5624a27a..222bedcd9575 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -23,6 +23,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/socket.h>
 #include <net/sock.h>
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index ffd5f2297584..a6c8da3ee893 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -27,6 +27,8 @@
 #include <linux/kernel.h>
 #include <linux/net.h>
 #include <linux/poll.h>
+#include <linux/sched/signal.h>
+
 #include <net/sock.h>
 #include <net/tcp_states.h>
 
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 40a1ef2adeb4..c3be03e8d098 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -76,6 +76,8 @@
 #include <linux/slab.h>
 #include <linux/idr.h>
 #include <linux/timer.h>
+#include <linux/sched/signal.h>
+
 #include "ar-internal.h"
 
 __read_mostly unsigned int rxrpc_max_client_connections = 1000;
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index f3a688e10843..28274a3c9831 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -14,6 +14,8 @@
 #include <linux/net.h>
 #include <linux/skbuff.h>
 #include <linux/export.h>
+#include <linux/sched/signal.h>
+
 #include <net/sock.h>
 #include <net/af_rxrpc.h>
 #include "ar-internal.h"
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 0a6ef217aa8a..19b36c60fb4c 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -15,6 +15,8 @@
 #include <linux/gfp.h>
 #include <linux/skbuff.h>
 #include <linux/export.h>
+#include <linux/sched/signal.h>
+
 #include <net/sock.h>
 #include <net/af_rxrpc.h>
 #include "ar-internal.h"
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 6b09a778cc71..43e4045e72bc 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -35,6 +35,8 @@
  */
 
 #include <linux/rhashtable.h>
+#include <linux/sched/signal.h>
+
 #include "core.h"
 #include "name_table.h"
 #include "node.h"
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 8a398b3fb532..9192ead66751 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -90,6 +90,7 @@
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/kmod.h>
 #include <linux/list.h>
 #include <linux/miscdevice.h>
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 849c4ad0411e..8d592a45b597 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -9,6 +9,7 @@
  */
 #include <linux/spinlock.h>
 #include <linux/module.h>
+#include <linux/sched/signal.h>
 #include <linux/ctype.h>
 #include <linux/list.h>
 #include <linux/virtio.h>
diff --git a/sound/core/control.c b/sound/core/control.c
index fb096cb20a80..c109b82eef4b 100644
--- a/sound/core/control.c
+++ b/sound/core/control.c
@@ -25,6 +25,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/time.h>
+#include <linux/sched/signal.h>
 #include <sound/core.h>
 #include <sound/minors.h>
 #include <sound/info.h>
diff --git a/sound/core/hwdep.c b/sound/core/hwdep.c
index 36d2416f90d9..9602a7e38d8a 100644
--- a/sound/core/hwdep.c
+++ b/sound/core/hwdep.c
@@ -25,6 +25,7 @@
 #include <linux/time.h>
 #include <linux/mutex.h>
 #include <linux/module.h>
+#include <linux/sched/signal.h>
 #include <sound/core.h>
 #include <sound/control.h>
 #include <sound/minors.h>
diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c
index 698a01419515..36baf962f9b0 100644
--- a/sound/core/oss/pcm_oss.c
+++ b/sound/core/oss/pcm_oss.c
@@ -28,6 +28,7 @@
 
 #include <linux/init.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/time.h>
 #include <linux/vmalloc.h>
 #include <linux/module.h>
diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c
index bb1261591a1f..5088d4b8db22 100644
--- a/sound/core/pcm_lib.c
+++ b/sound/core/pcm_lib.c
@@ -21,6 +21,7 @@
  */
 
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/time.h>
 #include <linux/math64.h>
 #include <linux/export.h>
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index aec9c92250fd..13dec5ec93f2 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -23,6 +23,7 @@
 #include <linux/module.h>
 #include <linux/file.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/time.h>
 #include <linux/pm_qos.h>
 #include <linux/io.h>
diff --git a/sound/core/rawmidi.c b/sound/core/rawmidi.c
index 8da9cb245d01..ab890336175f 100644
--- a/sound/core/rawmidi.c
+++ b/sound/core/rawmidi.c
@@ -22,7 +22,7 @@
 #include <sound/core.h>
 #include <linux/major.h>
 #include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/wait.h>
diff --git a/sound/core/seq/oss/seq_oss_device.h b/sound/core/seq/oss/seq_oss_device.h
index d7b4d016b547..afa007c0cc2d 100644
--- a/sound/core/seq/oss/seq_oss_device.h
+++ b/sound/core/seq/oss/seq_oss_device.h
@@ -24,7 +24,7 @@
 #include <linux/time.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <sound/core.h>
 #include <sound/seq_oss.h>
 #include <sound/rawmidi.h>
diff --git a/sound/core/seq/oss/seq_oss_writeq.c b/sound/core/seq/oss/seq_oss_writeq.c
index 1f6788a18444..5e04f4df10e4 100644
--- a/sound/core/seq/oss/seq_oss_writeq.c
+++ b/sound/core/seq/oss/seq_oss_writeq.c
@@ -28,6 +28,7 @@
 #include "../seq_clientmgr.h"
 #include <linux/wait.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 
 
 /*
diff --git a/sound/core/seq/seq_fifo.c b/sound/core/seq/seq_fifo.c
index 86240d02b530..448efd4e980e 100644
--- a/sound/core/seq/seq_fifo.c
+++ b/sound/core/seq/seq_fifo.c
@@ -21,6 +21,8 @@
 
 #include <sound/core.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
+
 #include "seq_fifo.h"
 #include "seq_lock.h"
 
diff --git a/sound/core/seq/seq_memory.c b/sound/core/seq/seq_memory.c
index dfa5156f3585..1a1acf3ddda4 100644
--- a/sound/core/seq/seq_memory.c
+++ b/sound/core/seq/seq_memory.c
@@ -23,6 +23,7 @@
 #include <linux/init.h>
 #include <linux/export.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 #include <linux/vmalloc.h>
 #include <sound/core.h>
 
diff --git a/sound/core/timer.c b/sound/core/timer.c
index ad153149b231..6d4fbc439246 100644
--- a/sound/core/timer.c
+++ b/sound/core/timer.c
@@ -27,6 +27,7 @@
 #include <linux/device.h>
 #include <linux/module.h>
 #include <linux/string.h>
+#include <linux/sched/signal.h>
 #include <sound/core.h>
 #include <sound/timer.h>
 #include <sound/control.h>
diff --git a/sound/firewire/bebob/bebob.h b/sound/firewire/bebob/bebob.h
index 175da875162d..17678d6ab5a2 100644
--- a/sound/firewire/bebob/bebob.h
+++ b/sound/firewire/bebob/bebob.h
@@ -17,6 +17,7 @@
 #include <linux/mod_devicetable.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 
 #include <sound/core.h>
 #include <sound/initval.h>
diff --git a/sound/firewire/dice/dice.h b/sound/firewire/dice/dice.h
index e6c07857f475..da00e75e09d4 100644
--- a/sound/firewire/dice/dice.h
+++ b/sound/firewire/dice/dice.h
@@ -23,6 +23,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/wait.h>
+#include <linux/sched/signal.h>
 
 #include <sound/control.h>
 #include <sound/core.h>
diff --git a/sound/firewire/digi00x/digi00x.h b/sound/firewire/digi00x/digi00x.h
index 2cd465c0caae..9dc761bdacca 100644
--- a/sound/firewire/digi00x/digi00x.h
+++ b/sound/firewire/digi00x/digi00x.h
@@ -16,6 +16,7 @@
 #include <linux/mod_devicetable.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 
 #include <sound/core.h>
 #include <sound/initval.h>
diff --git a/sound/firewire/fireworks/fireworks.h b/sound/firewire/fireworks/fireworks.h
index d73c12b8753d..9b19c7f05d57 100644
--- a/sound/firewire/fireworks/fireworks.h
+++ b/sound/firewire/fireworks/fireworks.h
@@ -17,6 +17,7 @@
 #include <linux/mod_devicetable.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
 
 #include <sound/core.h>
 #include <sound/initval.h>
diff --git a/sound/firewire/oxfw/oxfw.h b/sound/firewire/oxfw/oxfw.h
index 2047dcb27625..d54d4a9ac4a1 100644
--- a/sound/firewire/oxfw/oxfw.h
+++ b/sound/firewire/oxfw/oxfw.h
@@ -13,6 +13,7 @@
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/compat.h>
+#include <linux/sched/signal.h>
 
 #include <sound/control.h>
 #include <sound/core.h>
diff --git a/sound/firewire/tascam/tascam.h b/sound/firewire/tascam/tascam.h
index 1f61011579a7..d3cd4065722b 100644
--- a/sound/firewire/tascam/tascam.h
+++ b/sound/firewire/tascam/tascam.h
@@ -17,6 +17,7 @@
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/compat.h>
+#include <linux/sched/signal.h>
 
 #include <sound/core.h>
 #include <sound/initval.h>
diff --git a/sound/isa/gus/gus_pcm.c b/sound/isa/gus/gus_pcm.c
index 25f6788ccef3..06505999155f 100644
--- a/sound/isa/gus/gus_pcm.c
+++ b/sound/isa/gus/gus_pcm.c
@@ -27,6 +27,8 @@
 
 #include <asm/dma.h>
 #include <linux/slab.h>
+#include <linux/sched/signal.h>
+
 #include <sound/core.h>
 #include <sound/control.h>
 #include <sound/gus.h>
diff --git a/sound/isa/msnd/msnd.c b/sound/isa/msnd/msnd.c
index 835d4aa26761..8109ab3d29d1 100644
--- a/sound/isa/msnd/msnd.c
+++ b/sound/isa/msnd/msnd.c
@@ -36,6 +36,7 @@
  ********************************************************************/
 
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/types.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
diff --git a/sound/isa/sb/emu8000.c b/sound/isa/sb/emu8000.c
index 94c411299e5a..ec180708f160 100644
--- a/sound/isa/sb/emu8000.c
+++ b/sound/isa/sb/emu8000.c
@@ -21,7 +21,7 @@
  */
 
 #include <linux/wait.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/ioport.h>
 #include <linux/export.h>
diff --git a/sound/isa/sb/emu8000_patch.c b/sound/isa/sb/emu8000_patch.c
index 71d13c0bb746..c2e41d2762f7 100644
--- a/sound/isa/sb/emu8000_patch.c
+++ b/sound/isa/sb/emu8000_patch.c
@@ -20,6 +20,8 @@
  */
 
 #include "emu8000_local.h"
+
+#include <linux/sched/signal.h>
 #include <linux/uaccess.h>
 #include <linux/moduleparam.h>
 
diff --git a/sound/isa/sb/emu8000_pcm.c b/sound/isa/sb/emu8000_pcm.c
index 250fd0006b53..32f234f494e5 100644
--- a/sound/isa/sb/emu8000_pcm.c
+++ b/sound/isa/sb/emu8000_pcm.c
@@ -19,6 +19,8 @@
  */
 
 #include "emu8000_local.h"
+
+#include <linux/sched/signal.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <sound/initval.h>
diff --git a/sound/isa/wavefront/wavefront_synth.c b/sound/isa/wavefront/wavefront_synth.c
index 718d5e3b7806..4dae9ff9ef5a 100644
--- a/sound/isa/wavefront/wavefront_synth.c
+++ b/sound/isa/wavefront/wavefront_synth.c
@@ -26,6 +26,7 @@
 #include <linux/delay.h>
 #include <linux/time.h>
 #include <linux/wait.h>
+#include <linux/sched/signal.h>
 #include <linux/firmware.h>
 #include <linux/moduleparam.h>
 #include <linux/slab.h>
diff --git a/sound/oss/dmabuf.c b/sound/oss/dmabuf.c
index e3f29132d3ac..c5dd396c66a2 100644
--- a/sound/oss/dmabuf.c
+++ b/sound/oss/dmabuf.c
@@ -27,6 +27,8 @@
 
 #include <linux/mm.h>
 #include <linux/gfp.h>
+#include <linux/sched/signal.h>
+
 #include "sound_config.h"
 #include "sleep.h"
 
diff --git a/sound/oss/dmasound/dmasound_core.c b/sound/oss/dmasound/dmasound_core.c
index 5f248fb41bea..fb3bbceb1fef 100644
--- a/sound/oss/dmasound/dmasound_core.c
+++ b/sound/oss/dmasound/dmasound_core.c
@@ -182,6 +182,7 @@
 #include <linux/soundcard.h>
 #include <linux/poll.h>
 #include <linux/mutex.h>
+#include <linux/sched/signal.h>
 
 #include <linux/uaccess.h>
 
diff --git a/sound/oss/midibuf.c b/sound/oss/midibuf.c
index 8f45cd999965..701c7625c971 100644
--- a/sound/oss/midibuf.c
+++ b/sound/oss/midibuf.c
@@ -16,6 +16,8 @@
 #include <linux/stddef.h>
 #include <linux/kmod.h>
 #include <linux/spinlock.h>
+#include <linux/sched/signal.h>
+
 #define MIDIBUF_C
 
 #include "sound_config.h"
diff --git a/sound/oss/msnd_pinnacle.c b/sound/oss/msnd_pinnacle.c
index a8bb4a06ba6f..f34ec01d2239 100644
--- a/sound/oss/msnd_pinnacle.c
+++ b/sound/oss/msnd_pinnacle.c
@@ -41,6 +41,8 @@
 #include <linux/interrupt.h>
 #include <linux/mutex.h>
 #include <linux/gfp.h>
+#include <linux/sched/signal.h>
+
 #include <asm/irq.h>
 #include <asm/io.h>
 #include "sound_config.h"
diff --git a/sound/oss/sound_config.h b/sound/oss/sound_config.h
index f2554ab78f5e..5253b0a70437 100644
--- a/sound/oss/sound_config.h
+++ b/sound/oss/sound_config.h
@@ -16,6 +16,7 @@
 
 #include <linux/fs.h>
 #include <linux/sound.h>
+#include <linux/sched/signal.h>
 
 #include "os.h"
 #include "soundvers.h"
diff --git a/sound/oss/swarm_cs4297a.c b/sound/oss/swarm_cs4297a.c
index f3af63e58b36..97899352b15f 100644
--- a/sound/oss/swarm_cs4297a.c
+++ b/sound/oss/swarm_cs4297a.c
@@ -64,7 +64,7 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/ioport.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/delay.h>
 #include <linux/sound.h>
 #include <linux/slab.h>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index dd5de09bf362..ae79f7679cc2 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -32,7 +32,7 @@
 #include <linux/file.h>
 #include <linux/syscore_ops.h>
 #include <linux/cpu.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
 #include <linux/cpumask.h>
 #include <linux/smp.h>
-- 
cgit v1.2.3


From 03441a3482a31462c93509939a388877e3cd9261 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:35 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/stat.h>

We are going to split <linux/sched/stat.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/stat.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/powerpc/kvm/book3s_hv.c     | 1 +
 arch/s390/appldata/appldata_os.c | 1 +
 crypto/mcryptd.c                 | 1 +
 drivers/cpuidle/governors/menu.c | 1 +
 fs/proc/loadavg.c                | 1 +
 fs/proc/root.c                   | 1 +
 fs/proc/stat.c                   | 1 +
 include/linux/sched/stat.h       | 6 ++++++
 kernel/debug/kdb/kdb_main.c      | 1 +
 kernel/exit.c                    | 1 +
 kernel/fork.c                    | 1 +
 kernel/sched/sched.h             | 1 +
 kernel/sys.c                     | 1 +
 kernel/time/tick-sched.c         | 1 +
 virt/kvm/kvm_main.c              | 1 +
 15 files changed, 20 insertions(+)
 create mode 100644 include/linux/sched/stat.h

(limited to 'kernel')

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index fc56a9ce785d..1ec86d9e2a82 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -23,6 +23,7 @@
 #include <linux/slab.h>
 #include <linux/preempt.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/stat.h>
 #include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/fs.h>
diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c
index 079446619f89..45b3178200ab 100644
--- a/arch/s390/appldata/appldata_os.c
+++ b/arch/s390/appldata/appldata_os.c
@@ -18,6 +18,7 @@
 #include <linux/netdevice.h>
 #include <linux/sched.h>
 #include <linux/sched/loadavg.h>
+#include <linux/sched/stat.h>
 #include <asm/appldata.h>
 #include <asm/smp.h>
 
diff --git a/crypto/mcryptd.c b/crypto/mcryptd.c
index c207458d6299..4e6472658852 100644
--- a/crypto/mcryptd.c
+++ b/crypto/mcryptd.c
@@ -24,6 +24,7 @@
 #include <linux/module.h>
 #include <linux/scatterlist.h>
 #include <linux/sched.h>
+#include <linux/sched/stat.h>
 #include <linux/slab.h>
 #include <linux/hardirq.h>
 
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index fe86060e48f7..6d8a4026a903 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -19,6 +19,7 @@
 #include <linux/tick.h>
 #include <linux/sched.h>
 #include <linux/sched/loadavg.h>
+#include <linux/sched/stat.h>
 #include <linux/math64.h>
 #include <linux/cpu.h>
 
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index add5255ce7e0..983fce5c2418 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -4,6 +4,7 @@
 #include <linux/proc_fs.h>
 #include <linux/sched.h>
 #include <linux/sched/loadavg.h>
+#include <linux/sched/stat.h>
 #include <linux/seq_file.h>
 #include <linux/seqlock.h>
 #include <linux/time.h>
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 5d5fed20bfff..a50ba388255f 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -14,6 +14,7 @@
 #include <linux/stat.h>
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/sched/stat.h>
 #include <linux/module.h>
 #include <linux/bitops.h>
 #include <linux/user_namespace.h>
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index e47c3e8c4dfe..b95556e036bb 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -5,6 +5,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/proc_fs.h>
 #include <linux/sched.h>
+#include <linux/sched/stat.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/time.h>
diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h
new file mode 100644
index 000000000000..30fe012aecb0
--- /dev/null
+++ b/include/linux/sched/stat.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_STAT_H
+#define _LINUX_SCHED_STAT_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_STAT_H */
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 308937c7a687..bf48a492b4be 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -19,6 +19,7 @@
 #include <linux/reboot.h>
 #include <linux/sched.h>
 #include <linux/sched/loadavg.h>
+#include <linux/sched/stat.h>
 #include <linux/sysrq.h>
 #include <linux/smp.h>
 #include <linux/utsname.h>
diff --git a/kernel/exit.c b/kernel/exit.c
index 48649ccbb649..5bad7dce1b7b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -8,6 +8,7 @@
 #include <linux/slab.h>
 #include <linux/sched/autogroup.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/stat.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/capability.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 8fbe8bcd1e20..8632905f64c5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -17,6 +17,7 @@
 #include <linux/sched/coredump.h>
 #include <linux/sched/user.h>
 #include <linux/sched/numa_balancing.h>
+#include <linux/sched/stat.h>
 #include <linux/rtmutex.h>
 #include <linux/init.h>
 #include <linux/unistd.h>
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 04f376cf7ba9..6c8a1db44dde 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -9,6 +9,7 @@
 #include <linux/sched/numa_balancing.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/cpufreq.h>
+#include <linux/sched/stat.h>
 #include <linux/u64_stats_sync.h>
 #include <linux/sched/deadline.h>
 #include <linux/kernel_stat.h>
diff --git a/kernel/sys.c b/kernel/sys.c
index e2d743fb7f55..8c0392c8be51 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -51,6 +51,7 @@
 #include <linux/sched.h>
 #include <linux/sched/autogroup.h>
 #include <linux/sched/loadavg.h>
+#include <linux/sched/stat.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
 #include <linux/rcupdate.h>
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 5e9e123b4321..29d79b175141 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -21,6 +21,7 @@
 #include <linux/profile.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/clock.h>
+#include <linux/sched/stat.h>
 #include <linux/module.h>
 #include <linux/irq_work.h>
 #include <linux/posix-timers.h>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ae79f7679cc2..799499417f5b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -34,6 +34,7 @@
 #include <linux/cpu.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/stat.h>
 #include <linux/cpumask.h>
 #include <linux/smp.h>
 #include <linux/anon_inodes.h>
-- 
cgit v1.2.3


From 370c91355c76cbcaad8ff79b4bb15a7f2ea59433 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:35 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/nohz.h>

We are going to split <linux/sched/nohz.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/nohz.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/nohz.h | 6 ++++++
 kernel/sched/sched.h       | 1 +
 kernel/time/hrtimer.c      | 1 +
 kernel/time/tick-sched.c   | 1 +
 kernel/time/timer.c        | 1 +
 5 files changed, 10 insertions(+)
 create mode 100644 include/linux/sched/nohz.h

(limited to 'kernel')

diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h
new file mode 100644
index 000000000000..fe6caf0dab10
--- /dev/null
+++ b/include/linux/sched/nohz.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_NOHZ_H
+#define _LINUX_SCHED_NOHZ_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_NOHZ_H */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6c8a1db44dde..5979f47c422c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -10,6 +10,7 @@
 #include <linux/sched/mm.h>
 #include <linux/sched/cpufreq.h>
 #include <linux/sched/stat.h>
+#include <linux/sched/nohz.h>
 #include <linux/u64_stats_sync.h>
 #include <linux/sched/deadline.h>
 #include <linux/kernel_stat.h>
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index df512e8c04d4..aed2207f5b2d 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -47,6 +47,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/deadline.h>
+#include <linux/sched/nohz.h>
 #include <linux/timer.h>
 #include <linux/freezer.h>
 
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 29d79b175141..7fe53be86077 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -22,6 +22,7 @@
 #include <linux/sched/signal.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/stat.h>
+#include <linux/sched/nohz.h>
 #include <linux/module.h>
 #include <linux/irq_work.h>
 #include <linux/posix-timers.h>
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index f4465d2a25d1..d7999cb06229 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -40,6 +40,7 @@
 #include <linux/irq_work.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/sysctl.h>
+#include <linux/sched/nohz.h>
 #include <linux/slab.h>
 #include <linux/compat.h>
 
-- 
cgit v1.2.3


From b17b01533b719e9949e437abf66436a875739b40 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:35 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/debug.h>

We are going to split <linux/sched/debug.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/debug.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/kernel/process.c               | 1 +
 arch/alpha/kernel/traps.c                 | 1 +
 arch/arc/kernel/ctx_sw.c                  | 1 +
 arch/arc/kernel/stacktrace.c              | 2 ++
 arch/arc/kernel/troubleshoot.c            | 1 +
 arch/arm/kernel/process.c                 | 1 +
 arch/arm/kernel/stacktrace.c              | 1 +
 arch/arm/kernel/traps.c                   | 1 +
 arch/arm/mm/alignment.c                   | 1 +
 arch/arm/mm/fault.c                       | 1 +
 arch/arm/probes/kprobes/core.c            | 1 +
 arch/arm64/kernel/probes/kprobes.c        | 1 +
 arch/arm64/kernel/process.c               | 1 +
 arch/arm64/kernel/stacktrace.c            | 1 +
 arch/arm64/kernel/traps.c                 | 1 +
 arch/arm64/mm/fault.c                     | 1 +
 arch/avr32/kernel/nmi_debug.c             | 1 +
 arch/avr32/kernel/process.c               | 1 +
 arch/blackfin/kernel/dumpstack.c          | 2 ++
 arch/blackfin/kernel/early_printk.c       | 1 +
 arch/blackfin/kernel/nmi.c                | 1 +
 arch/blackfin/kernel/process.c            | 1 +
 arch/blackfin/kernel/trace.c              | 1 +
 arch/blackfin/kernel/traps.c              | 1 +
 arch/blackfin/mach-common/ints-priority.c | 1 +
 arch/blackfin/mm/isram-driver.c           | 1 +
 arch/c6x/kernel/traps.c                   | 1 +
 arch/cris/arch-v10/kernel/process.c       | 1 +
 arch/cris/arch-v10/kernel/traps.c         | 2 ++
 arch/cris/arch-v32/kernel/process.c       | 1 +
 arch/cris/arch-v32/kernel/traps.c         | 2 ++
 arch/cris/kernel/irq.c                    | 1 +
 arch/cris/kernel/stacktrace.c             | 2 +-
 arch/cris/kernel/traps.c                  | 1 +
 arch/frv/kernel/process.c                 | 1 +
 arch/frv/kernel/traps.c                   | 1 +
 arch/h8300/kernel/process.c               | 1 +
 arch/h8300/kernel/traps.c                 | 1 +
 arch/hexagon/kernel/process.c             | 1 +
 arch/hexagon/kernel/traps.c               | 1 +
 arch/hexagon/kernel/vm_events.c           | 1 +
 arch/ia64/hp/sim/simserial.c              | 1 +
 arch/ia64/kernel/mca.c                    | 1 +
 arch/ia64/kernel/process.c                | 1 +
 arch/ia64/kernel/traps.c                  | 1 +
 arch/m32r/kernel/process.c                | 1 +
 arch/m32r/kernel/traps.c                  | 1 +
 arch/m68k/kernel/process.c                | 1 +
 arch/m68k/kernel/traps.c                  | 1 +
 arch/m68k/mac/macints.c                   | 1 +
 arch/metag/kernel/process.c               | 1 +
 arch/metag/kernel/stacktrace.c            | 1 +
 arch/metag/kernel/traps.c                 | 1 +
 arch/metag/mm/fault.c                     | 1 +
 arch/microblaze/kernel/exceptions.c       | 1 +
 arch/microblaze/kernel/process.c          | 1 +
 arch/microblaze/kernel/traps.c            | 1 +
 arch/mips/kernel/process.c                | 1 +
 arch/mips/kernel/stacktrace.c             | 1 +
 arch/mips/kernel/traps.c                  | 1 +
 arch/mips/sgi-ip22/ip28-berr.c            | 1 +
 arch/mips/sgi-ip27/ip27-berr.c            | 1 +
 arch/mips/sgi-ip32/ip32-berr.c            | 1 +
 arch/mips/sgi-ip32/ip32-irq.c             | 1 +
 arch/mn10300/kernel/process.c             | 1 +
 arch/mn10300/kernel/traps.c               | 1 +
 arch/nios2/kernel/process.c               | 1 +
 arch/nios2/kernel/traps.c                 | 1 +
 arch/nios2/mm/fault.c                     | 1 +
 arch/openrisc/kernel/process.c            | 1 +
 arch/openrisc/kernel/traps.c              | 1 +
 arch/parisc/kernel/pa7300lc.c             | 1 +
 arch/parisc/kernel/process.c              | 1 +
 arch/parisc/kernel/signal.c               | 1 +
 arch/parisc/kernel/traps.c                | 1 +
 arch/parisc/kernel/unaligned.c            | 1 +
 arch/parisc/mm/fault.c                    | 1 +
 arch/powerpc/kernel/process.c             | 1 +
 arch/powerpc/kernel/stacktrace.c          | 1 +
 arch/powerpc/kernel/traps.c               | 1 +
 arch/s390/kernel/dumpstack.c              | 1 +
 arch/s390/kernel/process.c                | 1 +
 arch/s390/kernel/stacktrace.c             | 1 +
 arch/s390/kernel/traps.c                  | 1 +
 arch/s390/mm/fault.c                      | 1 +
 arch/score/kernel/traps.c                 | 1 +
 arch/sh/kernel/dumpstack.c                | 1 +
 arch/sh/kernel/nmi_debug.c                | 1 +
 arch/sh/kernel/process_32.c               | 1 +
 arch/sh/kernel/process_64.c               | 1 +
 arch/sh/kernel/stacktrace.c               | 1 +
 arch/sh/kernel/traps.c                    | 1 +
 arch/sh/kernel/traps_64.c                 | 1 +
 arch/sparc/kernel/process_32.c            | 1 +
 arch/sparc/kernel/process_64.c            | 1 +
 arch/sparc/kernel/stacktrace.c            | 1 +
 arch/sparc/kernel/sun4m_irq.c             | 1 +
 arch/sparc/kernel/sys_sparc_32.c          | 1 +
 arch/sparc/kernel/sys_sparc_64.c          | 1 +
 arch/sparc/kernel/traps_32.c              | 1 +
 arch/sparc/kernel/traps_64.c              | 1 +
 arch/sparc/mm/fault_64.c                  | 1 +
 arch/tile/include/asm/stack.h             | 2 ++
 arch/tile/kernel/process.c                | 1 +
 arch/tile/kernel/signal.c                 | 1 +
 arch/tile/kernel/stack.c                  | 1 +
 arch/tile/kernel/traps.c                  | 1 +
 arch/tile/kernel/unaligned.c              | 1 +
 arch/tile/mm/fault.c                      | 1 +
 arch/um/drivers/mconsole_kern.c           | 1 +
 arch/um/kernel/process.c                  | 1 +
 arch/um/kernel/sysrq.c                    | 2 ++
 arch/um/kernel/trap.c                     | 1 +
 arch/unicore32/kernel/process.c           | 1 +
 arch/unicore32/kernel/stacktrace.c        | 1 +
 arch/unicore32/kernel/traps.c             | 1 +
 arch/unicore32/mm/alignment.c             | 1 +
 arch/x86/kernel/amd_gart_64.c             | 1 +
 arch/x86/kernel/doublefault.c             | 1 +
 arch/x86/kernel/dumpstack.c               | 1 +
 arch/x86/kernel/dumpstack_32.c            | 1 +
 arch/x86/kernel/dumpstack_64.c            | 1 +
 arch/x86/kernel/kprobes/core.c            | 1 +
 arch/x86/kernel/nmi.c                     | 1 +
 arch/x86/kernel/process.c                 | 1 +
 arch/x86/kernel/stacktrace.c              | 1 +
 arch/x86/mm/extable.c                     | 2 ++
 arch/x86/platform/uv/uv_nmi.c             | 1 +
 arch/x86/um/sysrq_32.c                    | 1 +
 arch/x86/um/sysrq_64.c                    | 1 +
 arch/xtensa/kernel/process.c              | 1 +
 arch/xtensa/kernel/traps.c                | 1 +
 drivers/base/power/main.c                 | 1 +
 drivers/tty/sysrq.c                       | 2 ++
 drivers/tty/tty_ldsem.c                   | 1 +
 drivers/tty/vt/keyboard.c                 | 2 ++
 fs/proc/base.c                            | 1 +
 include/linux/sched/debug.h               | 6 ++++++
 kernel/debug/kdb/kdb_bt.c                 | 1 +
 kernel/debug/kdb/kdb_main.c               | 1 +
 kernel/hung_task.c                        | 1 +
 kernel/latencytop.c                       | 1 +
 kernel/locking/mutex.c                    | 1 +
 kernel/locking/rtmutex-debug.c            | 1 +
 kernel/locking/rtmutex.c                  | 1 +
 kernel/locking/rwsem-spinlock.c           | 1 +
 kernel/locking/rwsem-xadd.c               | 1 +
 kernel/locking/rwsem.c                    | 1 +
 kernel/locking/semaphore.c                | 1 +
 kernel/panic.c                            | 1 +
 kernel/power/process.c                    | 1 +
 kernel/printk/printk.c                    | 1 +
 kernel/rcu/tree.c                         | 1 +
 kernel/rcu/tree_plugin.h                  | 1 +
 kernel/rcu/update.c                       | 1 +
 kernel/sched/completion.c                 | 1 +
 kernel/sched/sched.h                      | 2 +-
 kernel/sched/wait.c                       | 1 +
 kernel/signal.c                           | 1 +
 kernel/time/alarmtimer.c                  | 1 +
 kernel/time/hrtimer.c                     | 1 +
 kernel/time/timer.c                       | 1 +
 kernel/watchdog.c                         | 1 +
 kernel/watchdog_hld.c                     | 2 ++
 lib/dump_stack.c                          | 1 +
 lib/nmi_backtrace.c                       | 1 +
 166 files changed, 181 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/sched/debug.h

(limited to 'kernel')

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index bca963a4aa48..88f7a974d311 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -11,6 +11,7 @@
 #include <linux/errno.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c
index 6448ab00043d..b137390e87e7 100644
--- a/arch/alpha/kernel/traps.c
+++ b/arch/alpha/kernel/traps.c
@@ -11,6 +11,7 @@
 #include <linux/jiffies.h>
 #include <linux/mm.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/tty.h>
 #include <linux/delay.h>
 #include <linux/extable.h>
diff --git a/arch/arc/kernel/ctx_sw.c b/arch/arc/kernel/ctx_sw.c
index 6f4cb0dab1b9..9e1ae9d41925 100644
--- a/arch/arc/kernel/ctx_sw.c
+++ b/arch/arc/kernel/ctx_sw.c
@@ -16,6 +16,7 @@
 
 #include <asm/asm-offsets.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #ifdef CONFIG_ARC_PLAT_EZNPS
 #include <plat/ctop.h>
 #endif
diff --git a/arch/arc/kernel/stacktrace.c b/arch/arc/kernel/stacktrace.c
index b9192a653b7e..74315f302971 100644
--- a/arch/arc/kernel/stacktrace.c
+++ b/arch/arc/kernel/stacktrace.c
@@ -28,6 +28,8 @@
 #include <linux/export.h>
 #include <linux/stacktrace.h>
 #include <linux/kallsyms.h>
+#include <linux/sched/debug.h>
+
 #include <asm/arcregs.h>
 #include <asm/unwind.h>
 #include <asm/switch_to.h>
diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c
index 0c48907f56a9..f9caf79186d4 100644
--- a/arch/arc/kernel/troubleshoot.c
+++ b/arch/arc/kernel/troubleshoot.c
@@ -14,6 +14,7 @@
 #include <linux/proc_fs.h>
 #include <linux/file.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/debug.h>
 
 #include <asm/arcregs.h>
 #include <asm/irqflags.h>
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 91d2d5b01414..03d46395d1ff 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -12,6 +12,7 @@
 
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
diff --git a/arch/arm/kernel/stacktrace.c b/arch/arm/kernel/stacktrace.c
index 92b72375c4c7..3a2fa203637a 100644
--- a/arch/arm/kernel/stacktrace.c
+++ b/arch/arm/kernel/stacktrace.c
@@ -1,5 +1,6 @@
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/stacktrace.h>
 
 #include <asm/stacktrace.h>
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index dc5f1a22b3c9..a9dad001b97d 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -25,6 +25,7 @@
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/irq.h>
 
 #include <linux/atomic.h>
diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c
index d7fe2de9cc9e..2c96190e018b 100644
--- a/arch/arm/mm/alignment.c
+++ b/arch/arm/mm/alignment.c
@@ -14,6 +14,7 @@
 #include <linux/moduleparam.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
+#include <linux/sched/debug.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/proc_fs.h>
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 520c7778d330..ff8b0aa2dfde 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -17,6 +17,7 @@
 #include <linux/uaccess.h>
 #include <linux/page-flags.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/highmem.h>
 #include <linux/perf_event.h>
 
diff --git a/arch/arm/probes/kprobes/core.c b/arch/arm/probes/kprobes/core.c
index a4ec240ee7ba..b6dc9d838a9a 100644
--- a/arch/arm/probes/kprobes/core.c
+++ b/arch/arm/probes/kprobes/core.c
@@ -24,6 +24,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/stop_machine.h>
+#include <linux/sched/debug.h>
 #include <linux/stringify.h>
 #include <asm/traps.h>
 #include <asm/opcodes.h>
diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c
index f0593c92279b..2a07aae5b8a2 100644
--- a/arch/arm64/kernel/probes/kprobes.c
+++ b/arch/arm64/kernel/probes/kprobes.c
@@ -22,6 +22,7 @@
 #include <linux/extable.h>
 #include <linux/slab.h>
 #include <linux/stop_machine.h>
+#include <linux/sched/debug.h>
 #include <linux/stringify.h>
 #include <asm/traps.h>
 #include <asm/ptrace.h>
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 1ad48f93abdd..40a16cdd9873 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -24,6 +24,7 @@
 #include <linux/efi.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 8a552a33c6ef..7597e42feeea 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -19,6 +19,7 @@
 #include <linux/export.h>
 #include <linux/ftrace.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/stacktrace.h>
 
 #include <asm/irq.h>
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 5b8779a849a2..5a5d83791837 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -30,6 +30,7 @@
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/syscalls.h>
 
 #include <asm/atomic.h>
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 30dd60ab8a65..4bf899fb451b 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -27,6 +27,7 @@
 #include <linux/uaccess.h>
 #include <linux/page-flags.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/highmem.h>
 #include <linux/perf_event.h>
 #include <linux/preempt.h>
diff --git a/arch/avr32/kernel/nmi_debug.c b/arch/avr32/kernel/nmi_debug.c
index 3414b8566c29..25823049bb99 100644
--- a/arch/avr32/kernel/nmi_debug.c
+++ b/arch/avr32/kernel/nmi_debug.c
@@ -9,6 +9,7 @@
 #include <linux/kdebug.h>
 #include <linux/notifier.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 
 #include <asm/irq.h>
 
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index 68e5b9dac059..dac022d88d9d 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -6,6 +6,7 @@
  * published by the Free Software Foundation.
  */
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/fs.h>
diff --git a/arch/blackfin/kernel/dumpstack.c b/arch/blackfin/kernel/dumpstack.c
index 95ba6d9e9a3d..3c992c1f8ef2 100644
--- a/arch/blackfin/kernel/dumpstack.c
+++ b/arch/blackfin/kernel/dumpstack.c
@@ -10,6 +10,8 @@
 #include <linux/mm.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
+#include <linux/sched/debug.h>
+
 #include <asm/trace.h>
 
 /*
diff --git a/arch/blackfin/kernel/early_printk.c b/arch/blackfin/kernel/early_printk.c
index 61fbd2de993d..4b89af9243d3 100644
--- a/arch/blackfin/kernel/early_printk.c
+++ b/arch/blackfin/kernel/early_printk.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/sched/debug.h>
 #include <linux/init.h>
 #include <linux/serial_core.h>
 #include <linux/console.h>
diff --git a/arch/blackfin/kernel/nmi.c b/arch/blackfin/kernel/nmi.c
index 9919d29287dc..633c37083e87 100644
--- a/arch/blackfin/kernel/nmi.c
+++ b/arch/blackfin/kernel/nmi.c
@@ -17,6 +17,7 @@
 #include <linux/nmi.h>
 #include <linux/smp.h>
 #include <linux/timer.h>
+#include <linux/sched/debug.h>
 #include <asm/blackfin.h>
 #include <linux/atomic.h>
 #include <asm/cacheflush.h>
diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c
index 4aa5545c4fde..e95d094c20a6 100644
--- a/arch/blackfin/kernel/process.c
+++ b/arch/blackfin/kernel/process.c
@@ -12,6 +12,7 @@
 #include <linux/uaccess.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/tick.h>
 #include <linux/fs.h>
 #include <linux/err.h>
diff --git a/arch/blackfin/kernel/trace.c b/arch/blackfin/kernel/trace.c
index 01e546ad2f7a..23c05b8effb3 100644
--- a/arch/blackfin/kernel/trace.c
+++ b/arch/blackfin/kernel/trace.c
@@ -12,6 +12,7 @@
 #include <linux/mm.h>
 #include <linux/oom.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
diff --git a/arch/blackfin/kernel/traps.c b/arch/blackfin/kernel/traps.c
index 32676d7721b1..a323a40a46e9 100644
--- a/arch/blackfin/kernel/traps.c
+++ b/arch/blackfin/kernel/traps.c
@@ -10,6 +10,7 @@
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <asm/traps.h>
 #include <asm/cplb.h>
 #include <asm/blackfin.h>
diff --git a/arch/blackfin/mach-common/ints-priority.c b/arch/blackfin/mach-common/ints-priority.c
index 4986b4fbcee9..13e94bf9d8ba 100644
--- a/arch/blackfin/mach-common/ints-priority.c
+++ b/arch/blackfin/mach-common/ints-priority.c
@@ -16,6 +16,7 @@
 #include <linux/seq_file.h>
 #include <linux/irq.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/syscore_ops.h>
 #include <linux/gpio.h>
 #include <asm/delay.h>
diff --git a/arch/blackfin/mm/isram-driver.c b/arch/blackfin/mm/isram-driver.c
index 7e2e674ed444..aaa1e64b753b 100644
--- a/arch/blackfin/mm/isram-driver.c
+++ b/arch/blackfin/mm/isram-driver.c
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 
 #include <asm/blackfin.h>
 #include <asm/dma.h>
diff --git a/arch/c6x/kernel/traps.c b/arch/c6x/kernel/traps.c
index dcc2c2f6d67c..09b8a40d5680 100644
--- a/arch/c6x/kernel/traps.c
+++ b/arch/c6x/kernel/traps.c
@@ -10,6 +10,7 @@
  */
 #include <linux/module.h>
 #include <linux/ptrace.h>
+#include <linux/sched/debug.h>
 #include <linux/kallsyms.h>
 #include <linux/bug.h>
 
diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c
index 96e5afef6b47..068a40374200 100644
--- a/arch/cris/arch-v10/kernel/process.c
+++ b/arch/cris/arch-v10/kernel/process.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/fs.h>
diff --git a/arch/cris/arch-v10/kernel/traps.c b/arch/cris/arch-v10/kernel/traps.c
index 96d004fe9740..c0a501f29bd8 100644
--- a/arch/cris/arch-v10/kernel/traps.c
+++ b/arch/cris/arch-v10/kernel/traps.c
@@ -10,6 +10,8 @@
 
 #include <linux/ptrace.h>
 #include <linux/uaccess.h>
+#include <linux/sched/debug.h>
+
 #include <arch/sv_addr_ag.h>
 #include <arch/system.h>
 
diff --git a/arch/cris/arch-v32/kernel/process.c b/arch/cris/arch-v32/kernel/process.c
index 4d1afa9f9fd3..613c2d71bf10 100644
--- a/arch/cris/arch-v32/kernel/process.c
+++ b/arch/cris/arch-v32/kernel/process.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/fs.h>
diff --git a/arch/cris/arch-v32/kernel/traps.c b/arch/cris/arch-v32/kernel/traps.c
index ad6174e217c9..a34256515036 100644
--- a/arch/cris/arch-v32/kernel/traps.c
+++ b/arch/cris/arch-v32/kernel/traps.c
@@ -5,6 +5,8 @@
 #include <linux/ptrace.h>
 #include <linux/extable.h>
 #include <linux/uaccess.h>
+#include <linux/sched/debug.h>
+
 #include <hwregs/supp_reg.h>
 #include <hwregs/intr_vect_defs.h>
 #include <asm/irq.h>
diff --git a/arch/cris/kernel/irq.c b/arch/cris/kernel/irq.c
index 694850e8f077..09b864f46f8a 100644
--- a/arch/cris/kernel/irq.c
+++ b/arch/cris/kernel/irq.c
@@ -22,6 +22,7 @@
 #include <linux/module.h>
 #include <linux/ptrace.h>
 #include <linux/irq.h>
+#include <linux/sched/debug.h>
 
 #include <linux/kernel_stat.h>
 #include <linux/signal.h>
diff --git a/arch/cris/kernel/stacktrace.c b/arch/cris/kernel/stacktrace.c
index 99838c74456d..f1cc3aaacd8d 100644
--- a/arch/cris/kernel/stacktrace.c
+++ b/arch/cris/kernel/stacktrace.c
@@ -1,5 +1,5 @@
 #include <linux/sched.h>
-#include <linux/stacktrace.h>
+#include <linux/sched/debug.h>
 #include <linux/stacktrace.h>
 #include <asm/stacktrace.h>
 
diff --git a/arch/cris/kernel/traps.c b/arch/cris/kernel/traps.c
index b2a312a7afc6..a01636a12a6e 100644
--- a/arch/cris/kernel/traps.c
+++ b/arch/cris/kernel/traps.c
@@ -15,6 +15,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/utsname.h>
+#include <linux/sched/debug.h>
 #ifdef CONFIG_KALLSYMS
 #include <linux/kallsyms.h>
 #endif
diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c
index b306241c4ef2..cf4c34c09ab3 100644
--- a/arch/frv/kernel/process.c
+++ b/arch/frv/kernel/process.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/frv/kernel/traps.c b/arch/frv/kernel/traps.c
index 43c134d6081c..ce29991e4219 100644
--- a/arch/frv/kernel/traps.c
+++ b/arch/frv/kernel/traps.c
@@ -10,6 +10,7 @@
  */
 
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/signal.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index 891974a11704..d7cabf373fe3 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -25,6 +25,7 @@
 #include <linux/errno.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/h8300/kernel/traps.c b/arch/h8300/kernel/traps.c
index 044a36125846..ee7e3ce40d78 100644
--- a/arch/h8300/kernel/traps.c
+++ b/arch/h8300/kernel/traps.c
@@ -16,6 +16,7 @@
 
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/init.h>
diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c
index d9edfd3fc52a..3bdd9592d025 100644
--- a/arch/hexagon/kernel/process.c
+++ b/arch/hexagon/kernel/process.c
@@ -19,6 +19,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/tick.h>
diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c
index 4496bcf605ef..b55f13c70b34 100644
--- a/arch/hexagon/kernel/traps.c
+++ b/arch/hexagon/kernel/traps.c
@@ -20,6 +20,7 @@
 
 #include <linux/init.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/kdebug.h>
diff --git a/arch/hexagon/kernel/vm_events.c b/arch/hexagon/kernel/vm_events.c
index 741aaa917cda..04f57ef22009 100644
--- a/arch/hexagon/kernel/vm_events.c
+++ b/arch/hexagon/kernel/vm_events.c
@@ -19,6 +19,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/sched/debug.h>
 #include <asm/registers.h>
 #include <linux/irq.h>
 #include <linux/hardirq.h>
diff --git a/arch/ia64/hp/sim/simserial.c b/arch/ia64/hp/sim/simserial.c
index 21fd50def270..de8cba121013 100644
--- a/arch/ia64/hp/sim/simserial.c
+++ b/arch/ia64/hp/sim/simserial.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/tty.h>
 #include <linux/tty_flip.h>
 #include <linux/major.h>
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 5ac51069e453..f9fe3ac64242 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -73,6 +73,7 @@
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/bootmem.h>
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 52deab683ba1..804b251ee5d1 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -20,6 +20,7 @@
 #include <linux/notifier.h>
 #include <linux/personality.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/stddef.h>
 #include <linux/thread_info.h>
 #include <linux/unistd.h>
diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c
index 48ba46b025e1..7b1fe9462158 100644
--- a/arch/ia64/kernel/traps.c
+++ b/arch/ia64/kernel/traps.c
@@ -10,6 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/tty.h>
 #include <linux/vt_kern.h>		/* For unblank_screen() */
 #include <linux/export.h>
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index e0568bee60c0..1a227c1fdb1a 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -22,6 +22,7 @@
 
 #include <linux/fs.h>
 #include <linux/slab.h>
+#include <linux/sched/debug.h>
 #include <linux/module.h>
 #include <linux/ptrace.h>
 #include <linux/unistd.h>
diff --git a/arch/m32r/kernel/traps.c b/arch/m32r/kernel/traps.c
index c3c5fdfae920..7c05bb393597 100644
--- a/arch/m32r/kernel/traps.c
+++ b/arch/m32r/kernel/traps.c
@@ -14,6 +14,7 @@
 #include <linux/kallsyms.h>
 #include <linux/stddef.h>
 #include <linux/ptrace.h>
+#include <linux/sched/debug.h>
 #include <linux/mm.h>
 #include <asm/page.h>
 #include <asm/processor.h>
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index f0a8e9b332cd..b4a4675f4119 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -13,6 +13,7 @@
 #include <linux/errno.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c
index 558f38402737..a926d2c88898 100644
--- a/arch/m68k/kernel/traps.c
+++ b/arch/m68k/kernel/traps.c
@@ -19,6 +19,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/signal.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/arch/m68k/mac/macints.c b/arch/m68k/mac/macints.c
index b5cd06df71fd..9637dee90dac 100644
--- a/arch/m68k/mac/macints.c
+++ b/arch/m68k/mac/macints.c
@@ -110,6 +110,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/delay.h>
diff --git a/arch/metag/kernel/process.c b/arch/metag/kernel/process.c
index 35062796edf2..2e611bd37515 100644
--- a/arch/metag/kernel/process.c
+++ b/arch/metag/kernel/process.c
@@ -8,6 +8,7 @@
 #include <linux/errno.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/unistd.h>
diff --git a/arch/metag/kernel/stacktrace.c b/arch/metag/kernel/stacktrace.c
index 5510361d5bea..4f806d66a58c 100644
--- a/arch/metag/kernel/stacktrace.c
+++ b/arch/metag/kernel/stacktrace.c
@@ -1,5 +1,6 @@
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/stacktrace.h>
 
 #include <asm/stacktrace.h>
diff --git a/arch/metag/kernel/traps.c b/arch/metag/kernel/traps.c
index 17b2e2e38d5a..5ce67f9124aa 100644
--- a/arch/metag/kernel/traps.c
+++ b/arch/metag/kernel/traps.c
@@ -10,6 +10,7 @@
 
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/signal.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/arch/metag/mm/fault.c b/arch/metag/mm/fault.c
index c765b3621b9b..5055477486b6 100644
--- a/arch/metag/mm/fault.c
+++ b/arch/metag/mm/fault.c
@@ -8,6 +8,7 @@
 #include <linux/mm.h>
 #include <linux/kernel.h>
 #include <linux/ptrace.h>
+#include <linux/sched/debug.h>
 #include <linux/interrupt.h>
 #include <linux/uaccess.h>
 
diff --git a/arch/microblaze/kernel/exceptions.c b/arch/microblaze/kernel/exceptions.c
index 42dd12a62ff5..e6f338d0496b 100644
--- a/arch/microblaze/kernel/exceptions.c
+++ b/arch/microblaze/kernel/exceptions.c
@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kallsyms.h>
 
 #include <asm/exceptions.h>
diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c
index b2dd37196b3b..8c5fdb2e2850 100644
--- a/arch/microblaze/kernel/process.c
+++ b/arch/microblaze/kernel/process.c
@@ -11,6 +11,7 @@
 #include <linux/cpu.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/pm.h>
 #include <linux/tick.h>
 #include <linux/bitops.h>
diff --git a/arch/microblaze/kernel/traps.c b/arch/microblaze/kernel/traps.c
index cb619533a192..45bbba9d919f 100644
--- a/arch/microblaze/kernel/traps.c
+++ b/arch/microblaze/kernel/traps.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/kallsyms.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/debug_locks.h>
 
 #include <asm/exceptions.h>
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index 803e255b6fc3..97574cdb532d 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -11,6 +11,7 @@
  */
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/tick.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/arch/mips/kernel/stacktrace.c b/arch/mips/kernel/stacktrace.c
index 506021f62549..986f910961d9 100644
--- a/arch/mips/kernel/stacktrace.c
+++ b/arch/mips/kernel/stacktrace.c
@@ -4,6 +4,7 @@
  *  Copyright (C) 2006 Atsushi Nemoto <anemo@mba.ocn.ne.jp>
  */
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/stacktrace.h>
 #include <linux/export.h>
 #include <asm/stacktrace.h>
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 49c6df20672a..1a9366a157cb 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -24,6 +24,7 @@
 #include <linux/extable.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/smp.h>
 #include <linux/spinlock.h>
 #include <linux/kallsyms.h>
diff --git a/arch/mips/sgi-ip22/ip28-berr.c b/arch/mips/sgi-ip22/ip28-berr.c
index 9960a8302eac..1f2a5bc4779e 100644
--- a/arch/mips/sgi-ip22/ip28-berr.c
+++ b/arch/mips/sgi-ip22/ip28-berr.c
@@ -8,6 +8,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/seq_file.h>
 
 #include <asm/addrspace.h>
diff --git a/arch/mips/sgi-ip27/ip27-berr.c b/arch/mips/sgi-ip27/ip27-berr.c
index f8919b6a24c8..d12879eb2b1f 100644
--- a/arch/mips/sgi-ip27/ip27-berr.c
+++ b/arch/mips/sgi-ip27/ip27-berr.c
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 #include <linux/signal.h>	/* for SIGBUS */
 #include <linux/sched.h>	/* schow_regs(), force_sig() */
+#include <linux/sched/debug.h>
 
 #include <asm/sn/addrs.h>
 #include <asm/sn/arch.h>
diff --git a/arch/mips/sgi-ip32/ip32-berr.c b/arch/mips/sgi-ip32/ip32-berr.c
index ba8f46d80ab8..57d8c7486fe6 100644
--- a/arch/mips/sgi-ip32/ip32-berr.c
+++ b/arch/mips/sgi-ip32/ip32-berr.c
@@ -10,6 +10,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <asm/traps.h>
 #include <linux/uaccess.h>
 #include <asm/addrspace.h>
diff --git a/arch/mips/sgi-ip32/ip32-irq.c b/arch/mips/sgi-ip32/ip32-irq.c
index 838d8589a1c0..a6a0ff7f5aed 100644
--- a/arch/mips/sgi-ip32/ip32-irq.c
+++ b/arch/mips/sgi-ip32/ip32-irq.c
@@ -18,6 +18,7 @@
 #include <linux/mm.h>
 #include <linux/random.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 
 #include <asm/irq_cpu.h>
 #include <asm/mipsregs.h>
diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c
index e5def2217f72..a1e2f8301d94 100644
--- a/arch/mn10300/kernel/process.c
+++ b/arch/mn10300/kernel/process.c
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/mn10300/kernel/traps.c b/arch/mn10300/kernel/traps.c
index a7a987c7954f..800fd0801969 100644
--- a/arch/mn10300/kernel/traps.c
+++ b/arch/mn10300/kernel/traps.c
@@ -10,6 +10,7 @@
  * 2 of the Licence, or (at your option) any later version.
  */
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/errno.h>
diff --git a/arch/nios2/kernel/process.c b/arch/nios2/kernel/process.c
index 2f8c74f93e70..3ad87a6b119b 100644
--- a/arch/nios2/kernel/process.c
+++ b/arch/nios2/kernel/process.c
@@ -14,6 +14,7 @@
 
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/tick.h>
 #include <linux/uaccess.h>
 
diff --git a/arch/nios2/kernel/traps.c b/arch/nios2/kernel/traps.c
index 72ed30a93c85..8184e7d6b385 100644
--- a/arch/nios2/kernel/traps.c
+++ b/arch/nios2/kernel/traps.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
 #include <linux/export.h>
diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c
index e7a14e1e0d6b..b804dd06ea1c 100644
--- a/arch/nios2/mm/fault.c
+++ b/arch/nios2/mm/fault.c
@@ -13,6 +13,7 @@
 
 #include <linux/signal.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c
index 6e9d1cb519f2..899339dac938 100644
--- a/arch/openrisc/kernel/process.c
+++ b/arch/openrisc/kernel/process.c
@@ -22,6 +22,7 @@
 
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/mm.h>
diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c
index 7e81ad258bca..2354ab88d948 100644
--- a/arch/openrisc/kernel/traps.c
+++ b/arch/openrisc/kernel/traps.c
@@ -22,6 +22,7 @@
 
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/extable.h>
 #include <linux/kmod.h>
diff --git a/arch/parisc/kernel/pa7300lc.c b/arch/parisc/kernel/pa7300lc.c
index 8a89780223aa..9b245fc67560 100644
--- a/arch/parisc/kernel/pa7300lc.c
+++ b/arch/parisc/kernel/pa7300lc.c
@@ -5,6 +5,7 @@
  *   Copyright (C) 2000 Philipp Rumpf */
 
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
 #include <asm/io.h>
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index ea6603ee8d24..8c283caf2b4d 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -43,6 +43,7 @@
 #include <linux/personality.h>
 #include <linux/ptrace.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/slab.h>
 #include <linux/stddef.h>
 #include <linux/unistd.h>
diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c
index e58925ac64d1..9e03296641d7 100644
--- a/arch/parisc/kernel/signal.c
+++ b/arch/parisc/kernel/signal.c
@@ -13,6 +13,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c
index 378df9207406..991654c88eec 100644
--- a/arch/parisc/kernel/traps.c
+++ b/arch/parisc/kernel/traps.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/errno.h>
diff --git a/arch/parisc/kernel/unaligned.c b/arch/parisc/kernel/unaligned.c
index a08ab481e556..e36f7b75ab07 100644
--- a/arch/parisc/kernel/unaligned.c
+++ b/arch/parisc/kernel/unaligned.c
@@ -24,6 +24,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/signal.h>
 #include <linux/ratelimit.h>
 #include <linux/uaccess.h>
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index 1a0b4f63f0e9..c3cac4ddfe9c 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -13,6 +13,7 @@
 #include <linux/mm.h>
 #include <linux/ptrace.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/interrupt.h>
 #include <linux/extable.h>
 #include <linux/uaccess.h>
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 4379a079b3c2..76f58b87dcb2 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -16,6 +16,7 @@
 
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c
index 4f24606afc3f..66711958493c 100644
--- a/arch/powerpc/kernel/stacktrace.c
+++ b/arch/powerpc/kernel/stacktrace.c
@@ -12,6 +12,7 @@
 
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/stacktrace.h>
 #include <asm/ptrace.h>
 #include <asm/processor.h>
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index e6cc56b61d01..ff365f9de27a 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -17,6 +17,7 @@
 
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c
index 55d4fe174fd9..72c584d62f59 100644
--- a/arch/s390/kernel/dumpstack.c
+++ b/arch/s390/kernel/dumpstack.c
@@ -14,6 +14,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <asm/processor.h>
 #include <asm/debug.h>
 #include <asm/dis.h>
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 54281660582c..ed99ff911b67 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -11,6 +11,7 @@
 #include <linux/compiler.h>
 #include <linux/cpu.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/elfcore.h>
diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c
index 0085b2d8ed7d..e66687dc6144 100644
--- a/arch/s390/kernel/stacktrace.c
+++ b/arch/s390/kernel/stacktrace.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/stacktrace.h>
 #include <linux/kallsyms.h>
 #include <linux/export.h>
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index 283ad7840335..f787b9d8f54c 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -17,6 +17,7 @@
 #include <linux/extable.h>
 #include <linux/ptrace.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index bb5560eb2435..5845d3028ffc 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -12,6 +12,7 @@
 #include <linux/perf_event.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/string.h>
diff --git a/arch/score/kernel/traps.c b/arch/score/kernel/traps.c
index fa624f30f783..8a54d320fb41 100644
--- a/arch/score/kernel/traps.c
+++ b/arch/score/kernel/traps.c
@@ -25,6 +25,7 @@
 
 #include <linux/extable.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 
 #include <asm/cacheflush.h>
 #include <asm/irq.h>
diff --git a/arch/sh/kernel/dumpstack.c b/arch/sh/kernel/dumpstack.c
index 8dfe645bcc4b..d00cab2f50f9 100644
--- a/arch/sh/kernel/dumpstack.c
+++ b/arch/sh/kernel/dumpstack.c
@@ -11,6 +11,7 @@
 #include <linux/kallsyms.h>
 #include <linux/ftrace.h>
 #include <linux/debug_locks.h>
+#include <linux/sched/debug.h>
 #include <linux/kdebug.h>
 #include <linux/export.h>
 #include <linux/uaccess.h>
diff --git a/arch/sh/kernel/nmi_debug.c b/arch/sh/kernel/nmi_debug.c
index ff0abbd1e652..730d928f0d12 100644
--- a/arch/sh/kernel/nmi_debug.c
+++ b/arch/sh/kernel/nmi_debug.c
@@ -9,6 +9,7 @@
 #include <linux/kdebug.h>
 #include <linux/notifier.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/hardirq.h>
 
 enum nmi_action {
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index 51741850a715..5098274e0f23 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -15,6 +15,7 @@
  */
 #include <linux/module.h>
 #include <linux/mm.h>
+#include <linux/sched/debug.h>
 #include <linux/slab.h>
 #include <linux/elfcore.h>
 #include <linux/kallsyms.h>
diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c
index e0b271bffd6a..ced21a417d9d 100644
--- a/arch/sh/kernel/process_64.c
+++ b/arch/sh/kernel/process_64.c
@@ -25,6 +25,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/io.h>
+#include <linux/sched/debug.h>
 #include <asm/syscalls.h>
 #include <linux/uaccess.h>
 #include <asm/pgtable.h>
diff --git a/arch/sh/kernel/stacktrace.c b/arch/sh/kernel/stacktrace.c
index bf989e063a0c..7a73d2763e1b 100644
--- a/arch/sh/kernel/stacktrace.c
+++ b/arch/sh/kernel/stacktrace.c
@@ -10,6 +10,7 @@
  * for more details.
  */
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/stacktrace.h>
 #include <linux/thread_info.h>
 #include <linux/module.h>
diff --git a/arch/sh/kernel/traps.c b/arch/sh/kernel/traps.c
index 3036dee854d1..bc5c9f347b9e 100644
--- a/arch/sh/kernel/traps.c
+++ b/arch/sh/kernel/traps.c
@@ -4,6 +4,7 @@
 #include <linux/kdebug.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/uaccess.h>
 #include <linux/hardirq.h>
 #include <linux/kernel.h>
diff --git a/arch/sh/kernel/traps_64.c b/arch/sh/kernel/traps_64.c
index 00835edb6e20..014fb08cf133 100644
--- a/arch/sh/kernel/traps_64.c
+++ b/arch/sh/kernel/traps_64.c
@@ -10,6 +10,7 @@
  * for more details.
  */
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/errno.h>
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index 48ffc3e7d1dd..237a471c8ed5 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -14,6 +14,7 @@
 #include <linux/errno.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index d249ca10b203..98f6ff6eaa55 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -14,6 +14,7 @@
 #include <linux/errno.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
diff --git a/arch/sparc/kernel/stacktrace.c b/arch/sparc/kernel/stacktrace.c
index e78386a0029f..be4c14cccc05 100644
--- a/arch/sparc/kernel/stacktrace.c
+++ b/arch/sparc/kernel/stacktrace.c
@@ -1,4 +1,5 @@
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/stacktrace.h>
 #include <linux/thread_info.h>
 #include <linux/ftrace.h>
diff --git a/arch/sparc/kernel/sun4m_irq.c b/arch/sparc/kernel/sun4m_irq.c
index da737c712fa8..aa84da0b2d30 100644
--- a/arch/sparc/kernel/sun4m_irq.c
+++ b/arch/sparc/kernel/sun4m_irq.c
@@ -10,6 +10,7 @@
  */
 
 #include <linux/slab.h>
+#include <linux/sched/debug.h>
 
 #include <asm/timer.h>
 #include <asm/traps.h>
diff --git a/arch/sparc/kernel/sys_sparc_32.c b/arch/sparc/kernel/sys_sparc_32.c
index ff3573059936..7aecb239626d 100644
--- a/arch/sparc/kernel/sys_sparc_32.c
+++ b/arch/sparc/kernel/sys_sparc_32.c
@@ -9,6 +9,7 @@
 #include <linux/types.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/debug.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
 #include <linux/file.h>
diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c
index c521ee277083..ef4520efc813 100644
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -9,6 +9,7 @@
 #include <linux/types.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/debug.h>
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/mm.h>
diff --git a/arch/sparc/kernel/traps_32.c b/arch/sparc/kernel/traps_32.c
index ecddac5a4c96..26740a2f285d 100644
--- a/arch/sparc/kernel/traps_32.c
+++ b/arch/sparc/kernel/traps_32.c
@@ -10,6 +10,7 @@
  */
 
 #include <linux/sched.h>  /* for jiffies */
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
 #include <linux/smp.h>
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index e022d7b00390..4ff4c35f76b2 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -10,6 +10,7 @@
 
 #include <linux/extable.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/linkage.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 643c149a3151..b84c4dd14954 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -10,6 +10,7 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/ptrace.h>
 #include <linux/mman.h>
 #include <linux/signal.h>
diff --git a/arch/tile/include/asm/stack.h b/arch/tile/include/asm/stack.h
index c3cb42615a9f..3573325e340b 100644
--- a/arch/tile/include/asm/stack.h
+++ b/arch/tile/include/asm/stack.h
@@ -17,6 +17,8 @@
 
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
+
 #include <asm/backtrace.h>
 #include <asm/page.h>
 #include <hv/hypervisor.h>
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index c84c54a1ac55..557b4c8435d0 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -13,6 +13,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/preempt.h>
 #include <linux/module.h>
 #include <linux/fs.h>
diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c
index 87299a6cfec8..8cc92ae6eb27 100644
--- a/arch/tile/kernel/signal.c
+++ b/arch/tile/kernel/signal.c
@@ -14,6 +14,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
diff --git a/arch/tile/kernel/stack.c b/arch/tile/kernel/stack.c
index 22bbbd3ff4a3..a1c0787fe9d8 100644
--- a/arch/tile/kernel/stack.c
+++ b/arch/tile/kernel/stack.c
@@ -13,6 +13,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/kprobes.h>
 #include <linux/module.h>
diff --git a/arch/tile/kernel/traps.c b/arch/tile/kernel/traps.c
index 39f427bb0de2..54804866f238 100644
--- a/arch/tile/kernel/traps.c
+++ b/arch/tile/kernel/traps.c
@@ -13,6 +13,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
diff --git a/arch/tile/kernel/unaligned.c b/arch/tile/kernel/unaligned.c
index f229e979584e..142acae78316 100644
--- a/arch/tile/kernel/unaligned.c
+++ b/arch/tile/kernel/unaligned.c
@@ -17,6 +17,7 @@
 #include <linux/smp.h>
 #include <linux/ptrace.h>
 #include <linux/slab.h>
+#include <linux/sched/debug.h>
 #include <linux/thread_info.h>
 #include <linux/uaccess.h>
 #include <linux/mman.h>
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index 709f8e9ba3e9..005c91c2adca 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -16,6 +16,7 @@
 
 #include <linux/signal.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/string.h>
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 8a4c72af3bc0..af326fb6510d 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/notifier.h>
 #include <linux/reboot.h>
+#include <linux/sched/debug.h>
 #include <linux/proc_fs.h>
 #include <linux/slab.h>
 #include <linux/syscalls.h>
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index 078630d6448c..c6c45ea4021f 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -17,6 +17,7 @@
 #include <linux/random.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/seq_file.h>
 #include <linux/tick.h>
 #include <linux/threads.h>
diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c
index aa1b56f5ac68..34ae555c3e70 100644
--- a/arch/um/kernel/sysrq.c
+++ b/arch/um/kernel/sysrq.c
@@ -11,6 +11,8 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
+
 #include <asm/sysrq.h>
 #include <asm/stacktrace.h>
 #include <os.h>
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 9711ae4aaa6a..59158871b9fc 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -8,6 +8,7 @@
 #include <linux/hardirq.h>
 #include <linux/module.h>
 #include <linux/uaccess.h>
+#include <linux/sched/debug.h>
 #include <asm/current.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c
index d7c6b676b3a5..c1a0eec88f1f 100644
--- a/arch/unicore32/kernel/process.c
+++ b/arch/unicore32/kernel/process.c
@@ -13,6 +13,7 @@
 
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
diff --git a/arch/unicore32/kernel/stacktrace.c b/arch/unicore32/kernel/stacktrace.c
index b34030bdabe3..9976e767d51c 100644
--- a/arch/unicore32/kernel/stacktrace.c
+++ b/arch/unicore32/kernel/stacktrace.c
@@ -11,6 +11,7 @@
  */
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/stacktrace.h>
 
 #include <asm/stacktrace.h>
diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c
index 7f5e06f9a202..506e1a2127c6 100644
--- a/arch/unicore32/kernel/traps.c
+++ b/arch/unicore32/kernel/traps.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/signal.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/spinlock.h>
 #include <linux/personality.h>
 #include <linux/kallsyms.h>
diff --git a/arch/unicore32/mm/alignment.c b/arch/unicore32/mm/alignment.c
index 24e836023e6c..3a7f6faa8794 100644
--- a/arch/unicore32/mm/alignment.c
+++ b/arch/unicore32/mm/alignment.c
@@ -15,6 +15,7 @@
  */
 #include <linux/compiler.h>
 #include <linux/kernel.h>
+#include <linux/sched/debug.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/init.h>
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 82dfe32faaf4..df083efe6ee0 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -17,6 +17,7 @@
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/string.h>
 #include <linux/spinlock.h>
 #include <linux/pci.h>
diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c
index b2f7207ba86c..f9c324e08d85 100644
--- a/arch/x86/kernel/doublefault.c
+++ b/arch/x86/kernel/doublefault.c
@@ -1,5 +1,6 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/init_task.h>
 #include <linux/fs.h>
 
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 0cfd01d2754c..7b9e7e68f316 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -10,6 +10,7 @@
 #include <linux/kdebug.h>
 #include <linux/module.h>
 #include <linux/ptrace.h>
+#include <linux/sched/debug.h>
 #include <linux/ftrace.h>
 #include <linux/kexec.h>
 #include <linux/bug.h>
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index bb3b5b9a6899..b0b3a3df7c20 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -2,6 +2,7 @@
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
  */
+#include <linux/sched/debug.h>
 #include <linux/kallsyms.h>
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index fac189efcc34..a8b117e93b46 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -2,6 +2,7 @@
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
  */
+#include <linux/sched/debug.h>
 #include <linux/kallsyms.h>
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 520b8dfe1640..6384eb754a58 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -45,6 +45,7 @@
 #include <linux/slab.h>
 #include <linux/hardirq.h>
 #include <linux/preempt.h>
+#include <linux/sched/debug.h>
 #include <linux/extable.h>
 #include <linux/kdebug.h>
 #include <linux/kallsyms.h>
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index d17b1a940add..f088ea4c66e7 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -13,6 +13,7 @@
 #include <linux/spinlock.h>
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
+#include <linux/sched/debug.h>
 #include <linux/nmi.h>
 #include <linux/debugfs.h>
 #include <linux/delay.h>
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 6395e0cd7dd6..505be5589e52 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -8,6 +8,7 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/sched/idle.h>
+#include <linux/sched/debug.h>
 #include <linux/init.h>
 #include <linux/export.h>
 #include <linux/pm.h>
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 0653788026e2..330cae0025d0 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -4,6 +4,7 @@
  *  Copyright (C) 2006-2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  */
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/stacktrace.h>
 #include <linux/export.h>
 #include <linux/uaccess.h>
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 61a7e9ea9aa1..35ea061010a1 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -1,5 +1,7 @@
 #include <linux/extable.h>
 #include <linux/uaccess.h>
+#include <linux/sched/debug.h>
+
 #include <asm/traps.h>
 #include <asm/kdebug.h>
 
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index 9743d0ccfec6..c34bd8233f7c 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -27,6 +27,7 @@
 #include <linux/moduleparam.h>
 #include <linux/nmi.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/slab.h>
 #include <linux/clocksource.h>
 
diff --git a/arch/x86/um/sysrq_32.c b/arch/x86/um/sysrq_32.c
index 16ee0e450e3e..f2383484840d 100644
--- a/arch/x86/um/sysrq_32.c
+++ b/arch/x86/um/sysrq_32.c
@@ -6,6 +6,7 @@
 #include <linux/kernel.h>
 #include <linux/smp.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kallsyms.h>
 #include <asm/ptrace.h>
 #include <asm/sysrq.h>
diff --git a/arch/x86/um/sysrq_64.c b/arch/x86/um/sysrq_64.c
index 38b4e4abd0f8..903ad91b624f 100644
--- a/arch/x86/um/sysrq_64.c
+++ b/arch/x86/um/sysrq_64.c
@@ -7,6 +7,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/utsname.h>
 #include <asm/current.h>
 #include <asm/ptrace.h>
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index 826d25104846..afaa3588d869 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -17,6 +17,7 @@
 
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c
index 84abd66e680d..d8bb2e62393e 100644
--- a/arch/xtensa/kernel/traps.c
+++ b/arch/xtensa/kernel/traps.c
@@ -25,6 +25,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/stringify.h>
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 249e0304597f..9faee1c893e5 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -27,6 +27,7 @@
 #include <linux/pm_wakeirq.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/async.h>
 #include <linux/suspend.h>
 #include <trace/events/power.h>
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 65db0aeb3d80..e5f0c7d0fe8d 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -16,6 +16,8 @@
 
 #include <linux/sched/signal.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/debug.h>
 #include <linux/interrupt.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
diff --git a/drivers/tty/tty_ldsem.c b/drivers/tty/tty_ldsem.c
index 9229de43e19d..4f6b1b10b537 100644
--- a/drivers/tty/tty_ldsem.c
+++ b/drivers/tty/tty_ldsem.c
@@ -32,6 +32,7 @@
 #include <linux/atomic.h>
 #include <linux/tty.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c
index 6b3a2c00974d..c5f0fc906136 100644
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -27,6 +27,8 @@
 #include <linux/consolemap.h>
 #include <linux/module.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/debug.h>
 #include <linux/tty.h>
 #include <linux/tty_flip.h>
 #include <linux/mm.h>
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4c5fa704e38c..5914fed3712d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -88,6 +88,7 @@
 #include <linux/sched/autogroup.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
+#include <linux/sched/debug.h>
 #include <linux/flex_array.h>
 #include <linux/posix-timers.h>
 #ifdef CONFIG_HARDWALL
diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h
new file mode 100644
index 000000000000..55bc0cd35fd5
--- /dev/null
+++ b/include/linux/sched/debug.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_DEBUG_H
+#define _LINUX_SCHED_DEBUG_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_DEBUG_H */
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 9b976a42376d..6ad4a9fcbd6f 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -13,6 +13,7 @@
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/kdb.h>
 #include <linux/nmi.h>
 #include "kdb_private.h"
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index bf48a492b4be..c8146d53ca67 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -20,6 +20,7 @@
 #include <linux/sched.h>
 #include <linux/sched/loadavg.h>
 #include <linux/sched/stat.h>
+#include <linux/sched/debug.h>
 #include <linux/sysrq.h>
 #include <linux/smp.h>
 #include <linux/utsname.h>
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 129247e56902..f0f8e2a8496f 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -17,6 +17,7 @@
 #include <linux/sysctl.h>
 #include <linux/utsname.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 
 #include <trace/events/sched.h>
 
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index b5c30d9f46c5..545839b838d6 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -55,6 +55,7 @@
 #include <linux/latencytop.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/list.h>
 #include <linux/stacktrace.h>
 
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 82ff5726bc1b..198527a62149 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -22,6 +22,7 @@
 #include <linux/sched/signal.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/wake_q.h>
+#include <linux/sched/debug.h>
 #include <linux/export.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 62b6cee8ea7f..97ee9df32e0f 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -18,6 +18,7 @@
  */
 #include <linux/sched.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/debug.h>
 #include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/spinlock.h>
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 1c8c8707853a..6edc32ecd9c5 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -16,6 +16,7 @@
 #include <linux/sched/rt.h>
 #include <linux/sched/deadline.h>
 #include <linux/sched/wake_q.h>
+#include <linux/sched/debug.h>
 #include <linux/timer.h>
 
 #include "rtmutex_common.h"
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 91e7bc8e9d5a..7bc24d477805 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -7,6 +7,7 @@
  */
 #include <linux/rwsem.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/export.h>
 
 enum rwsem_waiter_type {
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 9bd6e768164e..34e727f18e49 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -15,6 +15,7 @@
 #include <linux/sched/signal.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/wake_q.h>
+#include <linux/sched/debug.h>
 #include <linux/osq_lock.h>
 
 #include "rwsem.h"
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 45ba475d4be3..90a74ccd85a4 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -7,6 +7,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/export.h>
 #include <linux/rwsem.h>
 #include <linux/atomic.h>
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 9512e37637dc..561acdd39960 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -29,6 +29,7 @@
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/semaphore.h>
 #include <linux/spinlock.h>
 #include <linux/ftrace.h>
diff --git a/kernel/panic.c b/kernel/panic.c
index 3ec16e603e88..a58932b41700 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -9,6 +9,7 @@
  * to indicate a major problem.
  */
 #include <linux/debug_locks.h>
+#include <linux/sched/debug.h>
 #include <linux/interrupt.h>
 #include <linux/kmsg_dump.h>
 #include <linux/kallsyms.h>
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 2fba066e125f..85e2915d8961 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -12,6 +12,7 @@
 #include <linux/oom.h>
 #include <linux/suspend.h>
 #include <linux/module.h>
+#include <linux/sched/debug.h>
 #include <linux/syscalls.h>
 #include <linux/freezer.h>
 #include <linux/delay.h>
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 47050eedc206..3caaaa04b92e 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -46,6 +46,7 @@
 #include <linux/ctype.h>
 #include <linux/uio.h>
 #include <linux/sched/clock.h>
+#include <linux/sched/debug.h>
 
 #include <linux/uaccess.h>
 #include <asm/sections.h>
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e456327a63d6..50fee7689e71 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -35,6 +35,7 @@
 #include <linux/rcupdate_wait.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/nmi.h>
 #include <linux/atomic.h>
 #include <linux/bitops.h>
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 9dabb04003be..0a62a8f1caac 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -27,6 +27,7 @@
 #include <linux/delay.h>
 #include <linux/gfp.h>
 #include <linux/oom.h>
+#include <linux/sched/debug.h>
 #include <linux/smpboot.h>
 #include <uapi/linux/sched/types.h>
 #include "../time/tick-internal.h"
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index da128deb10ec..55c8530316c7 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -37,6 +37,7 @@
 #include <linux/smp.h>
 #include <linux/interrupt.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/atomic.h>
 #include <linux/bitops.h>
 #include <linux/percpu.h>
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index b294a8ee2842..53f9558fa925 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -12,6 +12,7 @@
  */
 
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/completion.h>
 
 /**
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5979f47c422c..e1e819f731b2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -11,6 +11,7 @@
 #include <linux/sched/cpufreq.h>
 #include <linux/sched/stat.h>
 #include <linux/sched/nohz.h>
+#include <linux/sched/debug.h>
 #include <linux/u64_stats_sync.h>
 #include <linux/sched/deadline.h>
 #include <linux/kernel_stat.h>
@@ -1830,7 +1831,6 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
 extern void print_dl_stats(struct seq_file *m, int cpu);
 extern void
 print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
-
 #ifdef CONFIG_NUMA_BALANCING
 extern void
 show_numa_stats(struct task_struct *p, struct seq_file *m);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 1fedfcf6fc9b..4d2ea6f25568 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -6,6 +6,7 @@
 #include <linux/init.h>
 #include <linux/export.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/mm.h>
 #include <linux/wait.h>
 #include <linux/hash.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index 762fcf64f0c3..15092fd8c7b1 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -15,6 +15,7 @@
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/sched/user.h>
+#include <linux/sched/debug.h>
 #include <linux/fs.h>
 #include <linux/tty.h>
 #include <linux/binfmts.h>
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 04939053c823..ce3a31e8eb36 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -20,6 +20,7 @@
 #include <linux/timerqueue.h>
 #include <linux/rtc.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
 #include <linux/alarmtimer.h>
 #include <linux/mutex.h>
 #include <linux/platform_device.h>
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index aed2207f5b2d..ec08f527d7ee 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -48,6 +48,7 @@
 #include <linux/sched/rt.h>
 #include <linux/sched/deadline.h>
 #include <linux/sched/nohz.h>
+#include <linux/sched/debug.h>
 #include <linux/timer.h>
 #include <linux/freezer.h>
 
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index d7999cb06229..1dc0256bfb6e 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -41,6 +41,7 @@
 #include <linux/sched/signal.h>
 #include <linux/sched/sysctl.h>
 #include <linux/sched/nohz.h>
+#include <linux/sched/debug.h>
 #include <linux/slab.h>
 #include <linux/compat.h>
 
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 52718f4512e9..03e0b69bb5bf 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -23,6 +23,7 @@
 #include <linux/tick.h>
 #include <linux/workqueue.h>
 #include <linux/sched/clock.h>
+#include <linux/sched/debug.h>
 
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index b5de262a9eb9..54a427d1f344 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -13,6 +13,8 @@
 
 #include <linux/nmi.h>
 #include <linux/module.h>
+#include <linux/sched/debug.h>
+
 #include <asm/irq_regs.h>
 #include <linux/perf_event.h>
 
diff --git a/lib/dump_stack.c b/lib/dump_stack.c
index c30d07e99dba..625375e7f11f 100644
--- a/lib/dump_stack.c
+++ b/lib/dump_stack.c
@@ -6,6 +6,7 @@
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/smp.h>
 #include <linux/atomic.h>
 
diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c
index 5f7999eacad5..4e8a30d1c22f 100644
--- a/lib/nmi_backtrace.c
+++ b/lib/nmi_backtrace.c
@@ -17,6 +17,7 @@
 #include <linux/kprobes.h>
 #include <linux/nmi.h>
 #include <linux/cpu.h>
+#include <linux/sched/debug.h>
 
 #ifdef arch_trigger_cpumask_backtrace
 /* For reliability, we're prepared to waste bits here. */
-- 
cgit v1.2.3


From ef8bd77f332bb0a4e467d7171bbfc6c57aa08a88 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:36 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/hotplug.h>

We are going to split <linux/sched/hotplug.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/hotplug.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/kernel/smp.c                        | 1 +
 arch/arm64/include/asm/mmu_context.h         | 1 +
 arch/arm64/kernel/smp.c                      | 1 +
 arch/ia64/kernel/process.c                   | 1 +
 arch/metag/kernel/smp.c                      | 1 +
 arch/mips/cavium-octeon/smp.c                | 1 +
 arch/mips/kernel/smp-bmips.c                 | 1 +
 arch/mips/kernel/smp-cps.c                   | 1 +
 arch/mips/loongson64/loongson-3/smp.c        | 1 +
 arch/powerpc/platforms/85xx/smp.c            | 1 +
 arch/powerpc/platforms/powermac/smp.c        | 1 +
 arch/powerpc/platforms/powernv/smp.c         | 1 +
 arch/powerpc/platforms/pseries/hotplug-cpu.c | 1 +
 arch/s390/kernel/smp.c                       | 1 +
 arch/sh/kernel/smp.c                         | 1 +
 arch/sparc/kernel/smp_64.c                   | 1 +
 arch/x86/kernel/smpboot.c                    | 1 +
 arch/xtensa/kernel/smp.c                     | 1 +
 include/linux/sched/hotplug.h                | 6 ++++++
 kernel/cpu.c                                 | 1 +
 kernel/sched/core.c                          | 1 +
 kernel/sched/sched.h                         | 4 +++-
 22 files changed, 29 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/sched/hotplug.h

(limited to 'kernel')

diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 5a07c5a4b894..2083a5370308 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/sched.h>
+#include <linux/sched/hotplug.h>
 #include <linux/interrupt.h>
 #include <linux/cache.h>
 #include <linux/profile.h>
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 1ef40d82cfd3..3c9f7d18a7e6 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -25,6 +25,7 @@
 
 #include <linux/compiler.h>
 #include <linux/sched.h>
+#include <linux/sched/hotplug.h>
 
 #include <asm/cacheflush.h>
 #include <asm/cpufeature.h>
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 827d52d78b67..f66e58523b96 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -22,6 +22,7 @@
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/sched.h>
+#include <linux/sched/hotplug.h>
 #include <linux/interrupt.h>
 #include <linux/cache.h>
 #include <linux/profile.h>
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 804b251ee5d1..2204ae450d65 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -21,6 +21,7 @@
 #include <linux/personality.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/hotplug.h>
 #include <linux/stddef.h>
 #include <linux/thread_info.h>
 #include <linux/unistd.h>
diff --git a/arch/metag/kernel/smp.c b/arch/metag/kernel/smp.c
index c622293254e4..198eda75846b 100644
--- a/arch/metag/kernel/smp.c
+++ b/arch/metag/kernel/smp.c
@@ -13,6 +13,7 @@
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/sched.h>
+#include <linux/sched/hotplug.h>
 #include <linux/interrupt.h>
 #include <linux/cache.h>
 #include <linux/profile.h>
diff --git a/arch/mips/cavium-octeon/smp.c b/arch/mips/cavium-octeon/smp.c
index 4355a4cf4d74..4b94b7fbafa3 100644
--- a/arch/mips/cavium-octeon/smp.c
+++ b/arch/mips/cavium-octeon/smp.c
@@ -11,6 +11,7 @@
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
 #include <linux/sched.h>
+#include <linux/sched/hotplug.h>
 #include <linux/init.h>
 #include <linux/export.h>
 
diff --git a/arch/mips/kernel/smp-bmips.c b/arch/mips/kernel/smp-bmips.c
index 16e37a28f876..3daa2cae50b0 100644
--- a/arch/mips/kernel/smp-bmips.c
+++ b/arch/mips/kernel/smp-bmips.c
@@ -10,6 +10,7 @@
 
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/sched/hotplug.h>
 #include <linux/mm.h>
 #include <linux/delay.h>
 #include <linux/smp.h>
diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c
index a2544c2394e4..1d3188c23bb8 100644
--- a/arch/mips/kernel/smp-cps.c
+++ b/arch/mips/kernel/smp-cps.c
@@ -12,6 +12,7 @@
 #include <linux/io.h>
 #include <linux/irqchip/mips-gic.h>
 #include <linux/sched.h>
+#include <linux/sched/hotplug.h>
 #include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/types.h>
diff --git a/arch/mips/loongson64/loongson-3/smp.c b/arch/mips/loongson64/loongson-3/smp.c
index cfcf240cedbe..66ede5b11355 100644
--- a/arch/mips/loongson64/loongson-3/smp.c
+++ b/arch/mips/loongson64/loongson-3/smp.c
@@ -17,6 +17,7 @@
 #include <linux/init.h>
 #include <linux/cpu.h>
 #include <linux/sched.h>
+#include <linux/sched/hotplug.h>
 #include <linux/smp.h>
 #include <linux/cpufreq.h>
 #include <asm/processor.h>
diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c
index a83a6d26090d..078097a0b09d 100644
--- a/arch/powerpc/platforms/85xx/smp.c
+++ b/arch/powerpc/platforms/85xx/smp.c
@@ -12,6 +12,7 @@
 
 #include <linux/stddef.h>
 #include <linux/kernel.h>
+#include <linux/sched/hotplug.h>
 #include <linux/init.h>
 #include <linux/delay.h>
 #include <linux/of.h>
diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c
index c9eb7d6540ea..746ca7321b03 100644
--- a/arch/powerpc/platforms/powermac/smp.c
+++ b/arch/powerpc/platforms/powermac/smp.c
@@ -23,6 +23,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/hotplug.h>
 #include <linux/smp.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index e39e6c428af1..8b67e1eefb5c 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/hotplug.h>
 #include <linux/smp.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index a1b63e00b2f7..7bc0e91f8715 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -24,6 +24,7 @@
 #include <linux/interrupt.h>
 #include <linux/delay.h>
 #include <linux/sched.h>	/* for idle_task_exit */
+#include <linux/sched/hotplug.h>
 #include <linux/cpu.h>
 #include <linux/of.h>
 #include <linux/slab.h>
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index d0a74d7ce433..b5766e03cdcb 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -31,6 +31,7 @@
 #include <linux/irqflags.h>
 #include <linux/cpu.h>
 #include <linux/slab.h>
+#include <linux/sched/hotplug.h>
 #include <linux/crash_dump.h>
 #include <linux/memblock.h>
 #include <asm/asm-offsets.h>
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index edc4769b047e..4abf119c129c 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -21,6 +21,7 @@
 #include <linux/cpu.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
+#include <linux/sched/hotplug.h>
 #include <linux/atomic.h>
 #include <linux/clockchips.h>
 #include <asm/processor.h>
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 8e3e13924594..15052d364e04 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -6,6 +6,7 @@
 #include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/hotplug.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/threads.h>
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index fe7a66e6b5a0..f3eb266d75ff 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -46,6 +46,7 @@
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/sched/topology.h>
+#include <linux/sched/hotplug.h>
 #include <linux/percpu.h>
 #include <linux/bootmem.h>
 #include <linux/err.h>
diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c
index fcea72019df7..f2b3e1725349 100644
--- a/arch/xtensa/kernel/smp.c
+++ b/arch/xtensa/kernel/smp.c
@@ -21,6 +21,7 @@
 #include <linux/irq.h>
 #include <linux/kdebug.h>
 #include <linux/module.h>
+#include <linux/sched/hotplug.h>
 #include <linux/reboot.h>
 #include <linux/seq_file.h>
 #include <linux/smp.h>
diff --git a/include/linux/sched/hotplug.h b/include/linux/sched/hotplug.h
new file mode 100644
index 000000000000..34670ed24894
--- /dev/null
+++ b/include/linux/sched/hotplug.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_HOTPLUG_H
+#define _LINUX_SCHED_HOTPLUG_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_HOTPLUG_H */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 0aae3183b029..78c206579d01 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -8,6 +8,7 @@
 #include <linux/init.h>
 #include <linux/notifier.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/hotplug.h>
 #include <linux/unistd.h>
 #include <linux/cpu.h>
 #include <linux/oom.h>
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 11a0684a29a7..956383844116 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9,6 +9,7 @@
 #include <linux/sched/clock.h>
 #include <uapi/linux/sched/types.h>
 #include <linux/sched/loadavg.h>
+#include <linux/sched/hotplug.h>
 #include <linux/cpuset.h>
 #include <linux/delayacct.h>
 #include <linux/init_task.h>
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e1e819f731b2..0974eb2ef50d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3,6 +3,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/sched/topology.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/deadline.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/wake_q.h>
 #include <linux/sched/signal.h>
@@ -12,8 +13,9 @@
 #include <linux/sched/stat.h>
 #include <linux/sched/nohz.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/hotplug.h>
+
 #include <linux/u64_stats_sync.h>
-#include <linux/sched/deadline.h>
 #include <linux/kernel_stat.h>
 #include <linux/binfmts.h>
 #include <linux/mutex.h>
-- 
cgit v1.2.3


From 299300258d1bc4e997b7db340a2e06636757fe2e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:36 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/task.h>

We are going to split <linux/sched/task.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/task.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/kernel/process.c              | 1 +
 arch/arc/kernel/process.c                | 2 ++
 arch/arm/kernel/process.c                | 1 +
 arch/arm/mm/init.c                       | 1 +
 arch/arm64/kernel/process.c              | 1 +
 arch/avr32/kernel/process.c              | 1 +
 arch/blackfin/kernel/process.c           | 1 +
 arch/blackfin/kernel/trace.c             | 1 +
 arch/c6x/kernel/process.c                | 1 +
 arch/cris/arch-v10/kernel/process.c      | 1 +
 arch/cris/arch-v32/kernel/process.c      | 1 +
 arch/cris/kernel/process.c               | 1 +
 arch/frv/kernel/process.c                | 1 +
 arch/frv/mm/mmu-context.c                | 1 +
 arch/h8300/kernel/process.c              | 1 +
 arch/hexagon/kernel/process.c            | 1 +
 arch/ia64/kernel/mca.c                   | 1 +
 arch/ia64/kernel/perfmon.c               | 1 +
 arch/ia64/kernel/process.c               | 1 +
 arch/ia64/kernel/ptrace.c                | 1 +
 arch/m32r/kernel/process.c               | 1 +
 arch/m32r/kernel/smpboot.c               | 1 +
 arch/m68k/kernel/process.c               | 1 +
 arch/metag/kernel/process.c              | 1 +
 arch/metag/kernel/traps.c                | 1 +
 arch/microblaze/kernel/process.c         | 1 +
 arch/mips/kernel/mips-mt-fpaff.c         | 1 +
 arch/mips/kernel/process.c               | 1 +
 arch/mn10300/kernel/process.c            | 1 +
 arch/mn10300/kernel/smp.c                | 1 +
 arch/nios2/kernel/process.c              | 1 +
 arch/openrisc/kernel/process.c           | 1 +
 arch/parisc/kernel/process.c             | 1 +
 arch/powerpc/kernel/process.c            | 1 +
 arch/s390/kernel/process.c               | 1 +
 arch/s390/kernel/setup.c                 | 1 +
 arch/score/kernel/process.c              | 1 +
 arch/sh/kernel/cpu/fpu.c                 | 1 +
 arch/sh/kernel/process_32.c              | 1 +
 arch/sh/kernel/process_64.c              | 1 +
 arch/sh/mm/asids-debugfs.c               | 1 +
 arch/sparc/kernel/process_32.c           | 1 +
 arch/sparc/kernel/process_64.c           | 1 +
 arch/tile/kernel/process.c               | 1 +
 arch/tile/kernel/smpboot.c               | 1 +
 arch/tile/kernel/unaligned.c             | 1 +
 arch/tile/mm/fault.c                     | 2 ++
 arch/um/kernel/exec.c                    | 1 +
 arch/um/kernel/process.c                 | 1 +
 arch/um/kernel/reboot.c                  | 1 +
 arch/unicore32/kernel/process.c          | 1 +
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 1 +
 arch/x86/kernel/fpu/init.c               | 1 +
 arch/x86/kernel/process.c                | 1 +
 arch/x86/kernel/process_32.c             | 1 +
 arch/x86/kernel/process_64.c             | 1 +
 arch/x86/kernel/unwind_frame.c           | 1 +
 arch/xtensa/kernel/process.c             | 1 +
 drivers/misc/kgdbts.c                    | 2 ++
 drivers/tty/sysrq.c                      | 2 +-
 drivers/tty/tty_io.c                     | 1 +
 fs/exec.c                                | 1 +
 fs/fcntl.c                               | 1 +
 fs/fs_struct.c                           | 1 +
 fs/proc/array.c                          | 1 +
 fs/proc/kcore.c                          | 1 +
 include/linux/sched/task.h               | 6 ++++++
 init/main.c                              | 1 +
 kernel/cgroup/cgroup.c                   | 1 +
 kernel/cpu.c                             | 1 +
 kernel/exit.c                            | 1 +
 kernel/fork.c                            | 1 +
 kernel/kmod.c                            | 1 +
 kernel/kthread.c                         | 1 +
 kernel/locking/lockdep.c                 | 1 +
 kernel/pid.c                             | 1 +
 kernel/pid_namespace.c                   | 1 +
 kernel/power/process.c                   | 1 +
 kernel/ptrace.c                          | 1 +
 kernel/sched/sched.h                     | 1 +
 kernel/signal.c                          | 1 +
 kernel/smpboot.c                         | 1 +
 kernel/sys.c                             | 1 +
 kernel/trace/ftrace.c                    | 1 +
 kernel/tracepoint.c                      | 1 +
 lib/dma-debug.c                          | 1 +
 mm/kmemleak.c                            | 1 +
 mm/memory-failure.c                      | 1 +
 mm/memory.c                              | 1 +
 mm/oom_kill.c                            | 1 +
 mm/rmap.c                                | 1 +
 mm/swapfile.c                            | 1 +
 mm/usercopy.c                            | 2 ++
 security/keys/keyctl.c                   | 1 +
 security/selinux/hooks.c                 | 1 +
 95 files changed, 104 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/sched/task.h

(limited to 'kernel')

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 88f7a974d311..713b4fac998e 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c
index a41a79a4f4fe..d618d26721ab 100644
--- a/arch/arc/kernel/process.c
+++ b/arch/arc/kernel/process.c
@@ -11,6 +11,8 @@
 #include <linux/errno.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
+
 #include <linux/mm.h>
 #include <linux/fs.h>
 #include <linux/unistd.h>
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 03d46395d1ff..d4c7c9a1afa9 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -13,6 +13,7 @@
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 2a9040dcf47e..1d8558ff9827 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -14,6 +14,7 @@
 #include <linux/bootmem.h>
 #include <linux/mman.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 #include <linux/export.h>
 #include <linux/nodemask.h>
 #include <linux/initrd.h>
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 40a16cdd9873..f9b8e74ff8f2 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -25,6 +25,7 @@
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index dac022d88d9d..75b944a5107c 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -7,6 +7,7 @@
  */
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/fs.h>
diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c
index e95d094c20a6..1a1f444004b1 100644
--- a/arch/blackfin/kernel/process.c
+++ b/arch/blackfin/kernel/process.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/tick.h>
 #include <linux/fs.h>
 #include <linux/err.h>
diff --git a/arch/blackfin/kernel/trace.c b/arch/blackfin/kernel/trace.c
index 23c05b8effb3..151f22196ab6 100644
--- a/arch/blackfin/kernel/trace.c
+++ b/arch/blackfin/kernel/trace.c
@@ -13,6 +13,7 @@
 #include <linux/oom.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
diff --git a/arch/c6x/kernel/process.c b/arch/c6x/kernel/process.c
index 0ee7686a78f3..6b61779d426a 100644
--- a/arch/c6x/kernel/process.c
+++ b/arch/c6x/kernel/process.c
@@ -17,6 +17,7 @@
 #include <linux/mqueue.h>
 #include <linux/syscalls.h>
 #include <linux/reboot.h>
+#include <linux/sched/task.h>
 
 #include <asm/syscalls.h>
 
diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c
index 068a40374200..808c2186b704 100644
--- a/arch/cris/arch-v10/kernel/process.c
+++ b/arch/cris/arch-v10/kernel/process.c
@@ -12,6 +12,7 @@
 
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/fs.h>
diff --git a/arch/cris/arch-v32/kernel/process.c b/arch/cris/arch-v32/kernel/process.c
index 613c2d71bf10..c852df262948 100644
--- a/arch/cris/arch-v32/kernel/process.c
+++ b/arch/cris/arch-v32/kernel/process.c
@@ -10,6 +10,7 @@
 
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/fs.h>
diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c
index 50a7dd451456..0bbd3a0c3d70 100644
--- a/arch/cris/kernel/process.c
+++ b/arch/cris/kernel/process.c
@@ -20,6 +20,7 @@
 #include <linux/spinlock.h>
 #include <linux/init_task.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/fs.h>
 #include <linux/user.h>
 #include <linux/elfcore.h>
diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c
index cf4c34c09ab3..c96dbd4b8626 100644
--- a/arch/frv/kernel/process.c
+++ b/arch/frv/kernel/process.c
@@ -14,6 +14,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/frv/mm/mmu-context.c b/arch/frv/mm/mmu-context.c
index 4031289bf2ec..16946a58f64d 100644
--- a/arch/frv/mm/mmu-context.c
+++ b/arch/frv/mm/mmu-context.c
@@ -11,6 +11,7 @@
 
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/task.h>
 #include <linux/mm.h>
 #include <asm/tlbflush.h>
 
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index d7cabf373fe3..54e4d6f01865 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -26,6 +26,7 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c
index 3bdd9592d025..a2a822a875b6 100644
--- a/arch/hexagon/kernel/process.c
+++ b/arch/hexagon/kernel/process.c
@@ -20,6 +20,7 @@
 
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/tick.h>
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index f9fe3ac64242..79c7c46d7dc1 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -74,6 +74,7 @@
 #include <linux/init.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/bootmem.h>
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 677a86826771..7e943d3c05ed 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -22,6 +22,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/interrupt.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 2204ae450d65..054facf22156 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -22,6 +22,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/hotplug.h>
+#include <linux/sched/task.h>
 #include <linux/stddef.h>
 #include <linux/thread_info.h>
 #include <linux/unistd.h>
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 0b1153e610ea..04fe1436e1cc 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -11,6 +11,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/mm.h>
 #include <linux/errno.h>
 #include <linux/ptrace.h>
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index 1a227c1fdb1a..2a450382934c 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -23,6 +23,7 @@
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/module.h>
 #include <linux/ptrace.h>
 #include <linux/unistd.h>
diff --git a/arch/m32r/kernel/smpboot.c b/arch/m32r/kernel/smpboot.c
index f98d2f6519d6..a7d04684d2c7 100644
--- a/arch/m32r/kernel/smpboot.c
+++ b/arch/m32r/kernel/smpboot.c
@@ -45,6 +45,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/err.h>
 #include <linux/irq.h>
 #include <linux/bootmem.h>
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index b4a4675f4119..5d335d178706 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
diff --git a/arch/metag/kernel/process.c b/arch/metag/kernel/process.c
index 2e611bd37515..801e6f927e62 100644
--- a/arch/metag/kernel/process.c
+++ b/arch/metag/kernel/process.c
@@ -9,6 +9,7 @@
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/unistd.h>
diff --git a/arch/metag/kernel/traps.c b/arch/metag/kernel/traps.c
index 5ce67f9124aa..23c5ac33a93f 100644
--- a/arch/metag/kernel/traps.c
+++ b/arch/metag/kernel/traps.c
@@ -11,6 +11,7 @@
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/signal.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c
index 8c5fdb2e2850..a642fe5ac5ff 100644
--- a/arch/microblaze/kernel/process.c
+++ b/arch/microblaze/kernel/process.c
@@ -12,6 +12,7 @@
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/pm.h>
 #include <linux/tick.h>
 #include <linux/bitops.h>
diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c
index ffc6bd3464d9..8cab633e0e5a 100644
--- a/arch/mips/kernel/mips-mt-fpaff.c
+++ b/arch/mips/kernel/mips-mt-fpaff.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/cred.h>
 #include <linux/security.h>
 #include <linux/types.h>
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index 97574cdb532d..8bfb833b3158 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -12,6 +12,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/tick.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c
index a1e2f8301d94..8ca7a9de09a5 100644
--- a/arch/mn10300/kernel/process.c
+++ b/arch/mn10300/kernel/process.c
@@ -12,6 +12,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/mn10300/kernel/smp.c b/arch/mn10300/kernel/smp.c
index e65b5cc2fa67..d924f7a79708 100644
--- a/arch/mn10300/kernel/smp.c
+++ b/arch/mn10300/kernel/smp.c
@@ -22,6 +22,7 @@
 #include <linux/kernel.h>
 #include <linux/delay.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/profile.h>
 #include <linux/smp.h>
 #include <linux/cpu.h>
diff --git a/arch/nios2/kernel/process.c b/arch/nios2/kernel/process.c
index 3ad87a6b119b..5ee1ddf6a924 100644
--- a/arch/nios2/kernel/process.c
+++ b/arch/nios2/kernel/process.c
@@ -15,6 +15,7 @@
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/tick.h>
 #include <linux/uaccess.h>
 
diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c
index 899339dac938..0c5ad3bde0f5 100644
--- a/arch/openrisc/kernel/process.c
+++ b/arch/openrisc/kernel/process.c
@@ -23,6 +23,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/mm.h>
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index 8c283caf2b4d..e5f97239cc5e 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -44,6 +44,7 @@
 #include <linux/ptrace.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/slab.h>
 #include <linux/stddef.h>
 #include <linux/unistd.h>
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 76f58b87dcb2..b99b12656f6f 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -17,6 +17,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index ed99ff911b67..250c41f05721 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -12,6 +12,7 @@
 #include <linux/cpu.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/elfcore.h>
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index e4d811f17971..3de07004c02e 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -18,6 +18,7 @@
 #include <linux/errno.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/memblock.h>
 #include <linux/mm.h>
diff --git a/arch/score/kernel/process.c b/arch/score/kernel/process.c
index aae9480706c2..32924159d8c9 100644
--- a/arch/score/kernel/process.c
+++ b/arch/score/kernel/process.c
@@ -28,6 +28,7 @@
 #include <linux/elfcore.h>
 #include <linux/pm.h>
 #include <linux/rcupdate.h>
+#include <linux/sched/task.h>
 
 void (*pm_power_off)(void);
 EXPORT_SYMBOL(pm_power_off);
diff --git a/arch/sh/kernel/cpu/fpu.c b/arch/sh/kernel/cpu/fpu.c
index 37e35330aae6..b76370a47bf9 100644
--- a/arch/sh/kernel/cpu/fpu.c
+++ b/arch/sh/kernel/cpu/fpu.c
@@ -1,4 +1,5 @@
 #include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 #include <linux/slab.h>
 #include <asm/processor.h>
 #include <asm/fpu.h>
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index 5098274e0f23..493c3eb86aa5 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/slab.h>
 #include <linux/elfcore.h>
 #include <linux/kallsyms.h>
diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c
index ced21a417d9d..056607185158 100644
--- a/arch/sh/kernel/process_64.c
+++ b/arch/sh/kernel/process_64.c
@@ -26,6 +26,7 @@
 #include <linux/module.h>
 #include <linux/io.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <asm/syscalls.h>
 #include <linux/uaccess.h>
 #include <asm/pgtable.h>
diff --git a/arch/sh/mm/asids-debugfs.c b/arch/sh/mm/asids-debugfs.c
index 110bd35165bf..e5539e0f8e3b 100644
--- a/arch/sh/mm/asids-debugfs.c
+++ b/arch/sh/mm/asids-debugfs.c
@@ -21,6 +21,7 @@
 #include <linux/seq_file.h>
 #include <linux/spinlock.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 
 #include <asm/processor.h>
 #include <asm/mmu_context.h>
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index 237a471c8ed5..ed87c45a785b 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index 98f6ff6eaa55..4c82a73af893 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -15,6 +15,7 @@
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index 557b4c8435d0..dbb4fa76d1dd 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -14,6 +14,7 @@
 
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/preempt.h>
 #include <linux/module.h>
 #include <linux/fs.h>
diff --git a/arch/tile/kernel/smpboot.c b/arch/tile/kernel/smpboot.c
index 53ce940a5016..f3fdd0c39b12 100644
--- a/arch/tile/kernel/smpboot.c
+++ b/arch/tile/kernel/smpboot.c
@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/kernel_stat.h>
 #include <linux/bootmem.h>
 #include <linux/notifier.h>
diff --git a/arch/tile/kernel/unaligned.c b/arch/tile/kernel/unaligned.c
index 142acae78316..8149c38f67b6 100644
--- a/arch/tile/kernel/unaligned.c
+++ b/arch/tile/kernel/unaligned.c
@@ -18,6 +18,7 @@
 #include <linux/ptrace.h>
 #include <linux/slab.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/thread_info.h>
 #include <linux/uaccess.h>
 #include <linux/mman.h>
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index 005c91c2adca..f58fa06a2214 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -17,6 +17,8 @@
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/string.h>
diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c
index 770ec07b6a6a..2df4e5d11939 100644
--- a/arch/um/kernel/exec.c
+++ b/arch/um/kernel/exec.c
@@ -8,6 +8,7 @@
 #include <linux/fs.h>
 #include <linux/ptrace.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/slab.h>
 #include <asm/current.h>
 #include <asm/processor.h>
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index c6c45ea4021f..e76dc7c11251 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/seq_file.h>
 #include <linux/tick.h>
 #include <linux/threads.h>
diff --git a/arch/um/kernel/reboot.c b/arch/um/kernel/reboot.c
index 79218106a033..0bc921cee0b1 100644
--- a/arch/um/kernel/reboot.c
+++ b/arch/um/kernel/reboot.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/oom.h>
diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c
index c1a0eec88f1f..d58f1600b477 100644
--- a/arch/unicore32/kernel/process.c
+++ b/arch/unicore32/kernel/process.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index d48af18e7baf..0bbe0f3a039f 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -26,6 +26,7 @@
 #include <linux/kernfs.h>
 #include <linux/seq_file.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 #include <linux/slab.h>
 #include <linux/cpu.h>
 #include <linux/task_work.h>
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 19bdd1bf8160..c2f8dde3255c 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -7,6 +7,7 @@
 #include <asm/cmdline.h>
 
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/init.h>
 
 /*
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 505be5589e52..441f00e7be8f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,6 +9,7 @@
 #include <linux/sched.h>
 #include <linux/sched/idle.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/init.h>
 #include <linux/export.h>
 #include <linux/pm.h>
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index a0ac3e81518a..d79ffa47aab8 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -12,6 +12,7 @@
 #include <linux/cpu.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/fs.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index a61e141b6891..124f5652ae3c 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -17,6 +17,7 @@
 #include <linux/cpu.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/fs.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index 23d15565d02a..7c0abdc9a732 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -1,4 +1,5 @@
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <asm/ptrace.h>
 #include <asm/bitops.h>
 #include <asm/stacktrace.h>
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index afaa3588d869..af33f9d4c624 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -18,6 +18,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/drivers/misc/kgdbts.c b/drivers/misc/kgdbts.c
index 99635dd9dbac..fc7efedbc4be 100644
--- a/drivers/misc/kgdbts.c
+++ b/drivers/misc/kgdbts.c
@@ -103,6 +103,8 @@
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
+#include <linux/sched/task.h>
+
 #include <asm/sections.h>
 
 #define v1printk(a...) do { \
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index e5f0c7d0fe8d..c6fc7141d7b2 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -17,7 +17,7 @@
 #include <linux/sched/signal.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/debug.h>
-#include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/interrupt.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 985d33f0f315..e6d1a6510886 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -70,6 +70,7 @@
 #include <linux/signal.h>
 #include <linux/fcntl.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 #include <linux/interrupt.h>
 #include <linux/tty.h>
 #include <linux/tty_driver.h>
diff --git a/fs/exec.c b/fs/exec.c
index e857c7260b1f..65145a3df065 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -36,6 +36,7 @@
 #include <linux/sched/coredump.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/numa_balancing.h>
+#include <linux/sched/task.h>
 #include <linux/pagemap.h>
 #include <linux/perf_event.h>
 #include <linux/highmem.h>
diff --git a/fs/fcntl.c b/fs/fcntl.c
index e1c54f20325c..be8fbe289087 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -7,6 +7,7 @@
 #include <linux/syscalls.h>
 #include <linux/init.h>
 #include <linux/mm.h>
+#include <linux/sched/task.h>
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 543ed50f0387..be0250788b73 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -1,5 +1,6 @@
 #include <linux/export.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 #include <linux/fs.h>
 #include <linux/path.h>
 #include <linux/slab.h>
diff --git a/fs/proc/array.c b/fs/proc/array.c
index cfe7f934a300..f3169b58af38 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -62,6 +62,7 @@
 #include <linux/mman.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/numa_balancing.h>
+#include <linux/sched/task.h>
 #include <linux/proc_fs.h>
 #include <linux/ioport.h>
 #include <linux/uaccess.h>
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index ea9f3d1ae830..4ee55274f155 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -28,6 +28,7 @@
 #include <linux/list.h>
 #include <linux/ioport.h>
 #include <linux/memory.h>
+#include <linux/sched/task.h>
 #include <asm/sections.h>
 #include "internal.h"
 
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
new file mode 100644
index 000000000000..0023c91ff821
--- /dev/null
+++ b/include/linux/sched/task.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_TASK_H
+#define _LINUX_SCHED_TASK_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_TASK_H */
diff --git a/init/main.c b/init/main.c
index c891cfb8928f..47956b22add1 100644
--- a/init/main.c
+++ b/init/main.c
@@ -76,6 +76,7 @@
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
 #include <linux/sched_clock.h>
+#include <linux/sched/task.h>
 #include <linux/context_tracking.h>
 #include <linux/random.h>
 #include <linux/list.h>
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index e8f87bf9840c..0125589c7428 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -41,6 +41,7 @@
 #include <linux/proc_fs.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/percpu-rwsem.h>
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 78c206579d01..f7c063239fa5 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -9,6 +9,7 @@
 #include <linux/notifier.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/hotplug.h>
+#include <linux/sched/task.h>
 #include <linux/unistd.h>
 #include <linux/cpu.h>
 #include <linux/oom.h>
diff --git a/kernel/exit.c b/kernel/exit.c
index 5bad7dce1b7b..e53408d156df 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -9,6 +9,7 @@
 #include <linux/sched/autogroup.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/stat.h>
+#include <linux/sched/task.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/capability.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 8632905f64c5..0af64d5d8b73 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -18,6 +18,7 @@
 #include <linux/sched/user.h>
 #include <linux/sched/numa_balancing.h>
 #include <linux/sched/stat.h>
+#include <linux/sched/task.h>
 #include <linux/rtmutex.h>
 #include <linux/init.h>
 #include <linux/unistd.h>
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 0c407f905ca4..ac5f5c2d098d 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -20,6 +20,7 @@
 */
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/syscalls.h>
 #include <linux/unistd.h>
 #include <linux/kmod.h>
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ef9b9eb809c7..2f26adea0f84 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -7,6 +7,7 @@
  */
 #include <uapi/linux/sched/types.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/kthread.h>
 #include <linux/completion.h>
 #include <linux/err.h>
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index cdafaff926ca..12e38c213b70 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -29,6 +29,7 @@
 #include <linux/mutex.h>
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
+#include <linux/sched/task.h>
 #include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/proc_fs.h>
diff --git a/kernel/pid.c b/kernel/pid.c
index 0291804151b5..0143ac0ddceb 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -38,6 +38,7 @@
 #include <linux/syscalls.h>
 #include <linux/proc_ns.h>
 #include <linux/proc_fs.h>
+#include <linux/sched/task.h>
 
 #define pid_hashfn(nr, ns)	\
 	hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 25314c96fbe6..7c8367854dc4 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -19,6 +19,7 @@
 #include <linux/proc_ns.h>
 #include <linux/reboot.h>
 #include <linux/export.h>
+#include <linux/sched/task.h>
 
 struct pid_cache {
 	int nr_ids;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 85e2915d8961..c7209f060eeb 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -13,6 +13,7 @@
 #include <linux/suspend.h>
 #include <linux/module.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/syscalls.h>
 #include <linux/freezer.h>
 #include <linux/delay.h>
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index bcdbdc19eb35..0af928712174 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -12,6 +12,7 @@
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
+#include <linux/sched/task.h>
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0974eb2ef50d..6b7155ae5c33 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -14,6 +14,7 @@
 #include <linux/sched/nohz.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/hotplug.h>
+#include <linux/sched/task.h>
 
 #include <linux/u64_stats_sync.h>
 #include <linux/kernel_stat.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index 15092fd8c7b1..25d099fd80f7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -16,6 +16,7 @@
 #include <linux/sched.h>
 #include <linux/sched/user.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 #include <linux/fs.h>
 #include <linux/tty.h>
 #include <linux/binfmts.h>
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 4a5c6e73ecd4..1d71c051a951 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -9,6 +9,7 @@
 #include <linux/list.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/export.h>
 #include <linux/percpu.h>
 #include <linux/kthread.h>
diff --git a/kernel/sys.c b/kernel/sys.c
index 8c0392c8be51..7f1ecd2e41c1 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -54,6 +54,7 @@
 #include <linux/sched/stat.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
+#include <linux/sched/task.h>
 #include <linux/rcupdate.h>
 #include <linux/uidgid.h>
 #include <linux/cred.h>
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0c0609326391..0d1597c9ee30 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -15,6 +15,7 @@
 
 #include <linux/stop_machine.h>
 #include <linux/clocksource.h>
+#include <linux/sched/task.h>
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
 #include <linux/suspend.h>
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 9f6984d52fec..685c50ae6300 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -25,6 +25,7 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 #include <linux/static_key.h>
 
 extern struct tracepoint * const __start___tracepoints_ptrs[];
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 60c57ec936db..942adf13418d 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -19,6 +19,7 @@
 
 #include <linux/scatterlist.h>
 #include <linux/dma-mapping.h>
+#include <linux/sched/task.h>
 #include <linux/stacktrace.h>
 #include <linux/dma-debug.h>
 #include <linux/spinlock.h>
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 2df6d3687b2a..a3970573359e 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -74,6 +74,7 @@
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 #include <linux/jiffies.h>
 #include <linux/delay.h>
 #include <linux/export.h>
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index b74984db386e..27f7210e7fab 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -41,6 +41,7 @@
 #include <linux/page-flags.h>
 #include <linux/kernel-page-flags.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 #include <linux/ksm.h>
 #include <linux/rmap.h>
 #include <linux/export.h>
diff --git a/mm/memory.c b/mm/memory.c
index d4994a28dc85..a97a4cec2e1f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -43,6 +43,7 @@
 #include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
 #include <linux/sched/numa_balancing.h>
+#include <linux/sched/task.h>
 #include <linux/hugetlb.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 69f75b014607..d083714a2bb9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -24,6 +24,7 @@
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
+#include <linux/sched/task.h>
 #include <linux/swap.h>
 #include <linux/timex.h>
 #include <linux/jiffies.h>
diff --git a/mm/rmap.c b/mm/rmap.c
index 0367a0506667..2da487d6cea8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -47,6 +47,7 @@
 
 #include <linux/mm.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/task.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
diff --git a/mm/swapfile.c b/mm/swapfile.c
index ff2bf3f61b14..521ef9b6064f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -7,6 +7,7 @@
 
 #include <linux/mm.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/task.h>
 #include <linux/hugetlb.h>
 #include <linux/mman.h>
 #include <linux/slab.h>
diff --git a/mm/usercopy.c b/mm/usercopy.c
index 7ccad05c0d5c..d155e12563b1 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -17,6 +17,8 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <asm/sections.h>
 
 enum {
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index bcb0b597c391..52c34532c785 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/slab.h>
 #include <linux/syscalls.h>
 #include <linux/key.h>
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index b12f873f92ba..57ff53696144 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -29,6 +29,7 @@
 #include <linux/tracehook.h>
 #include <linux/errno.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 #include <linux/lsm_hooks.h>
 #include <linux/xattr.h>
 #include <linux/capability.h>
-- 
cgit v1.2.3


From 68db0cf10678630d286f4bbbbdfa102951a35faa Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:37 +0100
Subject: sched/headers: Prepare for new header dependencies before moving code
 to <linux/sched/task_stack.h>

We are going to split <linux/sched/task_stack.h> out of <linux/sched.h>, which
will have to be picked up from other headers and a couple of .c files.

Create a trivial placeholder <linux/sched/task_stack.h> file that just
maps to <linux/sched.h> to make this patch obviously correct and
bisectable.

Include the new header in the files that are going to need it.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/kernel/osf_sys.c            | 1 +
 arch/alpha/kernel/process.c            | 1 +
 arch/alpha/kernel/ptrace.c             | 1 +
 arch/alpha/kernel/signal.c             | 1 +
 arch/arc/kernel/kgdb.c                 | 1 +
 arch/arc/kernel/process.c              | 1 +
 arch/arc/kernel/ptrace.c               | 1 +
 arch/arc/kernel/signal.c               | 2 ++
 arch/arm/kernel/perf_regs.c            | 1 +
 arch/arm/kernel/process.c              | 1 +
 arch/arm/kernel/ptrace.c               | 1 +
 arch/arm/kernel/smp.c                  | 1 +
 arch/arm/kernel/traps.c                | 1 +
 arch/arm64/include/asm/compat.h        | 1 +
 arch/arm64/kernel/debug-monitors.c     | 1 +
 arch/arm64/kernel/kgdb.c               | 2 ++
 arch/arm64/kernel/perf_regs.c          | 1 +
 arch/arm64/kernel/process.c            | 1 +
 arch/arm64/kernel/ptrace.c             | 1 +
 arch/arm64/kernel/smp.c                | 1 +
 arch/arm64/kernel/stacktrace.c         | 1 +
 arch/arm64/kernel/traps.c              | 1 +
 arch/avr32/kernel/process.c            | 1 +
 arch/avr32/kernel/ptrace.c             | 1 +
 arch/avr32/kernel/stacktrace.c         | 1 +
 arch/blackfin/kernel/process.c         | 1 +
 arch/blackfin/kernel/ptrace.c          | 1 +
 arch/blackfin/kernel/signal.c          | 1 +
 arch/blackfin/kernel/stacktrace.c      | 1 +
 arch/blackfin/mach-common/smp.c        | 1 +
 arch/c6x/kernel/process.c              | 1 +
 arch/c6x/kernel/ptrace.c               | 1 +
 arch/cris/arch-v10/kernel/process.c    | 1 +
 arch/cris/arch-v10/kernel/ptrace.c     | 1 +
 arch/cris/arch-v10/kernel/signal.c     | 1 +
 arch/cris/arch-v32/kernel/process.c    | 1 +
 arch/cris/arch-v32/kernel/ptrace.c     | 1 +
 arch/cris/arch-v32/kernel/signal.c     | 1 +
 arch/frv/kernel/process.c              | 1 +
 arch/h8300/kernel/process.c            | 1 +
 arch/h8300/kernel/signal.c             | 1 +
 arch/hexagon/kernel/kgdb.c             | 1 +
 arch/hexagon/kernel/process.c          | 1 +
 arch/hexagon/kernel/ptrace.c           | 1 +
 arch/hexagon/kernel/signal.c           | 2 ++
 arch/hexagon/kernel/stacktrace.c       | 1 +
 arch/hexagon/kernel/traps.c            | 1 +
 arch/ia64/kernel/perfmon.c             | 1 +
 arch/ia64/kernel/process.c             | 1 +
 arch/ia64/kernel/ptrace.c              | 1 +
 arch/ia64/kernel/setup.c               | 1 +
 arch/ia64/kernel/sys_ia64.c            | 1 +
 arch/m32r/kernel/process.c             | 1 +
 arch/m32r/kernel/ptrace.c              | 1 +
 arch/m68k/kernel/process.c             | 1 +
 arch/m68k/kernel/ptrace.c              | 1 +
 arch/metag/kernel/process.c            | 1 +
 arch/metag/kernel/ptrace.c             | 2 ++
 arch/metag/kernel/signal.c             | 1 +
 arch/metag/kernel/smp.c                | 1 +
 arch/metag/kernel/traps.c              | 1 +
 arch/microblaze/kernel/process.c       | 1 +
 arch/microblaze/kernel/ptrace.c        | 1 +
 arch/microblaze/kernel/unwind.c        | 1 +
 arch/mips/include/asm/fpu.h            | 1 +
 arch/mips/kernel/crash.c               | 1 +
 arch/mips/kernel/perf_event.c          | 1 +
 arch/mips/kernel/process.c             | 1 +
 arch/mips/kernel/ptrace.c              | 1 +
 arch/mips/kernel/ptrace32.c            | 1 +
 arch/mips/kernel/stacktrace.c          | 1 +
 arch/mips/kernel/syscall.c             | 1 +
 arch/mips/loongson64/loongson-3/smp.c  | 1 +
 arch/mips/paravirt/paravirt-smp.c      | 1 +
 arch/mips/sibyte/bcm1480/smp.c         | 1 +
 arch/mn10300/kernel/process.c          | 1 +
 arch/mn10300/kernel/ptrace.c           | 1 +
 arch/nios2/kernel/process.c            | 1 +
 arch/nios2/kernel/ptrace.c             | 1 +
 arch/openrisc/kernel/process.c         | 1 +
 arch/openrisc/kernel/ptrace.c          | 1 +
 arch/parisc/kernel/process.c           | 1 +
 arch/powerpc/kernel/process.c          | 1 +
 arch/powerpc/mm/fault.c                | 1 +
 arch/powerpc/perf/perf_regs.c          | 1 +
 arch/s390/include/asm/compat.h         | 1 +
 arch/s390/include/asm/kprobes.h        | 1 +
 arch/s390/kernel/compat_signal.c       | 1 +
 arch/s390/kernel/dumpstack.c           | 1 +
 arch/s390/kernel/process.c             | 1 +
 arch/s390/kernel/ptrace.c              | 1 +
 arch/s390/kernel/runtime_instr.c       | 2 ++
 arch/s390/kernel/signal.c              | 1 +
 arch/s390/kernel/smp.c                 | 1 +
 arch/s390/kernel/uprobes.c             | 2 ++
 arch/score/kernel/process.c            | 1 +
 arch/score/kernel/ptrace.c             | 1 +
 arch/sh/kernel/cpu/fpu.c               | 1 +
 arch/sh/kernel/dumpstack.c             | 1 +
 arch/sh/kernel/kgdb.c                  | 2 ++
 arch/sh/kernel/process.c               | 1 +
 arch/sh/kernel/process_32.c            | 1 +
 arch/sh/kernel/process_64.c            | 1 +
 arch/sh/kernel/ptrace_32.c             | 1 +
 arch/sh/kernel/ptrace_64.c             | 1 +
 arch/sh/kernel/signal_32.c             | 1 +
 arch/sh/kernel/sys_sh32.c              | 1 +
 arch/sh/kernel/traps.c                 | 1 +
 arch/sh/kernel/traps_32.c              | 2 ++
 arch/sparc/kernel/process_32.c         | 1 +
 arch/sparc/kernel/process_64.c         | 1 +
 arch/sparc/kernel/ptrace_64.c          | 1 +
 arch/tile/kernel/compat_signal.c       | 1 +
 arch/tile/kernel/kgdb.c                | 2 ++
 arch/tile/kernel/process.c             | 1 +
 arch/tile/kernel/ptrace.c              | 2 ++
 arch/tile/kernel/signal.c              | 1 +
 arch/tile/kernel/stack.c               | 1 +
 arch/um/kernel/exec.c                  | 1 +
 arch/um/kernel/process.c               | 1 +
 arch/um/kernel/skas/process.c          | 1 +
 arch/unicore32/kernel/process.c        | 1 +
 arch/unicore32/kernel/ptrace.c         | 1 +
 arch/unicore32/kernel/traps.c          | 1 +
 arch/x86/entry/common.c                | 1 +
 arch/x86/entry/vdso/vma.c              | 1 +
 arch/x86/ia32/ia32_aout.c              | 1 +
 arch/x86/ia32/ia32_signal.c            | 1 +
 arch/x86/kernel/dumpstack.c            | 1 +
 arch/x86/kernel/fpu/regset.c           | 1 +
 arch/x86/kernel/ioport.c               | 1 +
 arch/x86/kernel/irq_64.c               | 1 +
 arch/x86/kernel/perf_regs.c            | 1 +
 arch/x86/kernel/process.c              | 1 +
 arch/x86/kernel/process_32.c           | 1 +
 arch/x86/kernel/process_64.c           | 1 +
 arch/x86/kernel/ptrace.c               | 1 +
 arch/x86/kernel/signal.c               | 1 +
 arch/x86/kernel/smpboot.c              | 1 +
 arch/x86/kernel/stacktrace.c           | 1 +
 arch/x86/kernel/step.c                 | 1 +
 arch/x86/kernel/traps.c                | 1 +
 arch/x86/kernel/unwind_frame.c         | 1 +
 arch/x86/kernel/vm86_32.c              | 1 +
 arch/x86/mm/fault.c                    | 1 +
 arch/xtensa/kernel/process.c           | 1 +
 arch/xtensa/kernel/ptrace.c            | 1 +
 arch/xtensa/kernel/signal.c            | 1 +
 arch/xtensa/kernel/smp.c               | 1 +
 block/blk-map.c                        | 1 +
 drivers/hv/vmbus_drv.c                 | 2 ++
 drivers/ide/ide-cd.c                   | 1 +
 drivers/md/bcache/closure.h            | 1 +
 drivers/misc/lkdtm_usercopy.c          | 1 +
 drivers/mtd/nand/gpmi-nand/gpmi-nand.c | 1 +
 fs/binfmt_aout.c                       | 1 +
 fs/binfmt_elf.c                        | 1 +
 fs/binfmt_elf_fdpic.c                  | 3 ++-
 fs/binfmt_flat.c                       | 1 +
 fs/coredump.c                          | 1 +
 include/linux/elfcore.h                | 2 ++
 include/linux/perf_regs.h              | 2 ++
 include/linux/sched/task_stack.h       | 6 ++++++
 init/main.c                            | 1 +
 kernel/events/callchain.c              | 2 ++
 kernel/exit.c                          | 1 +
 kernel/fork.c                          | 1 +
 kernel/printk/printk.c                 | 1 +
 kernel/sched/sched.h                   | 1 +
 kernel/seccomp.c                       | 1 +
 kernel/signal.c                        | 1 +
 kernel/trace/trace_stack.c             | 1 +
 lib/debugobjects.c                     | 1 +
 lib/dma-debug.c                        | 1 +
 lib/syscall.c                          | 1 +
 mm/kasan/kasan.c                       | 1 +
 mm/kmemleak.c                          | 1 +
 mm/util.c                              | 1 +
 178 files changed, 198 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/sched/task_stack.h

(limited to 'kernel')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 3b9b2a382ba2..73446baa632e 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -13,6 +13,7 @@
 #include <linux/errno.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 713b4fac998e..0b9635040721 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -13,6 +13,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c
index bc4d2cdcf21d..285a82d491ef 100644
--- a/arch/alpha/kernel/ptrace.c
+++ b/arch/alpha/kernel/ptrace.c
@@ -6,6 +6,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/errno.h>
diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c
index b8221f112eee..8129dd92cadc 100644
--- a/arch/alpha/kernel/signal.c
+++ b/arch/alpha/kernel/signal.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/sched/signal.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
 #include <linux/errno.h>
diff --git a/arch/arc/kernel/kgdb.c b/arch/arc/kernel/kgdb.c
index ecf6a7869375..9a3c34af2ae8 100644
--- a/arch/arc/kernel/kgdb.c
+++ b/arch/arc/kernel/kgdb.c
@@ -10,6 +10,7 @@
 
 #include <linux/kgdb.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <asm/disasm.h>
 #include <asm/cacheflush.h>
 
diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c
index d618d26721ab..2a018de6d6cd 100644
--- a/arch/arc/kernel/process.c
+++ b/arch/arc/kernel/process.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 
 #include <linux/mm.h>
 #include <linux/fs.h>
diff --git a/arch/arc/kernel/ptrace.c b/arch/arc/kernel/ptrace.c
index 4442204fe238..31150060d38b 100644
--- a/arch/arc/kernel/ptrace.c
+++ b/arch/arc/kernel/ptrace.c
@@ -8,6 +8,7 @@
 
 #include <linux/ptrace.h>
 #include <linux/tracehook.h>
+#include <linux/sched/task_stack.h>
 #include <linux/regset.h>
 #include <linux/unistd.h>
 #include <linux/elf.h>
diff --git a/arch/arc/kernel/signal.c b/arch/arc/kernel/signal.c
index d347bbc086fe..48685445002e 100644
--- a/arch/arc/kernel/signal.c
+++ b/arch/arc/kernel/signal.c
@@ -53,6 +53,8 @@
 #include <linux/uaccess.h>
 #include <linux/syscalls.h>
 #include <linux/tracehook.h>
+#include <linux/sched/task_stack.h>
+
 #include <asm/ucontext.h>
 
 struct rt_sigframe {
diff --git a/arch/arm/kernel/perf_regs.c b/arch/arm/kernel/perf_regs.c
index 592dda3f21ff..c366b83bf955 100644
--- a/arch/arm/kernel/perf_regs.c
+++ b/arch/arm/kernel/perf_regs.c
@@ -3,6 +3,7 @@
 #include <linux/kernel.h>
 #include <linux/perf_event.h>
 #include <linux/bug.h>
+#include <linux/sched/task_stack.h>
 #include <asm/perf_regs.h>
 #include <asm/ptrace.h>
 
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index d4c7c9a1afa9..939e8b58c59d 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -14,6 +14,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index 46f7bab81c40..58e3771e4c5b 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -11,6 +11,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/elf.h>
 #include <linux/smp.h>
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 2083a5370308..b724cff7ad60 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -13,6 +13,7 @@
 #include <linux/spinlock.h>
 #include <linux/sched.h>
 #include <linux/sched/hotplug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/interrupt.h>
 #include <linux/cache.h>
 #include <linux/profile.h>
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index a9dad001b97d..948c648fea00 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -26,6 +26,7 @@
 #include <linux/init.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/irq.h>
 
 #include <linux/atomic.h>
diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h
index eb8432bb82b8..e39d487bf724 100644
--- a/arch/arm64/include/asm/compat.h
+++ b/arch/arm64/include/asm/compat.h
@@ -23,6 +23,7 @@
  */
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 
 #define COMPAT_USER_HZ		100
 #ifdef __AARCH64EB__
diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c
index 2bd426448fc1..32913567da08 100644
--- a/arch/arm64/kernel/debug-monitors.c
+++ b/arch/arm64/kernel/debug-monitors.c
@@ -26,6 +26,7 @@
 #include <linux/kprobes.h>
 #include <linux/stat.h>
 #include <linux/uaccess.h>
+#include <linux/sched/task_stack.h>
 
 #include <asm/cpufeature.h>
 #include <asm/cputype.h>
diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c
index d217c9e95b06..2122cd187f19 100644
--- a/arch/arm64/kernel/kgdb.c
+++ b/arch/arm64/kernel/kgdb.c
@@ -24,6 +24,8 @@
 #include <linux/kdebug.h>
 #include <linux/kgdb.h>
 #include <linux/kprobes.h>
+#include <linux/sched/task_stack.h>
+
 #include <asm/debug-monitors.h>
 #include <asm/insn.h>
 #include <asm/traps.h>
diff --git a/arch/arm64/kernel/perf_regs.c b/arch/arm64/kernel/perf_regs.c
index 3f62b35fb6f1..bd1b74c2436f 100644
--- a/arch/arm64/kernel/perf_regs.c
+++ b/arch/arm64/kernel/perf_regs.c
@@ -2,6 +2,7 @@
 #include <linux/kernel.h>
 #include <linux/perf_event.h>
 #include <linux/bug.h>
+#include <linux/sched/task_stack.h>
 
 #include <asm/compat.h>
 #include <asm/perf_regs.h>
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index f9b8e74ff8f2..043d373b8369 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -26,6 +26,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 64fc32ea3422..c142459a88f3 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -23,6 +23,7 @@
 #include <linux/compat.h>
 #include <linux/kernel.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/ptrace.h>
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index f66e58523b96..83c0a839a6ad 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -23,6 +23,7 @@
 #include <linux/spinlock.h>
 #include <linux/sched.h>
 #include <linux/sched/hotplug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/interrupt.h>
 #include <linux/cache.h>
 #include <linux/profile.h>
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 7597e42feeea..feac80c22f61 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -20,6 +20,7 @@
 #include <linux/ftrace.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/stacktrace.h>
 
 #include <asm/irq.h>
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 5a5d83791837..bdbd0c3febf4 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -31,6 +31,7 @@
 #include <linux/init.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/syscalls.h>
 
 #include <asm/atomic.h>
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index 75b944a5107c..ad0dfccedb79 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -8,6 +8,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/fs.h>
diff --git a/arch/avr32/kernel/ptrace.c b/arch/avr32/kernel/ptrace.c
index a89b893279bb..41a14e96a1db 100644
--- a/arch/avr32/kernel/ptrace.c
+++ b/arch/avr32/kernel/ptrace.c
@@ -8,6 +8,7 @@
 #undef DEBUG
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/ptrace.h>
 #include <linux/errno.h>
diff --git a/arch/avr32/kernel/stacktrace.c b/arch/avr32/kernel/stacktrace.c
index c09f0d8dd679..f8cc995cf0e0 100644
--- a/arch/avr32/kernel/stacktrace.c
+++ b/arch/avr32/kernel/stacktrace.c
@@ -8,6 +8,7 @@
  * published by the Free Software Foundation.
  */
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/stacktrace.h>
 #include <linux/thread_info.h>
 #include <linux/module.h>
diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c
index 1a1f444004b1..b691ef875a40 100644
--- a/arch/blackfin/kernel/process.c
+++ b/arch/blackfin/kernel/process.c
@@ -14,6 +14,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/tick.h>
 #include <linux/fs.h>
 #include <linux/err.h>
diff --git a/arch/blackfin/kernel/ptrace.c b/arch/blackfin/kernel/ptrace.c
index 360d99645163..a6827095b99a 100644
--- a/arch/blackfin/kernel/ptrace.c
+++ b/arch/blackfin/kernel/ptrace.c
@@ -7,6 +7,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/elf.h>
diff --git a/arch/blackfin/kernel/signal.c b/arch/blackfin/kernel/signal.c
index ea570db598e5..5f5172779204 100644
--- a/arch/blackfin/kernel/signal.c
+++ b/arch/blackfin/kernel/signal.c
@@ -12,6 +12,7 @@
 #include <linux/binfmts.h>
 #include <linux/uaccess.h>
 #include <linux/tracehook.h>
+#include <linux/sched/task_stack.h>
 
 #include <asm/cacheflush.h>
 #include <asm/ucontext.h>
diff --git a/arch/blackfin/kernel/stacktrace.c b/arch/blackfin/kernel/stacktrace.c
index 30301e1eace5..17198f3650b6 100644
--- a/arch/blackfin/kernel/stacktrace.c
+++ b/arch/blackfin/kernel/stacktrace.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/stacktrace.h>
 #include <linux/thread_info.h>
 #include <linux/module.h>
diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c
index a2e6db2ce811..3ac98252d299 100644
--- a/arch/blackfin/mach-common/smp.c
+++ b/arch/blackfin/mach-common/smp.c
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/interrupt.h>
 #include <linux/cache.h>
 #include <linux/clockchips.h>
diff --git a/arch/c6x/kernel/process.c b/arch/c6x/kernel/process.c
index 6b61779d426a..c4ecb24c2d5c 100644
--- a/arch/c6x/kernel/process.c
+++ b/arch/c6x/kernel/process.c
@@ -18,6 +18,7 @@
 #include <linux/syscalls.h>
 #include <linux/reboot.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 
 #include <asm/syscalls.h>
 
diff --git a/arch/c6x/kernel/ptrace.c b/arch/c6x/kernel/ptrace.c
index 3c494e84444d..a27e1f02ce18 100644
--- a/arch/c6x/kernel/ptrace.c
+++ b/arch/c6x/kernel/ptrace.c
@@ -14,6 +14,7 @@
 #include <linux/tracehook.h>
 #include <linux/regset.h>
 #include <linux/elf.h>
+#include <linux/sched/task_stack.h>
 
 #include <asm/cacheflush.h>
 
diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c
index 808c2186b704..e299d30105b5 100644
--- a/arch/cris/arch-v10/kernel/process.c
+++ b/arch/cris/arch-v10/kernel/process.c
@@ -13,6 +13,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/fs.h>
diff --git a/arch/cris/arch-v10/kernel/ptrace.c b/arch/cris/arch-v10/kernel/ptrace.c
index eca94c7d56e7..c2f2b9b83cc4 100644
--- a/arch/cris/arch-v10/kernel/ptrace.c
+++ b/arch/cris/arch-v10/kernel/ptrace.c
@@ -4,6 +4,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/errno.h>
diff --git a/arch/cris/arch-v10/kernel/signal.c b/arch/cris/arch-v10/kernel/signal.c
index db30c98e4926..bab4a8dd6bfd 100644
--- a/arch/cris/arch-v10/kernel/signal.c
+++ b/arch/cris/arch-v10/kernel/signal.c
@@ -14,6 +14,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
diff --git a/arch/cris/arch-v32/kernel/process.c b/arch/cris/arch-v32/kernel/process.c
index c852df262948..c530a8fa87ce 100644
--- a/arch/cris/arch-v32/kernel/process.c
+++ b/arch/cris/arch-v32/kernel/process.c
@@ -11,6 +11,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/fs.h>
diff --git a/arch/cris/arch-v32/kernel/ptrace.c b/arch/cris/arch-v32/kernel/ptrace.c
index c366bc05466a..0461e95bbb62 100644
--- a/arch/cris/arch-v32/kernel/ptrace.c
+++ b/arch/cris/arch-v32/kernel/ptrace.c
@@ -4,6 +4,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/errno.h>
diff --git a/arch/cris/arch-v32/kernel/signal.c b/arch/cris/arch-v32/kernel/signal.c
index 816bf2ca93ef..ea2e8e1398e8 100644
--- a/arch/cris/arch-v32/kernel/signal.c
+++ b/arch/cris/arch-v32/kernel/signal.c
@@ -3,6 +3,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c
index c96dbd4b8626..5a4c92abc99e 100644
--- a/arch/frv/kernel/process.c
+++ b/arch/frv/kernel/process.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index 54e4d6f01865..0f5db5bb561b 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -27,6 +27,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c
index d784f7117f9a..1e8070d08770 100644
--- a/arch/h8300/kernel/signal.c
+++ b/arch/h8300/kernel/signal.c
@@ -25,6 +25,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
diff --git a/arch/hexagon/kernel/kgdb.c b/arch/hexagon/kernel/kgdb.c
index 62dece3ad827..16c24b22d0b2 100644
--- a/arch/hexagon/kernel/kgdb.c
+++ b/arch/hexagon/kernel/kgdb.c
@@ -20,6 +20,7 @@
 
 #include <linux/irq.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kdebug.h>
 #include <linux/kgdb.h>
 
diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c
index a2a822a875b6..de715bab7956 100644
--- a/arch/hexagon/kernel/process.c
+++ b/arch/hexagon/kernel/process.c
@@ -21,6 +21,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/tick.h>
diff --git a/arch/hexagon/kernel/ptrace.c b/arch/hexagon/kernel/ptrace.c
index 390a9ad14ca1..ecd75e2e8eb3 100644
--- a/arch/hexagon/kernel/ptrace.c
+++ b/arch/hexagon/kernel/ptrace.c
@@ -22,6 +22,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/errno.h>
diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c
index c6b22b9945a7..78aa7304a5c9 100644
--- a/arch/hexagon/kernel/signal.c
+++ b/arch/hexagon/kernel/signal.c
@@ -21,6 +21,8 @@
 #include <linux/linkage.h>
 #include <linux/syscalls.h>
 #include <linux/tracehook.h>
+#include <linux/sched/task_stack.h>
+
 #include <asm/registers.h>
 #include <asm/thread_info.h>
 #include <asm/unistd.h>
diff --git a/arch/hexagon/kernel/stacktrace.c b/arch/hexagon/kernel/stacktrace.c
index f94918b449a8..41866a06adf7 100644
--- a/arch/hexagon/kernel/stacktrace.c
+++ b/arch/hexagon/kernel/stacktrace.c
@@ -19,6 +19,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/stacktrace.h>
 #include <linux/thread_info.h>
 #include <linux/module.h>
diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c
index b55f13c70b34..2942a9204a9a 100644
--- a/arch/hexagon/kernel/traps.c
+++ b/arch/hexagon/kernel/traps.c
@@ -21,6 +21,7 @@
 #include <linux/init.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/kdebug.h>
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 7e943d3c05ed..09f86ebfcc7b 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -23,6 +23,7 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/interrupt.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 054facf22156..d344d0d691aa 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -23,6 +23,7 @@
 #include <linux/sched/debug.h>
 #include <linux/sched/hotplug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/stddef.h>
 #include <linux/thread_info.h>
 #include <linux/unistd.h>
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 04fe1436e1cc..3f8293378a83 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/errno.h>
 #include <linux/ptrace.h>
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 32a6cc36296f..b2b4f5216180 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -33,6 +33,7 @@
 #include <linux/reboot.h>
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
+#include <linux/sched/task_stack.h>
 #include <linux/seq_file.h>
 #include <linux/string.h>
 #include <linux/threads.h>
diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c
index ce4cc60d519b..5ce927c854a6 100644
--- a/arch/ia64/kernel/sys_ia64.c
+++ b/arch/ia64/kernel/sys_ia64.c
@@ -11,6 +11,7 @@
 #include <linux/mman.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/task_stack.h>
 #include <linux/shm.h>
 #include <linux/file.h>		/* doh, must come after sched.h... */
 #include <linux/smp.h>
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index 2a450382934c..d8ffcfec599c 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -24,6 +24,7 @@
 #include <linux/slab.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/module.h>
 #include <linux/ptrace.h>
 #include <linux/unistd.h>
diff --git a/arch/m32r/kernel/ptrace.c b/arch/m32r/kernel/ptrace.c
index a68acb9fa515..2d887400e30e 100644
--- a/arch/m32r/kernel/ptrace.c
+++ b/arch/m32r/kernel/ptrace.c
@@ -16,6 +16,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/err.h>
 #include <linux/smp.h>
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index 5d335d178706..e475c945c8b2 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
diff --git a/arch/m68k/kernel/ptrace.c b/arch/m68k/kernel/ptrace.c
index 9cd86d7343a6..748c63bd0081 100644
--- a/arch/m68k/kernel/ptrace.c
+++ b/arch/m68k/kernel/ptrace.c
@@ -12,6 +12,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/errno.h>
diff --git a/arch/metag/kernel/process.c b/arch/metag/kernel/process.c
index 801e6f927e62..c4606ce743d2 100644
--- a/arch/metag/kernel/process.c
+++ b/arch/metag/kernel/process.c
@@ -10,6 +10,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/unistd.h>
diff --git a/arch/metag/kernel/ptrace.c b/arch/metag/kernel/ptrace.c
index 7563628822bd..5fd16ee5280c 100644
--- a/arch/metag/kernel/ptrace.c
+++ b/arch/metag/kernel/ptrace.c
@@ -15,6 +15,8 @@
 #include <linux/tracehook.h>
 #include <linux/elf.h>
 #include <linux/uaccess.h>
+#include <linux/sched/task_stack.h>
+
 #include <trace/syscall.h>
 
 #define CREATE_TRACE_POINTS
diff --git a/arch/metag/kernel/signal.c b/arch/metag/kernel/signal.c
index ce49d429c74a..338925d808e6 100644
--- a/arch/metag/kernel/signal.c
+++ b/arch/metag/kernel/signal.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
diff --git a/arch/metag/kernel/smp.c b/arch/metag/kernel/smp.c
index 198eda75846b..cab2aa64ca82 100644
--- a/arch/metag/kernel/smp.c
+++ b/arch/metag/kernel/smp.c
@@ -14,6 +14,7 @@
 #include <linux/spinlock.h>
 #include <linux/sched.h>
 #include <linux/sched/hotplug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/interrupt.h>
 #include <linux/cache.h>
 #include <linux/profile.h>
diff --git a/arch/metag/kernel/traps.c b/arch/metag/kernel/traps.c
index 23c5ac33a93f..444851e510d5 100644
--- a/arch/metag/kernel/traps.c
+++ b/arch/metag/kernel/traps.c
@@ -12,6 +12,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/signal.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c
index a642fe5ac5ff..e92a817e645f 100644
--- a/arch/microblaze/kernel/process.c
+++ b/arch/microblaze/kernel/process.c
@@ -13,6 +13,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/pm.h>
 #include <linux/tick.h>
 #include <linux/bitops.h>
diff --git a/arch/microblaze/kernel/ptrace.c b/arch/microblaze/kernel/ptrace.c
index 8cfa98cadf3d..badd286882ae 100644
--- a/arch/microblaze/kernel/ptrace.c
+++ b/arch/microblaze/kernel/ptrace.c
@@ -27,6 +27,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/ptrace.h>
 #include <linux/signal.h>
 #include <linux/elf.h>
diff --git a/arch/microblaze/kernel/unwind.c b/arch/microblaze/kernel/unwind.c
index 61c04eed14d5..34c270cb11fc 100644
--- a/arch/microblaze/kernel/unwind.c
+++ b/arch/microblaze/kernel/unwind.c
@@ -17,6 +17,7 @@
 #include <linux/kallsyms.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/stacktrace.h>
 #include <linux/types.h>
 #include <linux/errno.h>
diff --git a/arch/mips/include/asm/fpu.h b/arch/mips/include/asm/fpu.h
index f06f97bd62df..321752bcbab6 100644
--- a/arch/mips/include/asm/fpu.h
+++ b/arch/mips/include/asm/fpu.h
@@ -11,6 +11,7 @@
 #define _ASM_FPU_H
 
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/thread_info.h>
 #include <linux/bitops.h>
 
diff --git a/arch/mips/kernel/crash.c b/arch/mips/kernel/crash.c
index 5a71518be0f1..ca25cd393b1c 100644
--- a/arch/mips/kernel/crash.c
+++ b/arch/mips/kernel/crash.c
@@ -8,6 +8,7 @@
 #include <linux/irq.h>
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 
 /* This keeps a track of which one is crashing cpu. */
 static int crashing_cpu = -1;
diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c
index d64056e0bb56..f298eb2ff6c2 100644
--- a/arch/mips/kernel/perf_event.c
+++ b/arch/mips/kernel/perf_event.c
@@ -15,6 +15,7 @@
  */
 
 #include <linux/perf_event.h>
+#include <linux/sched/task_stack.h>
 
 #include <asm/stacktrace.h>
 
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index 8bfb833b3158..fb6b6b650719 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -13,6 +13,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/tick.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index fdef26382c37..339601267265 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -19,6 +19,7 @@
 #include <linux/elf.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/errno.h>
 #include <linux/ptrace.h>
diff --git a/arch/mips/kernel/ptrace32.c b/arch/mips/kernel/ptrace32.c
index 4f0998525626..40e212d6b26b 100644
--- a/arch/mips/kernel/ptrace32.c
+++ b/arch/mips/kernel/ptrace32.c
@@ -18,6 +18,7 @@
 #include <linux/compat.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/errno.h>
 #include <linux/ptrace.h>
diff --git a/arch/mips/kernel/stacktrace.c b/arch/mips/kernel/stacktrace.c
index 986f910961d9..7c7c902249f2 100644
--- a/arch/mips/kernel/stacktrace.c
+++ b/arch/mips/kernel/stacktrace.c
@@ -5,6 +5,7 @@
  */
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/stacktrace.h>
 #include <linux/export.h>
 #include <asm/stacktrace.h>
diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c
index c86ddbaa4598..f1d17ece4181 100644
--- a/arch/mips/kernel/syscall.c
+++ b/arch/mips/kernel/syscall.c
@@ -26,6 +26,7 @@
 #include <linux/uaccess.h>
 #include <linux/slab.h>
 #include <linux/elf.h>
+#include <linux/sched/task_stack.h>
 
 #include <asm/asm.h>
 #include <asm/branch.h>
diff --git a/arch/mips/loongson64/loongson-3/smp.c b/arch/mips/loongson64/loongson-3/smp.c
index 66ede5b11355..64659fc73940 100644
--- a/arch/mips/loongson64/loongson-3/smp.c
+++ b/arch/mips/loongson64/loongson-3/smp.c
@@ -18,6 +18,7 @@
 #include <linux/cpu.h>
 #include <linux/sched.h>
 #include <linux/sched/hotplug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/smp.h>
 #include <linux/cpufreq.h>
 #include <asm/processor.h>
diff --git a/arch/mips/paravirt/paravirt-smp.c b/arch/mips/paravirt/paravirt-smp.c
index f8d3e081b2eb..72eb1a56c645 100644
--- a/arch/mips/paravirt/paravirt-smp.c
+++ b/arch/mips/paravirt/paravirt-smp.c
@@ -10,6 +10,7 @@
 #include <linux/cpumask.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 
 #include <asm/mipsregs.h>
 #include <asm/setup.h>
diff --git a/arch/mips/sibyte/bcm1480/smp.c b/arch/mips/sibyte/bcm1480/smp.c
index 4c71aea25663..d0e94ffcc1b8 100644
--- a/arch/mips/sibyte/bcm1480/smp.c
+++ b/arch/mips/sibyte/bcm1480/smp.c
@@ -21,6 +21,7 @@
 #include <linux/smp.h>
 #include <linux/kernel_stat.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 
 #include <asm/mmu_context.h>
 #include <asm/io.h>
diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c
index 8ca7a9de09a5..c9fa42619c6a 100644
--- a/arch/mn10300/kernel/process.c
+++ b/arch/mn10300/kernel/process.c
@@ -13,6 +13,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/mn10300/kernel/ptrace.c b/arch/mn10300/kernel/ptrace.c
index 976020f469c1..8009876a7ac4 100644
--- a/arch/mn10300/kernel/ptrace.c
+++ b/arch/mn10300/kernel/ptrace.c
@@ -11,6 +11,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/errno.h>
diff --git a/arch/nios2/kernel/process.c b/arch/nios2/kernel/process.c
index 5ee1ddf6a924..869a4d59de32 100644
--- a/arch/nios2/kernel/process.c
+++ b/arch/nios2/kernel/process.c
@@ -16,6 +16,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/tick.h>
 #include <linux/uaccess.h>
 
diff --git a/arch/nios2/kernel/ptrace.c b/arch/nios2/kernel/ptrace.c
index 681dda92eff1..de97bcb7dd44 100644
--- a/arch/nios2/kernel/ptrace.c
+++ b/arch/nios2/kernel/ptrace.c
@@ -14,6 +14,7 @@
 #include <linux/ptrace.h>
 #include <linux/regset.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/tracehook.h>
 #include <linux/uaccess.h>
 #include <linux/user.h>
diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c
index 0c5ad3bde0f5..828a29110459 100644
--- a/arch/openrisc/kernel/process.c
+++ b/arch/openrisc/kernel/process.c
@@ -24,6 +24,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/mm.h>
diff --git a/arch/openrisc/kernel/ptrace.c b/arch/openrisc/kernel/ptrace.c
index 228288887d74..eb97a8e7c8aa 100644
--- a/arch/openrisc/kernel/ptrace.c
+++ b/arch/openrisc/kernel/ptrace.c
@@ -18,6 +18,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/string.h>
 
 #include <linux/mm.h>
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index e5f97239cc5e..06f7ca7fe70b 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -45,6 +45,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/slab.h>
 #include <linux/stddef.h>
 #include <linux/unistd.h>
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index b99b12656f6f..d645da302bf2 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -18,6 +18,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 8dc758658972..51def8a515be 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -17,6 +17,7 @@
 
 #include <linux/signal.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/string.h>
diff --git a/arch/powerpc/perf/perf_regs.c b/arch/powerpc/perf/perf_regs.c
index d24a8a3668fa..cbd82fde5770 100644
--- a/arch/powerpc/perf/perf_regs.c
+++ b/arch/powerpc/perf/perf_regs.c
@@ -10,6 +10,7 @@
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/perf_event.h>
 #include <linux/bug.h>
 #include <linux/stddef.h>
diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h
index 352f7bdaf11f..0ddd37e6c29d 100644
--- a/arch/s390/include/asm/compat.h
+++ b/arch/s390/include/asm/compat.h
@@ -5,6 +5,7 @@
  */
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/thread_info.h>
 
 #define __TYPE_IS_PTR(t) (!__builtin_types_compatible_p(typeof(0?(t)0:0ULL), u64))
diff --git a/arch/s390/include/asm/kprobes.h b/arch/s390/include/asm/kprobes.h
index 84c0f9086483..1293c4066cfc 100644
--- a/arch/s390/include/asm/kprobes.h
+++ b/arch/s390/include/asm/kprobes.h
@@ -35,6 +35,7 @@
 #include <linux/types.h>
 #include <linux/ptrace.h>
 #include <linux/percpu.h>
+#include <linux/sched/task_stack.h>
 
 #define __ARCH_WANT_KPROBES_INSN_SLOT
 
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index 362350cc485c..c620049c61f2 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -10,6 +10,7 @@
 
 #include <linux/compat.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c
index 72c584d62f59..829e1c53005c 100644
--- a/arch/s390/kernel/dumpstack.c
+++ b/arch/s390/kernel/dumpstack.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 #include <asm/processor.h>
 #include <asm/debug.h>
 #include <asm/dis.h>
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 250c41f05721..20cd339e11ae 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -13,6 +13,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/elfcore.h>
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 12020b55887b..c14df0a1ec3c 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -8,6 +8,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/errno.h>
diff --git a/arch/s390/kernel/runtime_instr.c b/arch/s390/kernel/runtime_instr.c
index fffa0e5462af..429d3a782f1c 100644
--- a/arch/s390/kernel/runtime_instr.c
+++ b/arch/s390/kernel/runtime_instr.c
@@ -11,6 +11,8 @@
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/kernel_stat.h>
+#include <linux/sched/task_stack.h>
+
 #include <asm/runtime_instr.h>
 #include <asm/cpu_mf.h>
 #include <asm/irq.h>
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index 62a4c263e887..289dd50f9744 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -10,6 +10,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index b5766e03cdcb..47a973b5b4f1 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -32,6 +32,7 @@
 #include <linux/cpu.h>
 #include <linux/slab.h>
 #include <linux/sched/hotplug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/crash_dump.h>
 #include <linux/memblock.h>
 #include <asm/asm-offsets.h>
diff --git a/arch/s390/kernel/uprobes.c b/arch/s390/kernel/uprobes.c
index 66956c09d5bf..314e0ee3016a 100644
--- a/arch/s390/kernel/uprobes.c
+++ b/arch/s390/kernel/uprobes.c
@@ -9,6 +9,8 @@
 #include <linux/uprobes.h>
 #include <linux/compat.h>
 #include <linux/kdebug.h>
+#include <linux/sched/task_stack.h>
+
 #include <asm/switch_to.h>
 #include <asm/facility.h>
 #include <asm/kprobes.h>
diff --git a/arch/score/kernel/process.c b/arch/score/kernel/process.c
index 32924159d8c9..eb64d7a677cb 100644
--- a/arch/score/kernel/process.c
+++ b/arch/score/kernel/process.c
@@ -29,6 +29,7 @@
 #include <linux/pm.h>
 #include <linux/rcupdate.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 
 void (*pm_power_off)(void);
 EXPORT_SYMBOL(pm_power_off);
diff --git a/arch/score/kernel/ptrace.c b/arch/score/kernel/ptrace.c
index 8b75e54816c1..d8455e60bce0 100644
--- a/arch/score/kernel/ptrace.c
+++ b/arch/score/kernel/ptrace.c
@@ -28,6 +28,7 @@
 #include <linux/mm.h>
 #include <linux/ptrace.h>
 #include <linux/regset.h>
+#include <linux/sched/task_stack.h>
 
 #include <linux/uaccess.h>
 
diff --git a/arch/sh/kernel/cpu/fpu.c b/arch/sh/kernel/cpu/fpu.c
index b76370a47bf9..547c73478459 100644
--- a/arch/sh/kernel/cpu/fpu.c
+++ b/arch/sh/kernel/cpu/fpu.c
@@ -1,5 +1,6 @@
 #include <linux/sched/signal.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/slab.h>
 #include <asm/processor.h>
 #include <asm/fpu.h>
diff --git a/arch/sh/kernel/dumpstack.c b/arch/sh/kernel/dumpstack.c
index d00cab2f50f9..b564b1eae4ae 100644
--- a/arch/sh/kernel/dumpstack.c
+++ b/arch/sh/kernel/dumpstack.c
@@ -12,6 +12,7 @@
 #include <linux/ftrace.h>
 #include <linux/debug_locks.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kdebug.h>
 #include <linux/export.h>
 #include <linux/uaccess.h>
diff --git a/arch/sh/kernel/kgdb.c b/arch/sh/kernel/kgdb.c
index adad46e41a1d..4f04c6638a4d 100644
--- a/arch/sh/kernel/kgdb.c
+++ b/arch/sh/kernel/kgdb.c
@@ -14,6 +14,8 @@
 #include <linux/irq.h>
 #include <linux/io.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+
 #include <asm/cacheflush.h>
 #include <asm/traps.h>
 
diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c
index 80a61c5e3015..f8a695a223dd 100644
--- a/arch/sh/kernel/process.c
+++ b/arch/sh/kernel/process.c
@@ -2,6 +2,7 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/task_stack.h>
 #include <linux/export.h>
 #include <linux/stackprotector.h>
 #include <asm/fpu.h>
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index 493c3eb86aa5..2c7bdf8cb934 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -17,6 +17,7 @@
 #include <linux/mm.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/slab.h>
 #include <linux/elfcore.h>
 #include <linux/kallsyms.h>
diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c
index 056607185158..ee2abe96f9f3 100644
--- a/arch/sh/kernel/process_64.c
+++ b/arch/sh/kernel/process_64.c
@@ -27,6 +27,7 @@
 #include <linux/io.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <asm/syscalls.h>
 #include <linux/uaccess.h>
 #include <asm/pgtable.h>
diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c
index 1aabfd356b35..5fc3ff606210 100644
--- a/arch/sh/kernel/ptrace_32.c
+++ b/arch/sh/kernel/ptrace_32.c
@@ -12,6 +12,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/errno.h>
diff --git a/arch/sh/kernel/ptrace_64.c b/arch/sh/kernel/ptrace_64.c
index c49d0d05a215..1e0656d9e7af 100644
--- a/arch/sh/kernel/ptrace_64.c
+++ b/arch/sh/kernel/ptrace_64.c
@@ -18,6 +18,7 @@
 #include <linux/kernel.h>
 #include <linux/rwsem.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/bitops.h>
diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c
index 5128d3001ee5..08bce11badc6 100644
--- a/arch/sh/kernel/signal_32.c
+++ b/arch/sh/kernel/signal_32.c
@@ -9,6 +9,7 @@
  *
  */
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
diff --git a/arch/sh/kernel/sys_sh32.c b/arch/sh/kernel/sys_sh32.c
index d5287d76809c..a2e1231a90a3 100644
--- a/arch/sh/kernel/sys_sh32.c
+++ b/arch/sh/kernel/sys_sh32.c
@@ -1,5 +1,6 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/sem.h>
diff --git a/arch/sh/kernel/traps.c b/arch/sh/kernel/traps.c
index bc5c9f347b9e..b32d1c3a4655 100644
--- a/arch/sh/kernel/traps.c
+++ b/arch/sh/kernel/traps.c
@@ -5,6 +5,7 @@
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/uaccess.h>
 #include <linux/hardirq.h>
 #include <linux/kernel.h>
diff --git a/arch/sh/kernel/traps_32.c b/arch/sh/kernel/traps_32.c
index ff639342a8be..57cff00cad17 100644
--- a/arch/sh/kernel/traps_32.c
+++ b/arch/sh/kernel/traps_32.c
@@ -25,6 +25,8 @@
 #include <linux/sysfs.h>
 #include <linux/uaccess.h>
 #include <linux/perf_event.h>
+#include <linux/sched/task_stack.h>
+
 #include <asm/alignment.h>
 #include <asm/fpu.h>
 #include <asm/kprobes.h>
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index ed87c45a785b..b6dac8e980f0 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -16,6 +16,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index 4c82a73af893..1badc493e62e 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -16,6 +16,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c
index 901063c1cf7e..df9e731a76f5 100644
--- a/arch/sparc/kernel/ptrace_64.c
+++ b/arch/sparc/kernel/ptrace_64.c
@@ -12,6 +12,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/errno.h>
 #include <linux/export.h>
diff --git a/arch/tile/kernel/compat_signal.c b/arch/tile/kernel/compat_signal.c
index c667e104a0c2..0e863f1ee08c 100644
--- a/arch/tile/kernel/compat_signal.c
+++ b/arch/tile/kernel/compat_signal.c
@@ -13,6 +13,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
diff --git a/arch/tile/kernel/kgdb.c b/arch/tile/kernel/kgdb.c
index 9247d6b562f4..d4eb5fb2df9d 100644
--- a/arch/tile/kernel/kgdb.c
+++ b/arch/tile/kernel/kgdb.c
@@ -19,6 +19,8 @@
 #include <linux/kdebug.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
+#include <linux/sched/task_stack.h>
+
 #include <asm/cacheflush.h>
 
 static tile_bundle_bits singlestep_insn = TILEGX_BPT_BUNDLE | DIE_SSTEPBP;
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index dbb4fa76d1dd..f0a0e18e4dfb 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/preempt.h>
 #include <linux/module.h>
 #include <linux/fs.h>
diff --git a/arch/tile/kernel/ptrace.c b/arch/tile/kernel/ptrace.c
index e279572824b1..e1a078e6828e 100644
--- a/arch/tile/kernel/ptrace.c
+++ b/arch/tile/kernel/ptrace.c
@@ -23,6 +23,8 @@
 #include <linux/elf.h>
 #include <linux/tracehook.h>
 #include <linux/context_tracking.h>
+#include <linux/sched/task_stack.h>
+
 #include <asm/traps.h>
 #include <arch/chip.h>
 
diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c
index 8cc92ae6eb27..f2bf557bb005 100644
--- a/arch/tile/kernel/signal.c
+++ b/arch/tile/kernel/signal.c
@@ -15,6 +15,7 @@
 
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
diff --git a/arch/tile/kernel/stack.c b/arch/tile/kernel/stack.c
index a1c0787fe9d8..94ecbc6676e5 100644
--- a/arch/tile/kernel/stack.c
+++ b/arch/tile/kernel/stack.c
@@ -14,6 +14,7 @@
 
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/kprobes.h>
 #include <linux/module.h>
diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c
index 2df4e5d11939..31968677a0d0 100644
--- a/arch/um/kernel/exec.c
+++ b/arch/um/kernel/exec.c
@@ -9,6 +9,7 @@
 #include <linux/ptrace.h>
 #include <linux/sched.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/slab.h>
 #include <asm/current.h>
 #include <asm/processor.h>
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index e76dc7c11251..a9bd61820042 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -19,6 +19,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/seq_file.h>
 #include <linux/tick.h>
 #include <linux/threads.h>
diff --git a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c
index 527fa5881915..ddecf326dab2 100644
--- a/arch/um/kernel/skas/process.c
+++ b/arch/um/kernel/skas/process.c
@@ -5,6 +5,7 @@
 
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <as-layout.h>
 #include <kern.h>
 #include <os.h>
diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c
index d58f1600b477..d22c1dc7e39e 100644
--- a/arch/unicore32/kernel/process.c
+++ b/arch/unicore32/kernel/process.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
diff --git a/arch/unicore32/kernel/ptrace.c b/arch/unicore32/kernel/ptrace.c
index 9f07c08da050..a102c2b4f358 100644
--- a/arch/unicore32/kernel/ptrace.c
+++ b/arch/unicore32/kernel/ptrace.c
@@ -15,6 +15,7 @@
 #include <linux/ptrace.h>
 #include <linux/signal.h>
 #include <linux/uaccess.h>
+#include <linux/sched/task_stack.h>
 
 /*
  * this routine will get a word off of the processes privileged stack.
diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c
index 506e1a2127c6..5f25b39f04d4 100644
--- a/arch/unicore32/kernel/traps.c
+++ b/arch/unicore32/kernel/traps.c
@@ -16,6 +16,7 @@
 #include <linux/signal.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/spinlock.h>
 #include <linux/personality.h>
 #include <linux/kallsyms.h>
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index b83c61cfd154..370c42c7f046 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -9,6 +9,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/errno.h>
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 572cee3fccff..226ca70dc6bd 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -7,6 +7,7 @@
 #include <linux/mm.h>
 #include <linux/err.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/random.h>
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 7c0a711989d2..8d0879f1d42c 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -26,6 +26,7 @@
 #include <linux/init.h>
 #include <linux/jiffies.h>
 #include <linux/perf_event.h>
+#include <linux/sched/task_stack.h>
 
 #include <linux/uaccess.h>
 #include <asm/pgalloc.h>
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 95c0b4ae09b0..724153797209 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 7b9e7e68f316..09d4ac0d2661 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/ptrace.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/ftrace.h>
 #include <linux/kexec.h>
 #include <linux/bug.h>
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index c114b132d121..b188b16841e3 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -5,6 +5,7 @@
 #include <asm/fpu/signal.h>
 #include <asm/fpu/regset.h>
 #include <asm/fpu/xstate.h>
+#include <linux/sched/task_stack.h>
 
 /*
  * The xstateregs_active() routine is the same as the regset_fpregs_active() routine,
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index b01bc8517450..ca49bab3e467 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/capability.h>
 #include <linux/errno.h>
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 6b0678a541e2..3be74fbdeff2 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -15,6 +15,7 @@
 #include <linux/ftrace.h>
 #include <linux/uaccess.h>
 #include <linux/smp.h>
+#include <linux/sched/task_stack.h>
 #include <asm/io_apic.h>
 #include <asm/apic.h>
 
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index da8cb987b973..587d887f7f17 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -1,6 +1,7 @@
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/perf_event.h>
 #include <linux/bug.h>
 #include <linux/stddef.h>
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 441f00e7be8f..56b059486c3b 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -10,6 +10,7 @@
 #include <linux/sched/idle.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/init.h>
 #include <linux/export.h>
 #include <linux/pm.h>
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index d79ffa47aab8..4c818f8bc135 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -13,6 +13,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/fs.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 124f5652ae3c..d6b784a5520d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -18,6 +18,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/fs.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 9cc7d5a330ef..2364b23ea3e5 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -6,6 +6,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/errno.h>
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 763af1d0de64..396c042e9d0e 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -10,6 +10,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f3eb266d75ff..bd1f1ad35284 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -47,6 +47,7 @@
 #include <linux/sched.h>
 #include <linux/sched/topology.h>
 #include <linux/sched/hotplug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/percpu.h>
 #include <linux/bootmem.h>
 #include <linux/err.h>
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 330cae0025d0..8e2b79b88e51 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -5,6 +5,7 @@
  */
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/stacktrace.h>
 #include <linux/export.h>
 #include <linux/uaccess.h>
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index a23ce84a3f6c..f07f83b3611b 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -2,6 +2,7 @@
  * x86 single-step support code, common to 32-bit and 64-bit.
  */
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/ptrace.h>
 #include <asm/desc.h>
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1dc86ee60a03..948443e115c1 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -29,6 +29,7 @@
 #include <linux/errno.h>
 #include <linux/kexec.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/timer.h>
 #include <linux/init.h>
 #include <linux/bug.h>
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index 7c0abdc9a732..478d15dbaee4 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -1,5 +1,6 @@
 #include <linux/sched.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <asm/ptrace.h>
 #include <asm/bitops.h>
 #include <asm/stacktrace.h>
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 0442d98367ae..23ee89ce59a9 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -35,6 +35,7 @@
 #include <linux/interrupt.h>
 #include <linux/syscalls.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
 #include <linux/string.h>
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index e3254ca0eec4..428e31763cb9 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -4,6 +4,7 @@
  *  Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
  */
 #include <linux/sched.h>		/* test_thread_flag(), ...	*/
+#include <linux/sched/task_stack.h>	/* task_stack_*(), ...		*/
 #include <linux/kdebug.h>		/* oops_begin/end, ...		*/
 #include <linux/extable.h>		/* search_exception_tables	*/
 #include <linux/bootmem.h>		/* max_low_pfn			*/
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index af33f9d4c624..58f96d1230d4 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -19,6 +19,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/xtensa/kernel/ptrace.c b/arch/xtensa/kernel/ptrace.c
index 32519b71d914..e0f583fed06a 100644
--- a/arch/xtensa/kernel/ptrace.c
+++ b/arch/xtensa/kernel/ptrace.c
@@ -20,6 +20,7 @@
 #include <linux/perf_event.h>
 #include <linux/ptrace.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/security.h>
 #include <linux/signal.h>
 #include <linux/smp.h>
diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c
index c41294745731..70a131945443 100644
--- a/arch/xtensa/kernel/signal.c
+++ b/arch/xtensa/kernel/signal.c
@@ -20,6 +20,7 @@
 #include <linux/ptrace.h>
 #include <linux/personality.h>
 #include <linux/tracehook.h>
+#include <linux/sched/task_stack.h>
 
 #include <asm/ucontext.h>
 #include <linux/uaccess.h>
diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c
index f2b3e1725349..fd894eaa63f3 100644
--- a/arch/xtensa/kernel/smp.c
+++ b/arch/xtensa/kernel/smp.c
@@ -22,6 +22,7 @@
 #include <linux/kdebug.h>
 #include <linux/module.h>
 #include <linux/sched/hotplug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/reboot.h>
 #include <linux/seq_file.h>
 #include <linux/smp.h>
diff --git a/block/blk-map.c b/block/blk-map.c
index 2f18c2a0be1b..3b5cb863318f 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -2,6 +2,7 @@
  * Functions related to mapping data to requests
  */
 #include <linux/kernel.h>
+#include <linux/sched/task_stack.h>
 #include <linux/module.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index f7f6b9144b07..da6b59ba5940 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -34,6 +34,8 @@
 #include <linux/kernel_stat.h>
 #include <linux/clockchips.h>
 #include <linux/cpu.h>
+#include <linux/sched/task_stack.h>
+
 #include <asm/hyperv.h>
 #include <asm/hypervisor.h>
 #include <asm/mshyperv.h>
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index aef00511ca86..74f1b7dc03f7 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -28,6 +28,7 @@
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
+#include <linux/sched/task_stack.h>
 #include <linux/delay.h>
 #include <linux/timer.h>
 #include <linux/seq_file.h>
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 9b2fe2d3e3a9..1ec84ca81146 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -3,6 +3,7 @@
 
 #include <linux/llist.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/workqueue.h>
 
 /*
diff --git a/drivers/misc/lkdtm_usercopy.c b/drivers/misc/lkdtm_usercopy.c
index 1dd611423d8b..df6ac985fbb5 100644
--- a/drivers/misc/lkdtm_usercopy.c
+++ b/drivers/misc/lkdtm_usercopy.c
@@ -5,6 +5,7 @@
 #include "lkdtm.h"
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mman.h>
 #include <linux/uaccess.h>
 #include <asm/cacheflush.h>
diff --git a/drivers/mtd/nand/gpmi-nand/gpmi-nand.c b/drivers/mtd/nand/gpmi-nand/gpmi-nand.c
index 6c062b8251d2..d52139635b67 100644
--- a/drivers/mtd/nand/gpmi-nand/gpmi-nand.c
+++ b/drivers/mtd/nand/gpmi-nand/gpmi-nand.c
@@ -20,6 +20,7 @@
  */
 #include <linux/clk.h>
 #include <linux/slab.h>
+#include <linux/sched/task_stack.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/mtd/partitions.h>
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 2a59139f520b..9be82c4e14a4 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -25,6 +25,7 @@
 #include <linux/init.h>
 #include <linux/coredump.h>
 #include <linux/slab.h>
+#include <linux/sched/task_stack.h>
 
 #include <linux/uaccess.h>
 #include <asm/cacheflush.h>
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 2c95257fa4da..92c00a13b28b 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -36,6 +36,7 @@
 #include <linux/coredump.h>
 #include <linux/sched.h>
 #include <linux/sched/coredump.h>
+#include <linux/sched/task_stack.h>
 #include <linux/cred.h>
 #include <linux/dax.h>
 #include <linux/uaccess.h>
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 222873bb689c..6103a8149ccd 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -15,6 +15,8 @@
 #include <linux/fs.h>
 #include <linux/stat.h>
 #include <linux/sched.h>
+#include <linux/sched/coredump.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/errno.h>
@@ -35,7 +37,6 @@
 #include <linux/elf-fdpic.h>
 #include <linux/elfcore.h>
 #include <linux/coredump.h>
-#include <linux/sched/coredump.h>
 #include <linux/dax.h>
 
 #include <linux/uaccess.h>
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 9b2917a30294..2edcefc0a294 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -19,6 +19,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/errno.h>
diff --git a/fs/coredump.c b/fs/coredump.c
index c74ab43b8383..592683711c64 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -18,6 +18,7 @@
 #include <linux/coredump.h>
 #include <linux/sched/coredump.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/task_stack.h>
 #include <linux/utsname.h>
 #include <linux/pid_namespace.h>
 #include <linux/module.h>
diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h
index 698d51a0eea3..c8240a12c42d 100644
--- a/include/linux/elfcore.h
+++ b/include/linux/elfcore.h
@@ -3,6 +3,8 @@
 
 #include <linux/user.h>
 #include <linux/bug.h>
+#include <linux/sched/task_stack.h>
+
 #include <asm/elf.h>
 #include <uapi/linux/elfcore.h>
 
diff --git a/include/linux/perf_regs.h b/include/linux/perf_regs.h
index a5f98d53d732..9b7dd59fe28d 100644
--- a/include/linux/perf_regs.h
+++ b/include/linux/perf_regs.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_PERF_REGS_H
 #define _LINUX_PERF_REGS_H
 
+#include <linux/sched/task_stack.h>
+
 struct perf_regs {
 	__u64		abi;
 	struct pt_regs	*regs;
diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
new file mode 100644
index 000000000000..933cd49824c2
--- /dev/null
+++ b/include/linux/sched/task_stack.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_TASK_STACK_H
+#define _LINUX_SCHED_TASK_STACK_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_TASK_STACK_H */
diff --git a/init/main.c b/init/main.c
index 47956b22add1..c2d337a136d1 100644
--- a/init/main.c
+++ b/init/main.c
@@ -77,6 +77,7 @@
 #include <linux/elevator.h>
 #include <linux/sched_clock.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/context_tracking.h>
 #include <linux/random.h>
 #include <linux/list.h>
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index e9fdb5203de5..c04917cad1bf 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -11,6 +11,8 @@
 
 #include <linux/perf_event.h>
 #include <linux/slab.h>
+#include <linux/sched/task_stack.h>
+
 #include "internal.h"
 
 struct callchain_cpus_entries {
diff --git a/kernel/exit.c b/kernel/exit.c
index e53408d156df..771c33fc9952 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -10,6 +10,7 @@
 #include <linux/sched/mm.h>
 #include <linux/sched/stat.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/capability.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 0af64d5d8b73..c87b6f03c710 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -19,6 +19,7 @@
 #include <linux/sched/numa_balancing.h>
 #include <linux/sched/stat.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/rtmutex.h>
 #include <linux/init.h>
 #include <linux/unistd.h>
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 3caaaa04b92e..2984fb0f0257 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -47,6 +47,7 @@
 #include <linux/uio.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 
 #include <linux/uaccess.h>
 #include <asm/sections.h>
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6b7155ae5c33..4e2fec3f12a0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -15,6 +15,7 @@
 #include <linux/sched/debug.h>
 #include <linux/sched/hotplug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 
 #include <linux/u64_stats_sync.h>
 #include <linux/kernel_stat.h>
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index e15185c28de5..65f61077ad50 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -18,6 +18,7 @@
 #include <linux/compat.h>
 #include <linux/coredump.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/seccomp.h>
 #include <linux/slab.h>
 #include <linux/syscalls.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index 25d099fd80f7..e6d470a5f75a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -17,6 +17,7 @@
 #include <linux/sched/user.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/fs.h>
 #include <linux/tty.h>
 #include <linux/binfmts.h>
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 2a1abbaca10e..1d68b5b7ad41 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -2,6 +2,7 @@
  * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
  *
  */
+#include <linux/sched/task_stack.h>
 #include <linux/stacktrace.h>
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 8c28cbd7e104..17afb0430161 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -13,6 +13,7 @@
 #include <linux/debugobjects.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
 #include <linux/slab.h>
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 942adf13418d..b157b46cc9a6 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -17,6 +17,7 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  */
 
+#include <linux/sched/task_stack.h>
 #include <linux/scatterlist.h>
 #include <linux/dma-mapping.h>
 #include <linux/sched/task.h>
diff --git a/lib/syscall.c b/lib/syscall.c
index 63239e097b13..17d5ff5fa6a3 100644
--- a/lib/syscall.c
+++ b/lib/syscall.c
@@ -1,5 +1,6 @@
 #include <linux/ptrace.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/export.h>
 #include <asm/syscall.h>
 
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index d99312ed7c22..98b27195e38b 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -29,6 +29,7 @@
 #include <linux/module.h>
 #include <linux/printk.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/slab.h>
 #include <linux/stacktrace.h>
 #include <linux/string.h>
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index a3970573359e..26c874e90b12 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -75,6 +75,7 @@
 #include <linux/list.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/jiffies.h>
 #include <linux/delay.h>
 #include <linux/export.h>
diff --git a/mm/util.c b/mm/util.c
index 14f4e13323e2..656dc5e37a87 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -6,6 +6,7 @@
 #include <linux/err.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/task_stack.h>
 #include <linux/security.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
-- 
cgit v1.2.3


From dfc3401a33086a3fd465468e171ea0e82430569b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 23:15:21 +0100
Subject: sched/headers: Prepare to move the 'root_task_group' declaration to
 <linux/sched/autogroup.h>

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/init_task.h | 1 +
 kernel/sched/sched.h      | 1 +
 2 files changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 3a85d61f7614..452c9799318c 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -12,6 +12,7 @@
 #include <linux/securebits.h>
 #include <linux/seqlock.h>
 #include <linux/rbtree.h>
+#include <linux/sched/autogroup.h>
 #include <net/net_namespace.h>
 #include <linux/sched/rt.h>
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4e2fec3f12a0..b1f1c8443837 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,5 +1,6 @@
 
 #include <linux/sched.h>
+#include <linux/sched/autogroup.h>
 #include <linux/sched/sysctl.h>
 #include <linux/sched/topology.h>
 #include <linux/sched/rt.h>
-- 
cgit v1.2.3


From f361bf4a66c9bfabace46f6ff5d97005c9b524fe Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 23:47:37 +0100
Subject: sched/headers: Prepare for the reduction of <linux/sched.h>'s signal
 API dependency

Instead of including the full <linux/signal.h>, we are going to include the
types-only <linux/signal_types.h> header in <linux/sched.h>, to further
decouple the scheduler header from the signal headers.

This means that various files which relied on the full <linux/signal.h> need
to be updated to gain an explicit dependency on it.

Update the code that relies on sched.h's inclusion of the <linux/signal.h> header.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm64/kernel/sys_compat.c          | 1 +
 drivers/gpu/drm/i915/i915_gem_request.c | 1 +
 drivers/isdn/mISDN/stack.c              | 2 ++
 fs/btrfs/extent-tree.c                  | 1 +
 fs/btrfs/free-space-cache.c             | 1 +
 fs/buffer.c                             | 1 +
 fs/ceph/addr.c                          | 1 +
 fs/dax.c                                | 1 +
 fs/ioctl.c                              | 2 ++
 fs/iomap.c                              | 2 ++
 fs/ocfs2/super.c                        | 1 +
 include/linux/sched.h                   | 1 +
 include/linux/sched/signal.h            | 1 +
 kernel/pid_namespace.c                  | 1 +
 mm/page-writeback.c                     | 1 +
 15 files changed, 18 insertions(+)

(limited to 'kernel')

diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c
index abaf582fc7a8..8b8bbd3eaa52 100644
--- a/arch/arm64/kernel/sys_compat.c
+++ b/arch/arm64/kernel/sys_compat.c
@@ -21,6 +21,7 @@
 #include <linux/compat.h>
 #include <linux/personality.h>
 #include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/syscalls.h>
 #include <linux/uaccess.h>
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index df3fef393dbe..e7c3c0318ff6 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -26,6 +26,7 @@
 #include <linux/dma-fence-array.h>
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
+#include <linux/sched/signal.h>
 
 #include "i915_drv.h"
 
diff --git a/drivers/isdn/mISDN/stack.c b/drivers/isdn/mISDN/stack.c
index b324474c0c12..696f22fd5ab4 100644
--- a/drivers/isdn/mISDN/stack.c
+++ b/drivers/isdn/mISDN/stack.c
@@ -19,6 +19,8 @@
 #include <linux/mISDNif.h>
 #include <linux/kthread.h>
 #include <linux/sched.h>
+#include <linux/signal.h>
+
 #include "core.h"
 
 static u_int	*debug;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c35b96633554..dad395b0b9fc 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -16,6 +16,7 @@
  * Boston, MA 021110-1307, USA.
  */
 #include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 1a131f7d6c1b..493a654b6012 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,7 @@
 
 #include <linux/pagemap.h>
 #include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/math64.h>
 #include <linux/ratelimit.h>
diff --git a/fs/buffer.c b/fs/buffer.c
index 28484b3ebc98..9196f2a270da 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -19,6 +19,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 #include <linux/syscalls.h>
 #include <linux/fs.h>
 #include <linux/iomap.h>
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index f297a9e18642..1a3e1b40799a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -8,6 +8,7 @@
 #include <linux/slab.h>
 #include <linux/pagevec.h>
 #include <linux/task_io_accounting_ops.h>
+#include <linux/signal.h>
 
 #include "super.h"
 #include "mds_client.h"
diff --git a/fs/dax.c b/fs/dax.c
index 7436c98b92c8..de622d4282a6 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -27,6 +27,7 @@
 #include <linux/pagevec.h>
 #include <linux/pmem.h>
 #include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/uio.h>
 #include <linux/vmstat.h>
 #include <linux/pfn_t.h>
diff --git a/fs/ioctl.c b/fs/ioctl.c
index cb9b02940805..569db68d02b3 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -15,6 +15,8 @@
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
 #include <linux/falloc.h>
+#include <linux/sched/signal.h>
+
 #include "internal.h"
 
 #include <asm/ioctls.h>
diff --git a/fs/iomap.c b/fs/iomap.c
index 0f85f2410605..3ca1a8e44135 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -26,6 +26,8 @@
 #include <linux/buffer_head.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/dax.h>
+#include <linux/sched/signal.h>
+
 #include "internal.h"
 
 /*
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index a24e42f95341..ca1646fbcaef 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -42,6 +42,7 @@
 #include <linux/seq_file.h>
 #include <linux/quotaops.h>
 #include <linux/cleancache.h>
+#include <linux/signal.h>
 
 #define CREATE_TRACE_POINTS
 #include "ocfs2_trace.h"
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8ae7b3d85658..ea05116bc3c2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -36,6 +36,7 @@ struct sched_param {
 #include <linux/signal.h>
 #include <linux/compiler.h>
 #include <linux/completion.h>
+#include <linux/signal_types.h>
 #include <linux/pid.h>
 #include <linux/percpu.h>
 #include <linux/topology.h>
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 0f4e9f4a43fd..7e10a7824523 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -1,6 +1,7 @@
 #ifndef _LINUX_SCHED_SIGNAL_H
 #define _LINUX_SCHED_SIGNAL_H
 
+#include <linux/signal.h>
 #include <linux/cred.h>
 #include <linux/sched.h>
 #include <linux/sched/jobctl.h>
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 7c8367854dc4..de461aa0bf9a 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -20,6 +20,7 @@
 #include <linux/reboot.h>
 #include <linux/export.h>
 #include <linux/sched/task.h>
+#include <linux/sched/signal.h>
 
 struct pid_cache {
 	int nr_ids;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 26a60818a8fc..d8ac2a7fb9e7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,6 +36,7 @@
 #include <linux/pagevec.h>
 #include <linux/timer.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/signal.h>
 #include <linux/mm_inline.h>
 #include <trace/events/writeback.h>
 
-- 
cgit v1.2.3


From 589ee62844e042b0b7d19ef57fb4cff77f3ca294 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 4 Feb 2017 00:16:44 +0100
Subject: sched/headers: Prepare to remove the <linux/mm_types.h> dependency
 from <linux/sched.h>

Update code that relied on sched.h including various MM types for them.

This will allow us to remove the <linux/mm_types.h> include from <linux/sched.h>.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/include/asm/a.out-core.h              | 1 +
 arch/alpha/include/asm/mmu_context.h             | 2 ++
 arch/arc/mm/tlb.c                                | 2 ++
 arch/arm/include/asm/mmu_context.h               | 2 ++
 arch/arm/include/asm/tlbflush.h                  | 7 ++++---
 arch/arm/kernel/suspend.c                        | 1 +
 arch/arm/kernel/swp_emulate.c                    | 1 +
 arch/arm/mm/idmap.c                              | 1 +
 arch/arm64/include/asm/mmu_context.h             | 1 +
 arch/arm64/kernel/traps.c                        | 1 +
 arch/avr32/include/asm/mmu_context.h             | 2 ++
 arch/blackfin/include/asm/mmu_context.h          | 2 ++
 arch/blackfin/kernel/flat.c                      | 1 +
 arch/blackfin/kernel/process.c                   | 1 +
 arch/blackfin/mm/sram-alloc.c                    | 2 ++
 arch/cris/arch-v10/mm/tlb.c                      | 2 ++
 arch/cris/arch-v32/mm/tlb.c                      | 1 +
 arch/cris/include/asm/pgtable.h                  | 2 +-
 arch/cris/mm/tlb.c                               | 2 ++
 arch/h8300/kernel/traps.c                        | 1 +
 arch/hexagon/include/asm/mmu_context.h           | 2 ++
 arch/hexagon/kernel/smp.c                        | 1 +
 arch/ia64/include/asm/mmu_context.h              | 1 +
 arch/ia64/include/asm/pgtable.h                  | 2 +-
 arch/ia64/sn/kernel/sn2/sn2_smp.c                | 1 +
 arch/m32r/include/asm/mmu_context.h              | 2 ++
 arch/m68k/include/asm/a.out-core.h               | 1 +
 arch/m68k/include/asm/mmu_context.h              | 1 +
 arch/metag/include/asm/mmu_context.h             | 1 +
 arch/microblaze/include/asm/mmu_context_mm.h     | 2 ++
 arch/microblaze/mm/pgtable.c                     | 1 +
 arch/mips/include/asm/elf.h                      | 2 ++
 arch/mips/include/asm/mmu_context.h              | 2 ++
 arch/mips/kernel/smp.c                           | 2 +-
 arch/mips/math-emu/dsemul.c                      | 1 +
 arch/mips/mm/ioremap.c                           | 1 +
 arch/mn10300/include/asm/mmu_context.h           | 2 ++
 arch/mn10300/kernel/smp.c                        | 2 +-
 arch/mn10300/mm/tlb-smp.c                        | 2 +-
 arch/nios2/include/asm/mmu_context.h             | 2 ++
 arch/nios2/kernel/process.c                      | 1 +
 arch/powerpc/kernel/io-workarounds.c             | 2 +-
 arch/powerpc/kvm/e500_mmu_host.c                 | 2 +-
 arch/powerpc/lib/feature-fixups.c                | 1 +
 arch/powerpc/mm/hash_utils_64.c                  | 2 +-
 arch/powerpc/mm/pgtable-book3s64.c               | 2 ++
 arch/powerpc/mm/pgtable-hash64.c                 | 2 ++
 arch/powerpc/mm/pgtable-radix.c                  | 2 +-
 arch/powerpc/mm/slb.c                            | 2 ++
 arch/s390/include/asm/elf.h                      | 2 +-
 arch/s390/include/asm/mmu_context.h              | 1 +
 arch/s390/kernel/processor.c                     | 1 +
 arch/s390/kvm/gaccess.c                          | 2 ++
 arch/s390/kvm/priv.c                             | 2 ++
 arch/score/include/asm/mmu_context.h             | 2 ++
 arch/score/kernel/traps.c                        | 1 +
 arch/sh/include/asm/mmu_context.h                | 2 ++
 arch/sparc/include/asm/mmu_context_64.h          | 2 ++
 arch/sparc/include/asm/pgtable_64.h              | 3 +++
 arch/sparc/kernel/asm-offsets.c                  | 1 +
 arch/sparc/kernel/traps_32.c                     | 1 +
 arch/sparc/mm/tsb.c                              | 2 ++
 arch/tile/include/asm/mmu_context.h              | 2 ++
 arch/um/include/asm/mmu_context.h                | 2 ++
 arch/um/kernel/exec.c                            | 2 +-
 arch/um/kernel/reboot.c                          | 1 +
 arch/um/kernel/skas/process.c                    | 3 ++-
 arch/x86/entry/vsyscall/vsyscall_64.c            | 1 +
 arch/x86/events/core.c                           | 2 +-
 arch/x86/include/asm/a.out-core.h                | 2 ++
 arch/x86/include/asm/mpx.h                       | 2 ++
 arch/x86/mm/mpx.c                                | 1 +
 arch/x86/um/syscalls_64.c                        | 1 +
 arch/x86/xen/mmu.c                               | 2 +-
 arch/xtensa/include/asm/mmu_context.h            | 1 +
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c | 2 ++
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c  | 2 ++
 drivers/infiniband/hw/cxgb3/iwch_provider.c      | 2 +-
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h           | 2 +-
 drivers/media/v4l2-core/videobuf-dma-sg.c        | 2 +-
 fs/binfmt_misc.c                                 | 2 +-
 fs/kernfs/file.c                                 | 2 +-
 include/drm/drm_mm.h                             | 1 +
 include/linux/init_task.h                        | 1 +
 include/linux/sched/mm.h                         | 1 +
 kernel/sched/debug.c                             | 2 +-
 kernel/sched/fair.c                              | 2 +-
 kernel/signal.c                                  | 2 +-
 lib/is_single_threaded.c                         | 1 +
 89 files changed, 125 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/include/asm/a.out-core.h b/arch/alpha/include/asm/a.out-core.h
index 9e33e92e524c..1610d078b064 100644
--- a/arch/alpha/include/asm/a.out-core.h
+++ b/arch/alpha/include/asm/a.out-core.h
@@ -15,6 +15,7 @@
 #ifdef __KERNEL__
 
 #include <linux/user.h>
+#include <linux/mm_types.h>
 
 /*
  * Fill in the user structure for an ECOFF core dump.
diff --git a/arch/alpha/include/asm/mmu_context.h b/arch/alpha/include/asm/mmu_context.h
index 4c51c05333c6..384bd47b5187 100644
--- a/arch/alpha/include/asm/mmu_context.h
+++ b/arch/alpha/include/asm/mmu_context.h
@@ -7,6 +7,8 @@
  * Copyright (C) 1996, Linus Torvalds
  */
 
+#include <linux/mm_types.h>
+
 #include <asm/machvec.h>
 #include <asm/compiler.h>
 #include <asm-generic/mm_hooks.h>
diff --git a/arch/arc/mm/tlb.c b/arch/arc/mm/tlb.c
index bdb295e09160..d0126fdfe2d8 100644
--- a/arch/arc/mm/tlb.c
+++ b/arch/arc/mm/tlb.c
@@ -53,6 +53,8 @@
 
 #include <linux/module.h>
 #include <linux/bug.h>
+#include <linux/mm_types.h>
+
 #include <asm/arcregs.h>
 #include <asm/setup.h>
 #include <asm/mmu_context.h>
diff --git a/arch/arm/include/asm/mmu_context.h b/arch/arm/include/asm/mmu_context.h
index 3cc14dd8587c..7f303295ef19 100644
--- a/arch/arm/include/asm/mmu_context.h
+++ b/arch/arm/include/asm/mmu_context.h
@@ -15,7 +15,9 @@
 
 #include <linux/compiler.h>
 #include <linux/sched.h>
+#include <linux/mm_types.h>
 #include <linux/preempt.h>
+
 #include <asm/cacheflush.h>
 #include <asm/cachetype.h>
 #include <asm/proc-fns.h>
diff --git a/arch/arm/include/asm/tlbflush.h b/arch/arm/include/asm/tlbflush.h
index def9e570199f..1897b5196fb5 100644
--- a/arch/arm/include/asm/tlbflush.h
+++ b/arch/arm/include/asm/tlbflush.h
@@ -10,6 +10,10 @@
 #ifndef _ASMARM_TLBFLUSH_H
 #define _ASMARM_TLBFLUSH_H
 
+#ifndef __ASSEMBLY__
+# include <linux/mm_types.h>
+#endif
+
 #ifdef CONFIG_MMU
 
 #include <asm/glue.h>
@@ -644,9 +648,6 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
 #elif defined(CONFIG_SMP)	/* !CONFIG_MMU */
 
 #ifndef __ASSEMBLY__
-
-#include <linux/mm_types.h>
-
 static inline void local_flush_tlb_all(void)									{ }
 static inline void local_flush_tlb_mm(struct mm_struct *mm)							{ }
 static inline void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)			{ }
diff --git a/arch/arm/kernel/suspend.c b/arch/arm/kernel/suspend.c
index 9a2f882a0a2d..ef794c799cb6 100644
--- a/arch/arm/kernel/suspend.c
+++ b/arch/arm/kernel/suspend.c
@@ -1,5 +1,6 @@
 #include <linux/init.h>
 #include <linux/slab.h>
+#include <linux/mm_types.h>
 
 #include <asm/cacheflush.h>
 #include <asm/idmap.h>
diff --git a/arch/arm/kernel/swp_emulate.c b/arch/arm/kernel/swp_emulate.c
index 853221f81104..3bda08bee674 100644
--- a/arch/arm/kernel/swp_emulate.c
+++ b/arch/arm/kernel/swp_emulate.c
@@ -23,6 +23,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/syscalls.h>
 #include <linux/perf_event.h>
 
diff --git a/arch/arm/mm/idmap.c b/arch/arm/mm/idmap.c
index c1a48f88764e..3e511bec69b8 100644
--- a/arch/arm/mm/idmap.c
+++ b/arch/arm/mm/idmap.c
@@ -1,6 +1,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
+#include <linux/mm_types.h>
 
 #include <asm/cputype.h>
 #include <asm/idmap.h>
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 3c9f7d18a7e6..3257895a9b5e 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -26,6 +26,7 @@
 #include <linux/compiler.h>
 #include <linux/sched.h>
 #include <linux/sched/hotplug.h>
+#include <linux/mm_types.h>
 
 #include <asm/cacheflush.h>
 #include <asm/cpufeature.h>
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index bdbd0c3febf4..e52be6aa44ee 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -33,6 +33,7 @@
 #include <linux/sched/debug.h>
 #include <linux/sched/task_stack.h>
 #include <linux/syscalls.h>
+#include <linux/mm_types.h>
 
 #include <asm/atomic.h>
 #include <asm/bug.h>
diff --git a/arch/avr32/include/asm/mmu_context.h b/arch/avr32/include/asm/mmu_context.h
index 27ff23407100..cd87abba8db7 100644
--- a/arch/avr32/include/asm/mmu_context.h
+++ b/arch/avr32/include/asm/mmu_context.h
@@ -12,6 +12,8 @@
 #ifndef __ASM_AVR32_MMU_CONTEXT_H
 #define __ASM_AVR32_MMU_CONTEXT_H
 
+#include <linux/mm_types.h>
+
 #include <asm/tlbflush.h>
 #include <asm/sysreg.h>
 #include <asm-generic/mm_hooks.h>
diff --git a/arch/blackfin/include/asm/mmu_context.h b/arch/blackfin/include/asm/mmu_context.h
index 15b16d3e8de8..0ce6de873b27 100644
--- a/arch/blackfin/include/asm/mmu_context.h
+++ b/arch/blackfin/include/asm/mmu_context.h
@@ -9,6 +9,8 @@
 
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/mm_types.h>
+
 #include <asm/setup.h>
 #include <asm/page.h>
 #include <asm/pgalloc.h>
diff --git a/arch/blackfin/kernel/flat.c b/arch/blackfin/kernel/flat.c
index a88daddbf074..b5b658449616 100644
--- a/arch/blackfin/kernel/flat.c
+++ b/arch/blackfin/kernel/flat.c
@@ -6,6 +6,7 @@
 
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/mm_types.h>
 #include <linux/flat.h>
 
 #define FLAT_BFIN_RELOC_TYPE_16_BIT 0
diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c
index b691ef875a40..89d5162d4ca6 100644
--- a/arch/blackfin/kernel/process.c
+++ b/arch/blackfin/kernel/process.c
@@ -15,6 +15,7 @@
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
+#include <linux/mm_types.h>
 #include <linux/tick.h>
 #include <linux/fs.h>
 #include <linux/err.h>
diff --git a/arch/blackfin/mm/sram-alloc.c b/arch/blackfin/mm/sram-alloc.c
index 1f3b3ef3e103..d2a96c2c02a3 100644
--- a/arch/blackfin/mm/sram-alloc.c
+++ b/arch/blackfin/mm/sram-alloc.c
@@ -19,6 +19,8 @@
 #include <linux/spinlock.h>
 #include <linux/rtc.h>
 #include <linux/slab.h>
+#include <linux/mm_types.h>
+
 #include <asm/blackfin.h>
 #include <asm/mem_map.h>
 #include "blackfin_sram.h"
diff --git a/arch/cris/arch-v10/mm/tlb.c b/arch/cris/arch-v10/mm/tlb.c
index 21d78c599bab..3225d38bdaea 100644
--- a/arch/cris/arch-v10/mm/tlb.c
+++ b/arch/cris/arch-v10/mm/tlb.c
@@ -10,6 +10,8 @@
  *
  */
 
+#include <linux/mm_types.h>
+
 #include <asm/tlb.h>
 #include <asm/mmu_context.h>
 #include <arch/svinto.h>
diff --git a/arch/cris/arch-v32/mm/tlb.c b/arch/cris/arch-v32/mm/tlb.c
index c030d020660a..bc3de5b5e27c 100644
--- a/arch/cris/arch-v32/mm/tlb.c
+++ b/arch/cris/arch-v32/mm/tlb.c
@@ -6,6 +6,7 @@
  * Authors:   Bjorn Wesen <bjornw@axis.com>
  *            Tobias Anderberg <tobiasa@axis.com>, CRISv32 port.
  */
+#include <linux/mm_types.h>
 
 #include <asm/tlb.h>
 #include <asm/mmu_context.h>
diff --git a/arch/cris/include/asm/pgtable.h b/arch/cris/include/asm/pgtable.h
index ceefc314d64d..2a3210ba4c72 100644
--- a/arch/cris/include/asm/pgtable.h
+++ b/arch/cris/include/asm/pgtable.h
@@ -9,7 +9,7 @@
 #include <asm-generic/pgtable-nopmd.h>
 
 #ifndef __ASSEMBLY__
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <asm/mmu.h>
 #endif
 #include <arch/pgtable.h>
diff --git a/arch/cris/mm/tlb.c b/arch/cris/mm/tlb.c
index b7f8de576777..8413741cfa0f 100644
--- a/arch/cris/mm/tlb.c
+++ b/arch/cris/mm/tlb.c
@@ -9,6 +9,8 @@
 
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/mm_types.h>
+
 #include <asm/tlb.h>
 
 #define D(x)
diff --git a/arch/h8300/kernel/traps.c b/arch/h8300/kernel/traps.c
index ee7e3ce40d78..e47a9e0dc278 100644
--- a/arch/h8300/kernel/traps.c
+++ b/arch/h8300/kernel/traps.c
@@ -17,6 +17,7 @@
 #include <linux/types.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/mm_types.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/init.h>
diff --git a/arch/hexagon/include/asm/mmu_context.h b/arch/hexagon/include/asm/mmu_context.h
index d423d2e73c30..d8a071afdd1d 100644
--- a/arch/hexagon/include/asm/mmu_context.h
+++ b/arch/hexagon/include/asm/mmu_context.h
@@ -21,6 +21,8 @@
 #ifndef _ASM_MMU_CONTEXT_H
 #define _ASM_MMU_CONTEXT_H
 
+#include <linux/mm_types.h>
+
 #include <asm/setup.h>
 #include <asm/page.h>
 #include <asm/pgalloc.h>
diff --git a/arch/hexagon/kernel/smp.c b/arch/hexagon/kernel/smp.c
index c02a6455839e..1f63e91a353b 100644
--- a/arch/hexagon/kernel/smp.c
+++ b/arch/hexagon/kernel/smp.c
@@ -29,6 +29,7 @@
 #include <linux/smp.h>
 #include <linux/spinlock.h>
 #include <linux/cpu.h>
+#include <linux/mm_types.h>
 
 #include <asm/time.h>    /*  timer_interrupt  */
 #include <asm/hexagon_vm.h>
diff --git a/arch/ia64/include/asm/mmu_context.h b/arch/ia64/include/asm/mmu_context.h
index 7f2a456603cb..9b99368633b5 100644
--- a/arch/ia64/include/asm/mmu_context.h
+++ b/arch/ia64/include/asm/mmu_context.h
@@ -26,6 +26,7 @@
 #include <linux/compiler.h>
 #include <linux/percpu.h>
 #include <linux/sched.h>
+#include <linux/mm_types.h>
 #include <linux/spinlock.h>
 
 #include <asm/processor.h>
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index 9f3ed9ee8f13..384794e665fc 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -147,7 +147,7 @@
 
 # ifndef __ASSEMBLY__
 
-#include <linux/sched.h>	/* for mm_struct */
+#include <linux/sched/mm.h>	/* for mm_struct */
 #include <linux/bitops.h>
 #include <asm/cacheflush.h>
 #include <asm/mmu_context.h>
diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c
index c98dc965fe82..b73b0ebf8214 100644
--- a/arch/ia64/sn/kernel/sn2/sn2_smp.c
+++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c
@@ -13,6 +13,7 @@
 #include <linux/spinlock.h>
 #include <linux/threads.h>
 #include <linux/sched.h>
+#include <linux/mm_types.h>
 #include <linux/smp.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
diff --git a/arch/m32r/include/asm/mmu_context.h b/arch/m32r/include/asm/mmu_context.h
index 9fc78fc44445..1230b7050d8e 100644
--- a/arch/m32r/include/asm/mmu_context.h
+++ b/arch/m32r/include/asm/mmu_context.h
@@ -12,6 +12,8 @@
 #ifndef __ASSEMBLY__
 
 #include <linux/atomic.h>
+#include <linux/mm_types.h>
+
 #include <asm/pgalloc.h>
 #include <asm/mmu.h>
 #include <asm/tlbflush.h>
diff --git a/arch/m68k/include/asm/a.out-core.h b/arch/m68k/include/asm/a.out-core.h
index f6bfc1d63ff6..ae91ea6bb303 100644
--- a/arch/m68k/include/asm/a.out-core.h
+++ b/arch/m68k/include/asm/a.out-core.h
@@ -16,6 +16,7 @@
 
 #include <linux/user.h>
 #include <linux/elfcore.h>
+#include <linux/mm_types.h>
 
 /*
  * fill in the user structure for an a.out core dump
diff --git a/arch/m68k/include/asm/mmu_context.h b/arch/m68k/include/asm/mmu_context.h
index dc3be991d634..4a6ae6dffa34 100644
--- a/arch/m68k/include/asm/mmu_context.h
+++ b/arch/m68k/include/asm/mmu_context.h
@@ -2,6 +2,7 @@
 #define __M68K_MMU_CONTEXT_H
 
 #include <asm-generic/mm_hooks.h>
+#include <linux/mm_types.h>
 
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
diff --git a/arch/metag/include/asm/mmu_context.h b/arch/metag/include/asm/mmu_context.h
index ae2a71b5e0be..2e0312748197 100644
--- a/arch/metag/include/asm/mmu_context.h
+++ b/arch/metag/include/asm/mmu_context.h
@@ -9,6 +9,7 @@
 #include <asm/cacheflush.h>
 
 #include <linux/io.h>
+#include <linux/mm_types.h>
 
 static inline void enter_lazy_tlb(struct mm_struct *mm,
 				  struct task_struct *tsk)
diff --git a/arch/microblaze/include/asm/mmu_context_mm.h b/arch/microblaze/include/asm/mmu_context_mm.h
index d68647746448..99472d2ca340 100644
--- a/arch/microblaze/include/asm/mmu_context_mm.h
+++ b/arch/microblaze/include/asm/mmu_context_mm.h
@@ -12,6 +12,8 @@
 #define _ASM_MICROBLAZE_MMU_CONTEXT_H
 
 #include <linux/atomic.h>
+#include <linux/mm_types.h>
+
 #include <asm/bitops.h>
 #include <asm/mmu.h>
 #include <asm-generic/mm_hooks.h>
diff --git a/arch/microblaze/mm/pgtable.c b/arch/microblaze/mm/pgtable.c
index cc732fe357ad..4c0599239915 100644
--- a/arch/microblaze/mm/pgtable.c
+++ b/arch/microblaze/mm/pgtable.c
@@ -31,6 +31,7 @@
 #include <linux/types.h>
 #include <linux/vmalloc.h>
 #include <linux/init.h>
+#include <linux/mm_types.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
diff --git a/arch/mips/include/asm/elf.h b/arch/mips/include/asm/elf.h
index 7a6c466e5f2a..0eb1a75be105 100644
--- a/arch/mips/include/asm/elf.h
+++ b/arch/mips/include/asm/elf.h
@@ -10,6 +10,8 @@
 
 #include <linux/auxvec.h>
 #include <linux/fs.h>
+#include <linux/mm_types.h>
+
 #include <uapi/linux/elf.h>
 
 #include <asm/current.h>
diff --git a/arch/mips/include/asm/mmu_context.h b/arch/mips/include/asm/mmu_context.h
index 2abf94f72c0a..da2004cef2d5 100644
--- a/arch/mips/include/asm/mmu_context.h
+++ b/arch/mips/include/asm/mmu_context.h
@@ -13,8 +13,10 @@
 
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/mm_types.h>
 #include <linux/smp.h>
 #include <linux/slab.h>
+
 #include <asm/cacheflush.h>
 #include <asm/dsemul.h>
 #include <asm/hazards.h>
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 8c60a296294c..6e71130549ea 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -28,7 +28,7 @@
 #include <linux/export.h>
 #include <linux/time.h>
 #include <linux/timex.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/cpumask.h>
 #include <linux/cpu.h>
 #include <linux/err.h>
diff --git a/arch/mips/math-emu/dsemul.c b/arch/mips/math-emu/dsemul.c
index c4469ff4a996..6664908514b3 100644
--- a/arch/mips/math-emu/dsemul.c
+++ b/arch/mips/math-emu/dsemul.c
@@ -1,5 +1,6 @@
 #include <linux/err.h>
 #include <linux/slab.h>
+#include <linux/mm_types.h>
 
 #include <asm/branch.h>
 #include <asm/cacheflush.h>
diff --git a/arch/mips/mm/ioremap.c b/arch/mips/mm/ioremap.c
index 1f189627440f..1986e09fb457 100644
--- a/arch/mips/mm/ioremap.c
+++ b/arch/mips/mm/ioremap.c
@@ -12,6 +12,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
+#include <linux/mm_types.h>
 #include <asm/cacheflush.h>
 #include <asm/io.h>
 #include <asm/tlbflush.h>
diff --git a/arch/mn10300/include/asm/mmu_context.h b/arch/mn10300/include/asm/mmu_context.h
index 75dbe696f830..d2034f5e6eda 100644
--- a/arch/mn10300/include/asm/mmu_context.h
+++ b/arch/mn10300/include/asm/mmu_context.h
@@ -23,6 +23,8 @@
 #define _ASM_MMU_CONTEXT_H
 
 #include <linux/atomic.h>
+#include <linux/mm_types.h>
+
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm-generic/mm_hooks.h>
diff --git a/arch/mn10300/kernel/smp.c b/arch/mn10300/kernel/smp.c
index d924f7a79708..35d2c3fe6f76 100644
--- a/arch/mn10300/kernel/smp.c
+++ b/arch/mn10300/kernel/smp.c
@@ -21,7 +21,7 @@
 #include <linux/err.h>
 #include <linux/kernel.h>
 #include <linux/delay.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/task.h>
 #include <linux/profile.h>
 #include <linux/smp.h>
diff --git a/arch/mn10300/mm/tlb-smp.c b/arch/mn10300/mm/tlb-smp.c
index 9a39ea9031d4..085f2bb691ac 100644
--- a/arch/mn10300/mm/tlb-smp.c
+++ b/arch/mn10300/mm/tlb-smp.c
@@ -20,7 +20,7 @@
 #include <linux/err.h>
 #include <linux/kernel.h>
 #include <linux/delay.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/profile.h>
 #include <linux/smp.h>
 #include <asm/tlbflush.h>
diff --git a/arch/nios2/include/asm/mmu_context.h b/arch/nios2/include/asm/mmu_context.h
index 294b4b1f81d4..78ab3dacf579 100644
--- a/arch/nios2/include/asm/mmu_context.h
+++ b/arch/nios2/include/asm/mmu_context.h
@@ -13,6 +13,8 @@
 #ifndef _ASM_NIOS2_MMU_CONTEXT_H
 #define _ASM_NIOS2_MMU_CONTEXT_H
 
+#include <linux/mm_types.h>
+
 #include <asm-generic/mm_hooks.h>
 
 extern void mmu_context_init(void);
diff --git a/arch/nios2/kernel/process.c b/arch/nios2/kernel/process.c
index 869a4d59de32..509e7855e8dc 100644
--- a/arch/nios2/kernel/process.c
+++ b/arch/nios2/kernel/process.c
@@ -17,6 +17,7 @@
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
+#include <linux/mm_types.h>
 #include <linux/tick.h>
 #include <linux/uaccess.h>
 
diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c
index 5f8613ceb97f..a582e0d42525 100644
--- a/arch/powerpc/kernel/io-workarounds.c
+++ b/arch/powerpc/kernel/io-workarounds.c
@@ -12,7 +12,7 @@
 #undef DEBUG
 
 #include <linux/kernel.h>
-#include <linux/sched.h>	/* for init_mm */
+#include <linux/sched/mm.h>	/* for init_mm */
 
 #include <asm/io.h>
 #include <asm/machdep.h>
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index b0333cc737dd..0fda4230f6c0 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -25,7 +25,7 @@
 #include <linux/highmem.h>
 #include <linux/log2.h>
 #include <linux/uaccess.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/rwsem.h>
 #include <linux/vmalloc.h>
 #include <linux/hugetlb.h>
diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c
index 043415f0bdb1..f3917705c686 100644
--- a/arch/powerpc/lib/feature-fixups.c
+++ b/arch/powerpc/lib/feature-fixups.c
@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/init.h>
+#include <linux/sched/mm.h>
 #include <asm/cputable.h>
 #include <asm/code-patching.h>
 #include <asm/page.h>
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 12d679df50bd..c554768b1fa2 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -23,7 +23,7 @@
 
 #include <linux/spinlock.h>
 #include <linux/errno.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
 #include <linux/sysctl.h>
diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c
index b798ff674fab..5fcb3dd74c13 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -8,6 +8,8 @@
  */
 
 #include <linux/sched.h>
+#include <linux/mm_types.h>
+
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
 
diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c
index c23e286a6b8f..8b85a14b08ea 100644
--- a/arch/powerpc/mm/pgtable-hash64.c
+++ b/arch/powerpc/mm/pgtable-hash64.c
@@ -10,6 +10,8 @@
  */
 
 #include <linux/sched.h>
+#include <linux/mm_types.h>
+
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
 
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index feeda90cd06d..2a590a98e652 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -8,7 +8,7 @@
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/memblock.h>
 #include <linux/of_fdt.h>
 
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 48fc28bab544..5e01b2ece1d0 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -22,6 +22,8 @@
 #include <asm/cacheflush.h>
 #include <asm/smp.h>
 #include <linux/compiler.h>
+#include <linux/mm_types.h>
+
 #include <asm/udbg.h>
 #include <asm/code-patching.h>
 
diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h
index 83aaefed2a7b..1d48880b3cc1 100644
--- a/arch/s390/include/asm/elf.h
+++ b/arch/s390/include/asm/elf.h
@@ -132,7 +132,7 @@ typedef s390_fp_regs compat_elf_fpregset_t;
 typedef s390_compat_regs compat_elf_gregset_t;
 
 #include <linux/compat.h>
-#include <linux/sched.h>	/* for task_struct */
+#include <linux/sched/mm.h>	/* for task_struct */
 #include <asm/mmu_context.h>
 
 #include <asm/vdso.h>
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 9b828c073176..6e31d87fb669 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -9,6 +9,7 @@
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
+#include <linux/mm_types.h>
 #include <asm/tlbflush.h>
 #include <asm/ctl_reg.h>
 
diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c
index bc2b60dcb178..bf7854523831 100644
--- a/arch/s390/kernel/processor.c
+++ b/arch/s390/kernel/processor.c
@@ -10,6 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/seq_file.h>
+#include <linux/mm_types.h>
 #include <linux/delay.h>
 #include <linux/cpu.h>
 #include <asm/diag.h>
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 4492c9363178..d55c829a5944 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -6,7 +6,9 @@
  */
 
 #include <linux/vmalloc.h>
+#include <linux/mm_types.h>
 #include <linux/err.h>
+
 #include <asm/pgtable.h>
 #include <asm/gmap.h>
 #include "kvm-s390.h"
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index fb4b494cde9b..64b6a309f2c4 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -15,6 +15,8 @@
 #include <linux/gfp.h>
 #include <linux/errno.h>
 #include <linux/compat.h>
+#include <linux/mm_types.h>
+
 #include <asm/asm-offsets.h>
 #include <asm/facility.h>
 #include <asm/current.h>
diff --git a/arch/score/include/asm/mmu_context.h b/arch/score/include/asm/mmu_context.h
index 2644577c96e8..073f95d350de 100644
--- a/arch/score/include/asm/mmu_context.h
+++ b/arch/score/include/asm/mmu_context.h
@@ -3,7 +3,9 @@
 
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/mm_types.h>
 #include <linux/slab.h>
+
 #include <asm-generic/mm_hooks.h>
 
 #include <asm/cacheflush.h>
diff --git a/arch/score/kernel/traps.c b/arch/score/kernel/traps.c
index 8a54d320fb41..fe68de6c746c 100644
--- a/arch/score/kernel/traps.c
+++ b/arch/score/kernel/traps.c
@@ -26,6 +26,7 @@
 #include <linux/extable.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/debug.h>
+#include <linux/mm_types.h>
 
 #include <asm/cacheflush.h>
 #include <asm/irq.h>
diff --git a/arch/sh/include/asm/mmu_context.h b/arch/sh/include/asm/mmu_context.h
index 35ffdd081d26..eb6ac3c10c44 100644
--- a/arch/sh/include/asm/mmu_context.h
+++ b/arch/sh/include/asm/mmu_context.h
@@ -11,6 +11,8 @@
 #include <cpu/mmu_context.h>
 #include <asm/tlbflush.h>
 #include <linux/uaccess.h>
+#include <linux/mm_types.h>
+
 #include <asm/io.h>
 #include <asm-generic/mm_hooks.h>
 
diff --git a/arch/sparc/include/asm/mmu_context_64.h b/arch/sparc/include/asm/mmu_context_64.h
index d0317993e947..22fede6eba11 100644
--- a/arch/sparc/include/asm/mmu_context_64.h
+++ b/arch/sparc/include/asm/mmu_context_64.h
@@ -6,6 +6,8 @@
 #ifndef __ASSEMBLY__
 
 #include <linux/spinlock.h>
+#include <linux/mm_types.h>
+
 #include <asm/spitfire.h>
 #include <asm-generic/mm_hooks.h>
 
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 7932a4a37817..56e49c8f770d 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -878,6 +878,9 @@ static inline unsigned long pud_pfn(pud_t pud)
 #define pte_offset_map			pte_index
 #define pte_unmap(pte)			do { } while (0)
 
+/* We cannot include <linux/mm_types.h> at this point yet: */
+extern struct mm_struct init_mm;
+
 /* Actual page table PTE updates.  */
 void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr,
 		   pte_t *ptep, pte_t orig, int fullmm,
diff --git a/arch/sparc/kernel/asm-offsets.c b/arch/sparc/kernel/asm-offsets.c
index f76389a32342..3f09e1c83f58 100644
--- a/arch/sparc/kernel/asm-offsets.c
+++ b/arch/sparc/kernel/asm-offsets.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/mm_types.h>
 // #include <linux/mm.h>
 #include <linux/kbuild.h>
 
diff --git a/arch/sparc/kernel/traps_32.c b/arch/sparc/kernel/traps_32.c
index 26740a2f285d..2de72a49308a 100644
--- a/arch/sparc/kernel/traps_32.c
+++ b/arch/sparc/kernel/traps_32.c
@@ -11,6 +11,7 @@
 
 #include <linux/sched.h>  /* for jiffies */
 #include <linux/sched/debug.h>
+#include <linux/mm_types.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
 #include <linux/smp.h>
diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c
index 23479c3d39f0..0a04811f06b7 100644
--- a/arch/sparc/mm/tsb.c
+++ b/arch/sparc/mm/tsb.c
@@ -6,6 +6,8 @@
 #include <linux/kernel.h>
 #include <linux/preempt.h>
 #include <linux/slab.h>
+#include <linux/mm_types.h>
+
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
diff --git a/arch/tile/include/asm/mmu_context.h b/arch/tile/include/asm/mmu_context.h
index f67753db1f78..45a4b4c424cf 100644
--- a/arch/tile/include/asm/mmu_context.h
+++ b/arch/tile/include/asm/mmu_context.h
@@ -16,6 +16,8 @@
 #define _ASM_TILE_MMU_CONTEXT_H
 
 #include <linux/smp.h>
+#include <linux/mm_types.h>
+
 #include <asm/setup.h>
 #include <asm/page.h>
 #include <asm/pgalloc.h>
diff --git a/arch/um/include/asm/mmu_context.h b/arch/um/include/asm/mmu_context.h
index 1a60e1328e2f..94ac2739918c 100644
--- a/arch/um/include/asm/mmu_context.h
+++ b/arch/um/include/asm/mmu_context.h
@@ -7,6 +7,8 @@
 #define __UM_MMU_CONTEXT_H
 
 #include <linux/sched.h>
+#include <linux/mm_types.h>
+
 #include <asm/mmu.h>
 
 extern void uml_setup_stubs(struct mm_struct *mm);
diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c
index 31968677a0d0..a43d42bf0a86 100644
--- a/arch/um/kernel/exec.c
+++ b/arch/um/kernel/exec.c
@@ -7,7 +7,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/ptrace.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
 #include <linux/slab.h>
diff --git a/arch/um/kernel/reboot.c b/arch/um/kernel/reboot.c
index 0bc921cee0b1..71f3e9217cf2 100644
--- a/arch/um/kernel/reboot.c
+++ b/arch/um/kernel/reboot.c
@@ -5,6 +5,7 @@
 
 #include <linux/sched/signal.h>
 #include <linux/sched/task.h>
+#include <linux/sched/mm.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/oom.h>
diff --git a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c
index ddecf326dab2..3a952a4f7965 100644
--- a/arch/um/kernel/skas/process.c
+++ b/arch/um/kernel/skas/process.c
@@ -4,8 +4,9 @@
  */
 
 #include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/task_stack.h>
+
 #include <as-layout.h>
 #include <kern.h>
 #include <os.h>
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index df91fb393a01..ce1d7534fa53 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -28,6 +28,7 @@
 #include <linux/kernel.h>
 #include <linux/timer.h>
 #include <linux/sched/signal.h>
+#include <linux/mm_types.h>
 #include <linux/syscalls.h>
 #include <linux/ratelimit.h>
 
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 2ecc0e97772b..349d4d17aa7f 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -20,7 +20,7 @@
 #include <linux/export.h>
 #include <linux/init.h>
 #include <linux/kdebug.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/clock.h>
 #include <linux/uaccess.h>
 #include <linux/slab.h>
diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h
index 7a15588e45d4..7d3ece8bfb61 100644
--- a/arch/x86/include/asm/a.out-core.h
+++ b/arch/x86/include/asm/a.out-core.h
@@ -17,6 +17,8 @@
 
 #include <linux/user.h>
 #include <linux/elfcore.h>
+#include <linux/mm_types.h>
+
 #include <asm/debugreg.h>
 
 /*
diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h
index 0b416d4cf73b..a0d662be4c5b 100644
--- a/arch/x86/include/asm/mpx.h
+++ b/arch/x86/include/asm/mpx.h
@@ -2,6 +2,8 @@
 #define _ASM_X86_MPX_H
 
 #include <linux/types.h>
+#include <linux/mm_types.h>
+
 #include <asm/ptrace.h>
 #include <asm/insn.h>
 
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index c98079684bdb..5126dfd52b18 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -7,6 +7,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/slab.h>
+#include <linux/mm_types.h>
 #include <linux/syscalls.h>
 #include <linux/sched/sysctl.h>
 
diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c
index e6552275320b..10d907098c26 100644
--- a/arch/x86/um/syscalls_64.c
+++ b/arch/x86/um/syscalls_64.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/uaccess.h>
 #include <asm/prctl.h> /* XXX This should get the constants from libc */
 #include <os.h>
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index f6740b5b1738..37cb5aad71de 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -38,7 +38,7 @@
  *
  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  */
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/highmem.h>
 #include <linux/debugfs.h>
 #include <linux/bug.h>
diff --git a/arch/xtensa/include/asm/mmu_context.h b/arch/xtensa/include/asm/mmu_context.h
index 04c8ebdc4517..f7e186dfc4e4 100644
--- a/arch/xtensa/include/asm/mmu_context.h
+++ b/arch/xtensa/include/asm/mmu_context.h
@@ -17,6 +17,7 @@
 
 #include <linux/stringify.h>
 #include <linux/sched.h>
+#include <linux/mm_types.h>
 
 #include <asm/vectors.h>
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
index d83de985e88c..6acc4313363e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
@@ -23,6 +23,8 @@
 
 #include <linux/printk.h>
 #include <linux/slab.h>
+#include <linux/mm_types.h>
+
 #include "kfd_priv.h"
 #include "kfd_mqd_manager.h"
 #include "cik_regs.h"
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
index fa32c32fa1c2..a9b9882a9a77 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
@@ -23,6 +23,8 @@
 
 #include <linux/printk.h>
 #include <linux/slab.h>
+#include <linux/mm_types.h>
+
 #include "kfd_priv.h"
 #include "kfd_mqd_manager.h"
 #include "vi_structs.h"
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index 318ec5267bdf..86ecd3ea6a4b 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -37,7 +37,7 @@
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/list.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/spinlock.h>
 #include <linux/ethtool.h>
 #include <linux/rtnetlink.h>
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index d19662f635b1..5846c47c8d55 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -37,7 +37,7 @@
 #include <linux/idr.h>
 #include <linux/completion.h>
 #include <linux/netdevice.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
 #include <linux/inet.h>
diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c
index 36bd904946bd..0b5c43f7e020 100644
--- a/drivers/media/v4l2-core/videobuf-dma-sg.c
+++ b/drivers/media/v4l2-core/videobuf-dma-sg.c
@@ -21,7 +21,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 9b4688ab1d8e..bee1a36bc2ec 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -12,7 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/magic.h>
 #include <linux/binfmts.h>
 #include <linux/slab.h>
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 35043a8c4529..8e4dc7ab584c 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -13,7 +13,7 @@
 #include <linux/slab.h>
 #include <linux/poll.h>
 #include <linux/pagemap.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/fsnotify.h>
 
 #include "kernfs-internal.h"
diff --git a/include/drm/drm_mm.h b/include/drm/drm_mm.h
index d81b0ba9921f..2ef16bf25826 100644
--- a/include/drm/drm_mm.h
+++ b/include/drm/drm_mm.h
@@ -40,6 +40,7 @@
 #include <linux/bug.h>
 #include <linux/rbtree.h>
 #include <linux/kernel.h>
+#include <linux/mm_types.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #ifdef CONFIG_DRM_DEBUG_MM
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 452c9799318c..f6841c19c913 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -15,6 +15,7 @@
 #include <linux/sched/autogroup.h>
 #include <net/net_namespace.h>
 #include <linux/sched/rt.h>
+#include <linux/mm_types.h>
 
 #include <asm/thread_info.h>
 
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 1cf7941bb946..d32e3932b2e3 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -2,6 +2,7 @@
 #define _LINUX_SCHED_MM_H
 
 #include <linux/sched.h>
+#include <linux/mm_types.h>
 #include <linux/gfp.h>
 
 #endif /* _LINUX_SCHED_MM_H */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 109adc0e9cb9..e865d8bfb881 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -11,7 +11,7 @@
  */
 
 #include <linux/proc_fs.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/seq_file.h>
 #include <linux/kallsyms.h>
 #include <linux/utsname.h>
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 11e0ab57748a..3e88b35ac157 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,7 +20,7 @@
  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  */
 
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/topology.h>
 
 #include <linux/latencytop.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index e6d470a5f75a..2008aa999976 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -13,7 +13,7 @@
 #include <linux/slab.h>
 #include <linux/export.h>
 #include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/sched/user.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
diff --git a/lib/is_single_threaded.c b/lib/is_single_threaded.c
index 9745cfffcb69..8d4678edcc0e 100644
--- a/lib/is_single_threaded.c
+++ b/lib/is_single_threaded.c
@@ -10,6 +10,7 @@
  * 2 of the Licence, or (at your option) any later version.
  */
 #include <linux/sched/signal.h>
+#include <linux/sched/mm.h>
 
 /*
  * Returns true if the task does not share ->mm with another thread/process.
-- 
cgit v1.2.3


From 9164bb4a18dfa592cd0aca455ea57abf89ca4526 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 4 Feb 2017 01:20:53 +0100
Subject: sched/headers: Prepare to move 'init_task' and 'init_thread_union'
 from <linux/sched.h> to <linux/sched/task.h>

Update all usage sites first.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm64/kernel/setup.c     | 1 +
 arch/arm64/mm/kasan_init.c    | 1 +
 arch/frv/mm/init.c            | 1 +
 arch/metag/mm/init.c          | 1 +
 arch/nios2/kernel/setup.c     | 1 +
 arch/powerpc/kernel/paca.c    | 1 +
 arch/um/kernel/skas/process.c | 1 +
 arch/um/kernel/um_arch.c      | 2 ++
 arch/x86/kernel/cpu/common.c  | 1 +
 arch/x86/mm/kasan_init_64.c   | 1 +
 fs/namespace.c                | 2 ++
 include/linux/sched/signal.h  | 1 +
 init/init_task.c              | 1 +
 kernel/delayacct.c            | 1 +
 lib/is_single_threaded.c      | 1 +
 mm/vmacache.c                 | 1 +
 16 files changed, 18 insertions(+)

(limited to 'kernel')

diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 952e2c0dabd5..42274bda0ccb 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -42,6 +42,7 @@
 #include <linux/of_fdt.h>
 #include <linux/efi.h>
 #include <linux/psci.h>
+#include <linux/sched/task.h>
 #include <linux/mm.h>
 
 #include <asm/acpi.h>
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index 201d918e7575..55d1e9205543 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -13,6 +13,7 @@
 #define pr_fmt(fmt) "kasan: " fmt
 #include <linux/kasan.h>
 #include <linux/kernel.h>
+#include <linux/sched/task.h>
 #include <linux/memblock.h>
 #include <linux/start_kernel.h>
 #include <linux/mm.h>
diff --git a/arch/frv/mm/init.c b/arch/frv/mm/init.c
index 88a159743528..328f0a292316 100644
--- a/arch/frv/mm/init.c
+++ b/arch/frv/mm/init.c
@@ -18,6 +18,7 @@
 
 #include <linux/signal.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/pagemap.h>
 #include <linux/gfp.h>
 #include <linux/swap.h>
diff --git a/arch/metag/mm/init.c b/arch/metag/mm/init.c
index c0ec116b3993..188d4d9fbed4 100644
--- a/arch/metag/mm/init.c
+++ b/arch/metag/mm/init.c
@@ -12,6 +12,7 @@
 #include <linux/percpu.h>
 #include <linux/memblock.h>
 #include <linux/initrd.h>
+#include <linux/sched/task.h>
 
 #include <asm/setup.h>
 #include <asm/page.h>
diff --git a/arch/nios2/kernel/setup.c b/arch/nios2/kernel/setup.c
index a3fa80d1aacc..6e57ffa5db27 100644
--- a/arch/nios2/kernel/setup.c
+++ b/arch/nios2/kernel/setup.c
@@ -14,6 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/console.h>
 #include <linux/bootmem.h>
 #include <linux/initrd.h>
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index fa20060ff7a5..dfc479df9634 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -10,6 +10,7 @@
 #include <linux/smp.h>
 #include <linux/export.h>
 #include <linux/memblock.h>
+#include <linux/sched/task.h>
 
 #include <asm/lppaca.h>
 #include <asm/paca.h>
diff --git a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c
index 3a952a4f7965..d4dbf08722d6 100644
--- a/arch/um/kernel/skas/process.c
+++ b/arch/um/kernel/skas/process.c
@@ -6,6 +6,7 @@
 #include <linux/init.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/task_stack.h>
+#include <linux/sched/task.h>
 
 #include <as-layout.h>
 #include <kern.h>
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index e8175a8aa22c..4b85acd4020c 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -11,7 +11,9 @@
 #include <linux/string.h>
 #include <linux/utsname.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/kmsg_dump.h>
+
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/sections.h>
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c7e2ca966386..f2fd8fefc589 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -9,6 +9,7 @@
 #include <linux/delay.h>
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
+#include <linux/sched/task.h>
 #include <linux/init.h>
 #include <linux/kprobes.h>
 #include <linux/kgdb.h>
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 0493c17b8a51..8d63d7a104c3 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -4,6 +4,7 @@
 #include <linux/kdebug.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/vmalloc.h>
 
 #include <asm/tlbflush.h>
diff --git a/fs/namespace.c b/fs/namespace.c
index 131cd7b94f47..cc1375eff88c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -25,6 +25,8 @@
 #include <linux/magic.h>
 #include <linux/bootmem.h>
 #include <linux/task_work.h>
+#include <linux/sched/task.h>
+
 #include "pnode.h"
 #include "internal.h"
 
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 7e10a7824523..320eca0446cd 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -5,5 +5,6 @@
 #include <linux/cred.h>
 #include <linux/sched.h>
 #include <linux/sched/jobctl.h>
+#include <linux/sched/task.h>
 
 #endif /* _LINUX_SCHED_SIGNAL_H */
diff --git a/init/init_task.c b/init/init_task.c
index 53d4ce942a88..66787e30a419 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -4,6 +4,7 @@
 #include <linux/sched.h>
 #include <linux/sched/sysctl.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/task.h>
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 660549656991..c94135fc2698 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -14,6 +14,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/slab.h>
 #include <linux/taskstats.h>
 #include <linux/time.h>
diff --git a/lib/is_single_threaded.c b/lib/is_single_threaded.c
index 8d4678edcc0e..9c7d89df40ed 100644
--- a/lib/is_single_threaded.c
+++ b/lib/is_single_threaded.c
@@ -10,6 +10,7 @@
  * 2 of the Licence, or (at your option) any later version.
  */
 #include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 #include <linux/sched/mm.h>
 
 /*
diff --git a/mm/vmacache.c b/mm/vmacache.c
index 4355d34c68a6..7ffa0ee341b5 100644
--- a/mm/vmacache.c
+++ b/mm/vmacache.c
@@ -2,6 +2,7 @@
  * Copyright (C) 2014 Davidlohr Bueso.
  */
 #include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 #include <linux/mm.h>
 #include <linux/vmacache.h>
 
-- 
cgit v1.2.3


From b2d091031075ac9a1598e3cc3a29c28f02e64c0d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 4 Feb 2017 01:27:20 +0100
Subject: sched/headers: Prepare to use <linux/rcuupdate.h> instead of
 <linux/rculist.h> in <linux/sched.h>

We don't actually need the full rculist.h header in sched.h anymore,
we will be able to include the smaller rcupdate.h header instead.

But first update code that relied on the implicit header inclusion.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/powerpc/kvm/book3s_mmu_hpte.c  | 1 +
 arch/x86/kvm/irq_comm.c             | 2 ++
 arch/x86/kvm/page_track.c           | 2 ++
 drivers/md/bcache/btree.c           | 2 ++
 drivers/misc/vmw_vmci/vmci_event.c  | 1 +
 drivers/nvme/target/admin-cmd.c     | 2 ++
 drivers/nvme/target/core.c          | 2 ++
 drivers/s390/cio/qdio_thinint.c     | 2 ++
 drivers/scsi/libfc/fc_disc.c        | 2 ++
 drivers/scsi/libfc/fc_rport.c       | 2 ++
 include/linux/dmar.h                | 2 +-
 include/linux/pid.h                 | 2 +-
 include/linux/rhashtable.h          | 2 +-
 include/linux/sched/signal.h        | 1 +
 include/net/bluetooth/hci_core.h    | 2 ++
 kernel/trace/trace_events_hist.c    | 1 +
 kernel/trace/trace_events_trigger.c | 1 +
 kernel/trace/trace_kprobe.c         | 1 +
 kernel/trace/trace_uprobe.c         | 1 +
 lib/bug.c                           | 1 +
 lib/rhashtable.c                    | 1 +
 net/mac80211/mesh_plink.c           | 2 ++
 net/mac802154/llsec.c               | 2 ++
 security/apparmor/policy.c          | 1 +
 security/tomoyo/domain.c            | 2 ++
 security/tomoyo/group.c             | 2 ++
 security/tomoyo/util.c              | 2 ++
 27 files changed, 41 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c
index 5a1ab1250a05..905a934c1ef4 100644
--- a/arch/powerpc/kvm/book3s_mmu_hpte.c
+++ b/arch/powerpc/kvm/book3s_mmu_hpte.c
@@ -21,6 +21,7 @@
 #include <linux/kvm_host.h>
 #include <linux/hash.h>
 #include <linux/slab.h>
+#include <linux/rculist.h>
 
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index b96d3893f121..6825cd36d13b 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -23,6 +23,8 @@
 #include <linux/kvm_host.h>
 #include <linux/slab.h>
 #include <linux/export.h>
+#include <linux/rculist.h>
+
 #include <trace/events/kvm.h>
 
 #include <asm/msidef.h>
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
index 4a1c13eaa518..37942e419c32 100644
--- a/arch/x86/kvm/page_track.c
+++ b/arch/x86/kvm/page_track.c
@@ -14,6 +14,8 @@
  */
 
 #include <linux/kvm_host.h>
+#include <linux/rculist.h>
+
 #include <asm/kvm_host.h>
 #include <asm/kvm_page_track.h>
 
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 384559af310e..450d0e848ae4 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -33,6 +33,8 @@
 #include <linux/random.h>
 #include <linux/rcupdate.h>
 #include <linux/sched/clock.h>
+#include <linux/rculist.h>
+
 #include <trace/events/bcache.h>
 
 /*
diff --git a/drivers/misc/vmw_vmci/vmci_event.c b/drivers/misc/vmw_vmci/vmci_event.c
index 8449516d6ac6..84258a48029d 100644
--- a/drivers/misc/vmw_vmci/vmci_event.c
+++ b/drivers/misc/vmw_vmci/vmci_event.c
@@ -19,6 +19,7 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/rculist.h>
 
 #include "vmci_driver.h"
 #include "vmci_event.h"
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 94e524fea568..a7bcff45f437 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -13,6 +13,8 @@
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
+#include <linux/rculist.h>
+
 #include <generated/utsrelease.h>
 #include <asm/unaligned.h>
 #include "nvmet.h"
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 5267ce20c12d..11b0a0a5f661 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -14,6 +14,8 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
 #include <linux/random.h>
+#include <linux/rculist.h>
+
 #include "nvmet.h"
 
 static struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX];
diff --git a/drivers/s390/cio/qdio_thinint.c b/drivers/s390/cio/qdio_thinint.c
index 8ad98a902a91..c61164f4528e 100644
--- a/drivers/s390/cio/qdio_thinint.c
+++ b/drivers/s390/cio/qdio_thinint.c
@@ -8,6 +8,8 @@
 #include <linux/slab.h>
 #include <linux/kernel_stat.h>
 #include <linux/atomic.h>
+#include <linux/rculist.h>
+
 #include <asm/debug.h>
 #include <asm/qdio.h>
 #include <asm/airq.h>
diff --git a/drivers/scsi/libfc/fc_disc.c b/drivers/scsi/libfc/fc_disc.c
index 6103231104da..fd501f8dbb11 100644
--- a/drivers/scsi/libfc/fc_disc.c
+++ b/drivers/scsi/libfc/fc_disc.c
@@ -36,6 +36,8 @@
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/export.h>
+#include <linux/rculist.h>
+
 #include <asm/unaligned.h>
 
 #include <scsi/fc/fc_gs.h>
diff --git a/drivers/scsi/libfc/fc_rport.c b/drivers/scsi/libfc/fc_rport.c
index c991f3b822f8..b44c3136eb51 100644
--- a/drivers/scsi/libfc/fc_rport.c
+++ b/drivers/scsi/libfc/fc_rport.c
@@ -65,6 +65,8 @@
 #include <linux/timer.h>
 #include <linux/workqueue.h>
 #include <linux/export.h>
+#include <linux/rculist.h>
+
 #include <asm/unaligned.h>
 
 #include <scsi/libfc.h>
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index e9bc9292bd3a..e8ffba1052d3 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -26,7 +26,7 @@
 #include <linux/msi.h>
 #include <linux/irqreturn.h>
 #include <linux/rwsem.h>
-#include <linux/rcupdate.h>
+#include <linux/rculist.h>
 
 struct acpi_dmar_header;
 
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 298ead5512e5..4d179316e431 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -1,7 +1,7 @@
 #ifndef _LINUX_PID_H
 #define _LINUX_PID_H
 
-#include <linux/rcupdate.h>
+#include <linux/rculist.h>
 
 enum pid_type
 {
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index f2e12a845910..092292b6675e 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -25,7 +25,7 @@
 #include <linux/list_nulls.h>
 #include <linux/workqueue.h>
 #include <linux/mutex.h>
-#include <linux/rcupdate.h>
+#include <linux/rculist.h>
 
 /*
  * The end of the chain is marked with a special nulls marks which has
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 320eca0446cd..c6958a53fef3 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -1,6 +1,7 @@
 #ifndef _LINUX_SCHED_SIGNAL_H
 #define _LINUX_SCHED_SIGNAL_H
 
+#include <linux/rculist.h>
 #include <linux/signal.h>
 #include <linux/cred.h>
 #include <linux/sched.h>
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 90708f68cc02..95ccc1eef558 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -26,6 +26,8 @@
 #define __HCI_CORE_H
 
 #include <linux/leds.h>
+#include <linux/rculist.h>
+
 #include <net/bluetooth/hci.h>
 #include <net/bluetooth/hci_sock.h>
 
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index f3a960ed75a1..1c21d0e2a145 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -19,6 +19,7 @@
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/stacktrace.h>
+#include <linux/rculist.h>
 
 #include "tracing_map.h"
 #include "trace.h"
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 6721a1e89f39..f2ac9d44f6c4 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -22,6 +22,7 @@
 #include <linux/ctype.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
+#include <linux/rculist.h>
 
 #include "trace.h"
 
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index eadd96ef772f..5f688cc724f0 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -20,6 +20,7 @@
 
 #include <linux/module.h>
 #include <linux/uaccess.h>
+#include <linux/rculist.h>
 
 #include "trace_probe.h"
 
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index f4379e772171..a7581fec9681 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -24,6 +24,7 @@
 #include <linux/uprobes.h>
 #include <linux/namei.h>
 #include <linux/string.h>
+#include <linux/rculist.h>
 
 #include "trace_probe.h"
 
diff --git a/lib/bug.c b/lib/bug.c
index bc3656e944d2..06edbbef0623 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -45,6 +45,7 @@
 #include <linux/kernel.h>
 #include <linux/bug.h>
 #include <linux/sched.h>
+#include <linux/rculist.h>
 
 extern const struct bug_entry __start___bug_table[], __stop___bug_table[];
 
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index c5b9b9351cec..f8635fd57442 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -19,6 +19,7 @@
 #include <linux/init.h>
 #include <linux/log2.h>
 #include <linux/sched.h>
+#include <linux/rculist.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index fcba70e57073..953d71e784a9 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -9,6 +9,8 @@
 #include <linux/gfp.h>
 #include <linux/kernel.h>
 #include <linux/random.h>
+#include <linux/rculist.h>
+
 #include "ieee80211_i.h"
 #include "rate.h"
 #include "mesh.h"
diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c
index 6a3e1c2181d3..1e1c9b20bab7 100644
--- a/net/mac802154/llsec.c
+++ b/net/mac802154/llsec.c
@@ -18,6 +18,8 @@
 #include <linux/bug.h>
 #include <linux/completion.h>
 #include <linux/ieee802154.h>
+#include <linux/rculist.h>
+
 #include <crypto/aead.h>
 #include <crypto/skcipher.h>
 
diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c
index 462c5d36b871..def1fbd6bdfd 100644
--- a/security/apparmor/policy.c
+++ b/security/apparmor/policy.c
@@ -77,6 +77,7 @@
 #include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/cred.h>
+#include <linux/rculist.h>
 #include <linux/user_namespace.h>
 
 #include "include/apparmor.h"
diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c
index 838ffa78cfda..00d223e9fb37 100644
--- a/security/tomoyo/domain.c
+++ b/security/tomoyo/domain.c
@@ -5,8 +5,10 @@
  */
 
 #include "common.h"
+
 #include <linux/binfmts.h>
 #include <linux/slab.h>
+#include <linux/rculist.h>
 
 /* Variables definitions.*/
 
diff --git a/security/tomoyo/group.c b/security/tomoyo/group.c
index 50092534ec54..944ad77d8fba 100644
--- a/security/tomoyo/group.c
+++ b/security/tomoyo/group.c
@@ -5,6 +5,8 @@
  */
 
 #include <linux/slab.h>
+#include <linux/rculist.h>
+
 #include "common.h"
 
 /**
diff --git a/security/tomoyo/util.c b/security/tomoyo/util.c
index 5fe3679137ae..848317fea704 100644
--- a/security/tomoyo/util.c
+++ b/security/tomoyo/util.c
@@ -5,6 +5,8 @@
  */
 
 #include <linux/slab.h>
+#include <linux/rculist.h>
+
 #include "common.h"
 
 /* Lock for protecting policy. */
-- 
cgit v1.2.3


From f719ff9bcee2a422647790f12d53d3755f47c727 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 6 Feb 2017 10:57:33 +0100
Subject: sched/headers: Prepare to move the task_lock()/unlock() APIs to
 <linux/sched/task.h>

But first update the code that uses these facilities with the
new header.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/mips/math-emu/dsemul.c                | 1 +
 block/blk-ioc.c                            | 1 +
 block/ioprio.c                             | 1 +
 drivers/staging/android/ion/ion.c          | 1 +
 drivers/staging/lustre/lustre/ptlrpc/sec.c | 1 +
 fs/proc/internal.h                         | 1 +
 fs/proc/proc_net.c                         | 1 +
 fs/proc_namespace.c                        | 2 ++
 include/linux/cpuset.h                     | 1 +
 ipc/namespace.c                            | 1 +
 kernel/cgroup/cpuset.c                     | 1 +
 kernel/sched/debug.c                       | 1 +
 kernel/utsname.c                           | 1 +
 mm/mempolicy.c                             | 1 +
 mm/mmu_context.c                           | 1 +
 net/core/net_namespace.c                   | 2 ++
 net/core/netclassid_cgroup.c               | 2 ++
 net/core/netprio_cgroup.c                  | 2 ++
 18 files changed, 22 insertions(+)

(limited to 'kernel')

diff --git a/arch/mips/math-emu/dsemul.c b/arch/mips/math-emu/dsemul.c
index 6664908514b3..b6bfd3625369 100644
--- a/arch/mips/math-emu/dsemul.c
+++ b/arch/mips/math-emu/dsemul.c
@@ -1,6 +1,7 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/mm_types.h>
+#include <linux/sched/task.h>
 
 #include <asm/branch.h>
 #include <asm/cacheflush.h>
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index b12f9c87b4c3..6bfa39675337 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -7,6 +7,7 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
+#include <linux/sched/task.h>
 
 #include "blk.h"
 
diff --git a/block/ioprio.c b/block/ioprio.c
index 4b9ab8367dda..0c47a00f92a8 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -27,6 +27,7 @@
 #include <linux/blkdev.h>
 #include <linux/capability.h>
 #include <linux/sched/user.h>
+#include <linux/sched/task.h>
 #include <linux/syscalls.h>
 #include <linux/security.h>
 #include <linux/pid_namespace.h>
diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c
index 2c3ffbcbd621..f45115fce4eb 100644
--- a/drivers/staging/android/ion/ion.c
+++ b/drivers/staging/android/ion/ion.c
@@ -36,6 +36,7 @@
 #include <linux/debugfs.h>
 #include <linux/dma-buf.h>
 #include <linux/idr.h>
+#include <linux/sched/task.h>
 
 #include "ion.h"
 #include "ion_priv.h"
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec.c b/drivers/staging/lustre/lustre/ptlrpc/sec.c
index 49f34fd655c3..366f2ce20f5e 100644
--- a/drivers/staging/lustre/lustre/ptlrpc/sec.c
+++ b/drivers/staging/lustre/lustre/ptlrpc/sec.c
@@ -40,6 +40,7 @@
 #include <linux/crypto.h>
 #include <linux/cred.h>
 #include <linux/key.h>
+#include <linux/sched/task.h>
 
 #include "../include/obd.h"
 #include "../include/obd_class.h"
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 550e8b183abe..26a6daf02185 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -15,6 +15,7 @@
 #include <linux/atomic.h>
 #include <linux/binfmts.h>
 #include <linux/sched/coredump.h>
+#include <linux/sched/task.h>
 
 struct ctl_table_header;
 struct mempolicy;
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index ffd72a6c6e04..5cbc65d7a1e1 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -17,6 +17,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/module.h>
 #include <linux/bitops.h>
 #include <linux/mount.h>
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 3f1190d18991..b5713fefb4c1 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -10,6 +10,8 @@
 #include <linux/nsproxy.h>
 #include <linux/security.h>
 #include <linux/fs_struct.h>
+#include <linux/sched/task.h>
+
 #include "proc/internal.h" /* only for get_proc_task() in ->open() */
 
 #include "pnode.h"
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index c608c39cb161..611fce58d670 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -10,6 +10,7 @@
 
 #include <linux/sched.h>
 #include <linux/sched/topology.h>
+#include <linux/sched/task.h>
 #include <linux/cpumask.h>
 #include <linux/nodemask.h>
 #include <linux/mm.h>
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 1f1d713ac19c..b4d80f9f7246 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -14,6 +14,7 @@
 #include <linux/mount.h>
 #include <linux/user_namespace.h>
 #include <linux/proc_ns.h>
+#include <linux/sched/task.h>
 
 #include "util.h"
 
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index a5c46db2855c..0f41292be0fb 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -45,6 +45,7 @@
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/task.h>
 #include <linux/seq_file.h>
 #include <linux/security.h>
 #include <linux/slab.h>
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index e865d8bfb881..38f019324f1a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -12,6 +12,7 @@
 
 #include <linux/proc_fs.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/task.h>
 #include <linux/seq_file.h>
 #include <linux/kallsyms.h>
 #include <linux/utsname.h>
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 06585ad296ff..913fe4336d2b 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -17,6 +17,7 @@
 #include <linux/cred.h>
 #include <linux/user_namespace.h>
 #include <linux/proc_ns.h>
+#include <linux/sched/task.h>
 
 static struct ucounts *inc_uts_namespaces(struct user_namespace *ns)
 {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 18e0105810df..75b2745bac41 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -75,6 +75,7 @@
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/numa_balancing.h>
+#include <linux/sched/task.h>
 #include <linux/nodemask.h>
 #include <linux/cpuset.h>
 #include <linux/slab.h>
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index 8888b124a386..3e612ae748e9 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -6,6 +6,7 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/task.h>
 #include <linux/mmu_context.h>
 #include <linux/export.h>
 
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 3c4bbec39713..652468ff65b7 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -16,6 +16,8 @@
 #include <linux/export.h>
 #include <linux/user_namespace.h>
 #include <linux/net_namespace.h>
+#include <linux/sched/task.h>
+
 #include <net/sock.h>
 #include <net/netlink.h>
 #include <net/net_namespace.h>
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 11fce17274f6..6ae56037bb13 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -12,6 +12,8 @@
 #include <linux/slab.h>
 #include <linux/cgroup.h>
 #include <linux/fdtable.h>
+#include <linux/sched/task.h>
+
 #include <net/cls_cgroup.h>
 #include <net/sock.h>
 
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 756637dc7a57..0f9275ee5595 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -20,6 +20,8 @@
 #include <linux/cgroup.h>
 #include <linux/rcupdate.h>
 #include <linux/atomic.h>
+#include <linux/sched/task.h>
+
 #include <net/rtnetlink.h>
 #include <net/pkt_cls.h>
 #include <net/sock.h>
-- 
cgit v1.2.3


From 32ef5517c298042ed58408545f475df43afe1f24 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Feb 2017 11:48:36 +0100
Subject: sched/headers: Prepare to move cputime functionality from
 <linux/sched.h> into <linux/sched/cputime.h>

Introduce a trivial, mostly empty <linux/sched/cputime.h> header
to prepare for the moving of cputime functionality out of sched.h.

Update all code that relies on these facilities.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/kernel/osf_sys.c    | 1 +
 arch/ia64/kernel/time.c        | 2 +-
 arch/powerpc/kernel/time.c     | 2 +-
 arch/s390/kernel/idle.c        | 2 +-
 arch/s390/kernel/vtime.c       | 2 +-
 arch/x86/kernel/apm_32.c       | 1 +
 arch/x86/kvm/hyperv.c          | 2 ++
 drivers/isdn/mISDN/stack.c     | 1 +
 drivers/s390/cio/cio.c         | 2 +-
 fs/binfmt_elf.c                | 1 +
 fs/binfmt_elf_fdpic.c          | 1 +
 fs/proc/array.c                | 1 +
 fs/proc/stat.c                 | 2 +-
 include/linux/sched/cputime.h  | 7 +++++++
 kernel/acct.c                  | 2 ++
 kernel/delayacct.c             | 1 +
 kernel/exit.c                  | 1 +
 kernel/fork.c                  | 1 +
 kernel/sched/cputime.c         | 2 +-
 kernel/sched/sched.h           | 1 +
 kernel/signal.c                | 1 +
 kernel/sys.c                   | 1 +
 kernel/time/itimer.c           | 1 +
 kernel/time/posix-cpu-timers.c | 1 +
 kernel/tsacct.c                | 1 +
 25 files changed, 33 insertions(+), 7 deletions(-)
 create mode 100644 include/linux/sched/cputime.h

(limited to 'kernel')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 73446baa632e..0b961093ca5c 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -14,6 +14,7 @@
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/task_stack.h>
+#include <linux/sched/cputime.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 144f9db7a876..aa7be020a904 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -22,7 +22,7 @@
 #include <linux/timex.h>
 #include <linux/timekeeper_internal.h>
 #include <linux/platform_device.h>
-#include <linux/cputime.h>
+#include <linux/sched/cputime.h>
 
 #include <asm/machvec.h>
 #include <asm/delay.h>
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 37833c5dc274..07b90725855e 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -58,7 +58,7 @@
 #include <linux/clk-provider.h>
 #include <linux/suspend.h>
 #include <linux/rtc.h>
-#include <linux/cputime.h>
+#include <linux/sched/cputime.h>
 #include <asm/trace.h>
 
 #include <asm/io.h>
diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c
index fb07a70820af..9340b2a07935 100644
--- a/arch/s390/kernel/idle.c
+++ b/arch/s390/kernel/idle.c
@@ -12,7 +12,7 @@
 #include <linux/notifier.h>
 #include <linux/init.h>
 #include <linux/cpu.h>
-#include <linux/cputime.h>
+#include <linux/sched/cputime.h>
 #include <asm/nmi.h>
 #include <asm/smp.h>
 #include "entry.h"
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index 31bd96e81167..c14fc9029912 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -6,7 +6,7 @@
  */
 
 #include <linux/kernel_stat.h>
-#include <linux/cputime.h>
+#include <linux/sched/cputime.h>
 #include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/timex.h>
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index dc04b30cbd60..5a414545e8a3 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -219,6 +219,7 @@
 #include <linux/init.h>
 #include <linux/time.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/cputime.h>
 #include <linux/pm.h>
 #include <linux/capability.h>
 #include <linux/device.h>
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index f701d4430727..ebae57ac5902 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -28,6 +28,8 @@
 
 #include <linux/kvm_host.h>
 #include <linux/highmem.h>
+#include <linux/sched/cputime.h>
+
 #include <asm/apicdef.h>
 #include <trace/events/kvm.h>
 
diff --git a/drivers/isdn/mISDN/stack.c b/drivers/isdn/mISDN/stack.c
index 696f22fd5ab4..8b7faea2ddf8 100644
--- a/drivers/isdn/mISDN/stack.c
+++ b/drivers/isdn/mISDN/stack.c
@@ -19,6 +19,7 @@
 #include <linux/mISDNif.h>
 #include <linux/kthread.h>
 #include <linux/sched.h>
+#include <linux/sched/cputime.h>
 #include <linux/signal.h>
 
 #include "core.h"
diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c
index de6fccc13124..1b350665c823 100644
--- a/drivers/s390/cio/cio.c
+++ b/drivers/s390/cio/cio.c
@@ -29,7 +29,7 @@
 #include <asm/chpid.h>
 #include <asm/airq.h>
 #include <asm/isc.h>
-#include <linux/cputime.h>
+#include <linux/sched/cputime.h>
 #include <asm/fcx.h>
 #include <asm/nmi.h>
 #include <asm/crw.h>
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 92c00a13b28b..5075fd5c62c8 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -37,6 +37,7 @@
 #include <linux/sched.h>
 #include <linux/sched/coredump.h>
 #include <linux/sched/task_stack.h>
+#include <linux/sched/cputime.h>
 #include <linux/cred.h>
 #include <linux/dax.h>
 #include <linux/uaccess.h>
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 6103a8149ccd..cf93a4fad012 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -17,6 +17,7 @@
 #include <linux/sched.h>
 #include <linux/sched/coredump.h>
 #include <linux/sched/task_stack.h>
+#include <linux/sched/cputime.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/errno.h>
diff --git a/fs/proc/array.c b/fs/proc/array.c
index f3169b58af38..88c355574aa0 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -63,6 +63,7 @@
 #include <linux/sched/mm.h>
 #include <linux/sched/numa_balancing.h>
 #include <linux/sched/task.h>
+#include <linux/sched/cputime.h>
 #include <linux/proc_fs.h>
 #include <linux/ioport.h>
 #include <linux/uaccess.h>
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index b95556e036bb..bd4e55f4aa20 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -10,7 +10,7 @@
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/irqnr.h>
-#include <linux/cputime.h>
+#include <linux/sched/cputime.h>
 #include <linux/tick.h>
 
 #ifndef arch_irq_stat_cpu
diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h
new file mode 100644
index 000000000000..6ed4fe43de28
--- /dev/null
+++ b/include/linux/sched/cputime.h
@@ -0,0 +1,7 @@
+#ifndef _LINUX_SCHED_CPUTIME_H
+#define _LINUX_SCHED_CPUTIME_H
+
+#include <linux/sched/signal.h>
+#include <linux/cputime.h>
+
+#endif /* _LINUX_SCHED_CPUTIME_H */
diff --git a/kernel/acct.c b/kernel/acct.c
index ca9cb55b5855..5b1284370367 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -56,6 +56,8 @@
 #include <linux/syscalls.h>
 #include <linux/mount.h>
 #include <linux/uaccess.h>
+#include <linux/sched/cputime.h>
+
 #include <asm/div64.h>
 #include <linux/blkdev.h> /* sector_div */
 #include <linux/pid_namespace.h>
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index c94135fc2698..4a1c33416b6a 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -15,6 +15,7 @@
 
 #include <linux/sched.h>
 #include <linux/sched/task.h>
+#include <linux/sched/cputime.h>
 #include <linux/slab.h>
 #include <linux/taskstats.h>
 #include <linux/time.h>
diff --git a/kernel/exit.c b/kernel/exit.c
index 771c33fc9952..e126ebf2400c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -11,6 +11,7 @@
 #include <linux/sched/stat.h>
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
+#include <linux/sched/cputime.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/capability.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index c87b6f03c710..916e78004b8f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -20,6 +20,7 @@
 #include <linux/sched/stat.h>
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
+#include <linux/sched/cputime.h>
 #include <linux/rtmutex.h>
 #include <linux/init.h>
 #include <linux/unistd.h>
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 54031fa681e2..f3778e2b46c8 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -4,7 +4,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/static_key.h>
 #include <linux/context_tracking.h>
-#include <linux/cputime.h>
+#include <linux/sched/cputime.h>
 #include "sched.h"
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b1f1c8443837..76e3af3f39f4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -17,6 +17,7 @@
 #include <linux/sched/hotplug.h>
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
+#include <linux/sched/cputime.h>
 
 #include <linux/u64_stats_sync.h>
 #include <linux/kernel_stat.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index 2008aa999976..7e59ebc2c25e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -18,6 +18,7 @@
 #include <linux/sched/debug.h>
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
+#include <linux/sched/cputime.h>
 #include <linux/fs.h>
 #include <linux/tty.h>
 #include <linux/binfmts.h>
diff --git a/kernel/sys.c b/kernel/sys.c
index 7f1ecd2e41c1..7ff6d1b10cec 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -55,6 +55,7 @@
 #include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
 #include <linux/sched/task.h>
+#include <linux/sched/cputime.h>
 #include <linux/rcupdate.h>
 #include <linux/uidgid.h>
 #include <linux/cred.h>
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index f6b961c5e58c..087d6a1279b8 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -11,6 +11,7 @@
 #include <linux/syscalls.h>
 #include <linux/time.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/cputime.h>
 #include <linux/posix-timers.h>
 #include <linux/hrtimer.h>
 #include <trace/events/timer.h>
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a2475a9f57d8..4513ad16a253 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -3,6 +3,7 @@
  */
 
 #include <linux/sched/signal.h>
+#include <linux/sched/cputime.h>
 #include <linux/posix-timers.h>
 #include <linux/errno.h>
 #include <linux/math64.h>
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index d9a03b80b75d..370724b45391 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -19,6 +19,7 @@
 #include <linux/kernel.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/cputime.h>
 #include <linux/tsacct_kern.h>
 #include <linux/acct.h>
 #include <linux/jiffies.h>
-- 
cgit v1.2.3


From 3905f9ad455e0fd2ddb557566c5561b4a3027c07 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Feb 2017 12:07:04 +0100
Subject: sched/headers: Prepare to move sched_info_on() and
 force_schedstat_enabled() from <linux/sched.h> to <linux/sched/stat.h>

But first update usage sites with the new header dependency.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kvm/cpuid.c | 2 ++
 arch/x86/kvm/x86.c   | 2 ++
 fs/proc/base.c       | 1 +
 kernel/latencytop.c  | 1 +
 kernel/profile.c     | 2 ++
 5 files changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 1d155cc56629..efde6cc50875 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -16,6 +16,8 @@
 #include <linux/export.h>
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
+#include <linux/sched/stat.h>
+
 #include <asm/processor.h>
 #include <asm/user.h>
 #include <asm/fpu/xstate.h>
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b2a4b11274b0..1faf620a6fdc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -54,6 +54,8 @@
 #include <linux/pvclock_gtod.h>
 #include <linux/kvm_irqfd.h>
 #include <linux/irqbypass.h>
+#include <linux/sched/stat.h>
+
 #include <trace/events/kvm.h>
 
 #include <asm/debugreg.h>
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5914fed3712d..2dae60075f6e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -89,6 +89,7 @@
 #include <linux/sched/mm.h>
 #include <linux/sched/coredump.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/stat.h>
 #include <linux/flex_array.h>
 #include <linux/posix-timers.h>
 #ifdef CONFIG_HARDWALL
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 545839b838d6..96b4179cee6a 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -56,6 +56,7 @@
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/stat.h>
 #include <linux/list.h>
 #include <linux/stacktrace.h>
 
diff --git a/kernel/profile.c b/kernel/profile.c
index f67ce0aa6bc4..9aa2a4445b0d 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -25,6 +25,8 @@
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
+#include <linux/sched/stat.h>
+
 #include <asm/sections.h>
 #include <asm/irq_regs.h>
 #include <asm/ptrace.h>
-- 
cgit v1.2.3


From 5c2c5c5514393e31859439ef83a34051d7c68332 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Feb 2017 14:24:31 +0100
Subject: sched/headers, vfs/execve: Prepare to move the do_execve*()
 prototypes from <linux/sched.h> to <linux/binfmts.h>

But first update the usage sites with the new header dependency.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 init/main.c   | 1 +
 kernel/kmod.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/init/main.c b/init/main.c
index c2d337a136d1..ca9f53fb4125 100644
--- a/init/main.c
+++ b/init/main.c
@@ -15,6 +15,7 @@
 #include <linux/extable.h>
 #include <linux/module.h>
 #include <linux/proc_fs.h>
+#include <linux/binfmts.h>
 #include <linux/kernel.h>
 #include <linux/syscalls.h>
 #include <linux/stackprotector.h>
diff --git a/kernel/kmod.c b/kernel/kmod.c
index ac5f5c2d098d..563f97e2be36 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -21,6 +21,7 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/sched/task.h>
+#include <linux/binfmts.h>
 #include <linux/syscalls.h>
 #include <linux/unistd.h>
 #include <linux/kmod.h>
-- 
cgit v1.2.3


From 61855b6b03df9b6a15bd265c2c3ae7b5e23da312 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Feb 2017 14:35:41 +0100
Subject: sched/headers: Prepare to move exit_files() and exit_itimers() from
 <linux/sched.h> to <linux/sched/task.h>

But first update the usage site.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/posix-timers.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 1e6623d76750..50a6a47020de 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -35,6 +35,7 @@
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/mutex.h>
+#include <linux/sched/task.h>
 
 #include <linux/uaccess.h>
 #include <linux/list.h>
-- 
cgit v1.2.3


From 1777e4635507265ba53d8dc4cd248e7d7c306fa0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Feb 2017 14:47:12 +0100
Subject: sched/headers: Prepare to move _init() prototypes from
 <linux/sched.h> to <linux/sched/init.h>

But first introduce a trivial header and update usage sites.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/ia64/kernel/setup.c   | 1 +
 arch/m32r/kernel/traps.c   | 1 +
 arch/s390/kernel/setup.c   | 1 +
 include/linux/cpu.h        | 2 ++
 include/linux/sched/init.h | 6 ++++++
 init/main.c                | 1 +
 kernel/sched/sched.h       | 1 +
 7 files changed, 13 insertions(+)
 create mode 100644 include/linux/sched/init.h

(limited to 'kernel')

diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index b2b4f5216180..63bef6fc0f3e 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -29,6 +29,7 @@
 #include <linux/bootmem.h>
 #include <linux/console.h>
 #include <linux/delay.h>
+#include <linux/cpu.h>
 #include <linux/kernel.h>
 #include <linux/reboot.h>
 #include <linux/sched.h>
diff --git a/arch/m32r/kernel/traps.c b/arch/m32r/kernel/traps.c
index 832bb2bb29bd..647dd94a0c39 100644
--- a/arch/m32r/kernel/traps.c
+++ b/arch/m32r/kernel/traps.c
@@ -17,6 +17,7 @@
 #include <linux/sched/debug.h>
 #include <linux/sched/task_stack.h>
 #include <linux/mm.h>
+#include <linux/cpu.h>
 
 #include <asm/page.h>
 #include <asm/processor.h>
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 3de07004c02e..911dc0b49be0 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -19,6 +19,7 @@
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/sched/task.h>
+#include <linux/cpu.h>
 #include <linux/kernel.h>
 #include <linux/memblock.h>
 #include <linux/mm.h>
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 21f9c74496e7..f92081234afd 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -30,6 +30,8 @@ struct cpu {
 
 extern void boot_cpu_init(void);
 extern void boot_cpu_state_init(void);
+extern void cpu_init(void);
+extern void trap_init(void);
 
 extern int register_cpu(struct cpu *cpu, int num);
 extern struct device *get_cpu_device(unsigned cpu);
diff --git a/include/linux/sched/init.h b/include/linux/sched/init.h
new file mode 100644
index 000000000000..15e46313e55e
--- /dev/null
+++ b/include/linux/sched/init.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SCHED_INIT_H
+#define _LINUX_SCHED_INIT_H
+
+#include <linux/sched.h>
+
+#endif /* _LINUX_SCHED_INIT_H */
diff --git a/init/main.c b/init/main.c
index ca9f53fb4125..eae2f15657c6 100644
--- a/init/main.c
+++ b/init/main.c
@@ -63,6 +63,7 @@
 #include <linux/device.h>
 #include <linux/kthread.h>
 #include <linux/sched.h>
+#include <linux/sched/init.h>
 #include <linux/signal.h>
 #include <linux/idr.h>
 #include <linux/kgdb.h>
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 76e3af3f39f4..5cbf92214ad8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -18,6 +18,7 @@
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
 #include <linux/sched/cputime.h>
+#include <linux/sched/init.h>
 
 #include <linux/u64_stats_sync.h>
 #include <linux/kernel_stat.h>
-- 
cgit v1.2.3


From 0881e7bd341e2158b314596bcf2059e88e68f04e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Feb 2017 15:30:50 +0100
Subject: sched/headers: Prepare to move the
 get_task_struct()/put_task_struct() and related APIs from <linux/sched.h> to
 <linux/sched/task.h>

But first update usage sites with the new header dependency.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/dma/dmatest.c                         | 1 +
 drivers/gpu/drm/etnaviv/etnaviv_gem.c         | 1 +
 drivers/infiniband/core/umem_odp.c            | 1 +
 drivers/infiniband/hw/mlx4/main.c             | 1 +
 drivers/infiniband/hw/mlx5/main.c             | 1 +
 drivers/md/persistent-data/dm-block-manager.c | 1 +
 drivers/misc/cxl/main.c                       | 2 ++
 drivers/net/xen-netback/interface.c           | 1 +
 drivers/oprofile/buffer_sync.c                | 1 +
 drivers/tty/tty_ldsem.c                       | 1 +
 drivers/usb/usbip/usbip_common.h              | 1 +
 kernel/irq/manage.c                           | 1 +
 12 files changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c
index c9297605058c..54d581d407aa 100644
--- a/drivers/dma/dmatest.c
+++ b/drivers/dma/dmatest.c
@@ -16,6 +16,7 @@
 #include <linux/freezer.h>
 #include <linux/init.h>
 #include <linux/kthread.h>
+#include <linux/sched/task.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/random.h>
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
index ae2882b64b82..fd56f92f3469 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
@@ -17,6 +17,7 @@
 #include <linux/spinlock.h>
 #include <linux/shmem_fs.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/task.h>
 
 #include "etnaviv_drv.h"
 #include "etnaviv_gem.h"
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index 7279f259494f..cb2742b548bb 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -33,6 +33,7 @@
 #include <linux/types.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/task.h>
 #include <linux/pid.h>
 #include <linux/slab.h>
 #include <linux/export.h>
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 56da8f0d71bb..fba94df28cf1 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -40,6 +40,7 @@
 #include <linux/rtnetlink.h>
 #include <linux/if_vlan.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/task.h>
 
 #include <net/ipv6.h>
 #include <net/addrconf.h>
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 4b1ec3ff152a..4dc0a8785fe0 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -42,6 +42,7 @@
 #endif
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/task.h>
 #include <linux/delay.h>
 #include <rdma/ib_user_verbs.h>
 #include <rdma/ib_addr.h>
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 0863905dee02..8589e0a14068 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -13,6 +13,7 @@
 #include <linux/rwsem.h>
 #include <linux/device-mapper.h>
 #include <linux/stacktrace.h>
+#include <linux/sched/task.h>
 
 #define DM_MSG_PREFIX "block manager"
 
diff --git a/drivers/misc/cxl/main.c b/drivers/misc/cxl/main.c
index cc1706a92ace..b0b6ed31918e 100644
--- a/drivers/misc/cxl/main.c
+++ b/drivers/misc/cxl/main.c
@@ -19,6 +19,8 @@
 #include <linux/slab.h>
 #include <linux/idr.h>
 #include <linux/pci.h>
+#include <linux/sched/task.h>
+
 #include <asm/cputable.h>
 #include <misc/cxl-base.h>
 
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index a2d326760a72..829b26cd4549 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -31,6 +31,7 @@
 #include "common.h"
 
 #include <linux/kthread.h>
+#include <linux/sched/task.h>
 #include <linux/ethtool.h>
 #include <linux/rtnetlink.h>
 #include <linux/if_vlan.h>
diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c
index eb0c35994b54..ac27f3d3fbb4 100644
--- a/drivers/oprofile/buffer_sync.c
+++ b/drivers/oprofile/buffer_sync.c
@@ -32,6 +32,7 @@
 #include <linux/oprofile.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/task.h>
 #include <linux/gfp.h>
 
 #include "oprofile_stats.h"
diff --git a/drivers/tty/tty_ldsem.c b/drivers/tty/tty_ldsem.c
index 4f6b1b10b537..52b7baef4f7a 100644
--- a/drivers/tty/tty_ldsem.c
+++ b/drivers/tty/tty_ldsem.c
@@ -33,6 +33,7 @@
 #include <linux/tty.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/task.h>
 
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/drivers/usb/usbip/usbip_common.h b/drivers/usb/usbip/usbip_common.h
index 9f490375ac92..f8573a52e41a 100644
--- a/drivers/usb/usbip/usbip_common.h
+++ b/drivers/usb/usbip/usbip_common.h
@@ -31,6 +31,7 @@
 #include <linux/types.h>
 #include <linux/usb.h>
 #include <linux/wait.h>
+#include <linux/sched/task.h>
 #include <uapi/linux/usbip.h>
 
 #define USBIP_VERSION "1.0.0"
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 09740952e4de..a4afe5cc5af1 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -17,6 +17,7 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/task.h>
 #include <uapi/linux/sched/types.h>
 #include <linux/task_work.h>
 
-- 
cgit v1.2.3


From c3edc4010e9d102eb7b8f17d15c2ebc425fed63c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 2 Feb 2017 08:35:14 +0100
Subject: sched/headers: Move task_struct::signal and task_struct::sighand
 types and accessors into <linux/sched/signal.h>
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

task_struct::signal and task_struct::sighand are pointers, which would normally make it
straightforward to not define those types in sched.h.

That is not so, because the types are accompanied by a myriad of APIs (macros and inline
functions) that dereference them.

Split the types and the APIs out of sched.h and move them into a new header, <linux/sched/signal.h>.

With this change sched.h does not know about 'struct signal' and 'struct sighand' anymore,
trying to put accessors into sched.h as a test fails the following way:

  ./include/linux/sched.h: In function ‘test_signal_types’:
  ./include/linux/sched.h:2461:18: error: dereferencing pointer to incomplete type ‘struct signal_struct’
                    ^

This reduces the size and complexity of sched.h significantly.

Update all headers and .c code that relied on getting the signal handling
functionality from <linux/sched.h> to include <linux/sched/signal.h>.

The list of affected files in the preparatory patch was partly generated by
grepping for the APIs, and partly by doing coverage build testing, both
all[yes|mod|def|no]config builds on 64-bit and 32-bit x86, and an array of
cross-architecture builds.

Nevertheless some (trivial) build breakage is still expected related to rare
Kconfig combinations and in-flight patches to various kernel code, but most
of it should be handled by this patch.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/nwfpe/fpmodule.c    |   2 +-
 arch/sh/kernel/cpu/sh4/fpu.c |   3 +-
 drivers/net/tap.c            |   2 +-
 include/linux/sched.h        | 499 +-----------------------------------------
 include/linux/sched/signal.h | 502 +++++++++++++++++++++++++++++++++++++++++++
 kernel/cgroup/cgroup-v1.c    |   1 +
 mm/vmalloc.c                 |   2 +-
 net/smc/af_smc.c             |   2 +
 net/smc/smc_clc.c            |   2 +
 net/smc/smc_close.c          |   2 +
 net/smc/smc_rx.c             |   2 +
 net/smc/smc_tx.c             |   2 +
 12 files changed, 520 insertions(+), 501 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/nwfpe/fpmodule.c b/arch/arm/nwfpe/fpmodule.c
index ec717c190e2c..1365e8650843 100644
--- a/arch/arm/nwfpe/fpmodule.c
+++ b/arch/arm/nwfpe/fpmodule.c
@@ -31,7 +31,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/init.h>
 
 #include <asm/thread_notify.h>
diff --git a/arch/sh/kernel/cpu/sh4/fpu.c b/arch/sh/kernel/cpu/sh4/fpu.c
index 69ab4d3c8d41..95fd2dcb83da 100644
--- a/arch/sh/kernel/cpu/sh4/fpu.c
+++ b/arch/sh/kernel/cpu/sh4/fpu.c
@@ -10,8 +10,7 @@
  *
  * FIXME! These routines have not been tested for big endian case.
  */
-#include <linux/sched.h>
-#include <linux/signal.h>
+#include <linux/sched/signal.h>
 #include <linux/io.h>
 #include <cpu/fpu.h>
 #include <asm/processor.h>
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 35b55a2fa1a1..4d4173d25dd0 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -8,7 +8,7 @@
 #include <linux/module.h>
 #include <linux/skbuff.h>
 #include <linux/cache.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/wait.h>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7934cd0acbc7..c1586104d4c0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -71,6 +71,9 @@ struct blk_plug;
 struct filename;
 struct nameidata;
 
+struct signal_struct;
+struct sighand_struct;
+
 extern unsigned long total_forks;
 extern int nr_threads;
 DECLARE_PER_CPU(unsigned long, process_counts);
@@ -361,13 +364,6 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
 static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
 #endif
 
-struct sighand_struct {
-	atomic_t		count;
-	struct k_sigaction	action[_NSIG];
-	spinlock_t		siglock;
-	wait_queue_head_t	signalfd_wqh;
-};
-
 struct pacct_struct {
 	int			ac_flag;
 	long			ac_exitcode;
@@ -485,195 +481,6 @@ struct thread_group_cputimer {
 #include <linux/rwsem.h>
 struct autogroup;
 
-/*
- * NOTE! "signal_struct" does not have its own
- * locking, because a shared signal_struct always
- * implies a shared sighand_struct, so locking
- * sighand_struct is always a proper superset of
- * the locking of signal_struct.
- */
-struct signal_struct {
-	atomic_t		sigcnt;
-	atomic_t		live;
-	int			nr_threads;
-	struct list_head	thread_head;
-
-	wait_queue_head_t	wait_chldexit;	/* for wait4() */
-
-	/* current thread group signal load-balancing target: */
-	struct task_struct	*curr_target;
-
-	/* shared signal handling: */
-	struct sigpending	shared_pending;
-
-	/* thread group exit support */
-	int			group_exit_code;
-	/* overloaded:
-	 * - notify group_exit_task when ->count is equal to notify_count
-	 * - everyone except group_exit_task is stopped during signal delivery
-	 *   of fatal signals, group_exit_task processes the signal.
-	 */
-	int			notify_count;
-	struct task_struct	*group_exit_task;
-
-	/* thread group stop support, overloads group_exit_code too */
-	int			group_stop_count;
-	unsigned int		flags; /* see SIGNAL_* flags below */
-
-	/*
-	 * PR_SET_CHILD_SUBREAPER marks a process, like a service
-	 * manager, to re-parent orphan (double-forking) child processes
-	 * to this process instead of 'init'. The service manager is
-	 * able to receive SIGCHLD signals and is able to investigate
-	 * the process until it calls wait(). All children of this
-	 * process will inherit a flag if they should look for a
-	 * child_subreaper process at exit.
-	 */
-	unsigned int		is_child_subreaper:1;
-	unsigned int		has_child_subreaper:1;
-
-#ifdef CONFIG_POSIX_TIMERS
-
-	/* POSIX.1b Interval Timers */
-	int			posix_timer_id;
-	struct list_head	posix_timers;
-
-	/* ITIMER_REAL timer for the process */
-	struct hrtimer real_timer;
-	ktime_t it_real_incr;
-
-	/*
-	 * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use
-	 * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these
-	 * values are defined to 0 and 1 respectively
-	 */
-	struct cpu_itimer it[2];
-
-	/*
-	 * Thread group totals for process CPU timers.
-	 * See thread_group_cputimer(), et al, for details.
-	 */
-	struct thread_group_cputimer cputimer;
-
-	/* Earliest-expiration cache. */
-	struct task_cputime cputime_expires;
-
-	struct list_head cpu_timers[3];
-
-#endif
-
-	struct pid *leader_pid;
-
-#ifdef CONFIG_NO_HZ_FULL
-	atomic_t tick_dep_mask;
-#endif
-
-	struct pid *tty_old_pgrp;
-
-	/* boolean value for session group leader */
-	int leader;
-
-	struct tty_struct *tty; /* NULL if no tty */
-
-#ifdef CONFIG_SCHED_AUTOGROUP
-	struct autogroup *autogroup;
-#endif
-	/*
-	 * Cumulative resource counters for dead threads in the group,
-	 * and for reaped dead child processes forked by this group.
-	 * Live threads maintain their own counters and add to these
-	 * in __exit_signal, except for the group leader.
-	 */
-	seqlock_t stats_lock;
-	u64 utime, stime, cutime, cstime;
-	u64 gtime;
-	u64 cgtime;
-	struct prev_cputime prev_cputime;
-	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
-	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
-	unsigned long inblock, oublock, cinblock, coublock;
-	unsigned long maxrss, cmaxrss;
-	struct task_io_accounting ioac;
-
-	/*
-	 * Cumulative ns of schedule CPU time fo dead threads in the
-	 * group, not including a zombie group leader, (This only differs
-	 * from jiffies_to_ns(utime + stime) if sched_clock uses something
-	 * other than jiffies.)
-	 */
-	unsigned long long sum_sched_runtime;
-
-	/*
-	 * We don't bother to synchronize most readers of this at all,
-	 * because there is no reader checking a limit that actually needs
-	 * to get both rlim_cur and rlim_max atomically, and either one
-	 * alone is a single word that can safely be read normally.
-	 * getrlimit/setrlimit use task_lock(current->group_leader) to
-	 * protect this instead of the siglock, because they really
-	 * have no need to disable irqs.
-	 */
-	struct rlimit rlim[RLIM_NLIMITS];
-
-#ifdef CONFIG_BSD_PROCESS_ACCT
-	struct pacct_struct pacct;	/* per-process accounting information */
-#endif
-#ifdef CONFIG_TASKSTATS
-	struct taskstats *stats;
-#endif
-#ifdef CONFIG_AUDIT
-	unsigned audit_tty;
-	struct tty_audit_buf *tty_audit_buf;
-#endif
-
-	/*
-	 * Thread is the potential origin of an oom condition; kill first on
-	 * oom
-	 */
-	bool oom_flag_origin;
-	short oom_score_adj;		/* OOM kill score adjustment */
-	short oom_score_adj_min;	/* OOM kill score adjustment min value.
-					 * Only settable by CAP_SYS_RESOURCE. */
-	struct mm_struct *oom_mm;	/* recorded mm when the thread group got
-					 * killed by the oom killer */
-
-	struct mutex cred_guard_mutex;	/* guard against foreign influences on
-					 * credential calculations
-					 * (notably. ptrace) */
-};
-
-/*
- * Bits in flags field of signal_struct.
- */
-#define SIGNAL_STOP_STOPPED	0x00000001 /* job control stop in effect */
-#define SIGNAL_STOP_CONTINUED	0x00000002 /* SIGCONT since WCONTINUED reap */
-#define SIGNAL_GROUP_EXIT	0x00000004 /* group exit in progress */
-#define SIGNAL_GROUP_COREDUMP	0x00000008 /* coredump in progress */
-/*
- * Pending notifications to parent.
- */
-#define SIGNAL_CLD_STOPPED	0x00000010
-#define SIGNAL_CLD_CONTINUED	0x00000020
-#define SIGNAL_CLD_MASK		(SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED)
-
-#define SIGNAL_UNKILLABLE	0x00000040 /* for init: ignore fatal signals */
-
-#define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \
-			  SIGNAL_STOP_CONTINUED)
-
-static inline void signal_set_stop_flags(struct signal_struct *sig,
-					 unsigned int flags)
-{
-	WARN_ON(sig->flags & (SIGNAL_GROUP_EXIT|SIGNAL_GROUP_COREDUMP));
-	sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags;
-}
-
-/* If true, all threads except ->group_exit_task have pending SIGKILL */
-static inline int signal_group_exit(const struct signal_struct *sig)
-{
-	return	(sig->flags & SIGNAL_GROUP_EXIT) ||
-		(sig->group_exit_task != NULL);
-}
-
 /*
  * Some day this will be a full-fledged user tracking system..
  */
@@ -2126,190 +1933,8 @@ extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
 extern void sched_dead(struct task_struct *p);
 
 extern void proc_caches_init(void);
-extern void flush_signals(struct task_struct *);
-extern void ignore_signals(struct task_struct *);
-extern void flush_signal_handlers(struct task_struct *, int force_default);
-extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
-
-static inline int kernel_dequeue_signal(siginfo_t *info)
-{
-	struct task_struct *tsk = current;
-	siginfo_t __info;
-	int ret;
-
-	spin_lock_irq(&tsk->sighand->siglock);
-	ret = dequeue_signal(tsk, &tsk->blocked, info ?: &__info);
-	spin_unlock_irq(&tsk->sighand->siglock);
-
-	return ret;
-}
-
-static inline void kernel_signal_stop(void)
-{
-	spin_lock_irq(&current->sighand->siglock);
-	if (current->jobctl & JOBCTL_STOP_DEQUEUED)
-		__set_current_state(TASK_STOPPED);
-	spin_unlock_irq(&current->sighand->siglock);
-
-	schedule();
-}
 
 extern void release_task(struct task_struct * p);
-extern int send_sig_info(int, struct siginfo *, struct task_struct *);
-extern int force_sigsegv(int, struct task_struct *);
-extern int force_sig_info(int, struct siginfo *, struct task_struct *);
-extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp);
-extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid);
-extern int kill_pid_info_as_cred(int, struct siginfo *, struct pid *,
-				const struct cred *, u32);
-extern int kill_pgrp(struct pid *pid, int sig, int priv);
-extern int kill_pid(struct pid *pid, int sig, int priv);
-extern int kill_proc_info(int, struct siginfo *, pid_t);
-extern __must_check bool do_notify_parent(struct task_struct *, int);
-extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
-extern void force_sig(int, struct task_struct *);
-extern int send_sig(int, struct task_struct *, int);
-extern int zap_other_threads(struct task_struct *p);
-extern struct sigqueue *sigqueue_alloc(void);
-extern void sigqueue_free(struct sigqueue *);
-extern int send_sigqueue(struct sigqueue *,  struct task_struct *, int group);
-extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
-
-#ifdef TIF_RESTORE_SIGMASK
-/*
- * Legacy restore_sigmask accessors.  These are inefficient on
- * SMP architectures because they require atomic operations.
- */
-
-/**
- * set_restore_sigmask() - make sure saved_sigmask processing gets done
- *
- * This sets TIF_RESTORE_SIGMASK and ensures that the arch signal code
- * will run before returning to user mode, to process the flag.  For
- * all callers, TIF_SIGPENDING is already set or it's no harm to set
- * it.  TIF_RESTORE_SIGMASK need not be in the set of bits that the
- * arch code will notice on return to user mode, in case those bits
- * are scarce.  We set TIF_SIGPENDING here to ensure that the arch
- * signal code always gets run when TIF_RESTORE_SIGMASK is set.
- */
-static inline void set_restore_sigmask(void)
-{
-	set_thread_flag(TIF_RESTORE_SIGMASK);
-	WARN_ON(!test_thread_flag(TIF_SIGPENDING));
-}
-static inline void clear_restore_sigmask(void)
-{
-	clear_thread_flag(TIF_RESTORE_SIGMASK);
-}
-static inline bool test_restore_sigmask(void)
-{
-	return test_thread_flag(TIF_RESTORE_SIGMASK);
-}
-static inline bool test_and_clear_restore_sigmask(void)
-{
-	return test_and_clear_thread_flag(TIF_RESTORE_SIGMASK);
-}
-
-#else	/* TIF_RESTORE_SIGMASK */
-
-/* Higher-quality implementation, used if TIF_RESTORE_SIGMASK doesn't exist. */
-static inline void set_restore_sigmask(void)
-{
-	current->restore_sigmask = true;
-	WARN_ON(!test_thread_flag(TIF_SIGPENDING));
-}
-static inline void clear_restore_sigmask(void)
-{
-	current->restore_sigmask = false;
-}
-static inline bool test_restore_sigmask(void)
-{
-	return current->restore_sigmask;
-}
-static inline bool test_and_clear_restore_sigmask(void)
-{
-	if (!current->restore_sigmask)
-		return false;
-	current->restore_sigmask = false;
-	return true;
-}
-#endif
-
-static inline void restore_saved_sigmask(void)
-{
-	if (test_and_clear_restore_sigmask())
-		__set_current_blocked(&current->saved_sigmask);
-}
-
-static inline sigset_t *sigmask_to_save(void)
-{
-	sigset_t *res = &current->blocked;
-	if (unlikely(test_restore_sigmask()))
-		res = &current->saved_sigmask;
-	return res;
-}
-
-static inline int kill_cad_pid(int sig, int priv)
-{
-	return kill_pid(cad_pid, sig, priv);
-}
-
-/* These can be the second arg to send_sig_info/send_group_sig_info.  */
-#define SEND_SIG_NOINFO ((struct siginfo *) 0)
-#define SEND_SIG_PRIV	((struct siginfo *) 1)
-#define SEND_SIG_FORCED	((struct siginfo *) 2)
-
-/*
- * True if we are on the alternate signal stack.
- */
-static inline int on_sig_stack(unsigned long sp)
-{
-	/*
-	 * If the signal stack is SS_AUTODISARM then, by construction, we
-	 * can't be on the signal stack unless user code deliberately set
-	 * SS_AUTODISARM when we were already on it.
-	 *
-	 * This improves reliability: if user state gets corrupted such that
-	 * the stack pointer points very close to the end of the signal stack,
-	 * then this check will enable the signal to be handled anyway.
-	 */
-	if (current->sas_ss_flags & SS_AUTODISARM)
-		return 0;
-
-#ifdef CONFIG_STACK_GROWSUP
-	return sp >= current->sas_ss_sp &&
-		sp - current->sas_ss_sp < current->sas_ss_size;
-#else
-	return sp > current->sas_ss_sp &&
-		sp - current->sas_ss_sp <= current->sas_ss_size;
-#endif
-}
-
-static inline int sas_ss_flags(unsigned long sp)
-{
-	if (!current->sas_ss_size)
-		return SS_DISABLE;
-
-	return on_sig_stack(sp) ? SS_ONSTACK : 0;
-}
-
-static inline void sas_ss_reset(struct task_struct *p)
-{
-	p->sas_ss_sp = 0;
-	p->sas_ss_size = 0;
-	p->sas_ss_flags = SS_DISABLE;
-}
-
-static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig)
-{
-	if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp))
-#ifdef CONFIG_STACK_GROWSUP
-		return current->sas_ss_sp;
-#else
-		return current->sas_ss_sp + current->sas_ss_size;
-#endif
-	return sp;
-}
 
 #ifdef CONFIG_HAVE_COPY_THREAD_TLS
 extern int copy_thread_tls(unsigned long, unsigned long, unsigned long,
@@ -2338,10 +1963,8 @@ static inline void exit_thread(struct task_struct *tsk)
 #endif
 
 extern void exit_files(struct task_struct *);
-extern void __cleanup_sighand(struct sighand_struct *);
 
 extern void exit_itimers(struct signal_struct *);
-extern void flush_itimer_signals(void);
 
 extern void do_group_exit(int);
 
@@ -2376,81 +1999,6 @@ static inline unsigned long wait_task_inactive(struct task_struct *p,
 }
 #endif
 
-#define tasklist_empty() \
-	list_empty(&init_task.tasks)
-
-#define next_task(p) \
-	list_entry_rcu((p)->tasks.next, struct task_struct, tasks)
-
-#define for_each_process(p) \
-	for (p = &init_task ; (p = next_task(p)) != &init_task ; )
-
-extern bool current_is_single_threaded(void);
-
-/*
- * Careful: do_each_thread/while_each_thread is a double loop so
- *          'break' will not work as expected - use goto instead.
- */
-#define do_each_thread(g, t) \
-	for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do
-
-#define while_each_thread(g, t) \
-	while ((t = next_thread(t)) != g)
-
-#define __for_each_thread(signal, t)	\
-	list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node)
-
-#define for_each_thread(p, t)		\
-	__for_each_thread((p)->signal, t)
-
-/* Careful: this is a double loop, 'break' won't work as expected. */
-#define for_each_process_thread(p, t)	\
-	for_each_process(p) for_each_thread(p, t)
-
-typedef int (*proc_visitor)(struct task_struct *p, void *data);
-void walk_process_tree(struct task_struct *top, proc_visitor, void *);
-
-static inline int get_nr_threads(struct task_struct *tsk)
-{
-	return tsk->signal->nr_threads;
-}
-
-static inline bool thread_group_leader(struct task_struct *p)
-{
-	return p->exit_signal >= 0;
-}
-
-/* Do to the insanities of de_thread it is possible for a process
- * to have the pid of the thread group leader without actually being
- * the thread group leader.  For iteration through the pids in proc
- * all we care about is that we have a task with the appropriate
- * pid, we don't actually care if we have the right task.
- */
-static inline bool has_group_leader_pid(struct task_struct *p)
-{
-	return task_pid(p) == p->signal->leader_pid;
-}
-
-static inline
-bool same_thread_group(struct task_struct *p1, struct task_struct *p2)
-{
-	return p1->signal == p2->signal;
-}
-
-static inline struct task_struct *next_thread(const struct task_struct *p)
-{
-	return list_entry_rcu(p->thread_group.next,
-			      struct task_struct, thread_group);
-}
-
-static inline int thread_group_empty(struct task_struct *p)
-{
-	return list_empty(&p->thread_group);
-}
-
-#define delay_group_leader(p) \
-		(thread_group_leader(p) && !thread_group_empty(p))
-
 /*
  * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
  * subscriptions and synchronises with wait4().  Also used in procfs.  Also
@@ -2471,25 +2019,6 @@ static inline void task_unlock(struct task_struct *p)
 	spin_unlock(&p->alloc_lock);
 }
 
-extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
-							unsigned long *flags);
-
-static inline struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
-						       unsigned long *flags)
-{
-	struct sighand_struct *ret;
-
-	ret = __lock_task_sighand(tsk, flags);
-	(void)__cond_lock(&tsk->sighand->siglock, ret);
-	return ret;
-}
-
-static inline void unlock_task_sighand(struct task_struct *tsk,
-						unsigned long *flags)
-{
-	spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
-}
-
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 
 static inline struct thread_info *task_thread_info(struct task_struct *task)
@@ -2862,28 +2391,6 @@ static inline void mm_update_next_owner(struct mm_struct *mm)
 }
 #endif /* CONFIG_MEMCG */
 
-static inline unsigned long task_rlimit(const struct task_struct *tsk,
-		unsigned int limit)
-{
-	return READ_ONCE(tsk->signal->rlim[limit].rlim_cur);
-}
-
-static inline unsigned long task_rlimit_max(const struct task_struct *tsk,
-		unsigned int limit)
-{
-	return READ_ONCE(tsk->signal->rlim[limit].rlim_max);
-}
-
-static inline unsigned long rlimit(unsigned int limit)
-{
-	return task_rlimit(current, limit);
-}
-
-static inline unsigned long rlimit_max(unsigned int limit)
-{
-	return task_rlimit_max(current, limit);
-}
-
 #define SCHED_CPUFREQ_RT	(1U << 0)
 #define SCHED_CPUFREQ_DL	(1U << 1)
 #define SCHED_CPUFREQ_IOWAIT	(1U << 2)
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index c6958a53fef3..53fe5450f431 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -8,4 +8,506 @@
 #include <linux/sched/jobctl.h>
 #include <linux/sched/task.h>
 
+/*
+ * Types defining task->signal and task->sighand and APIs using them:
+ */
+
+struct sighand_struct {
+	atomic_t		count;
+	struct k_sigaction	action[_NSIG];
+	spinlock_t		siglock;
+	wait_queue_head_t	signalfd_wqh;
+};
+
+/*
+ * NOTE! "signal_struct" does not have its own
+ * locking, because a shared signal_struct always
+ * implies a shared sighand_struct, so locking
+ * sighand_struct is always a proper superset of
+ * the locking of signal_struct.
+ */
+struct signal_struct {
+	atomic_t		sigcnt;
+	atomic_t		live;
+	int			nr_threads;
+	struct list_head	thread_head;
+
+	wait_queue_head_t	wait_chldexit;	/* for wait4() */
+
+	/* current thread group signal load-balancing target: */
+	struct task_struct	*curr_target;
+
+	/* shared signal handling: */
+	struct sigpending	shared_pending;
+
+	/* thread group exit support */
+	int			group_exit_code;
+	/* overloaded:
+	 * - notify group_exit_task when ->count is equal to notify_count
+	 * - everyone except group_exit_task is stopped during signal delivery
+	 *   of fatal signals, group_exit_task processes the signal.
+	 */
+	int			notify_count;
+	struct task_struct	*group_exit_task;
+
+	/* thread group stop support, overloads group_exit_code too */
+	int			group_stop_count;
+	unsigned int		flags; /* see SIGNAL_* flags below */
+
+	/*
+	 * PR_SET_CHILD_SUBREAPER marks a process, like a service
+	 * manager, to re-parent orphan (double-forking) child processes
+	 * to this process instead of 'init'. The service manager is
+	 * able to receive SIGCHLD signals and is able to investigate
+	 * the process until it calls wait(). All children of this
+	 * process will inherit a flag if they should look for a
+	 * child_subreaper process at exit.
+	 */
+	unsigned int		is_child_subreaper:1;
+	unsigned int		has_child_subreaper:1;
+
+#ifdef CONFIG_POSIX_TIMERS
+
+	/* POSIX.1b Interval Timers */
+	int			posix_timer_id;
+	struct list_head	posix_timers;
+
+	/* ITIMER_REAL timer for the process */
+	struct hrtimer real_timer;
+	ktime_t it_real_incr;
+
+	/*
+	 * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use
+	 * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these
+	 * values are defined to 0 and 1 respectively
+	 */
+	struct cpu_itimer it[2];
+
+	/*
+	 * Thread group totals for process CPU timers.
+	 * See thread_group_cputimer(), et al, for details.
+	 */
+	struct thread_group_cputimer cputimer;
+
+	/* Earliest-expiration cache. */
+	struct task_cputime cputime_expires;
+
+	struct list_head cpu_timers[3];
+
+#endif
+
+	struct pid *leader_pid;
+
+#ifdef CONFIG_NO_HZ_FULL
+	atomic_t tick_dep_mask;
+#endif
+
+	struct pid *tty_old_pgrp;
+
+	/* boolean value for session group leader */
+	int leader;
+
+	struct tty_struct *tty; /* NULL if no tty */
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+	struct autogroup *autogroup;
+#endif
+	/*
+	 * Cumulative resource counters for dead threads in the group,
+	 * and for reaped dead child processes forked by this group.
+	 * Live threads maintain their own counters and add to these
+	 * in __exit_signal, except for the group leader.
+	 */
+	seqlock_t stats_lock;
+	u64 utime, stime, cutime, cstime;
+	u64 gtime;
+	u64 cgtime;
+	struct prev_cputime prev_cputime;
+	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
+	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
+	unsigned long inblock, oublock, cinblock, coublock;
+	unsigned long maxrss, cmaxrss;
+	struct task_io_accounting ioac;
+
+	/*
+	 * Cumulative ns of schedule CPU time fo dead threads in the
+	 * group, not including a zombie group leader, (This only differs
+	 * from jiffies_to_ns(utime + stime) if sched_clock uses something
+	 * other than jiffies.)
+	 */
+	unsigned long long sum_sched_runtime;
+
+	/*
+	 * We don't bother to synchronize most readers of this at all,
+	 * because there is no reader checking a limit that actually needs
+	 * to get both rlim_cur and rlim_max atomically, and either one
+	 * alone is a single word that can safely be read normally.
+	 * getrlimit/setrlimit use task_lock(current->group_leader) to
+	 * protect this instead of the siglock, because they really
+	 * have no need to disable irqs.
+	 */
+	struct rlimit rlim[RLIM_NLIMITS];
+
+#ifdef CONFIG_BSD_PROCESS_ACCT
+	struct pacct_struct pacct;	/* per-process accounting information */
+#endif
+#ifdef CONFIG_TASKSTATS
+	struct taskstats *stats;
+#endif
+#ifdef CONFIG_AUDIT
+	unsigned audit_tty;
+	struct tty_audit_buf *tty_audit_buf;
+#endif
+
+	/*
+	 * Thread is the potential origin of an oom condition; kill first on
+	 * oom
+	 */
+	bool oom_flag_origin;
+	short oom_score_adj;		/* OOM kill score adjustment */
+	short oom_score_adj_min;	/* OOM kill score adjustment min value.
+					 * Only settable by CAP_SYS_RESOURCE. */
+	struct mm_struct *oom_mm;	/* recorded mm when the thread group got
+					 * killed by the oom killer */
+
+	struct mutex cred_guard_mutex;	/* guard against foreign influences on
+					 * credential calculations
+					 * (notably. ptrace) */
+};
+
+/*
+ * Bits in flags field of signal_struct.
+ */
+#define SIGNAL_STOP_STOPPED	0x00000001 /* job control stop in effect */
+#define SIGNAL_STOP_CONTINUED	0x00000002 /* SIGCONT since WCONTINUED reap */
+#define SIGNAL_GROUP_EXIT	0x00000004 /* group exit in progress */
+#define SIGNAL_GROUP_COREDUMP	0x00000008 /* coredump in progress */
+/*
+ * Pending notifications to parent.
+ */
+#define SIGNAL_CLD_STOPPED	0x00000010
+#define SIGNAL_CLD_CONTINUED	0x00000020
+#define SIGNAL_CLD_MASK		(SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED)
+
+#define SIGNAL_UNKILLABLE	0x00000040 /* for init: ignore fatal signals */
+
+#define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \
+			  SIGNAL_STOP_CONTINUED)
+
+static inline void signal_set_stop_flags(struct signal_struct *sig,
+					 unsigned int flags)
+{
+	WARN_ON(sig->flags & (SIGNAL_GROUP_EXIT|SIGNAL_GROUP_COREDUMP));
+	sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags;
+}
+
+/* If true, all threads except ->group_exit_task have pending SIGKILL */
+static inline int signal_group_exit(const struct signal_struct *sig)
+{
+	return	(sig->flags & SIGNAL_GROUP_EXIT) ||
+		(sig->group_exit_task != NULL);
+}
+
+extern void flush_signals(struct task_struct *);
+extern void ignore_signals(struct task_struct *);
+extern void flush_signal_handlers(struct task_struct *, int force_default);
+extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
+
+static inline int kernel_dequeue_signal(siginfo_t *info)
+{
+	struct task_struct *tsk = current;
+	siginfo_t __info;
+	int ret;
+
+	spin_lock_irq(&tsk->sighand->siglock);
+	ret = dequeue_signal(tsk, &tsk->blocked, info ?: &__info);
+	spin_unlock_irq(&tsk->sighand->siglock);
+
+	return ret;
+}
+
+static inline void kernel_signal_stop(void)
+{
+	spin_lock_irq(&current->sighand->siglock);
+	if (current->jobctl & JOBCTL_STOP_DEQUEUED)
+		__set_current_state(TASK_STOPPED);
+	spin_unlock_irq(&current->sighand->siglock);
+
+	schedule();
+}
+extern int send_sig_info(int, struct siginfo *, struct task_struct *);
+extern int force_sigsegv(int, struct task_struct *);
+extern int force_sig_info(int, struct siginfo *, struct task_struct *);
+extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp);
+extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid);
+extern int kill_pid_info_as_cred(int, struct siginfo *, struct pid *,
+				const struct cred *, u32);
+extern int kill_pgrp(struct pid *pid, int sig, int priv);
+extern int kill_pid(struct pid *pid, int sig, int priv);
+extern int kill_proc_info(int, struct siginfo *, pid_t);
+extern __must_check bool do_notify_parent(struct task_struct *, int);
+extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
+extern void force_sig(int, struct task_struct *);
+extern int send_sig(int, struct task_struct *, int);
+extern int zap_other_threads(struct task_struct *p);
+extern struct sigqueue *sigqueue_alloc(void);
+extern void sigqueue_free(struct sigqueue *);
+extern int send_sigqueue(struct sigqueue *,  struct task_struct *, int group);
+extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
+
+#ifdef TIF_RESTORE_SIGMASK
+/*
+ * Legacy restore_sigmask accessors.  These are inefficient on
+ * SMP architectures because they require atomic operations.
+ */
+
+/**
+ * set_restore_sigmask() - make sure saved_sigmask processing gets done
+ *
+ * This sets TIF_RESTORE_SIGMASK and ensures that the arch signal code
+ * will run before returning to user mode, to process the flag.  For
+ * all callers, TIF_SIGPENDING is already set or it's no harm to set
+ * it.  TIF_RESTORE_SIGMASK need not be in the set of bits that the
+ * arch code will notice on return to user mode, in case those bits
+ * are scarce.  We set TIF_SIGPENDING here to ensure that the arch
+ * signal code always gets run when TIF_RESTORE_SIGMASK is set.
+ */
+static inline void set_restore_sigmask(void)
+{
+	set_thread_flag(TIF_RESTORE_SIGMASK);
+	WARN_ON(!test_thread_flag(TIF_SIGPENDING));
+}
+static inline void clear_restore_sigmask(void)
+{
+	clear_thread_flag(TIF_RESTORE_SIGMASK);
+}
+static inline bool test_restore_sigmask(void)
+{
+	return test_thread_flag(TIF_RESTORE_SIGMASK);
+}
+static inline bool test_and_clear_restore_sigmask(void)
+{
+	return test_and_clear_thread_flag(TIF_RESTORE_SIGMASK);
+}
+
+#else	/* TIF_RESTORE_SIGMASK */
+
+/* Higher-quality implementation, used if TIF_RESTORE_SIGMASK doesn't exist. */
+static inline void set_restore_sigmask(void)
+{
+	current->restore_sigmask = true;
+	WARN_ON(!test_thread_flag(TIF_SIGPENDING));
+}
+static inline void clear_restore_sigmask(void)
+{
+	current->restore_sigmask = false;
+}
+static inline bool test_restore_sigmask(void)
+{
+	return current->restore_sigmask;
+}
+static inline bool test_and_clear_restore_sigmask(void)
+{
+	if (!current->restore_sigmask)
+		return false;
+	current->restore_sigmask = false;
+	return true;
+}
+#endif
+
+static inline void restore_saved_sigmask(void)
+{
+	if (test_and_clear_restore_sigmask())
+		__set_current_blocked(&current->saved_sigmask);
+}
+
+static inline sigset_t *sigmask_to_save(void)
+{
+	sigset_t *res = &current->blocked;
+	if (unlikely(test_restore_sigmask()))
+		res = &current->saved_sigmask;
+	return res;
+}
+
+static inline int kill_cad_pid(int sig, int priv)
+{
+	return kill_pid(cad_pid, sig, priv);
+}
+
+/* These can be the second arg to send_sig_info/send_group_sig_info.  */
+#define SEND_SIG_NOINFO ((struct siginfo *) 0)
+#define SEND_SIG_PRIV	((struct siginfo *) 1)
+#define SEND_SIG_FORCED	((struct siginfo *) 2)
+
+/*
+ * True if we are on the alternate signal stack.
+ */
+static inline int on_sig_stack(unsigned long sp)
+{
+	/*
+	 * If the signal stack is SS_AUTODISARM then, by construction, we
+	 * can't be on the signal stack unless user code deliberately set
+	 * SS_AUTODISARM when we were already on it.
+	 *
+	 * This improves reliability: if user state gets corrupted such that
+	 * the stack pointer points very close to the end of the signal stack,
+	 * then this check will enable the signal to be handled anyway.
+	 */
+	if (current->sas_ss_flags & SS_AUTODISARM)
+		return 0;
+
+#ifdef CONFIG_STACK_GROWSUP
+	return sp >= current->sas_ss_sp &&
+		sp - current->sas_ss_sp < current->sas_ss_size;
+#else
+	return sp > current->sas_ss_sp &&
+		sp - current->sas_ss_sp <= current->sas_ss_size;
+#endif
+}
+
+static inline int sas_ss_flags(unsigned long sp)
+{
+	if (!current->sas_ss_size)
+		return SS_DISABLE;
+
+	return on_sig_stack(sp) ? SS_ONSTACK : 0;
+}
+
+static inline void sas_ss_reset(struct task_struct *p)
+{
+	p->sas_ss_sp = 0;
+	p->sas_ss_size = 0;
+	p->sas_ss_flags = SS_DISABLE;
+}
+
+static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig)
+{
+	if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp))
+#ifdef CONFIG_STACK_GROWSUP
+		return current->sas_ss_sp;
+#else
+		return current->sas_ss_sp + current->sas_ss_size;
+#endif
+	return sp;
+}
+
+extern void __cleanup_sighand(struct sighand_struct *);
+extern void flush_itimer_signals(void);
+
+#define tasklist_empty() \
+	list_empty(&init_task.tasks)
+
+#define next_task(p) \
+	list_entry_rcu((p)->tasks.next, struct task_struct, tasks)
+
+#define for_each_process(p) \
+	for (p = &init_task ; (p = next_task(p)) != &init_task ; )
+
+extern bool current_is_single_threaded(void);
+
+/*
+ * Careful: do_each_thread/while_each_thread is a double loop so
+ *          'break' will not work as expected - use goto instead.
+ */
+#define do_each_thread(g, t) \
+	for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do
+
+#define while_each_thread(g, t) \
+	while ((t = next_thread(t)) != g)
+
+#define __for_each_thread(signal, t)	\
+	list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node)
+
+#define for_each_thread(p, t)		\
+	__for_each_thread((p)->signal, t)
+
+/* Careful: this is a double loop, 'break' won't work as expected. */
+#define for_each_process_thread(p, t)	\
+	for_each_process(p) for_each_thread(p, t)
+
+typedef int (*proc_visitor)(struct task_struct *p, void *data);
+void walk_process_tree(struct task_struct *top, proc_visitor, void *);
+
+static inline int get_nr_threads(struct task_struct *tsk)
+{
+	return tsk->signal->nr_threads;
+}
+
+static inline bool thread_group_leader(struct task_struct *p)
+{
+	return p->exit_signal >= 0;
+}
+
+/* Do to the insanities of de_thread it is possible for a process
+ * to have the pid of the thread group leader without actually being
+ * the thread group leader.  For iteration through the pids in proc
+ * all we care about is that we have a task with the appropriate
+ * pid, we don't actually care if we have the right task.
+ */
+static inline bool has_group_leader_pid(struct task_struct *p)
+{
+	return task_pid(p) == p->signal->leader_pid;
+}
+
+static inline
+bool same_thread_group(struct task_struct *p1, struct task_struct *p2)
+{
+	return p1->signal == p2->signal;
+}
+
+static inline struct task_struct *next_thread(const struct task_struct *p)
+{
+	return list_entry_rcu(p->thread_group.next,
+			      struct task_struct, thread_group);
+}
+
+static inline int thread_group_empty(struct task_struct *p)
+{
+	return list_empty(&p->thread_group);
+}
+
+#define delay_group_leader(p) \
+		(thread_group_leader(p) && !thread_group_empty(p))
+
+extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
+							unsigned long *flags);
+
+static inline struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
+						       unsigned long *flags)
+{
+	struct sighand_struct *ret;
+
+	ret = __lock_task_sighand(tsk, flags);
+	(void)__cond_lock(&tsk->sighand->siglock, ret);
+	return ret;
+}
+
+static inline void unlock_task_sighand(struct task_struct *tsk,
+						unsigned long *flags)
+{
+	spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
+}
+
+static inline unsigned long task_rlimit(const struct task_struct *tsk,
+		unsigned int limit)
+{
+	return READ_ONCE(tsk->signal->rlim[limit].rlim_cur);
+}
+
+static inline unsigned long task_rlimit_max(const struct task_struct *tsk,
+		unsigned int limit)
+{
+	return READ_ONCE(tsk->signal->rlim[limit].rlim_max);
+}
+
+static inline unsigned long rlimit(unsigned int limit)
+{
+	return task_rlimit(current, limit);
+}
+
+static inline unsigned long rlimit_max(unsigned int limit)
+{
+	return task_rlimit_max(current, limit);
+}
+
 #endif /* _LINUX_SCHED_SIGNAL_H */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index fc34bcf2329f..08d2cb605101 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -5,6 +5,7 @@
 #include <linux/sort.h>
 #include <linux/delay.h>
 #include <linux/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/delayacct.h>
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index be93949b4885..b4024d688f38 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -12,7 +12,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/highmem.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 5d4208ad029e..85837ab90e89 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -27,6 +27,8 @@
 #include <linux/inetdevice.h>
 #include <linux/workqueue.h>
 #include <linux/in.h>
+#include <linux/sched/signal.h>
+
 #include <net/sock.h>
 #include <net/tcp.h>
 #include <net/smc.h>
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index cc6b6f8651eb..e41f594a1e1d 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -11,6 +11,8 @@
 
 #include <linux/in.h>
 #include <linux/if_ether.h>
+#include <linux/sched/signal.h>
+
 #include <net/sock.h>
 #include <net/tcp.h>
 
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index 03dfcc6b7661..67a71d170bed 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -9,6 +9,8 @@
  */
 
 #include <linux/workqueue.h>
+#include <linux/sched/signal.h>
+
 #include <net/sock.h>
 
 #include "smc.h"
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
index 5d1878732f46..c4ef9a4ec569 100644
--- a/net/smc/smc_rx.c
+++ b/net/smc/smc_rx.c
@@ -11,6 +11,8 @@
 
 #include <linux/net.h>
 #include <linux/rcupdate.h>
+#include <linux/sched/signal.h>
+
 #include <net/sock.h>
 
 #include "smc.h"
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 6e73b28915ea..69a0013dd25c 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -15,6 +15,8 @@
 #include <linux/net.h>
 #include <linux/rcupdate.h>
 #include <linux/workqueue.h>
+#include <linux/sched/signal.h>
+
 #include <net/sock.h>
 
 #include "smc.h"
-- 
cgit v1.2.3


From 6bfbaa51ed47774492d83d182a86068cc35aa4c6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Feb 2017 21:37:49 +0100
Subject: sched/headers, RCU: Move rcu_copy_process() from <linux/sched/task.h>
 to kernel/fork.c

Move rcu_copy_process() into kernel/fork.c, which is the only
user of this inline function.

This simplifies <linux/sched/task.h> to the level that <linux/sched.h>
does not have to be included in it anymore - which change is done
in a subsequent patch.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/task.h | 15 ---------------
 kernel/fork.c              | 15 +++++++++++++++
 2 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index e93638a03515..20ed9108f261 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -25,21 +25,6 @@ extern asmlinkage void schedule_tail(struct task_struct *prev);
 extern void init_idle(struct task_struct *idle, int cpu);
 extern void init_idle_bootup_task(struct task_struct *idle);
 
-static inline void rcu_copy_process(struct task_struct *p)
-{
-#ifdef CONFIG_PREEMPT_RCU
-	p->rcu_read_lock_nesting = 0;
-	p->rcu_read_unlock_special.s = 0;
-	p->rcu_blocked_node = NULL;
-	INIT_LIST_HEAD(&p->rcu_node_entry);
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
-#ifdef CONFIG_TASKS_RCU
-	p->rcu_tasks_holdout = false;
-	INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
-	p->rcu_tasks_idle_cpu = -1;
-#endif /* #ifdef CONFIG_TASKS_RCU */
-}
-
 extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
 extern void sched_dead(struct task_struct *p);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 916e78004b8f..6c463c80e93d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1465,6 +1465,21 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
 	 task->pids[type].pid = pid;
 }
 
+static inline void rcu_copy_process(struct task_struct *p)
+{
+#ifdef CONFIG_PREEMPT_RCU
+	p->rcu_read_lock_nesting = 0;
+	p->rcu_read_unlock_special.s = 0;
+	p->rcu_blocked_node = NULL;
+	INIT_LIST_HEAD(&p->rcu_node_entry);
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
+#ifdef CONFIG_TASKS_RCU
+	p->rcu_tasks_holdout = false;
+	INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
+	p->rcu_tasks_idle_cpu = -1;
+#endif /* #ifdef CONFIG_TASKS_RCU */
+}
+
 /*
  * This creates a new process as a copy of the old one,
  * but does not actually start it yet.
-- 
cgit v1.2.3


From 56cd697366b6d6f67acb6c58ac7f3b185d11ef07 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 6 Feb 2017 10:57:33 +0100
Subject: sched/headers: Move the task_lock()/unlock() APIs to
 <linux/sched/task.h>

The task_lock()/task_unlock() APIs are not realated to core scheduling,
they are task lifetime APIs, i.e. they belong into <linux/sched/task.h>.

Move them.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h      | 20 --------------------
 include/linux/sched/task.h | 20 ++++++++++++++++++++
 kernel/cgroup/cgroup-v1.c  |  1 +
 kernel/cgroup/namespace.c  |  2 +-
 4 files changed, 22 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4ab105a2a8f9..d481c129a822 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1526,26 +1526,6 @@ static inline unsigned long wait_task_inactive(struct task_struct *p,
 }
 #endif
 
-/*
- * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
- * subscriptions and synchronises with wait4().  Also used in procfs.  Also
- * pins the final release of task.io_context.  Also protects ->cpuset and
- * ->cgroup.subsys[]. And ->vfork_done.
- *
- * Nests both inside and outside of read_lock(&tasklist_lock).
- * It must not be nested with write_lock_irq(&tasklist_lock),
- * neither inside nor outside.
- */
-static inline void task_lock(struct task_struct *p)
-{
-	spin_lock(&p->alloc_lock);
-}
-
-static inline void task_unlock(struct task_struct *p)
-{
-	spin_unlock(&p->alloc_lock);
-}
-
 /* set thread flags in other task's structures
  * - see asm/thread_info.h for TIF_xxxx flags available
  */
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 1be049a18d1b..2be9fde588a7 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -91,4 +91,24 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
 }
 #endif
 
+/*
+ * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
+ * subscriptions and synchronises with wait4().  Also used in procfs.  Also
+ * pins the final release of task.io_context.  Also protects ->cpuset and
+ * ->cgroup.subsys[]. And ->vfork_done.
+ *
+ * Nests both inside and outside of read_lock(&tasklist_lock).
+ * It must not be nested with write_lock_irq(&tasklist_lock),
+ * neither inside nor outside.
+ */
+static inline void task_lock(struct task_struct *p)
+{
+	spin_lock(&p->alloc_lock);
+}
+
+static inline void task_unlock(struct task_struct *p)
+{
+	spin_unlock(&p->alloc_lock);
+}
+
 #endif /* _LINUX_SCHED_TASK_H */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 08d2cb605101..abc585858685 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -6,6 +6,7 @@
 #include <linux/delay.h>
 #include <linux/mm.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/task.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/delayacct.h>
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index cff7ea62c38f..96d38dab6fb2 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -1,6 +1,6 @@
 #include "cgroup-internal.h"
 
-#include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/slab.h>
 #include <linux/nsproxy.h>
 #include <linux/proc_ns.h>
-- 
cgit v1.2.3


From 1050b27c52f615bc0e772b3119881e5304ccde6b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Feb 2017 11:48:36 +0100
Subject: sched/headers: Move cputime functionality from <linux/sched.h> and
 <linux/cputime.h> into <linux/sched/cputime.h>

Move cputime related functionality out of <linux/sched.h>, as most code
that includes <linux/sched.h> does not use that functionality.

Move data types that are not included in task_struct directly to
the signal definitions, into <linux/sched/signal.h>.

Also merge the (small) existing <linux/cputime.h> header into <linux/sched/cputime.h>.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/cputime.h       |  13 ---
 include/linux/sched.h         |  89 ---------------------
 include/linux/sched/cputime.h | 182 +++++++++++++++++++++++++++++++++++++++++-
 include/linux/sched/signal.h  |  33 ++++++++
 kernel/sched/stats.h          | 111 --------------------------
 5 files changed, 214 insertions(+), 214 deletions(-)
 delete mode 100644 include/linux/cputime.h

(limited to 'kernel')

diff --git a/include/linux/cputime.h b/include/linux/cputime.h
deleted file mode 100644
index a691dc4ddc13..000000000000
--- a/include/linux/cputime.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef __LINUX_CPUTIME_H
-#define __LINUX_CPUTIME_H
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-#include <asm/cputime.h>
-
-#ifndef cputime_to_nsecs
-# define cputime_to_nsecs(__ct)	\
-	(cputime_to_usecs(__ct) * NSEC_PER_USEC)
-#endif
-
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-#endif /* __LINUX_CPUTIME_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d481c129a822..2b43f55e4956 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -219,14 +219,6 @@ struct prev_cputime {
 #endif
 };
 
-static inline void prev_cputime_init(struct prev_cputime *prev)
-{
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	prev->utime = prev->stime = 0;
-	raw_spin_lock_init(&prev->lock);
-#endif
-}
-
 /**
  * struct task_cputime - collected CPU time counts
  * @utime:		time spent in user mode, in nanoseconds
@@ -248,40 +240,6 @@ struct task_cputime {
 #define prof_exp	stime
 #define sched_exp	sum_exec_runtime
 
-/*
- * This is the atomic variant of task_cputime, which can be used for
- * storing and updating task_cputime statistics without locking.
- */
-struct task_cputime_atomic {
-	atomic64_t utime;
-	atomic64_t stime;
-	atomic64_t sum_exec_runtime;
-};
-
-#define INIT_CPUTIME_ATOMIC \
-	(struct task_cputime_atomic) {				\
-		.utime = ATOMIC64_INIT(0),			\
-		.stime = ATOMIC64_INIT(0),			\
-		.sum_exec_runtime = ATOMIC64_INIT(0),		\
-	}
-
-/**
- * struct thread_group_cputimer - thread group interval timer counts
- * @cputime_atomic:	atomic thread group interval timers.
- * @running:		true when there are timers running and
- *			@cputime_atomic receives updates.
- * @checking_timer:	true when a thread in the group is in the
- *			process of checking for thread group timers.
- *
- * This structure contains the version of task_cputime, above, that is
- * used for thread group CPU timer calculations.
- */
-struct thread_group_cputimer {
-	struct task_cputime_atomic cputime_atomic;
-	bool running;
-	bool checking_timer;
-};
-
 #include <linux/rwsem.h>
 
 #ifdef CONFIG_SCHED_INFO
@@ -1234,44 +1192,6 @@ static inline void put_task_struct(struct task_struct *t)
 struct task_struct *task_rcu_dereference(struct task_struct **ptask);
 struct task_struct *try_get_task_struct(struct task_struct **ptask);
 
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-extern void task_cputime(struct task_struct *t,
-			 u64 *utime, u64 *stime);
-extern u64 task_gtime(struct task_struct *t);
-#else
-static inline void task_cputime(struct task_struct *t,
-				u64 *utime, u64 *stime)
-{
-	*utime = t->utime;
-	*stime = t->stime;
-}
-
-static inline u64 task_gtime(struct task_struct *t)
-{
-	return t->gtime;
-}
-#endif
-
-#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
-static inline void task_cputime_scaled(struct task_struct *t,
-				       u64 *utimescaled,
-				       u64 *stimescaled)
-{
-	*utimescaled = t->utimescaled;
-	*stimescaled = t->stimescaled;
-}
-#else
-static inline void task_cputime_scaled(struct task_struct *t,
-				       u64 *utimescaled,
-				       u64 *stimescaled)
-{
-	task_cputime(t, utimescaled, stimescaled);
-}
-#endif
-
-extern void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st);
-extern void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st);
-
 /*
  * Per process flags
  */
@@ -1395,9 +1315,6 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
 #define cpu_relax_yield() cpu_relax()
 #endif
 
-extern unsigned long long
-task_sched_runtime(struct task_struct *task);
-
 /* sched_exec is called by processes performing an exec */
 #ifdef CONFIG_SMP
 extern void sched_exec(void);
@@ -1629,12 +1546,6 @@ static __always_inline bool need_resched(void)
 	return unlikely(tif_need_resched());
 }
 
-/*
- * Thread group CPU time accounting.
- */
-void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
-void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
-
 /*
  * Wrappers for p->thread_info->cpu access. No-op on UP.
  */
diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h
index 6ed4fe43de28..4c5b9735c1ae 100644
--- a/include/linux/sched/cputime.h
+++ b/include/linux/sched/cputime.h
@@ -2,6 +2,186 @@
 #define _LINUX_SCHED_CPUTIME_H
 
 #include <linux/sched/signal.h>
-#include <linux/cputime.h>
+
+/*
+ * cputime accounting APIs:
+ */
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+#include <asm/cputime.h>
+
+#ifndef cputime_to_nsecs
+# define cputime_to_nsecs(__ct)	\
+	(cputime_to_usecs(__ct) * NSEC_PER_USEC)
+#endif
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+extern void task_cputime(struct task_struct *t,
+			 u64 *utime, u64 *stime);
+extern u64 task_gtime(struct task_struct *t);
+#else
+static inline void task_cputime(struct task_struct *t,
+				u64 *utime, u64 *stime)
+{
+	*utime = t->utime;
+	*stime = t->stime;
+}
+
+static inline u64 task_gtime(struct task_struct *t)
+{
+	return t->gtime;
+}
+#endif
+
+#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
+static inline void task_cputime_scaled(struct task_struct *t,
+				       u64 *utimescaled,
+				       u64 *stimescaled)
+{
+	*utimescaled = t->utimescaled;
+	*stimescaled = t->stimescaled;
+}
+#else
+static inline void task_cputime_scaled(struct task_struct *t,
+				       u64 *utimescaled,
+				       u64 *stimescaled)
+{
+	task_cputime(t, utimescaled, stimescaled);
+}
+#endif
+
+extern void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st);
+extern void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st);
+
+
+/*
+ * Thread group CPU time accounting.
+ */
+void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
+void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
+
+
+/*
+ * The following are functions that support scheduler-internal time accounting.
+ * These functions are generally called at the timer tick.  None of this depends
+ * on CONFIG_SCHEDSTATS.
+ */
+
+/**
+ * get_running_cputimer - return &tsk->signal->cputimer if cputimer is running
+ *
+ * @tsk:	Pointer to target task.
+ */
+#ifdef CONFIG_POSIX_TIMERS
+static inline
+struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk)
+{
+	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+
+	/* Check if cputimer isn't running. This is accessed without locking. */
+	if (!READ_ONCE(cputimer->running))
+		return NULL;
+
+	/*
+	 * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime
+	 * in __exit_signal(), we won't account to the signal struct further
+	 * cputime consumed by that task, even though the task can still be
+	 * ticking after __exit_signal().
+	 *
+	 * In order to keep a consistent behaviour between thread group cputime
+	 * and thread group cputimer accounting, lets also ignore the cputime
+	 * elapsing after __exit_signal() in any thread group timer running.
+	 *
+	 * This makes sure that POSIX CPU clocks and timers are synchronized, so
+	 * that a POSIX CPU timer won't expire while the corresponding POSIX CPU
+	 * clock delta is behind the expiring timer value.
+	 */
+	if (unlikely(!tsk->sighand))
+		return NULL;
+
+	return cputimer;
+}
+#else
+static inline
+struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk)
+{
+	return NULL;
+}
+#endif
+
+/**
+ * account_group_user_time - Maintain utime for a thread group.
+ *
+ * @tsk:	Pointer to task structure.
+ * @cputime:	Time value by which to increment the utime field of the
+ *		thread_group_cputime structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the utime field there.
+ */
+static inline void account_group_user_time(struct task_struct *tsk,
+					   u64 cputime)
+{
+	struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
+
+	if (!cputimer)
+		return;
+
+	atomic64_add(cputime, &cputimer->cputime_atomic.utime);
+}
+
+/**
+ * account_group_system_time - Maintain stime for a thread group.
+ *
+ * @tsk:	Pointer to task structure.
+ * @cputime:	Time value by which to increment the stime field of the
+ *		thread_group_cputime structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the stime field there.
+ */
+static inline void account_group_system_time(struct task_struct *tsk,
+					     u64 cputime)
+{
+	struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
+
+	if (!cputimer)
+		return;
+
+	atomic64_add(cputime, &cputimer->cputime_atomic.stime);
+}
+
+/**
+ * account_group_exec_runtime - Maintain exec runtime for a thread group.
+ *
+ * @tsk:	Pointer to task structure.
+ * @ns:		Time value by which to increment the sum_exec_runtime field
+ *		of the thread_group_cputime structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the sum_exec_runtime field there.
+ */
+static inline void account_group_exec_runtime(struct task_struct *tsk,
+					      unsigned long long ns)
+{
+	struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
+
+	if (!cputimer)
+		return;
+
+	atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
+}
+
+static inline void prev_cputime_init(struct prev_cputime *prev)
+{
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+	prev->utime = prev->stime = 0;
+	raw_spin_lock_init(&prev->lock);
+#endif
+}
+
+extern unsigned long long
+task_sched_runtime(struct task_struct *task);
 
 #endif /* _LINUX_SCHED_CPUTIME_H */
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index b3545fb4333a..2cf446704cd4 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -35,6 +35,39 @@ struct cpu_itimer {
 	u64 incr;
 };
 
+/*
+ * This is the atomic variant of task_cputime, which can be used for
+ * storing and updating task_cputime statistics without locking.
+ */
+struct task_cputime_atomic {
+	atomic64_t utime;
+	atomic64_t stime;
+	atomic64_t sum_exec_runtime;
+};
+
+#define INIT_CPUTIME_ATOMIC \
+	(struct task_cputime_atomic) {				\
+		.utime = ATOMIC64_INIT(0),			\
+		.stime = ATOMIC64_INIT(0),			\
+		.sum_exec_runtime = ATOMIC64_INIT(0),		\
+	}
+/**
+ * struct thread_group_cputimer - thread group interval timer counts
+ * @cputime_atomic:	atomic thread group interval timers.
+ * @running:		true when there are timers running and
+ *			@cputime_atomic receives updates.
+ * @checking_timer:	true when a thread in the group is in the
+ *			process of checking for thread group timers.
+ *
+ * This structure contains the version of task_cputime, above, that is
+ * used for thread group CPU timer calculations.
+ */
+struct thread_group_cputimer {
+	struct task_cputime_atomic cputime_atomic;
+	bool running;
+	bool checking_timer;
+};
+
 /*
  * NOTE! "signal_struct" does not have its own
  * locking, because a shared signal_struct always
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index bf0da0aa0a14..d5710651043b 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -164,114 +164,3 @@ sched_info_switch(struct rq *rq,
 #define sched_info_arrive(rq, next)		do { } while (0)
 #define sched_info_switch(rq, t, next)		do { } while (0)
 #endif /* CONFIG_SCHED_INFO */
-
-/*
- * The following are functions that support scheduler-internal time accounting.
- * These functions are generally called at the timer tick.  None of this depends
- * on CONFIG_SCHEDSTATS.
- */
-
-/**
- * get_running_cputimer - return &tsk->signal->cputimer if cputimer is running
- *
- * @tsk:	Pointer to target task.
- */
-#ifdef CONFIG_POSIX_TIMERS
-static inline
-struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk)
-{
-	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-
-	/* Check if cputimer isn't running. This is accessed without locking. */
-	if (!READ_ONCE(cputimer->running))
-		return NULL;
-
-	/*
-	 * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime
-	 * in __exit_signal(), we won't account to the signal struct further
-	 * cputime consumed by that task, even though the task can still be
-	 * ticking after __exit_signal().
-	 *
-	 * In order to keep a consistent behaviour between thread group cputime
-	 * and thread group cputimer accounting, lets also ignore the cputime
-	 * elapsing after __exit_signal() in any thread group timer running.
-	 *
-	 * This makes sure that POSIX CPU clocks and timers are synchronized, so
-	 * that a POSIX CPU timer won't expire while the corresponding POSIX CPU
-	 * clock delta is behind the expiring timer value.
-	 */
-	if (unlikely(!tsk->sighand))
-		return NULL;
-
-	return cputimer;
-}
-#else
-static inline
-struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk)
-{
-	return NULL;
-}
-#endif
-
-/**
- * account_group_user_time - Maintain utime for a thread group.
- *
- * @tsk:	Pointer to task structure.
- * @cputime:	Time value by which to increment the utime field of the
- *		thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the utime field there.
- */
-static inline void account_group_user_time(struct task_struct *tsk,
-					   u64 cputime)
-{
-	struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
-
-	if (!cputimer)
-		return;
-
-	atomic64_add(cputime, &cputimer->cputime_atomic.utime);
-}
-
-/**
- * account_group_system_time - Maintain stime for a thread group.
- *
- * @tsk:	Pointer to task structure.
- * @cputime:	Time value by which to increment the stime field of the
- *		thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the stime field there.
- */
-static inline void account_group_system_time(struct task_struct *tsk,
-					     u64 cputime)
-{
-	struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
-
-	if (!cputimer)
-		return;
-
-	atomic64_add(cputime, &cputimer->cputime_atomic.stime);
-}
-
-/**
- * account_group_exec_runtime - Maintain exec runtime for a thread group.
- *
- * @tsk:	Pointer to task structure.
- * @ns:		Time value by which to increment the sum_exec_runtime field
- *		of the thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the sum_exec_runtime field there.
- */
-static inline void account_group_exec_runtime(struct task_struct *tsk,
-					      unsigned long long ns)
-{
-	struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
-
-	if (!cputimer)
-		return;
-
-	atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
-}
-- 
cgit v1.2.3


From cd9c513be34ceaae8bf211474b91b6897574efdd Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 8 Feb 2017 18:51:58 +0100
Subject: sched/headers: Remove <linux/rwsem.h> from <linux/sched.h>

This is a stray header that is not needed by anything in sched.h,
so remove it.

Update files that relied on the stray inclusion.

This reduces the size of the header dependency graph.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h          | 2 --
 include/linux/user_namespace.h | 1 +
 kernel/utsname_sysctl.c        | 1 +
 3 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ac98255d00fb..b361f881fe44 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -226,8 +226,6 @@ struct task_cputime {
 #define prof_exp	stime
 #define sched_exp	sum_exec_runtime
 
-#include <linux/rwsem.h>
-
 #ifdef CONFIG_SCHED_INFO
 struct sched_info {
 	/* cumulative counters */
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 08264641b502..faa9bfb827da 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -5,6 +5,7 @@
 #include <linux/nsproxy.h>
 #include <linux/ns_common.h>
 #include <linux/sched.h>
+#include <linux/rwsem.h>
 #include <linux/sysctl.h>
 #include <linux/err.h>
 
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index c8eac43267e9..233cd8fc6910 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -14,6 +14,7 @@
 #include <linux/utsname.h>
 #include <linux/sysctl.h>
 #include <linux/wait.h>
+#include <linux/rwsem.h>
 
 #ifdef CONFIG_PROC_SYSCTL
 
-- 
cgit v1.2.3


From 50ff9d130014f7b19541dbf607a615a72b75b778 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Feb 2017 16:03:58 +0100
Subject: sched/headers: Remove <linux/magic.h> from <linux/sched/task_stack.h>

It's not used by any of the scheduler methods, but <linux/sched/task_stack.h>
needs it to pick up STACK_END_MAGIC.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h     | 1 -
 kernel/cgroup/cgroup-v1.c | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index bd0111a06aa2..559be4f1aee5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -22,7 +22,6 @@
 #include <linux/task_io_accounting.h>
 #include <linux/latencytop.h>
 #include <linux/topology.h>
-#include <linux/magic.h>
 
 #include <asm/current.h>
 
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index abc585858685..56eba9caa632 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -7,6 +7,7 @@
 #include <linux/mm.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/task.h>
+#include <linux/magic.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/delayacct.h>
-- 
cgit v1.2.3