summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile7
-rw-r--r--kernel/cgroup.c37
-rw-r--r--kernel/cpu.c44
-rw-r--r--kernel/cpu_pm.c16
-rw-r--r--kernel/cred.c9
-rw-r--r--kernel/exit.c13
-rw-r--r--kernel/fork.c28
-rw-r--r--kernel/irq/chip.c8
-rw-r--r--kernel/irq/internals.h3
-rw-r--r--kernel/irq/manage.c119
-rw-r--r--kernel/irq/migration.c13
-rw-r--r--kernel/kallsyms.c32
-rw-r--r--kernel/kcmp.c196
-rw-r--r--kernel/kmod.c30
-rw-r--r--kernel/lglock.c89
-rw-r--r--kernel/pid_namespace.c13
-rw-r--r--kernel/res_counter.c10
-rw-r--r--kernel/resource.c4
-rw-r--r--kernel/sched/core.c68
-rw-r--r--kernel/sched/fair.c42
-rw-r--r--kernel/sched/rt.c51
-rw-r--r--kernel/signal.c59
-rw-r--r--kernel/smpboot.c17
-rw-r--r--kernel/sys.c213
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--kernel/task_work.c84
-rw-r--r--kernel/time/clockevents.c3
-rw-r--r--kernel/time/tick-sched.c19
-rw-r--r--kernel/trace/ring_buffer.c5
29 files changed, 949 insertions, 286 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c07f30fa9b7..c0cc67ad764c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -5,12 +5,12 @@
obj-y = fork.o exec_domain.o panic.o printk.o \
cpu.o exit.o itimer.o time.o softirq.o resource.o \
sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
- signal.o sys.o kmod.o workqueue.o pid.o \
+ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
rcupdate.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
notifier.o ksysfs.o cred.o \
- async.o range.o groups.o
+ async.o range.o groups.o lglock.o
ifdef CONFIG_FUNCTION_TRACER
# Do not trace debug files and internal ftrace files
@@ -25,6 +25,9 @@ endif
obj-y += sched/
obj-y += power/
+ifeq ($(CONFIG_CHECKPOINT_RESTORE),y)
+obj-$(CONFIG_X86) += kcmp.o
+endif
obj-$(CONFIG_FREEZER) += freezer.o
obj-$(CONFIG_PROFILING) += profile.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a0c6af34d500..72fcd3069a90 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -896,10 +896,13 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
mutex_unlock(&cgroup_mutex);
/*
- * Drop the active superblock reference that we took when we
- * created the cgroup
+ * We want to drop the active superblock reference from the
+ * cgroup creation after all the dentry refs are gone -
+ * kill_sb gets mighty unhappy otherwise. Mark
+ * dentry->d_fsdata with cgroup_diput() to tell
+ * cgroup_d_release() to call deactivate_super().
*/
- deactivate_super(cgrp->root->sb);
+ dentry->d_fsdata = cgroup_diput;
/*
* if we're getting rid of the cgroup, refcount should ensure
@@ -925,6 +928,13 @@ static int cgroup_delete(const struct dentry *d)
return 1;
}
+static void cgroup_d_release(struct dentry *dentry)
+{
+ /* did cgroup_diput() tell me to deactivate super? */
+ if (dentry->d_fsdata == cgroup_diput)
+ deactivate_super(dentry->d_sb);
+}
+
static void remove_dir(struct dentry *d)
{
struct dentry *parent = dget(d->d_parent);
@@ -1532,6 +1542,7 @@ static int cgroup_get_rootdir(struct super_block *sb)
static const struct dentry_operations cgroup_dops = {
.d_iput = cgroup_diput,
.d_delete = cgroup_delete,
+ .d_release = cgroup_d_release,
};
struct inode *inode =
@@ -5132,7 +5143,7 @@ EXPORT_SYMBOL_GPL(css_depth);
* @root: the css supporsed to be an ancestor of the child.
*
* Returns true if "root" is an ancestor of "child" in its hierarchy. Because
- * this function reads css->id, this use rcu_dereference() and rcu_read_lock().
+ * this function reads css->id, the caller must hold rcu_read_lock().
* But, considering usual usage, the csses should be valid objects after test.
* Assuming that the caller will do some action to the child if this returns
* returns true, the caller must take "child";s reference count.
@@ -5144,18 +5155,18 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
{
struct css_id *child_id;
struct css_id *root_id;
- bool ret = true;
- rcu_read_lock();
child_id = rcu_dereference(child->id);
+ if (!child_id)
+ return false;
root_id = rcu_dereference(root->id);
- if (!child_id
- || !root_id
- || (child_id->depth < root_id->depth)
- || (child_id->stack[root_id->depth] != root_id->id))
- ret = false;
- rcu_read_unlock();
- return ret;
+ if (!root_id)
+ return false;
+ if (child_id->depth < root_id->depth)
+ return false;
+ if (child_id->stack[root_id->depth] != root_id->id)
+ return false;
+ return true;
}
void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 0e6353cf147a..a4eb5227a19e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -10,7 +10,10 @@
#include <linux/sched.h>
#include <linux/unistd.h>
#include <linux/cpu.h>
+#include <linux/oom.h>
+#include <linux/rcupdate.h>
#include <linux/export.h>
+#include <linux/bug.h>
#include <linux/kthread.h>
#include <linux/stop_machine.h>
#include <linux/mutex.h>
@@ -173,6 +176,47 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL(unregister_cpu_notifier);
+/**
+ * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
+ * @cpu: a CPU id
+ *
+ * This function walks all processes, finds a valid mm struct for each one and
+ * then clears a corresponding bit in mm's cpumask. While this all sounds
+ * trivial, there are various non-obvious corner cases, which this function
+ * tries to solve in a safe manner.
+ *
+ * Also note that the function uses a somewhat relaxed locking scheme, so it may
+ * be called only for an already offlined CPU.
+ */
+void clear_tasks_mm_cpumask(int cpu)
+{
+ struct task_struct *p;
+
+ /*
+ * This function is called after the cpu is taken down and marked
+ * offline, so its not like new tasks will ever get this cpu set in
+ * their mm mask. -- Peter Zijlstra
+ * Thus, we may use rcu_read_lock() here, instead of grabbing
+ * full-fledged tasklist_lock.
+ */
+ WARN_ON(cpu_online(cpu));
+ rcu_read_lock();
+ for_each_process(p) {
+ struct task_struct *t;
+
+ /*
+ * Main thread might exit, but other threads may still have
+ * a valid mm. Find one.
+ */
+ t = find_lock_task_mm(p);
+ if (!t)
+ continue;
+ cpumask_clear_cpu(cpu, mm_cpumask(t->mm));
+ task_unlock(t);
+ }
+ rcu_read_unlock();
+}
+
static inline void check_for_tasks(int cpu)
{
struct task_struct *p;
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 249152e15308..9656a3c36503 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -81,7 +81,7 @@ int cpu_pm_unregister_notifier(struct notifier_block *nb)
EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
/**
- * cpm_pm_enter - CPU low power entry notifier
+ * cpu_pm_enter - CPU low power entry notifier
*
* Notifies listeners that a single CPU is entering a low power state that may
* cause some blocks in the same power domain as the cpu to reset.
@@ -89,7 +89,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
* Must be called on the affected CPU with interrupts disabled. Platform is
* responsible for ensuring that cpu_pm_enter is not called twice on the same
* CPU before cpu_pm_exit is called. Notified drivers can include VFP
- * co-processor, interrupt controller and it's PM extensions, local CPU
+ * co-processor, interrupt controller and its PM extensions, local CPU
* timers context save/restore which shouldn't be interrupted. Hence it
* must be called with interrupts disabled.
*
@@ -115,13 +115,13 @@ int cpu_pm_enter(void)
EXPORT_SYMBOL_GPL(cpu_pm_enter);
/**
- * cpm_pm_exit - CPU low power exit notifier
+ * cpu_pm_exit - CPU low power exit notifier
*
* Notifies listeners that a single CPU is exiting a low power state that may
* have caused some blocks in the same power domain as the cpu to reset.
*
* Notified drivers can include VFP co-processor, interrupt controller
- * and it's PM extensions, local CPU timers context save/restore which
+ * and its PM extensions, local CPU timers context save/restore which
* shouldn't be interrupted. Hence it must be called with interrupts disabled.
*
* Return conditions are same as __raw_notifier_call_chain.
@@ -139,7 +139,7 @@ int cpu_pm_exit(void)
EXPORT_SYMBOL_GPL(cpu_pm_exit);
/**
- * cpm_cluster_pm_enter - CPU cluster low power entry notifier
+ * cpu_cluster_pm_enter - CPU cluster low power entry notifier
*
* Notifies listeners that all cpus in a power domain are entering a low power
* state that may cause some blocks in the same power domain to reset.
@@ -147,7 +147,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit);
* Must be called after cpu_pm_enter has been called on all cpus in the power
* domain, and before cpu_pm_exit has been called on any cpu in the power
* domain. Notified drivers can include VFP co-processor, interrupt controller
- * and it's PM extensions, local CPU timers context save/restore which
+ * and its PM extensions, local CPU timers context save/restore which
* shouldn't be interrupted. Hence it must be called with interrupts disabled.
*
* Must be called with interrupts disabled.
@@ -174,7 +174,7 @@ int cpu_cluster_pm_enter(void)
EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
/**
- * cpm_cluster_pm_exit - CPU cluster low power exit notifier
+ * cpu_cluster_pm_exit - CPU cluster low power exit notifier
*
* Notifies listeners that all cpus in a power domain are exiting form a
* low power state that may have caused some blocks in the same power domain
@@ -183,7 +183,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
* Must be called after cpu_pm_exit has been called on all cpus in the power
* domain, and before cpu_pm_exit has been called on any cpu in the power
* domain. Notified drivers can include VFP co-processor, interrupt controller
- * and it's PM extensions, local CPU timers context save/restore which
+ * and its PM extensions, local CPU timers context save/restore which
* shouldn't be interrupted. Hence it must be called with interrupts disabled.
*
* Return conditions are same as __raw_notifier_call_chain.
diff --git a/kernel/cred.c b/kernel/cred.c
index 430557ea488f..de728ac50d82 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -207,13 +207,6 @@ void exit_creds(struct task_struct *tsk)
validate_creds(cred);
alter_cred_subscribers(cred, -1);
put_cred(cred);
-
- cred = (struct cred *) tsk->replacement_session_keyring;
- if (cred) {
- tsk->replacement_session_keyring = NULL;
- validate_creds(cred);
- put_cred(cred);
- }
}
/**
@@ -396,8 +389,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
struct cred *new;
int ret;
- p->replacement_session_keyring = NULL;
-
if (
#ifdef CONFIG_KEYS
!p->cred->thread_keyring &&
diff --git a/kernel/exit.c b/kernel/exit.c
index 910a0716e17a..34867cc5b42a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -884,9 +884,9 @@ static void check_stack_usage(void)
spin_lock(&low_water_lock);
if (free < lowest_to_date) {
- printk(KERN_WARNING "%s used greatest stack depth: %lu bytes "
- "left\n",
- current->comm, free);
+ printk(KERN_WARNING "%s (%d) used greatest stack depth: "
+ "%lu bytes left\n",
+ current->comm, task_pid_nr(current), free);
lowest_to_date = free;
}
spin_unlock(&low_water_lock);
@@ -946,12 +946,13 @@ void do_exit(long code)
exit_signals(tsk); /* sets PF_EXITING */
/*
* tsk->flags are checked in the futex code to protect against
- * an exiting task cleaning up the robust pi futexes.
+ * an exiting task cleaning up the robust pi futexes, and in
+ * task_work_add() to avoid the race with exit_task_work().
*/
smp_mb();
raw_spin_unlock_wait(&tsk->pi_lock);
- exit_irq_thread();
+ exit_task_work(tsk);
if (unlikely(in_atomic()))
printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
@@ -1214,7 +1215,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
unsigned long state;
int retval, status, traced;
pid_t pid = task_pid_vnr(p);
- uid_t uid = from_kuid_munged(current_user_ns(), __task_cred(p)->uid);
+ uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
struct siginfo __user *infop;
if (!likely(wo->wo_flags & WEXITED))
diff --git a/kernel/fork.c b/kernel/fork.c
index 47b4e4f379f9..ab5211b9e622 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -386,7 +386,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
}
charge = 0;
if (mpnt->vm_flags & VM_ACCOUNT) {
- unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
+ unsigned long len;
+ len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
goto fail_nomem;
charge = len;
@@ -614,7 +615,6 @@ void mmput(struct mm_struct *mm)
list_del(&mm->mmlist);
spin_unlock(&mmlist_lock);
}
- put_swap_token(mm);
if (mm->binfmt)
module_put(mm->binfmt->module);
mmdrop(mm);
@@ -787,9 +787,6 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
/* Get rid of any cached register state */
deactivate_mm(tsk, mm);
- if (tsk->vfork_done)
- complete_vfork_done(tsk);
-
/*
* If we're exiting normally, clear a user-space tid field if
* requested. We leave this alone when dying by signal, to leave
@@ -810,6 +807,13 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
}
tsk->clear_child_tid = NULL;
}
+
+ /*
+ * All done, finally we can wake up parent and return this mm to him.
+ * Also kthread_stop() uses this completion for synchronization.
+ */
+ if (tsk->vfork_done)
+ complete_vfork_done(tsk);
}
/*
@@ -831,10 +835,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
memcpy(mm, oldmm, sizeof(*mm));
mm_init_cpumask(mm);
- /* Initializing for Swap token stuff */
- mm->token_priority = 0;
- mm->last_interval = 0;
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
mm->pmd_huge_pte = NULL;
#endif
@@ -913,10 +913,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
goto fail_nomem;
good_mm:
- /* Initializing for Swap token stuff */
- mm->token_priority = 0;
- mm->last_interval = 0;
-
tsk->mm = mm;
tsk->active_mm = mm;
return 0;
@@ -984,9 +980,8 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
* Share io context with parent, if CLONE_IO is set
*/
if (clone_flags & CLONE_IO) {
- tsk->io_context = ioc_task_link(ioc);
- if (unlikely(!tsk->io_context))
- return -ENOMEM;
+ ioc_task_link(ioc);
+ tsk->io_context = ioc;
} else if (ioprio_valid(ioc->ioprio)) {
new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
if (unlikely(!new_ioc))
@@ -1420,6 +1415,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
*/
p->group_leader = p;
INIT_LIST_HEAD(&p->thread_group);
+ INIT_HLIST_HEAD(&p->task_works);
/* Now that the task is set up, run cgroup callbacks if
* necessary. We need to run them before the task is visible
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index fc275e4f629b..eebd6d5cfb44 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -275,8 +275,10 @@ void handle_nested_irq(unsigned int irq)
kstat_incr_irqs_this_cpu(irq, desc);
action = desc->action;
- if (unlikely(!action || irqd_irq_disabled(&desc->irq_data)))
+ if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) {
+ desc->istate |= IRQS_PENDING;
goto out_unlock;
+ }
irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
raw_spin_unlock_irq(&desc->lock);
@@ -324,8 +326,10 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);
- if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
+ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+ desc->istate |= IRQS_PENDING;
goto out_unlock;
+ }
handle_irq_event(desc);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 8e5c56b3b7d9..001fa5bab490 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -101,6 +101,9 @@ extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
extern void irq_set_thread_affinity(struct irq_desc *desc);
+extern int irq_do_set_affinity(struct irq_data *data,
+ const struct cpumask *dest, bool force);
+
/* Inline functions for support of irq chips on slow busses */
static inline void chip_bus_lock(struct irq_desc *desc)
{
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index bb32326afe87..8c548232ba39 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -7,6 +7,8 @@
* This file contains driver APIs to the irq subsystem.
*/
+#define pr_fmt(fmt) "genirq: " fmt
+
#include <linux/irq.h>
#include <linux/kthread.h>
#include <linux/module.h>
@@ -14,6 +16,7 @@
#include <linux/interrupt.h>
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/task_work.h>
#include "internals.h"
@@ -139,6 +142,25 @@ static inline void
irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
#endif
+int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ bool force)
+{
+ struct irq_desc *desc = irq_data_to_desc(data);
+ struct irq_chip *chip = irq_data_get_irq_chip(data);
+ int ret;
+
+ ret = chip->irq_set_affinity(data, mask, false);
+ switch (ret) {
+ case IRQ_SET_MASK_OK:
+ cpumask_copy(data->affinity, mask);
+ case IRQ_SET_MASK_OK_NOCOPY:
+ irq_set_thread_affinity(desc);
+ ret = 0;
+ }
+
+ return ret;
+}
+
int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
{
struct irq_chip *chip = irq_data_get_irq_chip(data);
@@ -149,14 +171,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
return -EINVAL;
if (irq_can_move_pcntxt(data)) {
- ret = chip->irq_set_affinity(data, mask, false);
- switch (ret) {
- case IRQ_SET_MASK_OK:
- cpumask_copy(data->affinity, mask);
- case IRQ_SET_MASK_OK_NOCOPY:
- irq_set_thread_affinity(desc);
- ret = 0;
- }
+ ret = irq_do_set_affinity(data, mask, false);
} else {
irqd_set_move_pending(data);
irq_copy_pending(desc, mask);
@@ -280,9 +295,8 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
static int
setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
{
- struct irq_chip *chip = irq_desc_get_chip(desc);
struct cpumask *set = irq_default_affinity;
- int ret, node = desc->irq_data.node;
+ int node = desc->irq_data.node;
/* Excludes PER_CPU and NO_BALANCE interrupts */
if (!irq_can_set_affinity(irq))
@@ -308,13 +322,7 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
if (cpumask_intersects(mask, nodemask))
cpumask_and(mask, mask, nodemask);
}
- ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
- switch (ret) {
- case IRQ_SET_MASK_OK:
- cpumask_copy(desc->irq_data.affinity, mask);
- case IRQ_SET_MASK_OK_NOCOPY:
- irq_set_thread_affinity(desc);
- }
+ irq_do_set_affinity(&desc->irq_data, mask, false);
return 0;
}
#else
@@ -565,7 +573,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
* IRQF_TRIGGER_* but the PIC does not support multiple
* flow-types?
*/
- pr_debug("genirq: No set_type function for IRQ %d (%s)\n", irq,
+ pr_debug("No set_type function for IRQ %d (%s)\n", irq,
chip ? (chip->name ? : "unknown") : "unknown");
return 0;
}
@@ -600,7 +608,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
ret = 0;
break;
default:
- pr_err("genirq: Setting trigger mode %lu for irq %u failed (%pF)\n",
+ pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n",
flags, irq, chip->irq_set_type);
}
if (unmask)
@@ -773,11 +781,39 @@ static void wake_threads_waitq(struct irq_desc *desc)
wake_up(&desc->wait_for_threads);
}
+static void irq_thread_dtor(struct task_work *unused)
+{
+ struct task_struct *tsk = current;
+ struct irq_desc *desc;
+ struct irqaction *action;
+
+ if (WARN_ON_ONCE(!(current->flags & PF_EXITING)))
+ return;
+
+ action = kthread_data(tsk);
+
+ pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
+ tsk->comm ? tsk->comm : "", tsk->pid, action->irq);
+
+
+ desc = irq_to_desc(action->irq);
+ /*
+ * If IRQTF_RUNTHREAD is set, we need to decrement
+ * desc->threads_active and wake possible waiters.
+ */
+ if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+ wake_threads_waitq(desc);
+
+ /* Prevent a stale desc->threads_oneshot */
+ irq_finalize_oneshot(desc, action);
+}
+
/*
* Interrupt handler thread
*/
static int irq_thread(void *data)
{
+ struct task_work on_exit_work;
static const struct sched_param param = {
.sched_priority = MAX_USER_RT_PRIO/2,
};
@@ -793,7 +829,9 @@ static int irq_thread(void *data)
handler_fn = irq_thread_fn;
sched_setscheduler(current, SCHED_FIFO, &param);
- current->irq_thread = 1;
+
+ init_task_work(&on_exit_work, irq_thread_dtor, NULL);
+ task_work_add(current, &on_exit_work, false);
while (!irq_wait_for_interrupt(action)) {
irqreturn_t action_ret;
@@ -815,44 +853,11 @@ static int irq_thread(void *data)
* cannot touch the oneshot mask at this point anymore as
* __setup_irq() might have given out currents thread_mask
* again.
- *
- * Clear irq_thread. Otherwise exit_irq_thread() would make
- * fuzz about an active irq thread going into nirvana.
*/
- current->irq_thread = 0;
+ task_work_cancel(current, irq_thread_dtor);
return 0;
}
-/*
- * Called from do_exit()
- */
-void exit_irq_thread(void)
-{
- struct task_struct *tsk = current;
- struct irq_desc *desc;
- struct irqaction *action;
-
- if (!tsk->irq_thread)
- return;
-
- action = kthread_data(tsk);
-
- pr_err("genirq: exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
- tsk->comm ? tsk->comm : "", tsk->pid, action->irq);
-
- desc = irq_to_desc(action->irq);
-
- /*
- * If IRQTF_RUNTHREAD is set, we need to decrement
- * desc->threads_active and wake possible waiters.
- */
- if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags))
- wake_threads_waitq(desc);
-
- /* Prevent a stale desc->threads_oneshot */
- irq_finalize_oneshot(desc, action);
-}
-
static void irq_setup_forced_threading(struct irqaction *new)
{
if (!force_irqthreads)
@@ -1044,7 +1049,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
* has. The type flags are unreliable as the
* underlying chip implementation can override them.
*/
- pr_err("genirq: Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n",
+ pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n",
irq);
ret = -EINVAL;
goto out_mask;
@@ -1095,7 +1100,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
if (nmsk != omsk)
/* hope the handler works with current trigger mode */
- pr_warning("genirq: irq %d uses trigger mode %u; requested %u\n",
+ pr_warning("irq %d uses trigger mode %u; requested %u\n",
irq, nmsk, omsk);
}
@@ -1133,7 +1138,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
mismatch:
if (!(new->flags & IRQF_PROBE_SHARED)) {
- pr_err("genirq: Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n",
+ pr_err("Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n",
irq, new->flags, new->name, old->flags, old->name);
#ifdef CONFIG_DEBUG_SHIRQ
dump_stack();
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index c3c89751b327..ca3f4aaff707 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -42,17 +42,8 @@ void irq_move_masked_irq(struct irq_data *idata)
* For correct operation this depends on the caller
* masking the irqs.
*/
- if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
- < nr_cpu_ids)) {
- int ret = chip->irq_set_affinity(&desc->irq_data,
- desc->pending_mask, false);
- switch (ret) {
- case IRQ_SET_MASK_OK:
- cpumask_copy(desc->irq_data.affinity, desc->pending_mask);
- case IRQ_SET_MASK_OK_NOCOPY:
- irq_set_thread_affinity(desc);
- }
- }
+ if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids)
+ irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false);
cpumask_clear(desc->pending_mask);
}
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 079f1d39a8b8..2169feeba529 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -343,7 +343,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
/* Look up a kernel symbol and return it in a text buffer. */
static int __sprint_symbol(char *buffer, unsigned long address,
- int symbol_offset)
+ int symbol_offset, int add_offset)
{
char *modname;
const char *name;
@@ -358,13 +358,13 @@ static int __sprint_symbol(char *buffer, unsigned long address,
if (name != buffer)
strcpy(buffer, name);
len = strlen(buffer);
- buffer += len;
offset -= symbol_offset;
+ if (add_offset)
+ len += sprintf(buffer + len, "+%#lx/%#lx", offset, size);
+
if (modname)
- len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname);
- else
- len += sprintf(buffer, "+%#lx/%#lx", offset, size);
+ len += sprintf(buffer + len, " [%s]", modname);
return len;
}
@@ -382,12 +382,28 @@ static int __sprint_symbol(char *buffer, unsigned long address,
*/
int sprint_symbol(char *buffer, unsigned long address)
{
- return __sprint_symbol(buffer, address, 0);
+ return __sprint_symbol(buffer, address, 0, 1);
}
-
EXPORT_SYMBOL_GPL(sprint_symbol);
/**
+ * sprint_symbol_no_offset - Look up a kernel symbol and return it in a text buffer
+ * @buffer: buffer to be stored
+ * @address: address to lookup
+ *
+ * This function looks up a kernel symbol with @address and stores its name
+ * and module name to @buffer if possible. If no symbol was found, just saves
+ * its @address as is.
+ *
+ * This function returns the number of bytes stored in @buffer.
+ */
+int sprint_symbol_no_offset(char *buffer, unsigned long address)
+{
+ return __sprint_symbol(buffer, address, 0, 0);
+}
+EXPORT_SYMBOL_GPL(sprint_symbol_no_offset);
+
+/**
* sprint_backtrace - Look up a backtrace symbol and return it in a text buffer
* @buffer: buffer to be stored
* @address: address to lookup
@@ -403,7 +419,7 @@ EXPORT_SYMBOL_GPL(sprint_symbol);
*/
int sprint_backtrace(char *buffer, unsigned long address)
{
- return __sprint_symbol(buffer, address, -1);
+ return __sprint_symbol(buffer, address, -1, 1);
}
/* Look up a kernel symbol and print it to the kernel messages. */
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
new file mode 100644
index 000000000000..30b7b225306c
--- /dev/null
+++ b/kernel/kcmp.c
@@ -0,0 +1,196 @@
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
+#include <linux/fdtable.h>
+#include <linux/string.h>
+#include <linux/random.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/cache.h>
+#include <linux/bug.h>
+#include <linux/err.h>
+#include <linux/kcmp.h>
+
+#include <asm/unistd.h>
+
+/*
+ * We don't expose the real in-memory order of objects for security reasons.
+ * But still the comparison results should be suitable for sorting. So we
+ * obfuscate kernel pointers values and compare the production instead.
+ *
+ * The obfuscation is done in two steps. First we xor the kernel pointer with
+ * a random value, which puts pointer into a new position in a reordered space.
+ * Secondly we multiply the xor production with a large odd random number to
+ * permute its bits even more (the odd multiplier guarantees that the product
+ * is unique ever after the high bits are truncated, since any odd number is
+ * relative prime to 2^n).
+ *
+ * Note also that the obfuscation itself is invisible to userspace and if needed
+ * it can be changed to an alternate scheme.
+ */
+static unsigned long cookies[KCMP_TYPES][2] __read_mostly;
+
+static long kptr_obfuscate(long v, int type)
+{
+ return (v ^ cookies[type][0]) * cookies[type][1];
+}
+
+/*
+ * 0 - equal, i.e. v1 = v2
+ * 1 - less than, i.e. v1 < v2
+ * 2 - greater than, i.e. v1 > v2
+ * 3 - not equal but ordering unavailable (reserved for future)
+ */
+static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type)
+{
+ long ret;
+
+ ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type);
+
+ return (ret < 0) | ((ret > 0) << 1);
+}
+
+/* The caller must have pinned the task */
+static struct file *
+get_file_raw_ptr(struct task_struct *task, unsigned int idx)
+{
+ struct file *file = NULL;
+
+ task_lock(task);
+ rcu_read_lock();
+
+ if (task->files)
+ file = fcheck_files(task->files, idx);
+
+ rcu_read_unlock();
+ task_unlock(task);
+
+ return file;
+}
+
+static void kcmp_unlock(struct mutex *m1, struct mutex *m2)
+{
+ if (likely(m2 != m1))
+ mutex_unlock(m2);
+ mutex_unlock(m1);
+}
+
+static int kcmp_lock(struct mutex *m1, struct mutex *m2)
+{
+ int err;
+
+ if (m2 > m1)
+ swap(m1, m2);
+
+ err = mutex_lock_killable(m1);
+ if (!err && likely(m1 != m2)) {
+ err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING);
+ if (err)
+ mutex_unlock(m1);
+ }
+
+ return err;
+}
+
+SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
+ unsigned long, idx1, unsigned long, idx2)
+{
+ struct task_struct *task1, *task2;
+ int ret;
+
+ rcu_read_lock();
+
+ /*
+ * Tasks are looked up in caller's PID namespace only.
+ */
+ task1 = find_task_by_vpid(pid1);
+ task2 = find_task_by_vpid(pid2);
+ if (!task1 || !task2)
+ goto err_no_task;
+
+ get_task_struct(task1);
+ get_task_struct(task2);
+
+ rcu_read_unlock();
+
+ /*
+ * One should have enough rights to inspect task details.
+ */
+ ret = kcmp_lock(&task1->signal->cred_guard_mutex,
+ &task2->signal->cred_guard_mutex);
+ if (ret)
+ goto err;
+ if (!ptrace_may_access(task1, PTRACE_MODE_READ) ||
+ !ptrace_may_access(task2, PTRACE_MODE_READ)) {
+ ret = -EPERM;
+ goto err_unlock;
+ }
+
+ switch (type) {
+ case KCMP_FILE: {
+ struct file *filp1, *filp2;
+
+ filp1 = get_file_raw_ptr(task1, idx1);
+ filp2 = get_file_raw_ptr(task2, idx2);
+
+ if (filp1 && filp2)
+ ret = kcmp_ptr(filp1, filp2, KCMP_FILE);
+ else
+ ret = -EBADF;
+ break;
+ }
+ case KCMP_VM:
+ ret = kcmp_ptr(task1->mm, task2->mm, KCMP_VM);
+ break;
+ case KCMP_FILES:
+ ret = kcmp_ptr(task1->files, task2->files, KCMP_FILES);
+ break;
+ case KCMP_FS:
+ ret = kcmp_ptr(task1->fs, task2->fs, KCMP_FS);
+ break;
+ case KCMP_SIGHAND:
+ ret = kcmp_ptr(task1->sighand, task2->sighand, KCMP_SIGHAND);
+ break;
+ case KCMP_IO:
+ ret = kcmp_ptr(task1->io_context, task2->io_context, KCMP_IO);
+ break;
+ case KCMP_SYSVSEM:
+#ifdef CONFIG_SYSVIPC
+ ret = kcmp_ptr(task1->sysvsem.undo_list,
+ task2->sysvsem.undo_list,
+ KCMP_SYSVSEM);
+#else
+ ret = -EOPNOTSUPP;
+#endif
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+err_unlock:
+ kcmp_unlock(&task1->signal->cred_guard_mutex,
+ &task2->signal->cred_guard_mutex);
+err:
+ put_task_struct(task1);
+ put_task_struct(task2);
+
+ return ret;
+
+err_no_task:
+ rcu_read_unlock();
+ return -ESRCH;
+}
+
+static __init int kcmp_cookies_init(void)
+{
+ int i;
+
+ get_random_bytes(cookies, sizeof(cookies));
+
+ for (i = 0; i < KCMP_TYPES; i++)
+ cookies[i][1] |= (~(~0UL >> 1) | 1);
+
+ return 0;
+}
+arch_initcall(kcmp_cookies_init);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 05698a7415fe..ff2c7cb86d77 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -221,13 +221,12 @@ fail:
return 0;
}
-void call_usermodehelper_freeinfo(struct subprocess_info *info)
+static void call_usermodehelper_freeinfo(struct subprocess_info *info)
{
if (info->cleanup)
(*info->cleanup)(info);
kfree(info);
}
-EXPORT_SYMBOL(call_usermodehelper_freeinfo);
static void umh_complete(struct subprocess_info *sub_info)
{
@@ -410,7 +409,7 @@ EXPORT_SYMBOL_GPL(usermodehelper_read_unlock);
/**
* __usermodehelper_set_disable_depth - Modify usermodehelper_disabled.
- * depth: New value to assign to usermodehelper_disabled.
+ * @depth: New value to assign to usermodehelper_disabled.
*
* Change the value of usermodehelper_disabled (under umhelper_sem locked for
* writing) and wakeup tasks waiting for it to change.
@@ -479,6 +478,7 @@ static void helper_unlock(void)
* structure. This should be passed to call_usermodehelper_exec to
* exec the process and free the structure.
*/
+static
struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
char **envp, gfp_t gfp_mask)
{
@@ -494,7 +494,6 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
out:
return sub_info;
}
-EXPORT_SYMBOL(call_usermodehelper_setup);
/**
* call_usermodehelper_setfns - set a cleanup/init function
@@ -512,6 +511,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
* Function must be runnable in either a process context or the
* context in which call_usermodehelper_exec is called.
*/
+static
void call_usermodehelper_setfns(struct subprocess_info *info,
int (*init)(struct subprocess_info *info, struct cred *new),
void (*cleanup)(struct subprocess_info *info),
@@ -521,7 +521,6 @@ void call_usermodehelper_setfns(struct subprocess_info *info,
info->init = init;
info->data = data;
}
-EXPORT_SYMBOL(call_usermodehelper_setfns);
/**
* call_usermodehelper_exec - start a usermode application
@@ -535,6 +534,7 @@ EXPORT_SYMBOL(call_usermodehelper_setfns);
* asynchronously if wait is not set, and runs as a child of keventd.
* (ie. it runs with full root capabilities).
*/
+static
int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
{
DECLARE_COMPLETION_ONSTACK(done);
@@ -576,7 +576,25 @@ unlock:
helper_unlock();
return retval;
}
-EXPORT_SYMBOL(call_usermodehelper_exec);
+
+int call_usermodehelper_fns(
+ char *path, char **argv, char **envp, int wait,
+ int (*init)(struct subprocess_info *info, struct cred *new),
+ void (*cleanup)(struct subprocess_info *), void *data)
+{
+ struct subprocess_info *info;
+ gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
+
+ info = call_usermodehelper_setup(path, argv, envp, gfp_mask);
+
+ if (info == NULL)
+ return -ENOMEM;
+
+ call_usermodehelper_setfns(info, init, cleanup, data);
+
+ return call_usermodehelper_exec(info, wait);
+}
+EXPORT_SYMBOL(call_usermodehelper_fns);
static int proc_cap_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
diff --git a/kernel/lglock.c b/kernel/lglock.c
new file mode 100644
index 000000000000..6535a667a5a7
--- /dev/null
+++ b/kernel/lglock.c
@@ -0,0 +1,89 @@
+/* See include/linux/lglock.h for description */
+#include <linux/module.h>
+#include <linux/lglock.h>
+#include <linux/cpu.h>
+#include <linux/string.h>
+
+/*
+ * Note there is no uninit, so lglocks cannot be defined in
+ * modules (but it's fine to use them from there)
+ * Could be added though, just undo lg_lock_init
+ */
+
+void lg_lock_init(struct lglock *lg, char *name)
+{
+ LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
+}
+EXPORT_SYMBOL(lg_lock_init);
+
+void lg_local_lock(struct lglock *lg)
+{
+ arch_spinlock_t *lock;
+
+ preempt_disable();
+ rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
+ lock = this_cpu_ptr(lg->lock);
+ arch_spin_lock(lock);
+}
+EXPORT_SYMBOL(lg_local_lock);
+
+void lg_local_unlock(struct lglock *lg)
+{
+ arch_spinlock_t *lock;
+
+ rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+ lock = this_cpu_ptr(lg->lock);
+ arch_spin_unlock(lock);
+ preempt_enable();
+}
+EXPORT_SYMBOL(lg_local_unlock);
+
+void lg_local_lock_cpu(struct lglock *lg, int cpu)
+{
+ arch_spinlock_t *lock;
+
+ preempt_disable();
+ rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
+ lock = per_cpu_ptr(lg->lock, cpu);
+ arch_spin_lock(lock);
+}
+EXPORT_SYMBOL(lg_local_lock_cpu);
+
+void lg_local_unlock_cpu(struct lglock *lg, int cpu)
+{
+ arch_spinlock_t *lock;
+
+ rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+ lock = per_cpu_ptr(lg->lock, cpu);
+ arch_spin_unlock(lock);
+ preempt_enable();
+}
+EXPORT_SYMBOL(lg_local_unlock_cpu);
+
+void lg_global_lock(struct lglock *lg)
+{
+ int i;
+
+ preempt_disable();
+ rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_);
+ for_each_possible_cpu(i) {
+ arch_spinlock_t *lock;
+ lock = per_cpu_ptr(lg->lock, i);
+ arch_spin_lock(lock);
+ }
+}
+EXPORT_SYMBOL(lg_global_lock);
+
+void lg_global_unlock(struct lglock *lg)
+{
+ int i;
+
+ rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+ for_each_possible_cpu(i) {
+ arch_spinlock_t *lock;
+ lock = per_cpu_ptr(lg->lock, i);
+ arch_spin_unlock(lock);
+ }
+ preempt_enable();
+}
+EXPORT_SYMBOL(lg_global_unlock);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 57bc1fd35b3c..16b20e38c4a1 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -149,7 +149,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
{
int nr;
int rc;
- struct task_struct *task;
+ struct task_struct *task, *me = current;
+
+ /* Ignore SIGCHLD causing any terminated children to autoreap */
+ spin_lock_irq(&me->sighand->siglock);
+ me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
+ spin_unlock_irq(&me->sighand->siglock);
/*
* The last thread in the cgroup-init thread group is terminating.
@@ -191,6 +196,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
return;
}
+#ifdef CONFIG_CHECKPOINT_RESTORE
static int pid_ns_ctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -218,8 +224,8 @@ static struct ctl_table pid_ns_ctl_table[] = {
},
{ }
};
-
static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
+#endif /* CONFIG_CHECKPOINT_RESTORE */
int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
{
@@ -253,7 +259,10 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
static __init int pid_namespaces_init(void)
{
pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
register_sysctl_paths(kern_path, pid_ns_ctl_table);
+#endif
return 0;
}
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index bebe2b170d49..ad581aa2369a 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -94,13 +94,15 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
counter->usage -= val;
}
-void res_counter_uncharge(struct res_counter *counter, unsigned long val)
+void res_counter_uncharge_until(struct res_counter *counter,
+ struct res_counter *top,
+ unsigned long val)
{
unsigned long flags;
struct res_counter *c;
local_irq_save(flags);
- for (c = counter; c != NULL; c = c->parent) {
+ for (c = counter; c != top; c = c->parent) {
spin_lock(&c->lock);
res_counter_uncharge_locked(c, val);
spin_unlock(&c->lock);
@@ -108,6 +110,10 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val)
local_irq_restore(flags);
}
+void res_counter_uncharge(struct res_counter *counter, unsigned long val)
+{
+ res_counter_uncharge_until(counter, NULL, val);
+}
static inline unsigned long long *
res_counter_member(struct res_counter *counter, int member)
diff --git a/kernel/resource.c b/kernel/resource.c
index 7e8ea66a8c01..e1d2b8ee76d5 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -515,8 +515,8 @@ out:
* @root: root resource descriptor
* @new: resource descriptor desired by caller
* @size: requested resource region size
- * @min: minimum size to allocate
- * @max: maximum size to allocate
+ * @min: minimum boundary to allocate
+ * @max: maximum boundary to allocate
* @align: alignment requested, in bytes
* @alignf: alignment function, optional, called if not NULL
* @alignf_data: arbitrary data to pass to the @alignf function
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 39eb6011bc38..c46958e26121 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -142,9 +142,8 @@ const_debug unsigned int sysctl_sched_features =
#define SCHED_FEAT(name, enabled) \
#name ,
-static __read_mostly char *sched_feat_names[] = {
+static const char * const sched_feat_names[] = {
#include "features.h"
- NULL
};
#undef SCHED_FEAT
@@ -2517,25 +2516,32 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
sched_avg_update(this_rq);
}
+#ifdef CONFIG_NO_HZ
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we cannot use the delta approach from the regular tick since that
+ * would seriously skew the load calculation. However we'll make do for those
+ * updates happening while idle (nohz_idle_balance) or coming out of idle
+ * (tick_nohz_idle_exit).
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+
/*
* Called from nohz_idle_balance() to update the load ratings before doing the
* idle balance.
*/
void update_idle_cpu_load(struct rq *this_rq)
{
- unsigned long curr_jiffies = jiffies;
+ unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
unsigned long load = this_rq->load.weight;
unsigned long pending_updates;
/*
- * Bloody broken means of dealing with nohz, but better than nothing..
- * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
- * update and see 0 difference the one time and 2 the next, even though
- * we ticked at roughtly the same rate.
- *
- * Hence we only use this from nohz_idle_balance() and skip this
- * nonsense when called from the scheduler_tick() since that's
- * guaranteed a stable rate.
+ * bail if there's load or we're actually up-to-date.
*/
if (load || curr_jiffies == this_rq->last_load_update_tick)
return;
@@ -2547,12 +2553,38 @@ void update_idle_cpu_load(struct rq *this_rq)
}
/*
+ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ */
+void update_cpu_load_nohz(void)
+{
+ struct rq *this_rq = this_rq();
+ unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+ unsigned long pending_updates;
+
+ if (curr_jiffies == this_rq->last_load_update_tick)
+ return;
+
+ raw_spin_lock(&this_rq->lock);
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ if (pending_updates) {
+ this_rq->last_load_update_tick = curr_jiffies;
+ /*
+ * We were idle, this means load 0, the current load might be
+ * !0 due to remote wakeups and the sort.
+ */
+ __update_cpu_load(this_rq, 0, pending_updates);
+ }
+ raw_spin_unlock(&this_rq->lock);
+}
+#endif /* CONFIG_NO_HZ */
+
+/*
* Called from scheduler_tick()
*/
static void update_cpu_load_active(struct rq *this_rq)
{
/*
- * See the mess in update_idle_cpu_load().
+ * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
*/
this_rq->last_load_update_tick = jiffies;
__update_cpu_load(this_rq, this_rq->load.weight, 1);
@@ -4982,7 +5014,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
p->sched_class->set_cpus_allowed(p, new_mask);
cpumask_copy(&p->cpus_allowed, new_mask);
- p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
+ p->nr_cpus_allowed = cpumask_weight(new_mask);
}
/*
@@ -5997,11 +6029,14 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
cpumask_or(covered, covered, sg_span);
- sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
+ sg->sgp = *per_cpu_ptr(sdd->sgp, i);
atomic_inc(&sg->sgp->ref);
- if (cpumask_test_cpu(cpu, sg_span))
+ if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
+ cpumask_first(sg_span) == cpu) {
+ WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span));
groups = sg;
+ }
if (!first)
first = sg;
@@ -6403,7 +6438,7 @@ static void sched_init_numa(void)
return;
for (j = 0; j < nr_node_ids; j++) {
- struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
+ struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
if (!mask)
return;
@@ -6691,7 +6726,6 @@ static int init_sched_domains(const struct cpumask *cpu_map)
if (!doms_cur)
doms_cur = &fallback_doms;
cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
- dattr_cur = NULL;
err = build_sched_domains(doms_cur[0], NULL);
register_sched_domain_sysctl();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 940e6d17cf96..b2a2d236f27b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
int want_sd = 1;
int sync = wake_flags & WF_SYNC;
- if (p->rt.nr_cpus_allowed == 1)
+ if (p->nr_cpus_allowed == 1)
return prev_cpu;
if (sd_flag & SD_BALANCE_WAKE) {
@@ -3503,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
unsigned long scale_rt_power(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- u64 total, available;
+ u64 total, available, age_stamp, avg;
- total = sched_avg_period() + (rq->clock - rq->age_stamp);
+ /*
+ * Since we're reading these variables without serialization make sure
+ * we read them once before doing sanity checks on them.
+ */
+ age_stamp = ACCESS_ONCE(rq->age_stamp);
+ avg = ACCESS_ONCE(rq->rt_avg);
+
+ total = sched_avg_period() + (rq->clock - age_stamp);
- if (unlikely(total < rq->rt_avg)) {
+ if (unlikely(total < avg)) {
/* Ensures that power won't end up being negative */
available = 0;
} else {
- available = total - rq->rt_avg;
+ available = total - avg;
}
if (unlikely((s64)total < SCHED_POWER_SCALE))
@@ -3574,11 +3581,26 @@ void update_group_power(struct sched_domain *sd, int cpu)
power = 0;
- group = child->groups;
- do {
- power += group->sgp->power;
- group = group->next;
- } while (group != child->groups);
+ if (child->flags & SD_OVERLAP) {
+ /*
+ * SD_OVERLAP domains cannot assume that child groups
+ * span the current group.
+ */
+
+ for_each_cpu(cpu, sched_group_cpus(sdg))
+ power += power_of(cpu);
+ } else {
+ /*
+ * !SD_OVERLAP domains can assume that child groups
+ * span the current group.
+ */
+
+ group = child->groups;
+ do {
+ power += group->sgp->power;
+ group = group->next;
+ } while (group != child->groups);
+ }
sdg->sgp->power = power;
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index c5565c3c515f..2a4e8dffbd6b 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -274,13 +274,16 @@ static void update_rt_migration(struct rt_rq *rt_rq)
static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
+ struct task_struct *p;
+
if (!rt_entity_is_task(rt_se))
return;
+ p = rt_task_of(rt_se);
rt_rq = &rq_of_rt_rq(rt_rq)->rt;
rt_rq->rt_nr_total++;
- if (rt_se->nr_cpus_allowed > 1)
+ if (p->nr_cpus_allowed > 1)
rt_rq->rt_nr_migratory++;
update_rt_migration(rt_rq);
@@ -288,13 +291,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
+ struct task_struct *p;
+
if (!rt_entity_is_task(rt_se))
return;
+ p = rt_task_of(rt_se);
rt_rq = &rq_of_rt_rq(rt_rq)->rt;
rt_rq->rt_nr_total--;
- if (rt_se->nr_cpus_allowed > 1)
+ if (p->nr_cpus_allowed > 1)
rt_rq->rt_nr_migratory--;
update_rt_migration(rt_rq);
@@ -1161,7 +1167,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
- if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
+ if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
inc_nr_running(rq);
@@ -1225,7 +1231,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
cpu = task_cpu(p);
- if (p->rt.nr_cpus_allowed == 1)
+ if (p->nr_cpus_allowed == 1)
goto out;
/* For anything but wake ups, just return the task_cpu */
@@ -1260,9 +1266,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
* will have to sort it out.
*/
if (curr && unlikely(rt_task(curr)) &&
- (curr->rt.nr_cpus_allowed < 2 ||
+ (curr->nr_cpus_allowed < 2 ||
curr->prio <= p->prio) &&
- (p->rt.nr_cpus_allowed > 1)) {
+ (p->nr_cpus_allowed > 1)) {
int target = find_lowest_rq(p);
if (target != -1)
@@ -1276,10 +1282,10 @@ out:
static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
{
- if (rq->curr->rt.nr_cpus_allowed == 1)
+ if (rq->curr->nr_cpus_allowed == 1)
return;
- if (p->rt.nr_cpus_allowed != 1
+ if (p->nr_cpus_allowed != 1
&& cpupri_find(&rq->rd->cpupri, p, NULL))
return;
@@ -1395,7 +1401,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
* The previous task needs to be made eligible for pushing
* if it is still active
*/
- if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
+ if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
}
@@ -1408,7 +1414,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
(cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
- (p->rt.nr_cpus_allowed > 1))
+ (p->nr_cpus_allowed > 1))
return 1;
return 0;
}
@@ -1464,7 +1470,7 @@ static int find_lowest_rq(struct task_struct *task)
if (unlikely(!lowest_mask))
return -1;
- if (task->rt.nr_cpus_allowed == 1)
+ if (task->nr_cpus_allowed == 1)
return -1; /* No other targets possible */
if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
@@ -1586,7 +1592,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
BUG_ON(rq->cpu != task_cpu(p));
BUG_ON(task_current(rq, p));
- BUG_ON(p->rt.nr_cpus_allowed <= 1);
+ BUG_ON(p->nr_cpus_allowed <= 1);
BUG_ON(!p->on_rq);
BUG_ON(!rt_task(p));
@@ -1793,9 +1799,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
has_pushable_tasks(rq) &&
- p->rt.nr_cpus_allowed > 1 &&
+ p->nr_cpus_allowed > 1 &&
rt_task(rq->curr) &&
- (rq->curr->rt.nr_cpus_allowed < 2 ||
+ (rq->curr->nr_cpus_allowed < 2 ||
rq->curr->prio <= p->prio))
push_rt_tasks(rq);
}
@@ -1817,7 +1823,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
* Only update if the process changes its state from whether it
* can migrate or not.
*/
- if ((p->rt.nr_cpus_allowed > 1) == (weight > 1))
+ if ((p->nr_cpus_allowed > 1) == (weight > 1))
return;
rq = task_rq(p);
@@ -1979,6 +1985,8 @@ static void watchdog(struct rq *rq, struct task_struct *p)
static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
{
+ struct sched_rt_entity *rt_se = &p->rt;
+
update_curr_rt(rq);
watchdog(rq, p);
@@ -1996,12 +2004,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
p->rt.time_slice = RR_TIMESLICE;
/*
- * Requeue to the end of queue if we are not the only element
- * on the queue:
+ * Requeue to the end of queue if we (and all of our ancestors) are the
+ * only element on the queue
*/
- if (p->rt.run_list.prev != p->rt.run_list.next) {
- requeue_task_rt(rq, p, 0);
- set_tsk_need_resched(p);
+ for_each_sched_rt_entity(rt_se) {
+ if (rt_se->run_list.prev != rt_se->run_list.next) {
+ requeue_task_rt(rq, p, 0);
+ set_tsk_need_resched(p);
+ return;
+ }
}
}
diff --git a/kernel/signal.c b/kernel/signal.c
index f7b418217633..677102789cf2 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1656,19 +1656,18 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
info.si_signo = sig;
info.si_errno = 0;
/*
- * we are under tasklist_lock here so our parent is tied to
- * us and cannot exit and release its namespace.
+ * We are under tasklist_lock here so our parent is tied to
+ * us and cannot change.
*
- * the only it can is to switch its nsproxy with sys_unshare,
- * bu uncharing pid namespaces is not allowed, so we'll always
- * see relevant namespace
+ * task_active_pid_ns will always return the same pid namespace
+ * until a task passes through release_task.
*
* write_lock() currently calls preempt_disable() which is the
* same as rcu_read_lock(), but according to Oleg, this is not
* correct to rely on this
*/
rcu_read_lock();
- info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
+ info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(tsk->parent));
info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns),
task_uid(tsk));
rcu_read_unlock();
@@ -2369,24 +2368,34 @@ relock:
}
/**
- * block_sigmask - add @ka's signal mask to current->blocked
- * @ka: action for @signr
- * @signr: signal that has been successfully delivered
+ * signal_delivered -
+ * @sig: number of signal being delivered
+ * @info: siginfo_t of signal being delivered
+ * @ka: sigaction setting that chose the handler
+ * @regs: user register state
+ * @stepping: nonzero if debugger single-step or block-step in use
*
* This function should be called when a signal has succesfully been
- * delivered. It adds the mask of signals for @ka to current->blocked
- * so that they are blocked during the execution of the signal
- * handler. In addition, @signr will be blocked unless %SA_NODEFER is
- * set in @ka->sa.sa_flags.
+ * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask
+ * is always blocked, and the signal itself is blocked unless %SA_NODEFER
+ * is set in @ka->sa.sa_flags. Tracing is notified.
*/
-void block_sigmask(struct k_sigaction *ka, int signr)
+void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
+ struct pt_regs *regs, int stepping)
{
sigset_t blocked;
+ /* A signal was successfully delivered, and the
+ saved sigmask was stored on the signal frame,
+ and will be restored by sigreturn. So we can
+ simply clear the restore sigmask flag. */
+ clear_restore_sigmask();
+
sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
if (!(ka->sa.sa_flags & SA_NODEFER))
- sigaddset(&blocked, signr);
+ sigaddset(&blocked, sig);
set_current_blocked(&blocked);
+ tracehook_signal_handler(sig, info, ka, regs, stepping);
}
/*
@@ -2519,7 +2528,16 @@ static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
* It is wrong to change ->blocked directly, this helper should be used
* to ensure the process can't miss a shared signal we are going to block.
*/
-void set_current_blocked(const sigset_t *newset)
+void set_current_blocked(sigset_t *newset)
+{
+ struct task_struct *tsk = current;
+ sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP));
+ spin_lock_irq(&tsk->sighand->siglock);
+ __set_task_blocked(tsk, newset);
+ spin_unlock_irq(&tsk->sighand->siglock);
+}
+
+void __set_current_blocked(const sigset_t *newset)
{
struct task_struct *tsk = current;
@@ -2559,7 +2577,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
return -EINVAL;
}
- set_current_blocked(&newset);
+ __set_current_blocked(&newset);
return 0;
}
@@ -3133,7 +3151,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
return -EINVAL;
}
- set_current_blocked(&new_blocked);
+ __set_current_blocked(&new_blocked);
}
if (oset) {
@@ -3197,7 +3215,6 @@ SYSCALL_DEFINE1(ssetmask, int, newmask)
int old = current->blocked.sig[0];
sigset_t newset;
- siginitset(&newset, newmask & ~(sigmask(SIGKILL) | sigmask(SIGSTOP)));
set_current_blocked(&newset);
return old;
@@ -3236,11 +3253,8 @@ SYSCALL_DEFINE0(pause)
#endif
-#ifdef HAVE_SET_RESTORE_SIGMASK
int sigsuspend(sigset_t *set)
{
- sigdelsetmask(set, sigmask(SIGKILL)|sigmask(SIGSTOP));
-
current->saved_sigmask = current->blocked;
set_current_blocked(set);
@@ -3249,7 +3263,6 @@ int sigsuspend(sigset_t *set)
set_restore_sigmask();
return -ERESTARTNOHAND;
}
-#endif
#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
/**
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index e1a797e028a3..98f60c5caa1b 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -31,6 +31,12 @@ void __init idle_thread_set_boot_cpu(void)
per_cpu(idle_threads, smp_processor_id()) = current;
}
+/**
+ * idle_init - Initialize the idle thread for a cpu
+ * @cpu: The cpu for which the idle thread should be initialized
+ *
+ * Creates the thread if it does not exist.
+ */
static inline void idle_init(unsigned int cpu)
{
struct task_struct *tsk = per_cpu(idle_threads, cpu);
@@ -45,17 +51,16 @@ static inline void idle_init(unsigned int cpu)
}
/**
- * idle_thread_init - Initialize the idle thread for a cpu
- * @cpu: The cpu for which the idle thread should be initialized
- *
- * Creates the thread if it does not exist.
+ * idle_threads_init - Initialize idle threads for all cpus
*/
void __init idle_threads_init(void)
{
- unsigned int cpu;
+ unsigned int cpu, boot_cpu;
+
+ boot_cpu = smp_processor_id();
for_each_possible_cpu(cpu) {
- if (cpu != smp_processor_id())
+ if (cpu != boot_cpu)
idle_init(cpu);
}
}
diff --git a/kernel/sys.c b/kernel/sys.c
index 6df42624e454..9ff89cb9657a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -36,6 +36,8 @@
#include <linux/personality.h>
#include <linux/ptrace.h>
#include <linux/fs_struct.h>
+#include <linux/file.h>
+#include <linux/mount.h>
#include <linux/gfp.h>
#include <linux/syscore_ops.h>
#include <linux/version.h>
@@ -1378,8 +1380,8 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
memcpy(u->nodename, tmp, len);
memset(u->nodename + len, 0, sizeof(u->nodename) - len);
errno = 0;
+ uts_proc_notify(UTS_PROC_HOSTNAME);
}
- uts_proc_notify(UTS_PROC_HOSTNAME);
up_write(&uts_sem);
return errno;
}
@@ -1429,8 +1431,8 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
memcpy(u->domainname, tmp, len);
memset(u->domainname + len, 0, sizeof(u->domainname) - len);
errno = 0;
+ uts_proc_notify(UTS_PROC_DOMAINNAME);
}
- uts_proc_notify(UTS_PROC_DOMAINNAME);
up_write(&uts_sem);
return errno;
}
@@ -1784,77 +1786,102 @@ SYSCALL_DEFINE1(umask, int, mask)
}
#ifdef CONFIG_CHECKPOINT_RESTORE
+static bool vma_flags_mismatch(struct vm_area_struct *vma,
+ unsigned long required,
+ unsigned long banned)
+{
+ return (vma->vm_flags & required) != required ||
+ (vma->vm_flags & banned);
+}
+
+static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
+{
+ struct file *exe_file;
+ struct dentry *dentry;
+ int err;
+
+ /*
+ * Setting new mm::exe_file is only allowed when no VM_EXECUTABLE vma's
+ * remain. So perform a quick test first.
+ */
+ if (mm->num_exe_file_vmas)
+ return -EBUSY;
+
+ exe_file = fget(fd);
+ if (!exe_file)
+ return -EBADF;
+
+ dentry = exe_file->f_path.dentry;
+
+ /*
+ * Because the original mm->exe_file points to executable file, make
+ * sure that this one is executable as well, to avoid breaking an
+ * overall picture.
+ */
+ err = -EACCES;
+ if (!S_ISREG(dentry->d_inode->i_mode) ||
+ exe_file->f_path.mnt->mnt_flags & MNT_NOEXEC)
+ goto exit;
+
+ err = inode_permission(dentry->d_inode, MAY_EXEC);
+ if (err)
+ goto exit;
+
+ /*
+ * The symlink can be changed only once, just to disallow arbitrary
+ * transitions malicious software might bring in. This means one
+ * could make a snapshot over all processes running and monitor
+ * /proc/pid/exe changes to notice unusual activity if needed.
+ */
+ down_write(&mm->mmap_sem);
+ if (likely(!mm->exe_file))
+ set_mm_exe_file(mm, exe_file);
+ else
+ err = -EBUSY;
+ up_write(&mm->mmap_sem);
+
+exit:
+ fput(exe_file);
+ return err;
+}
+
static int prctl_set_mm(int opt, unsigned long addr,
unsigned long arg4, unsigned long arg5)
{
unsigned long rlim = rlimit(RLIMIT_DATA);
- unsigned long vm_req_flags;
- unsigned long vm_bad_flags;
- struct vm_area_struct *vma;
- int error = 0;
struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ int error;
- if (arg4 | arg5)
+ if (arg5 || (arg4 && opt != PR_SET_MM_AUXV))
return -EINVAL;
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
+ if (opt == PR_SET_MM_EXE_FILE)
+ return prctl_set_mm_exe_file(mm, (unsigned int)addr);
+
if (addr >= TASK_SIZE)
return -EINVAL;
+ error = -EINVAL;
+
down_read(&mm->mmap_sem);
vma = find_vma(mm, addr);
- if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) {
- /* It must be existing VMA */
- if (!vma || vma->vm_start > addr)
- goto out;
- }
-
- error = -EINVAL;
switch (opt) {
case PR_SET_MM_START_CODE:
+ mm->start_code = addr;
+ break;
case PR_SET_MM_END_CODE:
- vm_req_flags = VM_READ | VM_EXEC;
- vm_bad_flags = VM_WRITE | VM_MAYSHARE;
-
- if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
- (vma->vm_flags & vm_bad_flags))
- goto out;
-
- if (opt == PR_SET_MM_START_CODE)
- mm->start_code = addr;
- else
- mm->end_code = addr;
+ mm->end_code = addr;
break;
-
case PR_SET_MM_START_DATA:
- case PR_SET_MM_END_DATA:
- vm_req_flags = VM_READ | VM_WRITE;
- vm_bad_flags = VM_EXEC | VM_MAYSHARE;
-
- if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
- (vma->vm_flags & vm_bad_flags))
- goto out;
-
- if (opt == PR_SET_MM_START_DATA)
- mm->start_data = addr;
- else
- mm->end_data = addr;
+ mm->start_data = addr;
break;
-
- case PR_SET_MM_START_STACK:
-
-#ifdef CONFIG_STACK_GROWSUP
- vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP;
-#else
- vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN;
-#endif
- if ((vma->vm_flags & vm_req_flags) != vm_req_flags)
- goto out;
-
- mm->start_stack = addr;
+ case PR_SET_MM_END_DATA:
+ mm->end_data = addr;
break;
case PR_SET_MM_START_BRK:
@@ -1881,16 +1908,77 @@ static int prctl_set_mm(int opt, unsigned long addr,
mm->brk = addr;
break;
+ /*
+ * If command line arguments and environment
+ * are placed somewhere else on stack, we can
+ * set them up here, ARG_START/END to setup
+ * command line argumets and ENV_START/END
+ * for environment.
+ */
+ case PR_SET_MM_START_STACK:
+ case PR_SET_MM_ARG_START:
+ case PR_SET_MM_ARG_END:
+ case PR_SET_MM_ENV_START:
+ case PR_SET_MM_ENV_END:
+ if (!vma) {
+ error = -EFAULT;
+ goto out;
+ }
+#ifdef CONFIG_STACK_GROWSUP
+ if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSUP, 0))
+#else
+ if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSDOWN, 0))
+#endif
+ goto out;
+ if (opt == PR_SET_MM_START_STACK)
+ mm->start_stack = addr;
+ else if (opt == PR_SET_MM_ARG_START)
+ mm->arg_start = addr;
+ else if (opt == PR_SET_MM_ARG_END)
+ mm->arg_end = addr;
+ else if (opt == PR_SET_MM_ENV_START)
+ mm->env_start = addr;
+ else if (opt == PR_SET_MM_ENV_END)
+ mm->env_end = addr;
+ break;
+
+ /*
+ * This doesn't move auxiliary vector itself
+ * since it's pinned to mm_struct, but allow
+ * to fill vector with new values. It's up
+ * to a caller to provide sane values here
+ * otherwise user space tools which use this
+ * vector might be unhappy.
+ */
+ case PR_SET_MM_AUXV: {
+ unsigned long user_auxv[AT_VECTOR_SIZE];
+
+ if (arg4 > sizeof(user_auxv))
+ goto out;
+ up_read(&mm->mmap_sem);
+
+ if (copy_from_user(user_auxv, (const void __user *)addr, arg4))
+ return -EFAULT;
+
+ /* Make sure the last entry is always AT_NULL */
+ user_auxv[AT_VECTOR_SIZE - 2] = 0;
+ user_auxv[AT_VECTOR_SIZE - 1] = 0;
+
+ BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
+
+ task_lock(current);
+ memcpy(mm->saved_auxv, user_auxv, arg4);
+ task_unlock(current);
+
+ return 0;
+ }
default:
- error = -EINVAL;
goto out;
}
error = 0;
-
out:
up_read(&mm->mmap_sem);
-
return error;
}
#else /* CONFIG_CHECKPOINT_RESTORE */
@@ -2114,7 +2202,6 @@ int orderly_poweroff(bool force)
NULL
};
int ret = -ENOMEM;
- struct subprocess_info *info;
if (argv == NULL) {
printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
@@ -2122,18 +2209,16 @@ int orderly_poweroff(bool force)
goto out;
}
- info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC);
- if (info == NULL) {
- argv_free(argv);
- goto out;
- }
-
- call_usermodehelper_setfns(info, NULL, argv_cleanup, NULL);
+ ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT,
+ NULL, argv_cleanup, NULL);
+out:
+ if (likely(!ret))
+ return 0;
- ret = call_usermodehelper_exec(info, UMH_NO_WAIT);
+ if (ret == -ENOMEM)
+ argv_free(argv);
- out:
- if (ret && force) {
+ if (force) {
printk(KERN_WARNING "Failed to start orderly shutdown: "
"forcing the issue\n");
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 47bfa16430d7..dbff751e4086 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -203,3 +203,6 @@ cond_syscall(sys_fanotify_mark);
cond_syscall(sys_name_to_handle_at);
cond_syscall(sys_open_by_handle_at);
cond_syscall(compat_sys_open_by_handle_at);
+
+/* compare kernel pointers */
+cond_syscall(sys_kcmp);
diff --git a/kernel/task_work.c b/kernel/task_work.c
new file mode 100644
index 000000000000..82d1c794066d
--- /dev/null
+++ b/kernel/task_work.c
@@ -0,0 +1,84 @@
+#include <linux/spinlock.h>
+#include <linux/task_work.h>
+#include <linux/tracehook.h>
+
+int
+task_work_add(struct task_struct *task, struct task_work *twork, bool notify)
+{
+ unsigned long flags;
+ int err = -ESRCH;
+
+#ifndef TIF_NOTIFY_RESUME
+ if (notify)
+ return -ENOTSUPP;
+#endif
+ /*
+ * We must not insert the new work if the task has already passed
+ * exit_task_work(). We rely on do_exit()->raw_spin_unlock_wait()
+ * and check PF_EXITING under pi_lock.
+ */
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+ if (likely(!(task->flags & PF_EXITING))) {
+ hlist_add_head(&twork->hlist, &task->task_works);
+ err = 0;
+ }
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+
+ /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */
+ if (likely(!err) && notify)
+ set_notify_resume(task);
+ return err;
+}
+
+struct task_work *
+task_work_cancel(struct task_struct *task, task_work_func_t func)
+{
+ unsigned long flags;
+ struct task_work *twork;
+ struct hlist_node *pos;
+
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+ hlist_for_each_entry(twork, pos, &task->task_works, hlist) {
+ if (twork->func == func) {
+ hlist_del(&twork->hlist);
+ goto found;
+ }
+ }
+ twork = NULL;
+ found:
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+
+ return twork;
+}
+
+void task_work_run(void)
+{
+ struct task_struct *task = current;
+ struct hlist_head task_works;
+ struct hlist_node *pos;
+
+ raw_spin_lock_irq(&task->pi_lock);
+ hlist_move_list(&task->task_works, &task_works);
+ raw_spin_unlock_irq(&task->pi_lock);
+
+ if (unlikely(hlist_empty(&task_works)))
+ return;
+ /*
+ * We use hlist to save the space in task_struct, but we want fifo.
+ * Find the last entry, the list should be short, then process them
+ * in reverse order.
+ */
+ for (pos = task_works.first; pos->next; pos = pos->next)
+ ;
+
+ for (;;) {
+ struct hlist_node **pprev = pos->pprev;
+ struct task_work *twork = container_of(pos, struct task_work,
+ hlist);
+ twork->func(twork);
+
+ if (pprev == &task_works.first)
+ break;
+ pos = container_of(pprev, struct hlist_node, next);
+ }
+}
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 9cd928f7a7c6..7e1ce012a851 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -297,8 +297,7 @@ void clockevents_register_device(struct clock_event_device *dev)
}
EXPORT_SYMBOL_GPL(clockevents_register_device);
-static void clockevents_config(struct clock_event_device *dev,
- u32 freq)
+void clockevents_config(struct clock_event_device *dev, u32 freq)
{
u64 sec;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6a3a5b9ff561..da70c6db496c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -576,6 +576,7 @@ void tick_nohz_idle_exit(void)
/* Update jiffies first */
select_nohz_load_balancer(0);
tick_do_update_jiffies64(now);
+ update_cpu_load_nohz();
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
/*
@@ -814,6 +815,16 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
return HRTIMER_RESTART;
}
+static int sched_skew_tick;
+
+static int __init skew_tick(char *str)
+{
+ get_option(&str, &sched_skew_tick);
+
+ return 0;
+}
+early_param("skew_tick", skew_tick);
+
/**
* tick_setup_sched_timer - setup the tick emulation timer
*/
@@ -831,6 +842,14 @@ void tick_setup_sched_timer(void)
/* Get the next period (per cpu) */
hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
+ /* Offset the tick to avert xtime_lock contention. */
+ if (sched_skew_tick) {
+ u64 offset = ktime_to_ns(tick_period) >> 1;
+ do_div(offset, num_possible_cpus());
+ offset *= smp_processor_id();
+ hrtimer_add_expires_ns(&ts->sched_timer, offset);
+ }
+
for (;;) {
hrtimer_forward(&ts->sched_timer, now, tick_period);
hrtimer_start_expires(&ts->sched_timer,
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6420cda62336..1d0f6a8a0e5e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1486,6 +1486,11 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
if (!buffer)
return size;
+ /* Make sure the requested buffer exists */
+ if (cpu_id != RING_BUFFER_ALL_CPUS &&
+ !cpumask_test_cpu(cpu_id, buffer->cpumask))
+ return size;
+
size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
size *= BUF_PAGE_SIZE;
OpenPOWER on IntegriCloud