diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 7 | ||||
-rw-r--r-- | kernel/cgroup.c | 37 | ||||
-rw-r--r-- | kernel/cpu.c | 44 | ||||
-rw-r--r-- | kernel/cpu_pm.c | 16 | ||||
-rw-r--r-- | kernel/cred.c | 9 | ||||
-rw-r--r-- | kernel/exit.c | 13 | ||||
-rw-r--r-- | kernel/fork.c | 28 | ||||
-rw-r--r-- | kernel/irq/chip.c | 8 | ||||
-rw-r--r-- | kernel/irq/internals.h | 3 | ||||
-rw-r--r-- | kernel/irq/manage.c | 119 | ||||
-rw-r--r-- | kernel/irq/migration.c | 13 | ||||
-rw-r--r-- | kernel/kallsyms.c | 32 | ||||
-rw-r--r-- | kernel/kcmp.c | 196 | ||||
-rw-r--r-- | kernel/kmod.c | 30 | ||||
-rw-r--r-- | kernel/lglock.c | 89 | ||||
-rw-r--r-- | kernel/pid_namespace.c | 13 | ||||
-rw-r--r-- | kernel/res_counter.c | 10 | ||||
-rw-r--r-- | kernel/resource.c | 4 | ||||
-rw-r--r-- | kernel/sched/core.c | 68 | ||||
-rw-r--r-- | kernel/sched/fair.c | 42 | ||||
-rw-r--r-- | kernel/sched/rt.c | 51 | ||||
-rw-r--r-- | kernel/signal.c | 59 | ||||
-rw-r--r-- | kernel/smpboot.c | 17 | ||||
-rw-r--r-- | kernel/sys.c | 213 | ||||
-rw-r--r-- | kernel/sys_ni.c | 3 | ||||
-rw-r--r-- | kernel/task_work.c | 84 | ||||
-rw-r--r-- | kernel/time/clockevents.c | 3 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 19 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 5 |
29 files changed, 949 insertions, 286 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 6c07f30fa9b7..c0cc67ad764c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -5,12 +5,12 @@ obj-y = fork.o exec_domain.o panic.o printk.o \ cpu.o exit.o itimer.o time.o softirq.o resource.o \ sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ - signal.o sys.o kmod.o workqueue.o pid.o \ + signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o cred.o \ - async.o range.o groups.o + async.o range.o groups.o lglock.o ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files @@ -25,6 +25,9 @@ endif obj-y += sched/ obj-y += power/ +ifeq ($(CONFIG_CHECKPOINT_RESTORE),y) +obj-$(CONFIG_X86) += kcmp.o +endif obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_STACKTRACE) += stacktrace.o diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a0c6af34d500..72fcd3069a90 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -896,10 +896,13 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) mutex_unlock(&cgroup_mutex); /* - * Drop the active superblock reference that we took when we - * created the cgroup + * We want to drop the active superblock reference from the + * cgroup creation after all the dentry refs are gone - + * kill_sb gets mighty unhappy otherwise. Mark + * dentry->d_fsdata with cgroup_diput() to tell + * cgroup_d_release() to call deactivate_super(). */ - deactivate_super(cgrp->root->sb); + dentry->d_fsdata = cgroup_diput; /* * if we're getting rid of the cgroup, refcount should ensure @@ -925,6 +928,13 @@ static int cgroup_delete(const struct dentry *d) return 1; } +static void cgroup_d_release(struct dentry *dentry) +{ + /* did cgroup_diput() tell me to deactivate super? */ + if (dentry->d_fsdata == cgroup_diput) + deactivate_super(dentry->d_sb); +} + static void remove_dir(struct dentry *d) { struct dentry *parent = dget(d->d_parent); @@ -1532,6 +1542,7 @@ static int cgroup_get_rootdir(struct super_block *sb) static const struct dentry_operations cgroup_dops = { .d_iput = cgroup_diput, .d_delete = cgroup_delete, + .d_release = cgroup_d_release, }; struct inode *inode = @@ -5132,7 +5143,7 @@ EXPORT_SYMBOL_GPL(css_depth); * @root: the css supporsed to be an ancestor of the child. * * Returns true if "root" is an ancestor of "child" in its hierarchy. Because - * this function reads css->id, this use rcu_dereference() and rcu_read_lock(). + * this function reads css->id, the caller must hold rcu_read_lock(). * But, considering usual usage, the csses should be valid objects after test. * Assuming that the caller will do some action to the child if this returns * returns true, the caller must take "child";s reference count. @@ -5144,18 +5155,18 @@ bool css_is_ancestor(struct cgroup_subsys_state *child, { struct css_id *child_id; struct css_id *root_id; - bool ret = true; - rcu_read_lock(); child_id = rcu_dereference(child->id); + if (!child_id) + return false; root_id = rcu_dereference(root->id); - if (!child_id - || !root_id - || (child_id->depth < root_id->depth) - || (child_id->stack[root_id->depth] != root_id->id)) - ret = false; - rcu_read_unlock(); - return ret; + if (!root_id) + return false; + if (child_id->depth < root_id->depth) + return false; + if (child_id->stack[root_id->depth] != root_id->id) + return false; + return true; } void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) diff --git a/kernel/cpu.c b/kernel/cpu.c index 0e6353cf147a..a4eb5227a19e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -10,7 +10,10 @@ #include <linux/sched.h> #include <linux/unistd.h> #include <linux/cpu.h> +#include <linux/oom.h> +#include <linux/rcupdate.h> #include <linux/export.h> +#include <linux/bug.h> #include <linux/kthread.h> #include <linux/stop_machine.h> #include <linux/mutex.h> @@ -173,6 +176,47 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_cpu_notifier); +/** + * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU + * @cpu: a CPU id + * + * This function walks all processes, finds a valid mm struct for each one and + * then clears a corresponding bit in mm's cpumask. While this all sounds + * trivial, there are various non-obvious corner cases, which this function + * tries to solve in a safe manner. + * + * Also note that the function uses a somewhat relaxed locking scheme, so it may + * be called only for an already offlined CPU. + */ +void clear_tasks_mm_cpumask(int cpu) +{ + struct task_struct *p; + + /* + * This function is called after the cpu is taken down and marked + * offline, so its not like new tasks will ever get this cpu set in + * their mm mask. -- Peter Zijlstra + * Thus, we may use rcu_read_lock() here, instead of grabbing + * full-fledged tasklist_lock. + */ + WARN_ON(cpu_online(cpu)); + rcu_read_lock(); + for_each_process(p) { + struct task_struct *t; + + /* + * Main thread might exit, but other threads may still have + * a valid mm. Find one. + */ + t = find_lock_task_mm(p); + if (!t) + continue; + cpumask_clear_cpu(cpu, mm_cpumask(t->mm)); + task_unlock(t); + } + rcu_read_unlock(); +} + static inline void check_for_tasks(int cpu) { struct task_struct *p; diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index 249152e15308..9656a3c36503 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -81,7 +81,7 @@ int cpu_pm_unregister_notifier(struct notifier_block *nb) EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); /** - * cpm_pm_enter - CPU low power entry notifier + * cpu_pm_enter - CPU low power entry notifier * * Notifies listeners that a single CPU is entering a low power state that may * cause some blocks in the same power domain as the cpu to reset. @@ -89,7 +89,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); * Must be called on the affected CPU with interrupts disabled. Platform is * responsible for ensuring that cpu_pm_enter is not called twice on the same * CPU before cpu_pm_exit is called. Notified drivers can include VFP - * co-processor, interrupt controller and it's PM extensions, local CPU + * co-processor, interrupt controller and its PM extensions, local CPU * timers context save/restore which shouldn't be interrupted. Hence it * must be called with interrupts disabled. * @@ -115,13 +115,13 @@ int cpu_pm_enter(void) EXPORT_SYMBOL_GPL(cpu_pm_enter); /** - * cpm_pm_exit - CPU low power exit notifier + * cpu_pm_exit - CPU low power exit notifier * * Notifies listeners that a single CPU is exiting a low power state that may * have caused some blocks in the same power domain as the cpu to reset. * * Notified drivers can include VFP co-processor, interrupt controller - * and it's PM extensions, local CPU timers context save/restore which + * and its PM extensions, local CPU timers context save/restore which * shouldn't be interrupted. Hence it must be called with interrupts disabled. * * Return conditions are same as __raw_notifier_call_chain. @@ -139,7 +139,7 @@ int cpu_pm_exit(void) EXPORT_SYMBOL_GPL(cpu_pm_exit); /** - * cpm_cluster_pm_enter - CPU cluster low power entry notifier + * cpu_cluster_pm_enter - CPU cluster low power entry notifier * * Notifies listeners that all cpus in a power domain are entering a low power * state that may cause some blocks in the same power domain to reset. @@ -147,7 +147,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit); * Must be called after cpu_pm_enter has been called on all cpus in the power * domain, and before cpu_pm_exit has been called on any cpu in the power * domain. Notified drivers can include VFP co-processor, interrupt controller - * and it's PM extensions, local CPU timers context save/restore which + * and its PM extensions, local CPU timers context save/restore which * shouldn't be interrupted. Hence it must be called with interrupts disabled. * * Must be called with interrupts disabled. @@ -174,7 +174,7 @@ int cpu_cluster_pm_enter(void) EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); /** - * cpm_cluster_pm_exit - CPU cluster low power exit notifier + * cpu_cluster_pm_exit - CPU cluster low power exit notifier * * Notifies listeners that all cpus in a power domain are exiting form a * low power state that may have caused some blocks in the same power domain @@ -183,7 +183,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); * Must be called after cpu_pm_exit has been called on all cpus in the power * domain, and before cpu_pm_exit has been called on any cpu in the power * domain. Notified drivers can include VFP co-processor, interrupt controller - * and it's PM extensions, local CPU timers context save/restore which + * and its PM extensions, local CPU timers context save/restore which * shouldn't be interrupted. Hence it must be called with interrupts disabled. * * Return conditions are same as __raw_notifier_call_chain. diff --git a/kernel/cred.c b/kernel/cred.c index 430557ea488f..de728ac50d82 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -207,13 +207,6 @@ void exit_creds(struct task_struct *tsk) validate_creds(cred); alter_cred_subscribers(cred, -1); put_cred(cred); - - cred = (struct cred *) tsk->replacement_session_keyring; - if (cred) { - tsk->replacement_session_keyring = NULL; - validate_creds(cred); - put_cred(cred); - } } /** @@ -396,8 +389,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) struct cred *new; int ret; - p->replacement_session_keyring = NULL; - if ( #ifdef CONFIG_KEYS !p->cred->thread_keyring && diff --git a/kernel/exit.c b/kernel/exit.c index 910a0716e17a..34867cc5b42a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -884,9 +884,9 @@ static void check_stack_usage(void) spin_lock(&low_water_lock); if (free < lowest_to_date) { - printk(KERN_WARNING "%s used greatest stack depth: %lu bytes " - "left\n", - current->comm, free); + printk(KERN_WARNING "%s (%d) used greatest stack depth: " + "%lu bytes left\n", + current->comm, task_pid_nr(current), free); lowest_to_date = free; } spin_unlock(&low_water_lock); @@ -946,12 +946,13 @@ void do_exit(long code) exit_signals(tsk); /* sets PF_EXITING */ /* * tsk->flags are checked in the futex code to protect against - * an exiting task cleaning up the robust pi futexes. + * an exiting task cleaning up the robust pi futexes, and in + * task_work_add() to avoid the race with exit_task_work(). */ smp_mb(); raw_spin_unlock_wait(&tsk->pi_lock); - exit_irq_thread(); + exit_task_work(tsk); if (unlikely(in_atomic())) printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", @@ -1214,7 +1215,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) unsigned long state; int retval, status, traced; pid_t pid = task_pid_vnr(p); - uid_t uid = from_kuid_munged(current_user_ns(), __task_cred(p)->uid); + uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); struct siginfo __user *infop; if (!likely(wo->wo_flags & WEXITED)) diff --git a/kernel/fork.c b/kernel/fork.c index 47b4e4f379f9..ab5211b9e622 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -386,7 +386,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) } charge = 0; if (mpnt->vm_flags & VM_ACCOUNT) { - unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; + unsigned long len; + len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ goto fail_nomem; charge = len; @@ -614,7 +615,6 @@ void mmput(struct mm_struct *mm) list_del(&mm->mmlist); spin_unlock(&mmlist_lock); } - put_swap_token(mm); if (mm->binfmt) module_put(mm->binfmt->module); mmdrop(mm); @@ -787,9 +787,6 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) /* Get rid of any cached register state */ deactivate_mm(tsk, mm); - if (tsk->vfork_done) - complete_vfork_done(tsk); - /* * If we're exiting normally, clear a user-space tid field if * requested. We leave this alone when dying by signal, to leave @@ -810,6 +807,13 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) } tsk->clear_child_tid = NULL; } + + /* + * All done, finally we can wake up parent and return this mm to him. + * Also kthread_stop() uses this completion for synchronization. + */ + if (tsk->vfork_done) + complete_vfork_done(tsk); } /* @@ -831,10 +835,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk) memcpy(mm, oldmm, sizeof(*mm)); mm_init_cpumask(mm); - /* Initializing for Swap token stuff */ - mm->token_priority = 0; - mm->last_interval = 0; - #ifdef CONFIG_TRANSPARENT_HUGEPAGE mm->pmd_huge_pte = NULL; #endif @@ -913,10 +913,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) goto fail_nomem; good_mm: - /* Initializing for Swap token stuff */ - mm->token_priority = 0; - mm->last_interval = 0; - tsk->mm = mm; tsk->active_mm = mm; return 0; @@ -984,9 +980,8 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) * Share io context with parent, if CLONE_IO is set */ if (clone_flags & CLONE_IO) { - tsk->io_context = ioc_task_link(ioc); - if (unlikely(!tsk->io_context)) - return -ENOMEM; + ioc_task_link(ioc); + tsk->io_context = ioc; } else if (ioprio_valid(ioc->ioprio)) { new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); if (unlikely(!new_ioc)) @@ -1420,6 +1415,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, */ p->group_leader = p; INIT_LIST_HEAD(&p->thread_group); + INIT_HLIST_HEAD(&p->task_works); /* Now that the task is set up, run cgroup callbacks if * necessary. We need to run them before the task is visible diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index fc275e4f629b..eebd6d5cfb44 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -275,8 +275,10 @@ void handle_nested_irq(unsigned int irq) kstat_incr_irqs_this_cpu(irq, desc); action = desc->action; - if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) + if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) { + desc->istate |= IRQS_PENDING; goto out_unlock; + } irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock_irq(&desc->lock); @@ -324,8 +326,10 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); - if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { + desc->istate |= IRQS_PENDING; goto out_unlock; + } handle_irq_event(desc); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 8e5c56b3b7d9..001fa5bab490 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -101,6 +101,9 @@ extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); extern void irq_set_thread_affinity(struct irq_desc *desc); +extern int irq_do_set_affinity(struct irq_data *data, + const struct cpumask *dest, bool force); + /* Inline functions for support of irq chips on slow busses */ static inline void chip_bus_lock(struct irq_desc *desc) { diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index bb32326afe87..8c548232ba39 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -7,6 +7,8 @@ * This file contains driver APIs to the irq subsystem. */ +#define pr_fmt(fmt) "genirq: " fmt + #include <linux/irq.h> #include <linux/kthread.h> #include <linux/module.h> @@ -14,6 +16,7 @@ #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/task_work.h> #include "internals.h" @@ -139,6 +142,25 @@ static inline void irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } #endif +int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) +{ + struct irq_desc *desc = irq_data_to_desc(data); + struct irq_chip *chip = irq_data_get_irq_chip(data); + int ret; + + ret = chip->irq_set_affinity(data, mask, false); + switch (ret) { + case IRQ_SET_MASK_OK: + cpumask_copy(data->affinity, mask); + case IRQ_SET_MASK_OK_NOCOPY: + irq_set_thread_affinity(desc); + ret = 0; + } + + return ret; +} + int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) { struct irq_chip *chip = irq_data_get_irq_chip(data); @@ -149,14 +171,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) return -EINVAL; if (irq_can_move_pcntxt(data)) { - ret = chip->irq_set_affinity(data, mask, false); - switch (ret) { - case IRQ_SET_MASK_OK: - cpumask_copy(data->affinity, mask); - case IRQ_SET_MASK_OK_NOCOPY: - irq_set_thread_affinity(desc); - ret = 0; - } + ret = irq_do_set_affinity(data, mask, false); } else { irqd_set_move_pending(data); irq_copy_pending(desc, mask); @@ -280,9 +295,8 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); static int setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) { - struct irq_chip *chip = irq_desc_get_chip(desc); struct cpumask *set = irq_default_affinity; - int ret, node = desc->irq_data.node; + int node = desc->irq_data.node; /* Excludes PER_CPU and NO_BALANCE interrupts */ if (!irq_can_set_affinity(irq)) @@ -308,13 +322,7 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) if (cpumask_intersects(mask, nodemask)) cpumask_and(mask, mask, nodemask); } - ret = chip->irq_set_affinity(&desc->irq_data, mask, false); - switch (ret) { - case IRQ_SET_MASK_OK: - cpumask_copy(desc->irq_data.affinity, mask); - case IRQ_SET_MASK_OK_NOCOPY: - irq_set_thread_affinity(desc); - } + irq_do_set_affinity(&desc->irq_data, mask, false); return 0; } #else @@ -565,7 +573,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, * IRQF_TRIGGER_* but the PIC does not support multiple * flow-types? */ - pr_debug("genirq: No set_type function for IRQ %d (%s)\n", irq, + pr_debug("No set_type function for IRQ %d (%s)\n", irq, chip ? (chip->name ? : "unknown") : "unknown"); return 0; } @@ -600,7 +608,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, ret = 0; break; default: - pr_err("genirq: Setting trigger mode %lu for irq %u failed (%pF)\n", + pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n", flags, irq, chip->irq_set_type); } if (unmask) @@ -773,11 +781,39 @@ static void wake_threads_waitq(struct irq_desc *desc) wake_up(&desc->wait_for_threads); } +static void irq_thread_dtor(struct task_work *unused) +{ + struct task_struct *tsk = current; + struct irq_desc *desc; + struct irqaction *action; + + if (WARN_ON_ONCE(!(current->flags & PF_EXITING))) + return; + + action = kthread_data(tsk); + + pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", + tsk->comm ? tsk->comm : "", tsk->pid, action->irq); + + + desc = irq_to_desc(action->irq); + /* + * If IRQTF_RUNTHREAD is set, we need to decrement + * desc->threads_active and wake possible waiters. + */ + if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) + wake_threads_waitq(desc); + + /* Prevent a stale desc->threads_oneshot */ + irq_finalize_oneshot(desc, action); +} + /* * Interrupt handler thread */ static int irq_thread(void *data) { + struct task_work on_exit_work; static const struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; @@ -793,7 +829,9 @@ static int irq_thread(void *data) handler_fn = irq_thread_fn; sched_setscheduler(current, SCHED_FIFO, ¶m); - current->irq_thread = 1; + + init_task_work(&on_exit_work, irq_thread_dtor, NULL); + task_work_add(current, &on_exit_work, false); while (!irq_wait_for_interrupt(action)) { irqreturn_t action_ret; @@ -815,44 +853,11 @@ static int irq_thread(void *data) * cannot touch the oneshot mask at this point anymore as * __setup_irq() might have given out currents thread_mask * again. - * - * Clear irq_thread. Otherwise exit_irq_thread() would make - * fuzz about an active irq thread going into nirvana. */ - current->irq_thread = 0; + task_work_cancel(current, irq_thread_dtor); return 0; } -/* - * Called from do_exit() - */ -void exit_irq_thread(void) -{ - struct task_struct *tsk = current; - struct irq_desc *desc; - struct irqaction *action; - - if (!tsk->irq_thread) - return; - - action = kthread_data(tsk); - - pr_err("genirq: exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", - tsk->comm ? tsk->comm : "", tsk->pid, action->irq); - - desc = irq_to_desc(action->irq); - - /* - * If IRQTF_RUNTHREAD is set, we need to decrement - * desc->threads_active and wake possible waiters. - */ - if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) - wake_threads_waitq(desc); - - /* Prevent a stale desc->threads_oneshot */ - irq_finalize_oneshot(desc, action); -} - static void irq_setup_forced_threading(struct irqaction *new) { if (!force_irqthreads) @@ -1044,7 +1049,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * has. The type flags are unreliable as the * underlying chip implementation can override them. */ - pr_err("genirq: Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n", + pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n", irq); ret = -EINVAL; goto out_mask; @@ -1095,7 +1100,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (nmsk != omsk) /* hope the handler works with current trigger mode */ - pr_warning("genirq: irq %d uses trigger mode %u; requested %u\n", + pr_warning("irq %d uses trigger mode %u; requested %u\n", irq, nmsk, omsk); } @@ -1133,7 +1138,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) mismatch: if (!(new->flags & IRQF_PROBE_SHARED)) { - pr_err("genirq: Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n", + pr_err("Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n", irq, new->flags, new->name, old->flags, old->name); #ifdef CONFIG_DEBUG_SHIRQ dump_stack(); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index c3c89751b327..ca3f4aaff707 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -42,17 +42,8 @@ void irq_move_masked_irq(struct irq_data *idata) * For correct operation this depends on the caller * masking the irqs. */ - if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) - < nr_cpu_ids)) { - int ret = chip->irq_set_affinity(&desc->irq_data, - desc->pending_mask, false); - switch (ret) { - case IRQ_SET_MASK_OK: - cpumask_copy(desc->irq_data.affinity, desc->pending_mask); - case IRQ_SET_MASK_OK_NOCOPY: - irq_set_thread_affinity(desc); - } - } + if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids) + irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false); cpumask_clear(desc->pending_mask); } diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 079f1d39a8b8..2169feeba529 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -343,7 +343,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, /* Look up a kernel symbol and return it in a text buffer. */ static int __sprint_symbol(char *buffer, unsigned long address, - int symbol_offset) + int symbol_offset, int add_offset) { char *modname; const char *name; @@ -358,13 +358,13 @@ static int __sprint_symbol(char *buffer, unsigned long address, if (name != buffer) strcpy(buffer, name); len = strlen(buffer); - buffer += len; offset -= symbol_offset; + if (add_offset) + len += sprintf(buffer + len, "+%#lx/%#lx", offset, size); + if (modname) - len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname); - else - len += sprintf(buffer, "+%#lx/%#lx", offset, size); + len += sprintf(buffer + len, " [%s]", modname); return len; } @@ -382,12 +382,28 @@ static int __sprint_symbol(char *buffer, unsigned long address, */ int sprint_symbol(char *buffer, unsigned long address) { - return __sprint_symbol(buffer, address, 0); + return __sprint_symbol(buffer, address, 0, 1); } - EXPORT_SYMBOL_GPL(sprint_symbol); /** + * sprint_symbol_no_offset - Look up a kernel symbol and return it in a text buffer + * @buffer: buffer to be stored + * @address: address to lookup + * + * This function looks up a kernel symbol with @address and stores its name + * and module name to @buffer if possible. If no symbol was found, just saves + * its @address as is. + * + * This function returns the number of bytes stored in @buffer. + */ +int sprint_symbol_no_offset(char *buffer, unsigned long address) +{ + return __sprint_symbol(buffer, address, 0, 0); +} +EXPORT_SYMBOL_GPL(sprint_symbol_no_offset); + +/** * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer * @buffer: buffer to be stored * @address: address to lookup @@ -403,7 +419,7 @@ EXPORT_SYMBOL_GPL(sprint_symbol); */ int sprint_backtrace(char *buffer, unsigned long address) { - return __sprint_symbol(buffer, address, -1); + return __sprint_symbol(buffer, address, -1, 1); } /* Look up a kernel symbol and print it to the kernel messages. */ diff --git a/kernel/kcmp.c b/kernel/kcmp.c new file mode 100644 index 000000000000..30b7b225306c --- /dev/null +++ b/kernel/kcmp.c @@ -0,0 +1,196 @@ +#include <linux/kernel.h> +#include <linux/syscalls.h> +#include <linux/fdtable.h> +#include <linux/string.h> +#include <linux/random.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/cache.h> +#include <linux/bug.h> +#include <linux/err.h> +#include <linux/kcmp.h> + +#include <asm/unistd.h> + +/* + * We don't expose the real in-memory order of objects for security reasons. + * But still the comparison results should be suitable for sorting. So we + * obfuscate kernel pointers values and compare the production instead. + * + * The obfuscation is done in two steps. First we xor the kernel pointer with + * a random value, which puts pointer into a new position in a reordered space. + * Secondly we multiply the xor production with a large odd random number to + * permute its bits even more (the odd multiplier guarantees that the product + * is unique ever after the high bits are truncated, since any odd number is + * relative prime to 2^n). + * + * Note also that the obfuscation itself is invisible to userspace and if needed + * it can be changed to an alternate scheme. + */ +static unsigned long cookies[KCMP_TYPES][2] __read_mostly; + +static long kptr_obfuscate(long v, int type) +{ + return (v ^ cookies[type][0]) * cookies[type][1]; +} + +/* + * 0 - equal, i.e. v1 = v2 + * 1 - less than, i.e. v1 < v2 + * 2 - greater than, i.e. v1 > v2 + * 3 - not equal but ordering unavailable (reserved for future) + */ +static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type) +{ + long ret; + + ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type); + + return (ret < 0) | ((ret > 0) << 1); +} + +/* The caller must have pinned the task */ +static struct file * +get_file_raw_ptr(struct task_struct *task, unsigned int idx) +{ + struct file *file = NULL; + + task_lock(task); + rcu_read_lock(); + + if (task->files) + file = fcheck_files(task->files, idx); + + rcu_read_unlock(); + task_unlock(task); + + return file; +} + +static void kcmp_unlock(struct mutex *m1, struct mutex *m2) +{ + if (likely(m2 != m1)) + mutex_unlock(m2); + mutex_unlock(m1); +} + +static int kcmp_lock(struct mutex *m1, struct mutex *m2) +{ + int err; + + if (m2 > m1) + swap(m1, m2); + + err = mutex_lock_killable(m1); + if (!err && likely(m1 != m2)) { + err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING); + if (err) + mutex_unlock(m1); + } + + return err; +} + +SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, + unsigned long, idx1, unsigned long, idx2) +{ + struct task_struct *task1, *task2; + int ret; + + rcu_read_lock(); + + /* + * Tasks are looked up in caller's PID namespace only. + */ + task1 = find_task_by_vpid(pid1); + task2 = find_task_by_vpid(pid2); + if (!task1 || !task2) + goto err_no_task; + + get_task_struct(task1); + get_task_struct(task2); + + rcu_read_unlock(); + + /* + * One should have enough rights to inspect task details. + */ + ret = kcmp_lock(&task1->signal->cred_guard_mutex, + &task2->signal->cred_guard_mutex); + if (ret) + goto err; + if (!ptrace_may_access(task1, PTRACE_MODE_READ) || + !ptrace_may_access(task2, PTRACE_MODE_READ)) { + ret = -EPERM; + goto err_unlock; + } + + switch (type) { + case KCMP_FILE: { + struct file *filp1, *filp2; + + filp1 = get_file_raw_ptr(task1, idx1); + filp2 = get_file_raw_ptr(task2, idx2); + + if (filp1 && filp2) + ret = kcmp_ptr(filp1, filp2, KCMP_FILE); + else + ret = -EBADF; + break; + } + case KCMP_VM: + ret = kcmp_ptr(task1->mm, task2->mm, KCMP_VM); + break; + case KCMP_FILES: + ret = kcmp_ptr(task1->files, task2->files, KCMP_FILES); + break; + case KCMP_FS: + ret = kcmp_ptr(task1->fs, task2->fs, KCMP_FS); + break; + case KCMP_SIGHAND: + ret = kcmp_ptr(task1->sighand, task2->sighand, KCMP_SIGHAND); + break; + case KCMP_IO: + ret = kcmp_ptr(task1->io_context, task2->io_context, KCMP_IO); + break; + case KCMP_SYSVSEM: +#ifdef CONFIG_SYSVIPC + ret = kcmp_ptr(task1->sysvsem.undo_list, + task2->sysvsem.undo_list, + KCMP_SYSVSEM); +#else + ret = -EOPNOTSUPP; +#endif + break; + default: + ret = -EINVAL; + break; + } + +err_unlock: + kcmp_unlock(&task1->signal->cred_guard_mutex, + &task2->signal->cred_guard_mutex); +err: + put_task_struct(task1); + put_task_struct(task2); + + return ret; + +err_no_task: + rcu_read_unlock(); + return -ESRCH; +} + +static __init int kcmp_cookies_init(void) +{ + int i; + + get_random_bytes(cookies, sizeof(cookies)); + + for (i = 0; i < KCMP_TYPES; i++) + cookies[i][1] |= (~(~0UL >> 1) | 1); + + return 0; +} +arch_initcall(kcmp_cookies_init); diff --git a/kernel/kmod.c b/kernel/kmod.c index 05698a7415fe..ff2c7cb86d77 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -221,13 +221,12 @@ fail: return 0; } -void call_usermodehelper_freeinfo(struct subprocess_info *info) +static void call_usermodehelper_freeinfo(struct subprocess_info *info) { if (info->cleanup) (*info->cleanup)(info); kfree(info); } -EXPORT_SYMBOL(call_usermodehelper_freeinfo); static void umh_complete(struct subprocess_info *sub_info) { @@ -410,7 +409,7 @@ EXPORT_SYMBOL_GPL(usermodehelper_read_unlock); /** * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled. - * depth: New value to assign to usermodehelper_disabled. + * @depth: New value to assign to usermodehelper_disabled. * * Change the value of usermodehelper_disabled (under umhelper_sem locked for * writing) and wakeup tasks waiting for it to change. @@ -479,6 +478,7 @@ static void helper_unlock(void) * structure. This should be passed to call_usermodehelper_exec to * exec the process and free the structure. */ +static struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, char **envp, gfp_t gfp_mask) { @@ -494,7 +494,6 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, out: return sub_info; } -EXPORT_SYMBOL(call_usermodehelper_setup); /** * call_usermodehelper_setfns - set a cleanup/init function @@ -512,6 +511,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup); * Function must be runnable in either a process context or the * context in which call_usermodehelper_exec is called. */ +static void call_usermodehelper_setfns(struct subprocess_info *info, int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *info), @@ -521,7 +521,6 @@ void call_usermodehelper_setfns(struct subprocess_info *info, info->init = init; info->data = data; } -EXPORT_SYMBOL(call_usermodehelper_setfns); /** * call_usermodehelper_exec - start a usermode application @@ -535,6 +534,7 @@ EXPORT_SYMBOL(call_usermodehelper_setfns); * asynchronously if wait is not set, and runs as a child of keventd. * (ie. it runs with full root capabilities). */ +static int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) { DECLARE_COMPLETION_ONSTACK(done); @@ -576,7 +576,25 @@ unlock: helper_unlock(); return retval; } -EXPORT_SYMBOL(call_usermodehelper_exec); + +int call_usermodehelper_fns( + char *path, char **argv, char **envp, int wait, + int (*init)(struct subprocess_info *info, struct cred *new), + void (*cleanup)(struct subprocess_info *), void *data) +{ + struct subprocess_info *info; + gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; + + info = call_usermodehelper_setup(path, argv, envp, gfp_mask); + + if (info == NULL) + return -ENOMEM; + + call_usermodehelper_setfns(info, init, cleanup, data); + + return call_usermodehelper_exec(info, wait); +} +EXPORT_SYMBOL(call_usermodehelper_fns); static int proc_cap_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) diff --git a/kernel/lglock.c b/kernel/lglock.c new file mode 100644 index 000000000000..6535a667a5a7 --- /dev/null +++ b/kernel/lglock.c @@ -0,0 +1,89 @@ +/* See include/linux/lglock.h for description */ +#include <linux/module.h> +#include <linux/lglock.h> +#include <linux/cpu.h> +#include <linux/string.h> + +/* + * Note there is no uninit, so lglocks cannot be defined in + * modules (but it's fine to use them from there) + * Could be added though, just undo lg_lock_init + */ + +void lg_lock_init(struct lglock *lg, char *name) +{ + LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0); +} +EXPORT_SYMBOL(lg_lock_init); + +void lg_local_lock(struct lglock *lg) +{ + arch_spinlock_t *lock; + + preempt_disable(); + rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); + lock = this_cpu_ptr(lg->lock); + arch_spin_lock(lock); +} +EXPORT_SYMBOL(lg_local_lock); + +void lg_local_unlock(struct lglock *lg) +{ + arch_spinlock_t *lock; + + rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + lock = this_cpu_ptr(lg->lock); + arch_spin_unlock(lock); + preempt_enable(); +} +EXPORT_SYMBOL(lg_local_unlock); + +void lg_local_lock_cpu(struct lglock *lg, int cpu) +{ + arch_spinlock_t *lock; + + preempt_disable(); + rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); + lock = per_cpu_ptr(lg->lock, cpu); + arch_spin_lock(lock); +} +EXPORT_SYMBOL(lg_local_lock_cpu); + +void lg_local_unlock_cpu(struct lglock *lg, int cpu) +{ + arch_spinlock_t *lock; + + rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + lock = per_cpu_ptr(lg->lock, cpu); + arch_spin_unlock(lock); + preempt_enable(); +} +EXPORT_SYMBOL(lg_local_unlock_cpu); + +void lg_global_lock(struct lglock *lg) +{ + int i; + + preempt_disable(); + rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_); + for_each_possible_cpu(i) { + arch_spinlock_t *lock; + lock = per_cpu_ptr(lg->lock, i); + arch_spin_lock(lock); + } +} +EXPORT_SYMBOL(lg_global_lock); + +void lg_global_unlock(struct lglock *lg) +{ + int i; + + rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + for_each_possible_cpu(i) { + arch_spinlock_t *lock; + lock = per_cpu_ptr(lg->lock, i); + arch_spin_unlock(lock); + } + preempt_enable(); +} +EXPORT_SYMBOL(lg_global_unlock); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 57bc1fd35b3c..16b20e38c4a1 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -149,7 +149,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) { int nr; int rc; - struct task_struct *task; + struct task_struct *task, *me = current; + + /* Ignore SIGCHLD causing any terminated children to autoreap */ + spin_lock_irq(&me->sighand->siglock); + me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; + spin_unlock_irq(&me->sighand->siglock); /* * The last thread in the cgroup-init thread group is terminating. @@ -191,6 +196,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) return; } +#ifdef CONFIG_CHECKPOINT_RESTORE static int pid_ns_ctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -218,8 +224,8 @@ static struct ctl_table pid_ns_ctl_table[] = { }, { } }; - static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; +#endif /* CONFIG_CHECKPOINT_RESTORE */ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) { @@ -253,7 +259,10 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); + +#ifdef CONFIG_CHECKPOINT_RESTORE register_sysctl_paths(kern_path, pid_ns_ctl_table); +#endif return 0; } diff --git a/kernel/res_counter.c b/kernel/res_counter.c index bebe2b170d49..ad581aa2369a 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -94,13 +94,15 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) counter->usage -= val; } -void res_counter_uncharge(struct res_counter *counter, unsigned long val) +void res_counter_uncharge_until(struct res_counter *counter, + struct res_counter *top, + unsigned long val) { unsigned long flags; struct res_counter *c; local_irq_save(flags); - for (c = counter; c != NULL; c = c->parent) { + for (c = counter; c != top; c = c->parent) { spin_lock(&c->lock); res_counter_uncharge_locked(c, val); spin_unlock(&c->lock); @@ -108,6 +110,10 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val) local_irq_restore(flags); } +void res_counter_uncharge(struct res_counter *counter, unsigned long val) +{ + res_counter_uncharge_until(counter, NULL, val); +} static inline unsigned long long * res_counter_member(struct res_counter *counter, int member) diff --git a/kernel/resource.c b/kernel/resource.c index 7e8ea66a8c01..e1d2b8ee76d5 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -515,8 +515,8 @@ out: * @root: root resource descriptor * @new: resource descriptor desired by caller * @size: requested resource region size - * @min: minimum size to allocate - * @max: maximum size to allocate + * @min: minimum boundary to allocate + * @max: maximum boundary to allocate * @align: alignment requested, in bytes * @alignf: alignment function, optional, called if not NULL * @alignf_data: arbitrary data to pass to the @alignf function diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 39eb6011bc38..c46958e26121 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -142,9 +142,8 @@ const_debug unsigned int sysctl_sched_features = #define SCHED_FEAT(name, enabled) \ #name , -static __read_mostly char *sched_feat_names[] = { +static const char * const sched_feat_names[] = { #include "features.h" - NULL }; #undef SCHED_FEAT @@ -2517,25 +2516,32 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, sched_avg_update(this_rq); } +#ifdef CONFIG_NO_HZ +/* + * There is no sane way to deal with nohz on smp when using jiffies because the + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. + * + * Therefore we cannot use the delta approach from the regular tick since that + * would seriously skew the load calculation. However we'll make do for those + * updates happening while idle (nohz_idle_balance) or coming out of idle + * (tick_nohz_idle_exit). + * + * This means we might still be one tick off for nohz periods. + */ + /* * Called from nohz_idle_balance() to update the load ratings before doing the * idle balance. */ void update_idle_cpu_load(struct rq *this_rq) { - unsigned long curr_jiffies = jiffies; + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); unsigned long load = this_rq->load.weight; unsigned long pending_updates; /* - * Bloody broken means of dealing with nohz, but better than nothing.. - * jiffies is updated by one cpu, another cpu can drift wrt the jiffy - * update and see 0 difference the one time and 2 the next, even though - * we ticked at roughtly the same rate. - * - * Hence we only use this from nohz_idle_balance() and skip this - * nonsense when called from the scheduler_tick() since that's - * guaranteed a stable rate. + * bail if there's load or we're actually up-to-date. */ if (load || curr_jiffies == this_rq->last_load_update_tick) return; @@ -2547,12 +2553,38 @@ void update_idle_cpu_load(struct rq *this_rq) } /* + * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + */ +void update_cpu_load_nohz(void) +{ + struct rq *this_rq = this_rq(); + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long pending_updates; + + if (curr_jiffies == this_rq->last_load_update_tick) + return; + + raw_spin_lock(&this_rq->lock); + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + if (pending_updates) { + this_rq->last_load_update_tick = curr_jiffies; + /* + * We were idle, this means load 0, the current load might be + * !0 due to remote wakeups and the sort. + */ + __update_cpu_load(this_rq, 0, pending_updates); + } + raw_spin_unlock(&this_rq->lock); +} +#endif /* CONFIG_NO_HZ */ + +/* * Called from scheduler_tick() */ static void update_cpu_load_active(struct rq *this_rq) { /* - * See the mess in update_idle_cpu_load(). + * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). */ this_rq->last_load_update_tick = jiffies; __update_cpu_load(this_rq, this_rq->load.weight, 1); @@ -4982,7 +5014,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) p->sched_class->set_cpus_allowed(p, new_mask); cpumask_copy(&p->cpus_allowed, new_mask); - p->rt.nr_cpus_allowed = cpumask_weight(new_mask); + p->nr_cpus_allowed = cpumask_weight(new_mask); } /* @@ -5997,11 +6029,14 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) cpumask_or(covered, covered, sg_span); - sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); + sg->sgp = *per_cpu_ptr(sdd->sgp, i); atomic_inc(&sg->sgp->ref); - if (cpumask_test_cpu(cpu, sg_span)) + if ((!groups && cpumask_test_cpu(cpu, sg_span)) || + cpumask_first(sg_span) == cpu) { + WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span)); groups = sg; + } if (!first) first = sg; @@ -6403,7 +6438,7 @@ static void sched_init_numa(void) return; for (j = 0; j < nr_node_ids; j++) { - struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j); + struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); if (!mask) return; @@ -6691,7 +6726,6 @@ static int init_sched_domains(const struct cpumask *cpu_map) if (!doms_cur) doms_cur = &fallback_doms; cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); - dattr_cur = NULL; err = build_sched_domains(doms_cur[0], NULL); register_sched_domain_sysctl(); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 940e6d17cf96..b2a2d236f27b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) int want_sd = 1; int sync = wake_flags & WF_SYNC; - if (p->rt.nr_cpus_allowed == 1) + if (p->nr_cpus_allowed == 1) return prev_cpu; if (sd_flag & SD_BALANCE_WAKE) { @@ -3503,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) unsigned long scale_rt_power(int cpu) { struct rq *rq = cpu_rq(cpu); - u64 total, available; + u64 total, available, age_stamp, avg; - total = sched_avg_period() + (rq->clock - rq->age_stamp); + /* + * Since we're reading these variables without serialization make sure + * we read them once before doing sanity checks on them. + */ + age_stamp = ACCESS_ONCE(rq->age_stamp); + avg = ACCESS_ONCE(rq->rt_avg); + + total = sched_avg_period() + (rq->clock - age_stamp); - if (unlikely(total < rq->rt_avg)) { + if (unlikely(total < avg)) { /* Ensures that power won't end up being negative */ available = 0; } else { - available = total - rq->rt_avg; + available = total - avg; } if (unlikely((s64)total < SCHED_POWER_SCALE)) @@ -3574,11 +3581,26 @@ void update_group_power(struct sched_domain *sd, int cpu) power = 0; - group = child->groups; - do { - power += group->sgp->power; - group = group->next; - } while (group != child->groups); + if (child->flags & SD_OVERLAP) { + /* + * SD_OVERLAP domains cannot assume that child groups + * span the current group. + */ + + for_each_cpu(cpu, sched_group_cpus(sdg)) + power += power_of(cpu); + } else { + /* + * !SD_OVERLAP domains can assume that child groups + * span the current group. + */ + + group = child->groups; + do { + power += group->sgp->power; + group = group->next; + } while (group != child->groups); + } sdg->sgp->power = power; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index c5565c3c515f..2a4e8dffbd6b 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -274,13 +274,16 @@ static void update_rt_migration(struct rt_rq *rt_rq) static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { + struct task_struct *p; + if (!rt_entity_is_task(rt_se)) return; + p = rt_task_of(rt_se); rt_rq = &rq_of_rt_rq(rt_rq)->rt; rt_rq->rt_nr_total++; - if (rt_se->nr_cpus_allowed > 1) + if (p->nr_cpus_allowed > 1) rt_rq->rt_nr_migratory++; update_rt_migration(rt_rq); @@ -288,13 +291,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { + struct task_struct *p; + if (!rt_entity_is_task(rt_se)) return; + p = rt_task_of(rt_se); rt_rq = &rq_of_rt_rq(rt_rq)->rt; rt_rq->rt_nr_total--; - if (rt_se->nr_cpus_allowed > 1) + if (p->nr_cpus_allowed > 1) rt_rq->rt_nr_migratory--; update_rt_migration(rt_rq); @@ -1161,7 +1167,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); - if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) + if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); inc_nr_running(rq); @@ -1225,7 +1231,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) cpu = task_cpu(p); - if (p->rt.nr_cpus_allowed == 1) + if (p->nr_cpus_allowed == 1) goto out; /* For anything but wake ups, just return the task_cpu */ @@ -1260,9 +1266,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) * will have to sort it out. */ if (curr && unlikely(rt_task(curr)) && - (curr->rt.nr_cpus_allowed < 2 || + (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio) && - (p->rt.nr_cpus_allowed > 1)) { + (p->nr_cpus_allowed > 1)) { int target = find_lowest_rq(p); if (target != -1) @@ -1276,10 +1282,10 @@ out: static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - if (rq->curr->rt.nr_cpus_allowed == 1) + if (rq->curr->nr_cpus_allowed == 1) return; - if (p->rt.nr_cpus_allowed != 1 + if (p->nr_cpus_allowed != 1 && cpupri_find(&rq->rd->cpupri, p, NULL)) return; @@ -1395,7 +1401,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) * The previous task needs to be made eligible for pushing * if it is still active */ - if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) + if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); } @@ -1408,7 +1414,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && - (p->rt.nr_cpus_allowed > 1)) + (p->nr_cpus_allowed > 1)) return 1; return 0; } @@ -1464,7 +1470,7 @@ static int find_lowest_rq(struct task_struct *task) if (unlikely(!lowest_mask)) return -1; - if (task->rt.nr_cpus_allowed == 1) + if (task->nr_cpus_allowed == 1) return -1; /* No other targets possible */ if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) @@ -1586,7 +1592,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); - BUG_ON(p->rt.nr_cpus_allowed <= 1); + BUG_ON(p->nr_cpus_allowed <= 1); BUG_ON(!p->on_rq); BUG_ON(!rt_task(p)); @@ -1793,9 +1799,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && has_pushable_tasks(rq) && - p->rt.nr_cpus_allowed > 1 && + p->nr_cpus_allowed > 1 && rt_task(rq->curr) && - (rq->curr->rt.nr_cpus_allowed < 2 || + (rq->curr->nr_cpus_allowed < 2 || rq->curr->prio <= p->prio)) push_rt_tasks(rq); } @@ -1817,7 +1823,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, * Only update if the process changes its state from whether it * can migrate or not. */ - if ((p->rt.nr_cpus_allowed > 1) == (weight > 1)) + if ((p->nr_cpus_allowed > 1) == (weight > 1)) return; rq = task_rq(p); @@ -1979,6 +1985,8 @@ static void watchdog(struct rq *rq, struct task_struct *p) static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) { + struct sched_rt_entity *rt_se = &p->rt; + update_curr_rt(rq); watchdog(rq, p); @@ -1996,12 +2004,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) p->rt.time_slice = RR_TIMESLICE; /* - * Requeue to the end of queue if we are not the only element - * on the queue: + * Requeue to the end of queue if we (and all of our ancestors) are the + * only element on the queue */ - if (p->rt.run_list.prev != p->rt.run_list.next) { - requeue_task_rt(rq, p, 0); - set_tsk_need_resched(p); + for_each_sched_rt_entity(rt_se) { + if (rt_se->run_list.prev != rt_se->run_list.next) { + requeue_task_rt(rq, p, 0); + set_tsk_need_resched(p); + return; + } } } diff --git a/kernel/signal.c b/kernel/signal.c index f7b418217633..677102789cf2 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1656,19 +1656,18 @@ bool do_notify_parent(struct task_struct *tsk, int sig) info.si_signo = sig; info.si_errno = 0; /* - * we are under tasklist_lock here so our parent is tied to - * us and cannot exit and release its namespace. + * We are under tasklist_lock here so our parent is tied to + * us and cannot change. * - * the only it can is to switch its nsproxy with sys_unshare, - * bu uncharing pid namespaces is not allowed, so we'll always - * see relevant namespace + * task_active_pid_ns will always return the same pid namespace + * until a task passes through release_task. * * write_lock() currently calls preempt_disable() which is the * same as rcu_read_lock(), but according to Oleg, this is not * correct to rely on this */ rcu_read_lock(); - info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); + info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(tsk->parent)); info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns), task_uid(tsk)); rcu_read_unlock(); @@ -2369,24 +2368,34 @@ relock: } /** - * block_sigmask - add @ka's signal mask to current->blocked - * @ka: action for @signr - * @signr: signal that has been successfully delivered + * signal_delivered - + * @sig: number of signal being delivered + * @info: siginfo_t of signal being delivered + * @ka: sigaction setting that chose the handler + * @regs: user register state + * @stepping: nonzero if debugger single-step or block-step in use * * This function should be called when a signal has succesfully been - * delivered. It adds the mask of signals for @ka to current->blocked - * so that they are blocked during the execution of the signal - * handler. In addition, @signr will be blocked unless %SA_NODEFER is - * set in @ka->sa.sa_flags. + * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask + * is always blocked, and the signal itself is blocked unless %SA_NODEFER + * is set in @ka->sa.sa_flags. Tracing is notified. */ -void block_sigmask(struct k_sigaction *ka, int signr) +void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, + struct pt_regs *regs, int stepping) { sigset_t blocked; + /* A signal was successfully delivered, and the + saved sigmask was stored on the signal frame, + and will be restored by sigreturn. So we can + simply clear the restore sigmask flag. */ + clear_restore_sigmask(); + sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(&blocked, signr); + sigaddset(&blocked, sig); set_current_blocked(&blocked); + tracehook_signal_handler(sig, info, ka, regs, stepping); } /* @@ -2519,7 +2528,16 @@ static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset) * It is wrong to change ->blocked directly, this helper should be used * to ensure the process can't miss a shared signal we are going to block. */ -void set_current_blocked(const sigset_t *newset) +void set_current_blocked(sigset_t *newset) +{ + struct task_struct *tsk = current; + sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP)); + spin_lock_irq(&tsk->sighand->siglock); + __set_task_blocked(tsk, newset); + spin_unlock_irq(&tsk->sighand->siglock); +} + +void __set_current_blocked(const sigset_t *newset) { struct task_struct *tsk = current; @@ -2559,7 +2577,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) return -EINVAL; } - set_current_blocked(&newset); + __set_current_blocked(&newset); return 0; } @@ -3133,7 +3151,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, return -EINVAL; } - set_current_blocked(&new_blocked); + __set_current_blocked(&new_blocked); } if (oset) { @@ -3197,7 +3215,6 @@ SYSCALL_DEFINE1(ssetmask, int, newmask) int old = current->blocked.sig[0]; sigset_t newset; - siginitset(&newset, newmask & ~(sigmask(SIGKILL) | sigmask(SIGSTOP))); set_current_blocked(&newset); return old; @@ -3236,11 +3253,8 @@ SYSCALL_DEFINE0(pause) #endif -#ifdef HAVE_SET_RESTORE_SIGMASK int sigsuspend(sigset_t *set) { - sigdelsetmask(set, sigmask(SIGKILL)|sigmask(SIGSTOP)); - current->saved_sigmask = current->blocked; set_current_blocked(set); @@ -3249,7 +3263,6 @@ int sigsuspend(sigset_t *set) set_restore_sigmask(); return -ERESTARTNOHAND; } -#endif #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND /** diff --git a/kernel/smpboot.c b/kernel/smpboot.c index e1a797e028a3..98f60c5caa1b 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -31,6 +31,12 @@ void __init idle_thread_set_boot_cpu(void) per_cpu(idle_threads, smp_processor_id()) = current; } +/** + * idle_init - Initialize the idle thread for a cpu + * @cpu: The cpu for which the idle thread should be initialized + * + * Creates the thread if it does not exist. + */ static inline void idle_init(unsigned int cpu) { struct task_struct *tsk = per_cpu(idle_threads, cpu); @@ -45,17 +51,16 @@ static inline void idle_init(unsigned int cpu) } /** - * idle_thread_init - Initialize the idle thread for a cpu - * @cpu: The cpu for which the idle thread should be initialized - * - * Creates the thread if it does not exist. + * idle_threads_init - Initialize idle threads for all cpus */ void __init idle_threads_init(void) { - unsigned int cpu; + unsigned int cpu, boot_cpu; + + boot_cpu = smp_processor_id(); for_each_possible_cpu(cpu) { - if (cpu != smp_processor_id()) + if (cpu != boot_cpu) idle_init(cpu); } } diff --git a/kernel/sys.c b/kernel/sys.c index 6df42624e454..9ff89cb9657a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -36,6 +36,8 @@ #include <linux/personality.h> #include <linux/ptrace.h> #include <linux/fs_struct.h> +#include <linux/file.h> +#include <linux/mount.h> #include <linux/gfp.h> #include <linux/syscore_ops.h> #include <linux/version.h> @@ -1378,8 +1380,8 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) memcpy(u->nodename, tmp, len); memset(u->nodename + len, 0, sizeof(u->nodename) - len); errno = 0; + uts_proc_notify(UTS_PROC_HOSTNAME); } - uts_proc_notify(UTS_PROC_HOSTNAME); up_write(&uts_sem); return errno; } @@ -1429,8 +1431,8 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) memcpy(u->domainname, tmp, len); memset(u->domainname + len, 0, sizeof(u->domainname) - len); errno = 0; + uts_proc_notify(UTS_PROC_DOMAINNAME); } - uts_proc_notify(UTS_PROC_DOMAINNAME); up_write(&uts_sem); return errno; } @@ -1784,77 +1786,102 @@ SYSCALL_DEFINE1(umask, int, mask) } #ifdef CONFIG_CHECKPOINT_RESTORE +static bool vma_flags_mismatch(struct vm_area_struct *vma, + unsigned long required, + unsigned long banned) +{ + return (vma->vm_flags & required) != required || + (vma->vm_flags & banned); +} + +static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) +{ + struct file *exe_file; + struct dentry *dentry; + int err; + + /* + * Setting new mm::exe_file is only allowed when no VM_EXECUTABLE vma's + * remain. So perform a quick test first. + */ + if (mm->num_exe_file_vmas) + return -EBUSY; + + exe_file = fget(fd); + if (!exe_file) + return -EBADF; + + dentry = exe_file->f_path.dentry; + + /* + * Because the original mm->exe_file points to executable file, make + * sure that this one is executable as well, to avoid breaking an + * overall picture. + */ + err = -EACCES; + if (!S_ISREG(dentry->d_inode->i_mode) || + exe_file->f_path.mnt->mnt_flags & MNT_NOEXEC) + goto exit; + + err = inode_permission(dentry->d_inode, MAY_EXEC); + if (err) + goto exit; + + /* + * The symlink can be changed only once, just to disallow arbitrary + * transitions malicious software might bring in. This means one + * could make a snapshot over all processes running and monitor + * /proc/pid/exe changes to notice unusual activity if needed. + */ + down_write(&mm->mmap_sem); + if (likely(!mm->exe_file)) + set_mm_exe_file(mm, exe_file); + else + err = -EBUSY; + up_write(&mm->mmap_sem); + +exit: + fput(exe_file); + return err; +} + static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { unsigned long rlim = rlimit(RLIMIT_DATA); - unsigned long vm_req_flags; - unsigned long vm_bad_flags; - struct vm_area_struct *vma; - int error = 0; struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + int error; - if (arg4 | arg5) + if (arg5 || (arg4 && opt != PR_SET_MM_AUXV)) return -EINVAL; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; + if (opt == PR_SET_MM_EXE_FILE) + return prctl_set_mm_exe_file(mm, (unsigned int)addr); + if (addr >= TASK_SIZE) return -EINVAL; + error = -EINVAL; + down_read(&mm->mmap_sem); vma = find_vma(mm, addr); - if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) { - /* It must be existing VMA */ - if (!vma || vma->vm_start > addr) - goto out; - } - - error = -EINVAL; switch (opt) { case PR_SET_MM_START_CODE: + mm->start_code = addr; + break; case PR_SET_MM_END_CODE: - vm_req_flags = VM_READ | VM_EXEC; - vm_bad_flags = VM_WRITE | VM_MAYSHARE; - - if ((vma->vm_flags & vm_req_flags) != vm_req_flags || - (vma->vm_flags & vm_bad_flags)) - goto out; - - if (opt == PR_SET_MM_START_CODE) - mm->start_code = addr; - else - mm->end_code = addr; + mm->end_code = addr; break; - case PR_SET_MM_START_DATA: - case PR_SET_MM_END_DATA: - vm_req_flags = VM_READ | VM_WRITE; - vm_bad_flags = VM_EXEC | VM_MAYSHARE; - - if ((vma->vm_flags & vm_req_flags) != vm_req_flags || - (vma->vm_flags & vm_bad_flags)) - goto out; - - if (opt == PR_SET_MM_START_DATA) - mm->start_data = addr; - else - mm->end_data = addr; + mm->start_data = addr; break; - - case PR_SET_MM_START_STACK: - -#ifdef CONFIG_STACK_GROWSUP - vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP; -#else - vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN; -#endif - if ((vma->vm_flags & vm_req_flags) != vm_req_flags) - goto out; - - mm->start_stack = addr; + case PR_SET_MM_END_DATA: + mm->end_data = addr; break; case PR_SET_MM_START_BRK: @@ -1881,16 +1908,77 @@ static int prctl_set_mm(int opt, unsigned long addr, mm->brk = addr; break; + /* + * If command line arguments and environment + * are placed somewhere else on stack, we can + * set them up here, ARG_START/END to setup + * command line argumets and ENV_START/END + * for environment. + */ + case PR_SET_MM_START_STACK: + case PR_SET_MM_ARG_START: + case PR_SET_MM_ARG_END: + case PR_SET_MM_ENV_START: + case PR_SET_MM_ENV_END: + if (!vma) { + error = -EFAULT; + goto out; + } +#ifdef CONFIG_STACK_GROWSUP + if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSUP, 0)) +#else + if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSDOWN, 0)) +#endif + goto out; + if (opt == PR_SET_MM_START_STACK) + mm->start_stack = addr; + else if (opt == PR_SET_MM_ARG_START) + mm->arg_start = addr; + else if (opt == PR_SET_MM_ARG_END) + mm->arg_end = addr; + else if (opt == PR_SET_MM_ENV_START) + mm->env_start = addr; + else if (opt == PR_SET_MM_ENV_END) + mm->env_end = addr; + break; + + /* + * This doesn't move auxiliary vector itself + * since it's pinned to mm_struct, but allow + * to fill vector with new values. It's up + * to a caller to provide sane values here + * otherwise user space tools which use this + * vector might be unhappy. + */ + case PR_SET_MM_AUXV: { + unsigned long user_auxv[AT_VECTOR_SIZE]; + + if (arg4 > sizeof(user_auxv)) + goto out; + up_read(&mm->mmap_sem); + + if (copy_from_user(user_auxv, (const void __user *)addr, arg4)) + return -EFAULT; + + /* Make sure the last entry is always AT_NULL */ + user_auxv[AT_VECTOR_SIZE - 2] = 0; + user_auxv[AT_VECTOR_SIZE - 1] = 0; + + BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); + + task_lock(current); + memcpy(mm->saved_auxv, user_auxv, arg4); + task_unlock(current); + + return 0; + } default: - error = -EINVAL; goto out; } error = 0; - out: up_read(&mm->mmap_sem); - return error; } #else /* CONFIG_CHECKPOINT_RESTORE */ @@ -2114,7 +2202,6 @@ int orderly_poweroff(bool force) NULL }; int ret = -ENOMEM; - struct subprocess_info *info; if (argv == NULL) { printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", @@ -2122,18 +2209,16 @@ int orderly_poweroff(bool force) goto out; } - info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC); - if (info == NULL) { - argv_free(argv); - goto out; - } - - call_usermodehelper_setfns(info, NULL, argv_cleanup, NULL); + ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT, + NULL, argv_cleanup, NULL); +out: + if (likely(!ret)) + return 0; - ret = call_usermodehelper_exec(info, UMH_NO_WAIT); + if (ret == -ENOMEM) + argv_free(argv); - out: - if (ret && force) { + if (force) { printk(KERN_WARNING "Failed to start orderly shutdown: " "forcing the issue\n"); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 47bfa16430d7..dbff751e4086 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -203,3 +203,6 @@ cond_syscall(sys_fanotify_mark); cond_syscall(sys_name_to_handle_at); cond_syscall(sys_open_by_handle_at); cond_syscall(compat_sys_open_by_handle_at); + +/* compare kernel pointers */ +cond_syscall(sys_kcmp); diff --git a/kernel/task_work.c b/kernel/task_work.c new file mode 100644 index 000000000000..82d1c794066d --- /dev/null +++ b/kernel/task_work.c @@ -0,0 +1,84 @@ +#include <linux/spinlock.h> +#include <linux/task_work.h> +#include <linux/tracehook.h> + +int +task_work_add(struct task_struct *task, struct task_work *twork, bool notify) +{ + unsigned long flags; + int err = -ESRCH; + +#ifndef TIF_NOTIFY_RESUME + if (notify) + return -ENOTSUPP; +#endif + /* + * We must not insert the new work if the task has already passed + * exit_task_work(). We rely on do_exit()->raw_spin_unlock_wait() + * and check PF_EXITING under pi_lock. + */ + raw_spin_lock_irqsave(&task->pi_lock, flags); + if (likely(!(task->flags & PF_EXITING))) { + hlist_add_head(&twork->hlist, &task->task_works); + err = 0; + } + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + + /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */ + if (likely(!err) && notify) + set_notify_resume(task); + return err; +} + +struct task_work * +task_work_cancel(struct task_struct *task, task_work_func_t func) +{ + unsigned long flags; + struct task_work *twork; + struct hlist_node *pos; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + hlist_for_each_entry(twork, pos, &task->task_works, hlist) { + if (twork->func == func) { + hlist_del(&twork->hlist); + goto found; + } + } + twork = NULL; + found: + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + + return twork; +} + +void task_work_run(void) +{ + struct task_struct *task = current; + struct hlist_head task_works; + struct hlist_node *pos; + + raw_spin_lock_irq(&task->pi_lock); + hlist_move_list(&task->task_works, &task_works); + raw_spin_unlock_irq(&task->pi_lock); + + if (unlikely(hlist_empty(&task_works))) + return; + /* + * We use hlist to save the space in task_struct, but we want fifo. + * Find the last entry, the list should be short, then process them + * in reverse order. + */ + for (pos = task_works.first; pos->next; pos = pos->next) + ; + + for (;;) { + struct hlist_node **pprev = pos->pprev; + struct task_work *twork = container_of(pos, struct task_work, + hlist); + twork->func(twork); + + if (pprev == &task_works.first) + break; + pos = container_of(pprev, struct hlist_node, next); + } +} diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 9cd928f7a7c6..7e1ce012a851 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -297,8 +297,7 @@ void clockevents_register_device(struct clock_event_device *dev) } EXPORT_SYMBOL_GPL(clockevents_register_device); -static void clockevents_config(struct clock_event_device *dev, - u32 freq) +void clockevents_config(struct clock_event_device *dev, u32 freq) { u64 sec; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6a3a5b9ff561..da70c6db496c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -576,6 +576,7 @@ void tick_nohz_idle_exit(void) /* Update jiffies first */ select_nohz_load_balancer(0); tick_do_update_jiffies64(now); + update_cpu_load_nohz(); #ifndef CONFIG_VIRT_CPU_ACCOUNTING /* @@ -814,6 +815,16 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) return HRTIMER_RESTART; } +static int sched_skew_tick; + +static int __init skew_tick(char *str) +{ + get_option(&str, &sched_skew_tick); + + return 0; +} +early_param("skew_tick", skew_tick); + /** * tick_setup_sched_timer - setup the tick emulation timer */ @@ -831,6 +842,14 @@ void tick_setup_sched_timer(void) /* Get the next period (per cpu) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); + /* Offset the tick to avert xtime_lock contention. */ + if (sched_skew_tick) { + u64 offset = ktime_to_ns(tick_period) >> 1; + do_div(offset, num_possible_cpus()); + offset *= smp_processor_id(); + hrtimer_add_expires_ns(&ts->sched_timer, offset); + } + for (;;) { hrtimer_forward(&ts->sched_timer, now, tick_period); hrtimer_start_expires(&ts->sched_timer, diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6420cda62336..1d0f6a8a0e5e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1486,6 +1486,11 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, if (!buffer) return size; + /* Make sure the requested buffer exists */ + if (cpu_id != RING_BUFFER_ALL_CPUS && + !cpumask_test_cpu(cpu_id, buffer->cpumask)) + return size; + size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); size *= BUF_PAGE_SIZE; |