summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks16
-rw-r--r--kernel/Makefile33
-rw-r--r--kernel/acct.c498
-rw-r--r--kernel/audit.c111
-rw-r--r--kernel/audit.h6
-rw-r--r--kernel/auditfilter.c37
-rw-r--r--kernel/auditsc.c160
-rw-r--r--kernel/backtracetest.c18
-rw-r--r--kernel/bounds.c2
-rw-r--r--kernel/bpf/Makefile1
-rw-r--r--kernel/bpf/core.c534
-rw-r--r--kernel/capability.c59
-rw-r--r--kernel/cgroup.c5130
-rw-r--r--kernel/cgroup_freezer.c156
-rw-r--r--kernel/compat.c244
-rw-r--r--kernel/configs/tiny.config4
-rw-r--r--kernel/context_tracking.c5
-rw-r--r--kernel/cpu.c111
-rw-r--r--kernel/cpu/Makefile1
-rw-r--r--kernel/cpu/idle.c144
-rw-r--r--kernel/cpuset.c831
-rw-r--r--kernel/debug/debug_core.c20
-rw-r--r--kernel/debug/kdb/kdb_bt.c2
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/debug/kdb/kdb_main.c4
-rw-r--r--kernel/delayacct.c62
-rw-r--r--kernel/events/core.c505
-rw-r--r--kernel/events/uprobes.c128
-rw-r--r--kernel/exec_domain.c14
-rw-r--r--kernel/exit.c223
-rw-r--r--kernel/extable.c2
-rw-r--r--kernel/fork.c183
-rw-r--r--kernel/futex.c635
-rw-r--r--kernel/futex_compat.c2
-rw-r--r--kernel/gcov/base.c6
-rw-r--r--kernel/gcov/fs.c3
-rw-r--r--kernel/gcov/gcc_4_7.c5
-rw-r--r--kernel/groups.c14
-rw-r--r--kernel/hung_task.c7
-rw-r--r--kernel/irq/Kconfig9
-rw-r--r--kernel/irq/chip.c54
-rw-r--r--kernel/irq/generic-chip.c5
-rw-r--r--kernel/irq/handle.c5
-rw-r--r--kernel/irq/internals.h17
-rw-r--r--kernel/irq/irqdesc.c101
-rw-r--r--kernel/irq/irqdomain.c8
-rw-r--r--kernel/irq/manage.c150
-rw-r--r--kernel/irq/proc.c8
-rw-r--r--kernel/irq/spurious.c106
-rw-r--r--kernel/irq_work.c116
-rw-r--r--kernel/kallsyms.c13
-rw-r--r--kernel/kcmp.c7
-rw-r--r--kernel/kexec.c1395
-rw-r--r--kernel/kmod.c7
-rw-r--r--kernel/kprobes.c409
-rw-r--r--kernel/ksysfs.c12
-rw-r--r--kernel/kthread.c10
-rw-r--r--kernel/latencytop.c5
-rw-r--r--kernel/locking/Makefile5
-rw-r--r--kernel/locking/lockdep.c25
-rw-r--r--kernel/locking/lockdep_internals.h6
-rw-r--r--kernel/locking/locktorture.c454
-rw-r--r--kernel/locking/mcs_spinlock.c208
-rw-r--r--kernel/locking/mcs_spinlock.h130
-rw-r--r--kernel/locking/mutex-debug.c23
-rw-r--r--kernel/locking/mutex.c143
-rw-r--r--kernel/locking/qrwlock.c132
-rw-r--r--kernel/locking/rtmutex-debug.c5
-rw-r--r--kernel/locking/rtmutex-debug.h12
-rw-r--r--kernel/locking/rtmutex.c823
-rw-r--r--kernel/locking/rtmutex.h12
-rw-r--r--kernel/locking/rtmutex_common.h22
-rw-r--r--kernel/locking/rwsem-spinlock.c28
-rw-r--r--kernel/locking/rwsem-xadd.c278
-rw-r--r--kernel/locking/rwsem.c31
-rw-r--r--kernel/module.c93
-rw-r--r--kernel/notifier.c24
-rw-r--r--kernel/nsproxy.c15
-rw-r--r--kernel/panic.c43
-rw-r--r--kernel/params.c43
-rw-r--r--kernel/pid_namespace.c4
-rw-r--r--kernel/power/Kconfig6
-rw-r--r--kernel/power/hibernate.c95
-rw-r--r--kernel/power/main.c44
-rw-r--r--kernel/power/power.h10
-rw-r--r--kernel/power/process.c4
-rw-r--r--kernel/power/qos.c18
-rw-r--r--kernel/power/snapshot.c472
-rw-r--r--kernel/power/suspend.c231
-rw-r--r--kernel/power/suspend_test.c39
-rw-r--r--kernel/power/swap.c4
-rw-r--r--kernel/power/user.c3
-rw-r--r--kernel/power/wakelock.c2
-rw-r--r--kernel/printk/printk.c504
-rw-r--r--kernel/profile.c42
-rw-r--r--kernel/ptrace.c12
-rw-r--r--kernel/rcu/Makefile2
-rw-r--r--kernel/rcu/rcu.h15
-rw-r--r--kernel/rcu/rcutorture.c (renamed from kernel/rcu/torture.c)1217
-rw-r--r--kernel/rcu/srcu.c15
-rw-r--r--kernel/rcu/tiny.c8
-rw-r--r--kernel/rcu/tiny_plugin.h12
-rw-r--r--kernel/rcu/tree.c596
-rw-r--r--kernel/rcu/tree.h57
-rw-r--r--kernel/rcu/tree_plugin.h457
-rw-r--r--kernel/rcu/tree_trace.c6
-rw-r--r--kernel/rcu/update.c24
-rw-r--r--kernel/reboot.c21
-rw-r--r--kernel/relay.c6
-rw-r--r--kernel/res_counter.c30
-rw-r--r--kernel/resource.c119
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/auto_group.c2
-rw-r--r--kernel/sched/clock.c3
-rw-r--r--kernel/sched/core.c1001
-rw-r--r--kernel/sched/cpuacct.c10
-rw-r--r--kernel/sched/cpudeadline.c37
-rw-r--r--kernel/sched/cpudeadline.h6
-rw-r--r--kernel/sched/cpupri.c16
-rw-r--r--kernel/sched/cpupri.h2
-rw-r--r--kernel/sched/cputime.c52
-rw-r--r--kernel/sched/deadline.c113
-rw-r--r--kernel/sched/debug.c12
-rw-r--r--kernel/sched/fair.c1423
-rw-r--r--kernel/sched/features.h8
-rw-r--r--kernel/sched/idle.c269
-rw-r--r--kernel/sched/idle_task.c27
-rw-r--r--kernel/sched/proc.c7
-rw-r--r--kernel/sched/rt.c233
-rw-r--r--kernel/sched/sched.h144
-rw-r--r--kernel/sched/stats.c2
-rw-r--r--kernel/sched/stop_task.c19
-rw-r--r--kernel/sched/wait.c32
-rw-r--r--kernel/seccomp.c632
-rw-r--r--kernel/signal.c151
-rw-r--r--kernel/smp.c215
-rw-r--r--kernel/softirq.c14
-rw-r--r--kernel/stop_machine.c1
-rw-r--r--kernel/sys.c33
-rw-r--r--kernel/sys_ni.c12
-rw-r--r--kernel/sysctl.c163
-rw-r--r--kernel/sysctl_binary.c1
-rw-r--r--kernel/system_keyring.c1
-rw-r--r--kernel/test_kprobes.c87
-rw-r--r--kernel/time/Kconfig11
-rw-r--r--kernel/time/Makefile24
-rw-r--r--kernel/time/alarmtimer.c56
-rw-r--r--kernel/time/clockevents.c50
-rw-r--r--kernel/time/clocksource.c12
-rw-r--r--kernel/time/hrtimer.c (renamed from kernel/hrtimer.c)171
-rw-r--r--kernel/time/itimer.c (renamed from kernel/itimer.c)0
-rw-r--r--kernel/time/ntp.c52
-rw-r--r--kernel/time/ntp_internal.h2
-rw-r--r--kernel/time/posix-cpu-timers.c (renamed from kernel/posix-cpu-timers.c)0
-rw-r--r--kernel/time/posix-timers.c (renamed from kernel/posix-timers.c)2
-rw-r--r--kernel/time/sched_clock.c17
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c106
-rw-r--r--kernel/time/tick-broadcast.c85
-rw-r--r--kernel/time/tick-common.c18
-rw-r--r--kernel/time/tick-internal.h13
-rw-r--r--kernel/time/tick-sched.c37
-rw-r--r--kernel/time/time.c (renamed from kernel/time.c)118
-rw-r--r--kernel/time/timeconst.bc (renamed from kernel/timeconst.bc)0
-rw-r--r--kernel/time/timekeeping.c1162
-rw-r--r--kernel/time/timekeeping.h20
-rw-r--r--kernel/time/timekeeping_debug.c4
-rw-r--r--kernel/time/timekeeping_internal.h17
-rw-r--r--kernel/time/timer.c (renamed from kernel/timer.c)79
-rw-r--r--kernel/time/udelay_test.c168
-rw-r--r--kernel/torture.c733
-rw-r--r--kernel/trace/Kconfig36
-rw-r--r--kernel/trace/Makefile4
-rw-r--r--kernel/trace/blktrace.c23
-rw-r--r--kernel/trace/ftrace.c1053
-rw-r--r--kernel/trace/ring_buffer.c101
-rw-r--r--kernel/trace/ring_buffer_benchmark.c6
-rw-r--r--kernel/trace/trace.c805
-rw-r--r--kernel/trace/trace.h89
-rw-r--r--kernel/trace/trace_benchmark.c198
-rw-r--r--kernel/trace/trace_benchmark.h41
-rw-r--r--kernel/trace/trace_clock.c9
-rw-r--r--kernel/trace/trace_event_perf.c39
-rw-r--r--kernel/trace/trace_events.c153
-rw-r--r--kernel/trace/trace_events_filter.c73
-rw-r--r--kernel/trace/trace_events_trigger.c4
-rw-r--r--kernel/trace/trace_export.c13
-rw-r--r--kernel/trace/trace_functions.c155
-rw-r--r--kernel/trace/trace_functions_graph.c65
-rw-r--r--kernel/trace/trace_irqsoff.c85
-rw-r--r--kernel/trace/trace_kprobe.c112
-rw-r--r--kernel/trace/trace_nop.c6
-rw-r--r--kernel/trace/trace_output.c298
-rw-r--r--kernel/trace/trace_output.h4
-rw-r--r--kernel/trace/trace_probe.c65
-rw-r--r--kernel/trace/trace_probe.h32
-rw-r--r--kernel/trace/trace_sched_wakeup.c80
-rw-r--r--kernel/trace/trace_selftest.c69
-rw-r--r--kernel/trace/trace_seq.c428
-rw-r--r--kernel/trace/trace_stack.c41
-rw-r--r--kernel/trace/trace_uprobe.c298
-rw-r--r--kernel/tracepoint.c714
-rw-r--r--kernel/tsacct.c19
-rw-r--r--kernel/up.c6
-rw-r--r--kernel/user.c4
-rw-r--r--kernel/user_namespace.c52
-rw-r--r--kernel/utsname.c6
-rw-r--r--kernel/utsname_sysctl.c10
-rw-r--r--kernel/watchdog.c77
-rw-r--r--kernel/workqueue.c686
-rw-r--r--kernel/workqueue_internal.h2
210 files changed, 20718 insertions, 11915 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index d2b32ac27a39..76768ee812b2 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -220,6 +220,20 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE
endif
+config ARCH_SUPPORTS_ATOMIC_RMW
+ bool
+
config MUTEX_SPIN_ON_OWNER
def_bool y
- depends on SMP && !DEBUG_MUTEXES
+ depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
+
+config RWSEM_SPIN_ON_OWNER
+ def_bool y
+ depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
+
+config ARCH_USE_QUEUE_RWLOCK
+ bool
+
+config QUEUE_RWLOCK
+ def_bool y if ARCH_USE_QUEUE_RWLOCK
+ depends on SMP
diff --git a/kernel/Makefile b/kernel/Makefile
index bc010ee272b6..dc5c77544fd6 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -3,12 +3,11 @@
#
obj-y = fork.o exec_domain.o panic.o \
- cpu.o exit.o itimer.o time.o softirq.o resource.o \
- sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
+ cpu.o exit.o softirq.o resource.o \
+ sysctl.o sysctl_binary.o capability.o ptrace.o user.o \
signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
- extable.o params.o posix-timers.o \
- kthread.o sys_ni.o posix-cpu-timers.o \
- hrtimer.o nsproxy.o \
+ extable.o params.o \
+ kthread.o sys_ni.o nsproxy.o \
notifier.o ksysfs.o cred.o reboot.o \
async.o range.o groups.o smpboot.o
@@ -18,11 +17,13 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg
CFLAGS_REMOVE_irq_work.o = -pg
endif
+# cond_syscall is currently not LTO compatible
+CFLAGS_sys_ni.o = $(DISABLE_LTO)
+
obj-y += sched/
obj-y += locking/
obj-y += power/
obj-y += printk/
-obj-y += cpu/
obj-y += irq/
obj-y += rcu/
@@ -85,6 +86,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_TRACEPOINTS) += trace/
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_CPU_PM) += cpu_pm.o
+obj-$(CONFIG_NET) += bpf/
obj-$(CONFIG_PERF_EVENTS) += events/
@@ -93,6 +95,7 @@ obj-$(CONFIG_PADATA) += padata.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o
obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
+obj-$(CONFIG_TORTURE_TEST) += torture.o
$(obj)/configs.o: $(obj)/config_data.h
@@ -102,27 +105,11 @@ targets += config_data.gz
$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
$(call if_changed,gzip)
- filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;")
+ filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;")
targets += config_data.h
$(obj)/config_data.h: $(obj)/config_data.gz FORCE
$(call filechk,ikconfiggz)
-$(obj)/time.o: $(obj)/timeconst.h
-
-quiet_cmd_hzfile = HZFILE $@
- cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
-
-targets += hz.bc
-$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
- $(call if_changed,hzfile)
-
-quiet_cmd_bc = BC $@
- cmd_bc = bc -q $(filter-out FORCE,$^) > $@
-
-targets += timeconst.h
-$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
- $(call if_changed,bc)
-
###############################################################################
#
# Roll all the X.509 certificates that we can find together and pull them into
diff --git a/kernel/acct.c b/kernel/acct.c
index 8d6e145138bb..b4c667d22e79 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -55,10 +55,11 @@
#include <linux/times.h>
#include <linux/syscalls.h>
#include <linux/mount.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/div64.h>
#include <linux/blkdev.h> /* sector_div */
#include <linux/pid_namespace.h>
+#include <linux/fs_pin.h>
/*
* These constants control the amount of freespace that suspend and
@@ -75,172 +76,190 @@ int acct_parm[3] = {4, 2, 30};
/*
* External references and all of the globals.
*/
-static void do_acct_process(struct bsd_acct_struct *acct,
- struct pid_namespace *ns, struct file *);
+static void do_acct_process(struct bsd_acct_struct *acct);
-/*
- * This structure is used so that all the data protected by lock
- * can be placed in the same cache line as the lock. This primes
- * the cache line to have the data after getting the lock.
- */
struct bsd_acct_struct {
+ struct fs_pin pin;
+ struct mutex lock;
int active;
unsigned long needcheck;
struct file *file;
struct pid_namespace *ns;
- struct list_head list;
+ struct work_struct work;
+ struct completion done;
};
-static DEFINE_SPINLOCK(acct_lock);
-static LIST_HEAD(acct_list);
-
/*
* Check the amount of free space and suspend/resume accordingly.
*/
-static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
+static int check_free_space(struct bsd_acct_struct *acct)
{
struct kstatfs sbuf;
- int res;
- int act;
- u64 resume;
- u64 suspend;
-
- spin_lock(&acct_lock);
- res = acct->active;
- if (!file || time_is_before_jiffies(acct->needcheck))
+
+ if (time_is_before_jiffies(acct->needcheck))
goto out;
- spin_unlock(&acct_lock);
/* May block */
- if (vfs_statfs(&file->f_path, &sbuf))
- return res;
- suspend = sbuf.f_blocks * SUSPEND;
- resume = sbuf.f_blocks * RESUME;
-
- do_div(suspend, 100);
- do_div(resume, 100);
-
- if (sbuf.f_bavail <= suspend)
- act = -1;
- else if (sbuf.f_bavail >= resume)
- act = 1;
- else
- act = 0;
-
- /*
- * If some joker switched acct->file under us we'ld better be
- * silent and _not_ touch anything.
- */
- spin_lock(&acct_lock);
- if (file != acct->file) {
- if (act)
- res = act>0;
+ if (vfs_statfs(&acct->file->f_path, &sbuf))
goto out;
- }
if (acct->active) {
- if (act < 0) {
+ u64 suspend = sbuf.f_blocks * SUSPEND;
+ do_div(suspend, 100);
+ if (sbuf.f_bavail <= suspend) {
acct->active = 0;
- printk(KERN_INFO "Process accounting paused\n");
+ pr_info("Process accounting paused\n");
}
} else {
- if (act > 0) {
+ u64 resume = sbuf.f_blocks * RESUME;
+ do_div(resume, 100);
+ if (sbuf.f_bavail >= resume) {
acct->active = 1;
- printk(KERN_INFO "Process accounting resumed\n");
+ pr_info("Process accounting resumed\n");
}
}
acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
- res = acct->active;
out:
- spin_unlock(&acct_lock);
+ return acct->active;
+}
+
+static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
+{
+ struct bsd_acct_struct *res;
+again:
+ smp_rmb();
+ rcu_read_lock();
+ res = ACCESS_ONCE(ns->bacct);
+ if (!res) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ if (!atomic_long_inc_not_zero(&res->pin.count)) {
+ rcu_read_unlock();
+ cpu_relax();
+ goto again;
+ }
+ rcu_read_unlock();
+ mutex_lock(&res->lock);
+ if (!res->ns) {
+ mutex_unlock(&res->lock);
+ pin_put(&res->pin);
+ goto again;
+ }
return res;
}
-/*
- * Close the old accounting file (if currently open) and then replace
- * it with file (if non-NULL).
- *
- * NOTE: acct_lock MUST be held on entry and exit.
- */
-static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
- struct pid_namespace *ns)
+static void close_work(struct work_struct *work)
+{
+ struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
+ struct file *file = acct->file;
+ if (file->f_op->flush)
+ file->f_op->flush(file, NULL);
+ __fput_sync(file);
+ complete(&acct->done);
+}
+
+static void acct_kill(struct bsd_acct_struct *acct,
+ struct bsd_acct_struct *new)
{
- struct file *old_acct = NULL;
- struct pid_namespace *old_ns = NULL;
-
- if (acct->file) {
- old_acct = acct->file;
- old_ns = acct->ns;
- acct->active = 0;
- acct->file = NULL;
+ if (acct) {
+ struct pid_namespace *ns = acct->ns;
+ do_acct_process(acct);
+ INIT_WORK(&acct->work, close_work);
+ init_completion(&acct->done);
+ schedule_work(&acct->work);
+ wait_for_completion(&acct->done);
+ pin_remove(&acct->pin);
+ ns->bacct = new;
acct->ns = NULL;
- list_del(&acct->list);
+ atomic_long_dec(&acct->pin.count);
+ mutex_unlock(&acct->lock);
+ pin_put(&acct->pin);
}
- if (file) {
- acct->file = file;
- acct->ns = ns;
- acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
- acct->active = 1;
- list_add(&acct->list, &acct_list);
- }
- if (old_acct) {
- mnt_unpin(old_acct->f_path.mnt);
- spin_unlock(&acct_lock);
- do_acct_process(acct, old_ns, old_acct);
- filp_close(old_acct, NULL);
- spin_lock(&acct_lock);
+}
+
+static void acct_pin_kill(struct fs_pin *pin)
+{
+ struct bsd_acct_struct *acct;
+ acct = container_of(pin, struct bsd_acct_struct, pin);
+ mutex_lock(&acct->lock);
+ if (!acct->ns) {
+ mutex_unlock(&acct->lock);
+ pin_put(pin);
+ acct = NULL;
}
+ acct_kill(acct, NULL);
}
static int acct_on(struct filename *pathname)
{
struct file *file;
- struct vfsmount *mnt;
- struct pid_namespace *ns;
- struct bsd_acct_struct *acct = NULL;
+ struct vfsmount *mnt, *internal;
+ struct pid_namespace *ns = task_active_pid_ns(current);
+ struct bsd_acct_struct *acct, *old;
+ int err;
+
+ acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
+ if (!acct)
+ return -ENOMEM;
/* Difference from BSD - they don't do O_APPEND */
file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
- if (IS_ERR(file))
+ if (IS_ERR(file)) {
+ kfree(acct);
return PTR_ERR(file);
+ }
if (!S_ISREG(file_inode(file)->i_mode)) {
+ kfree(acct);
filp_close(file, NULL);
return -EACCES;
}
if (!file->f_op->write) {
+ kfree(acct);
filp_close(file, NULL);
return -EIO;
}
-
- ns = task_active_pid_ns(current);
- if (ns->bacct == NULL) {
- acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
- if (acct == NULL) {
- filp_close(file, NULL);
- return -ENOMEM;
- }
+ internal = mnt_clone_internal(&file->f_path);
+ if (IS_ERR(internal)) {
+ kfree(acct);
+ filp_close(file, NULL);
+ return PTR_ERR(internal);
}
-
- spin_lock(&acct_lock);
- if (ns->bacct == NULL) {
- ns->bacct = acct;
- acct = NULL;
+ err = mnt_want_write(internal);
+ if (err) {
+ mntput(internal);
+ kfree(acct);
+ filp_close(file, NULL);
+ return err;
}
-
mnt = file->f_path.mnt;
- mnt_pin(mnt);
- acct_file_reopen(ns->bacct, file, ns);
- spin_unlock(&acct_lock);
-
- mntput(mnt); /* it's pinned, now give up active reference */
- kfree(acct);
-
+ file->f_path.mnt = internal;
+
+ atomic_long_set(&acct->pin.count, 1);
+ acct->pin.kill = acct_pin_kill;
+ acct->file = file;
+ acct->needcheck = jiffies;
+ acct->ns = ns;
+ mutex_init(&acct->lock);
+ mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */
+ pin_insert(&acct->pin, mnt);
+
+ old = acct_get(ns);
+ if (old)
+ acct_kill(old, acct);
+ else
+ ns->bacct = acct;
+ mutex_unlock(&acct->lock);
+ mnt_drop_write(mnt);
+ mntput(mnt);
return 0;
}
+static DEFINE_MUTEX(acct_on_mutex);
+
/**
* sys_acct - enable/disable process accounting
* @name: file name for accounting records or NULL to shutdown accounting
@@ -261,80 +280,23 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
if (name) {
struct filename *tmp = getname(name);
+
if (IS_ERR(tmp))
- return (PTR_ERR(tmp));
+ return PTR_ERR(tmp);
+ mutex_lock(&acct_on_mutex);
error = acct_on(tmp);
+ mutex_unlock(&acct_on_mutex);
putname(tmp);
} else {
- struct bsd_acct_struct *acct;
-
- acct = task_active_pid_ns(current)->bacct;
- if (acct == NULL)
- return 0;
-
- spin_lock(&acct_lock);
- acct_file_reopen(acct, NULL, NULL);
- spin_unlock(&acct_lock);
+ acct_kill(acct_get(task_active_pid_ns(current)), NULL);
}
return error;
}
-/**
- * acct_auto_close - turn off a filesystem's accounting if it is on
- * @m: vfsmount being shut down
- *
- * If the accounting is turned on for a file in the subtree pointed to
- * to by m, turn accounting off. Done when m is about to die.
- */
-void acct_auto_close_mnt(struct vfsmount *m)
-{
- struct bsd_acct_struct *acct;
-
- spin_lock(&acct_lock);
-restart:
- list_for_each_entry(acct, &acct_list, list)
- if (acct->file && acct->file->f_path.mnt == m) {
- acct_file_reopen(acct, NULL, NULL);
- goto restart;
- }
- spin_unlock(&acct_lock);
-}
-
-/**
- * acct_auto_close - turn off a filesystem's accounting if it is on
- * @sb: super block for the filesystem
- *
- * If the accounting is turned on for a file in the filesystem pointed
- * to by sb, turn accounting off.
- */
-void acct_auto_close(struct super_block *sb)
-{
- struct bsd_acct_struct *acct;
-
- spin_lock(&acct_lock);
-restart:
- list_for_each_entry(acct, &acct_list, list)
- if (acct->file && acct->file->f_path.dentry->d_sb == sb) {
- acct_file_reopen(acct, NULL, NULL);
- goto restart;
- }
- spin_unlock(&acct_lock);
-}
-
void acct_exit_ns(struct pid_namespace *ns)
{
- struct bsd_acct_struct *acct = ns->bacct;
-
- if (acct == NULL)
- return;
-
- spin_lock(&acct_lock);
- if (acct->file != NULL)
- acct_file_reopen(acct, NULL, NULL);
- spin_unlock(&acct_lock);
-
- kfree(acct);
+ acct_kill(acct_get(ns), NULL);
}
/*
@@ -376,7 +338,7 @@ static comp_t encode_comp_t(unsigned long value)
return exp;
}
-#if ACCT_VERSION==1 || ACCT_VERSION==2
+#if ACCT_VERSION == 1 || ACCT_VERSION == 2
/*
* encode an u64 into a comp2_t (24 bits)
*
@@ -389,7 +351,7 @@ static comp_t encode_comp_t(unsigned long value)
#define MANTSIZE2 20 /* 20 bit mantissa. */
#define EXPSIZE2 5 /* 5 bit base 2 exponent. */
#define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
-#define MAXEXP2 ((1 <<EXPSIZE2) - 1) /* Maximum exponent. */
+#define MAXEXP2 ((1 << EXPSIZE2) - 1) /* Maximum exponent. */
static comp2_t encode_comp2_t(u64 value)
{
@@ -420,7 +382,7 @@ static comp2_t encode_comp2_t(u64 value)
}
#endif
-#if ACCT_VERSION==3
+#if ACCT_VERSION == 3
/*
* encode an u64 into a 32 bit IEEE float
*/
@@ -429,8 +391,9 @@ static u32 encode_float(u64 value)
unsigned exp = 190;
unsigned u;
- if (value==0) return 0;
- while ((s64)value > 0){
+ if (value == 0)
+ return 0;
+ while ((s64)value > 0) {
value <<= 1;
exp--;
}
@@ -448,120 +411,112 @@ static u32 encode_float(u64 value)
* do_exit() or when switching to a different output file.
*/
-/*
- * do_acct_process does all actual work. Caller holds the reference to file.
- */
-static void do_acct_process(struct bsd_acct_struct *acct,
- struct pid_namespace *ns, struct file *file)
+static void fill_ac(acct_t *ac)
{
struct pacct_struct *pacct = &current->signal->pacct;
- acct_t ac;
- mm_segment_t fs;
- unsigned long flim;
- u64 elapsed;
- u64 run_time;
- struct timespec uptime;
+ u64 elapsed, run_time;
struct tty_struct *tty;
- const struct cred *orig_cred;
-
- /* Perform file operations on behalf of whoever enabled accounting */
- orig_cred = override_creds(file->f_cred);
-
- /*
- * First check to see if there is enough free_space to continue
- * the process accounting system.
- */
- if (!check_free_space(acct, file))
- goto out;
/*
* Fill the accounting struct with the needed info as recorded
* by the different kernel functions.
*/
- memset(&ac, 0, sizeof(acct_t));
+ memset(ac, 0, sizeof(acct_t));
- ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
- strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
+ ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER;
+ strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
/* calculate run_time in nsec*/
- do_posix_clock_monotonic_gettime(&uptime);
- run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
- run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC
- + current->group_leader->start_time.tv_nsec;
+ run_time = ktime_get_ns();
+ run_time -= current->group_leader->start_time;
/* convert nsec -> AHZ */
elapsed = nsec_to_AHZ(run_time);
-#if ACCT_VERSION==3
- ac.ac_etime = encode_float(elapsed);
+#if ACCT_VERSION == 3
+ ac->ac_etime = encode_float(elapsed);
#else
- ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
- (unsigned long) elapsed : (unsigned long) -1l);
+ ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
+ (unsigned long) elapsed : (unsigned long) -1l);
#endif
-#if ACCT_VERSION==1 || ACCT_VERSION==2
+#if ACCT_VERSION == 1 || ACCT_VERSION == 2
{
/* new enlarged etime field */
comp2_t etime = encode_comp2_t(elapsed);
- ac.ac_etime_hi = etime >> 16;
- ac.ac_etime_lo = (u16) etime;
+
+ ac->ac_etime_hi = etime >> 16;
+ ac->ac_etime_lo = (u16) etime;
}
#endif
do_div(elapsed, AHZ);
- ac.ac_btime = get_seconds() - elapsed;
+ ac->ac_btime = get_seconds() - elapsed;
+#if ACCT_VERSION==2
+ ac->ac_ahz = AHZ;
+#endif
+
+ spin_lock_irq(&current->sighand->siglock);
+ tty = current->signal->tty; /* Safe as we hold the siglock */
+ ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
+ ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
+ ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
+ ac->ac_flag = pacct->ac_flag;
+ ac->ac_mem = encode_comp_t(pacct->ac_mem);
+ ac->ac_minflt = encode_comp_t(pacct->ac_minflt);
+ ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
+ ac->ac_exitcode = pacct->ac_exitcode;
+ spin_unlock_irq(&current->sighand->siglock);
+}
+/*
+ * do_acct_process does all actual work. Caller holds the reference to file.
+ */
+static void do_acct_process(struct bsd_acct_struct *acct)
+{
+ acct_t ac;
+ unsigned long flim;
+ const struct cred *orig_cred;
+ struct pid_namespace *ns = acct->ns;
+ struct file *file = acct->file;
+
+ /*
+ * Accounting records are not subject to resource limits.
+ */
+ flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+ /* Perform file operations on behalf of whoever enabled accounting */
+ orig_cred = override_creds(file->f_cred);
+
+ /*
+ * First check to see if there is enough free_space to continue
+ * the process accounting system.
+ */
+ if (!check_free_space(acct))
+ goto out;
+
+ fill_ac(&ac);
/* we really need to bite the bullet and change layout */
ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
-#if ACCT_VERSION==2
- ac.ac_ahz = AHZ;
-#endif
-#if ACCT_VERSION==1 || ACCT_VERSION==2
+#if ACCT_VERSION == 1 || ACCT_VERSION == 2
/* backward-compatible 16 bit fields */
ac.ac_uid16 = ac.ac_uid;
ac.ac_gid16 = ac.ac_gid;
#endif
-#if ACCT_VERSION==3
+#if ACCT_VERSION == 3
ac.ac_pid = task_tgid_nr_ns(current, ns);
rcu_read_lock();
ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
rcu_read_unlock();
#endif
-
- spin_lock_irq(&current->sighand->siglock);
- tty = current->signal->tty; /* Safe as we hold the siglock */
- ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
- ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
- ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
- ac.ac_flag = pacct->ac_flag;
- ac.ac_mem = encode_comp_t(pacct->ac_mem);
- ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
- ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
- ac.ac_exitcode = pacct->ac_exitcode;
- spin_unlock_irq(&current->sighand->siglock);
- ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */
- ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
- ac.ac_swaps = encode_comp_t(0);
-
/*
* Get freeze protection. If the fs is frozen, just skip the write
* as we could deadlock the system otherwise.
*/
- if (!file_start_write_trylock(file))
- goto out;
- /*
- * Kernel segment override to datasegment and write it
- * to the accounting file.
- */
- fs = get_fs();
- set_fs(KERNEL_DS);
- /*
- * Accounting records are not subject to resource limits.
- */
- flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
- file->f_op->write(file, (char *)&ac,
- sizeof(acct_t), &file->f_pos);
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
- set_fs(fs);
- file_end_write(file);
+ if (file_start_write_trylock(file)) {
+ /* it's been opened O_APPEND, so position is irrelevant */
+ loff_t pos = 0;
+ __kernel_write(file, (char *)&ac, sizeof(acct_t), &pos);
+ file_end_write(file);
+ }
out:
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
revert_creds(orig_cred);
}
@@ -578,6 +533,7 @@ void acct_collect(long exitcode, int group_dead)
if (group_dead && current->mm) {
struct vm_area_struct *vma;
+
down_read(&current->mm->mmap_sem);
vma = current->mm->mmap;
while (vma) {
@@ -609,34 +565,20 @@ void acct_collect(long exitcode, int group_dead)
spin_unlock_irq(&current->sighand->siglock);
}
-static void acct_process_in_ns(struct pid_namespace *ns)
+static void slow_acct_process(struct pid_namespace *ns)
{
- struct file *file = NULL;
- struct bsd_acct_struct *acct;
-
- acct = ns->bacct;
- /*
- * accelerate the common fastpath:
- */
- if (!acct || !acct->file)
- return;
-
- spin_lock(&acct_lock);
- file = acct->file;
- if (unlikely(!file)) {
- spin_unlock(&acct_lock);
- return;
+ for ( ; ns; ns = ns->parent) {
+ struct bsd_acct_struct *acct = acct_get(ns);
+ if (acct) {
+ do_acct_process(acct);
+ mutex_unlock(&acct->lock);
+ pin_put(&acct->pin);
+ }
}
- get_file(file);
- spin_unlock(&acct_lock);
-
- do_acct_process(acct, ns, file);
- fput(file);
}
/**
- * acct_process - now just a wrapper around acct_process_in_ns,
- * which in turn is a wrapper around do_acct_process.
+ * acct_process
*
* handles process accounting for an exiting task
*/
@@ -649,6 +591,10 @@ void acct_process(void)
* alive and holds its namespace, which in turn holds
* its parent.
*/
- for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent)
- acct_process_in_ns(ns);
+ for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) {
+ if (ns->bacct)
+ break;
+ }
+ if (unlikely(ns))
+ slow_acct_process(ns);
}
diff --git a/kernel/audit.c b/kernel/audit.c
index 3392d3e0254a..ba2ff5a5c600 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -44,7 +44,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/init.h>
-#include <asm/types.h>
+#include <linux/types.h>
#include <linux/atomic.h>
#include <linux/mm.h>
#include <linux/export.h>
@@ -182,7 +182,7 @@ struct audit_buffer {
struct audit_reply {
__u32 portid;
- struct net *net;
+ struct net *net;
struct sk_buff *skb;
};
@@ -396,7 +396,7 @@ static void audit_printk_skb(struct sk_buff *skb)
if (printk_ratelimit())
pr_notice("type=%d %s\n", nlh->nlmsg_type, data);
else
- audit_log_lost("printk limit exceeded\n");
+ audit_log_lost("printk limit exceeded");
}
audit_hold_skb(skb);
@@ -412,7 +412,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
if (audit_pid) {
pr_err("*NO* daemon at audit_pid=%d\n", audit_pid);
- audit_log_lost("auditd disappeared\n");
+ audit_log_lost("auditd disappeared");
audit_pid = 0;
audit_sock = NULL;
}
@@ -424,6 +424,38 @@ static void kauditd_send_skb(struct sk_buff *skb)
}
/*
+ * kauditd_send_multicast_skb - send the skb to multicast userspace listeners
+ *
+ * This function doesn't consume an skb as might be expected since it has to
+ * copy it anyways.
+ */
+static void kauditd_send_multicast_skb(struct sk_buff *skb)
+{
+ struct sk_buff *copy;
+ struct audit_net *aunet = net_generic(&init_net, audit_net_id);
+ struct sock *sock = aunet->nlsk;
+
+ if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG))
+ return;
+
+ /*
+ * The seemingly wasteful skb_copy() rather than bumping the refcount
+ * using skb_get() is necessary because non-standard mods are made to
+ * the skb by the original kaudit unicast socket send routine. The
+ * existing auditd daemon assumes this breakage. Fixing this would
+ * require co-ordinating a change in the established protocol between
+ * the kaudit kernel subsystem and the auditd userspace code. There is
+ * no reason for new multicast clients to continue with this
+ * non-compliance.
+ */
+ copy = skb_copy(skb, GFP_KERNEL);
+ if (!copy)
+ return;
+
+ nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL);
+}
+
+/*
* flush_hold_queue - empty the hold queue if auditd appears
*
* If auditd just started, drain the queue of messages already
@@ -607,10 +639,19 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
{
int err = 0;
- /* Only support the initial namespaces for now. */
- if ((current_user_ns() != &init_user_ns) ||
- (task_active_pid_ns(current) != &init_pid_ns))
- return -EPERM;
+ /* Only support initial user namespace for now. */
+ /*
+ * We return ECONNREFUSED because it tricks userspace into thinking
+ * that audit was not configured into the kernel. Lots of users
+ * configure their PAM stack (because that's what the distro does)
+ * to reject login if unable to send messages to audit. If we return
+ * ECONNREFUSED the PAM stack thinks the kernel does not have audit
+ * configured in and will let login proceed. If we return EPERM
+ * userspace will reject all logins. This should be removed when we
+ * support non init namespaces!!
+ */
+ if (current_user_ns() != &init_user_ns)
+ return -ECONNREFUSED;
switch (msg_type) {
case AUDIT_LIST:
@@ -629,13 +670,18 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
case AUDIT_TTY_SET:
case AUDIT_TRIM:
case AUDIT_MAKE_EQUIV:
- if (!capable(CAP_AUDIT_CONTROL))
+ /* Only support auditd and auditctl in initial pid namespace
+ * for now. */
+ if ((task_active_pid_ns(current) != &init_pid_ns))
+ return -EPERM;
+
+ if (!netlink_capable(skb, CAP_AUDIT_CONTROL))
err = -EPERM;
break;
case AUDIT_USER:
case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
- if (!capable(CAP_AUDIT_WRITE))
+ if (!netlink_capable(skb, CAP_AUDIT_WRITE))
err = -EPERM;
break;
default: /* bad msg */
@@ -649,6 +695,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
{
int rc = 0;
uid_t uid = from_kuid(&init_user_ns, current_uid());
+ pid_t pid = task_tgid_nr(current);
if (!audit_enabled && msg_type != AUDIT_USER_AVC) {
*ab = NULL;
@@ -658,7 +705,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
*ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
if (unlikely(!*ab))
return rc;
- audit_log_format(*ab, "pid=%d uid=%u", task_tgid_vnr(current), uid);
+ audit_log_format(*ab, "pid=%d uid=%u", pid, uid);
audit_log_session_info(*ab);
audit_log_task_context(*ab);
@@ -1061,10 +1108,22 @@ static void audit_receive(struct sk_buff *skb)
mutex_unlock(&audit_cmd_mutex);
}
+/* Run custom bind function on netlink socket group connect or bind requests. */
+static int audit_bind(int group)
+{
+ if (!capable(CAP_AUDIT_READ))
+ return -EPERM;
+
+ return 0;
+}
+
static int __net_init audit_net_init(struct net *net)
{
struct netlink_kernel_cfg cfg = {
.input = audit_receive,
+ .bind = audit_bind,
+ .flags = NL_CFG_F_NONROOT_RECV,
+ .groups = AUDIT_NLGRP_MAX,
};
struct audit_net *aunet = net_generic(net, audit_net_id);
@@ -1087,7 +1146,7 @@ static void __net_exit audit_net_exit(struct net *net)
audit_sock = NULL;
}
- rcu_assign_pointer(aunet->nlsk, NULL);
+ RCU_INIT_POINTER(aunet->nlsk, NULL);
synchronize_net();
netlink_kernel_release(sock);
}
@@ -1618,7 +1677,7 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
audit_log_format(ab, " %s=", prefix);
CAP_FOR_EACH_U32(i) {
audit_log_format(ab, "%08x",
- cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]);
+ cap->cap[CAP_LAST_U32 - i]);
}
}
@@ -1819,11 +1878,11 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
spin_unlock_irq(&tsk->sighand->siglock);
audit_log_format(ab,
- " ppid=%ld pid=%d auid=%u uid=%u gid=%u"
+ " ppid=%d pid=%d auid=%u uid=%u gid=%u"
" euid=%u suid=%u fsuid=%u"
" egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
- sys_getppid(),
- tsk->pid,
+ task_ppid_nr(tsk),
+ task_pid_nr(tsk),
from_kuid(&init_user_ns, audit_get_loginuid(tsk)),
from_kuid(&init_user_ns, cred->uid),
from_kgid(&init_user_ns, cred->gid),
@@ -1886,10 +1945,10 @@ out:
* audit_log_end - end one audit record
* @ab: the audit_buffer
*
- * The netlink_* functions cannot be called inside an irq context, so
- * the audit buffer is placed on a queue and a tasklet is scheduled to
- * remove them from the queue outside the irq context. May be called in
- * any context.
+ * netlink_unicast() cannot be called inside an irq context because it blocks
+ * (last arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed
+ * on a queue and a tasklet is scheduled to remove them from the queue outside
+ * the irq context. May be called in any context.
*/
void audit_log_end(struct audit_buffer *ab)
{
@@ -1899,6 +1958,18 @@ void audit_log_end(struct audit_buffer *ab)
audit_log_lost("rate limit exceeded");
} else {
struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
+
+ kauditd_send_multicast_skb(ab->skb);
+
+ /*
+ * The original kaudit unicast socket sends up messages with
+ * nlmsg_len set to the payload length rather than the entire
+ * message length. This breaks the standard set by netlink.
+ * The existing auditd daemon assumes this breakage. Fixing
+ * this would require co-ordinating a change in the established
+ * protocol between the kaudit kernel subsystem and the auditd
+ * userspace code.
+ */
nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN;
if (audit_pid) {
diff --git a/kernel/audit.h b/kernel/audit.h
index 8df132214606..7bb65730c890 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -106,6 +106,11 @@ struct audit_names {
bool should_free;
};
+struct audit_proctitle {
+ int len; /* length of the cmdline field. */
+ char *value; /* the cmdline field */
+};
+
/* The per-task audit context. */
struct audit_context {
int dummy; /* must be the first element */
@@ -202,6 +207,7 @@ struct audit_context {
} execve;
};
int fds[2];
+ struct audit_proctitle proctitle;
#if AUDIT_DEBUG
int put_count;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 92062fd6cc8c..c447cd9848d1 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -19,6 +19,8 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/audit.h>
#include <linux/kthread.h>
@@ -104,7 +106,7 @@ static inline struct audit_entry *audit_init_entry(u32 field_count)
if (unlikely(!entry))
return NULL;
- fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL);
+ fields = kcalloc(field_count, sizeof(*fields), GFP_KERNEL);
if (unlikely(!fields)) {
kfree(entry);
return NULL;
@@ -158,7 +160,7 @@ static __u32 *classes[AUDIT_SYSCALL_CLASSES];
int __init audit_register_class(int class, unsigned *list)
{
- __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL);
+ __u32 *p = kcalloc(AUDIT_BITMASK_SIZE, sizeof(__u32), GFP_KERNEL);
if (!p)
return -ENOMEM;
while (*list != ~0U) {
@@ -226,7 +228,7 @@ static int audit_match_signal(struct audit_entry *entry)
#endif
/* Common user-space to kernel rule translation. */
-static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
+static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *rule)
{
unsigned listnr;
struct audit_entry *entry;
@@ -249,7 +251,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
;
}
if (unlikely(rule->action == AUDIT_POSSIBLE)) {
- printk(KERN_ERR "AUDIT_POSSIBLE is deprecated\n");
+ pr_err("AUDIT_POSSIBLE is deprecated\n");
goto exit_err;
}
if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS)
@@ -403,7 +405,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
int i;
char *str;
- entry = audit_to_entry_common((struct audit_rule *)data);
+ entry = audit_to_entry_common(data);
if (IS_ERR(entry))
goto exit_nofree;
@@ -431,6 +433,19 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
f->val = 0;
}
+ if ((f->type == AUDIT_PID) || (f->type == AUDIT_PPID)) {
+ struct pid *pid;
+ rcu_read_lock();
+ pid = find_vpid(f->val);
+ if (!pid) {
+ rcu_read_unlock();
+ err = -ESRCH;
+ goto exit_free;
+ }
+ f->val = pid_nr(pid);
+ rcu_read_unlock();
+ }
+
err = audit_field_valid(entry, f);
if (err)
goto exit_free;
@@ -479,8 +494,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
/* Keep currently invalid fields around in case they
* become valid after a policy reload. */
if (err == -EINVAL) {
- printk(KERN_WARNING "audit rule for LSM "
- "\'%s\' is invalid\n", str);
+ pr_warn("audit rule for LSM \'%s\' is invalid\n",
+ str);
err = 0;
}
if (err) {
@@ -709,8 +724,8 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
/* Keep currently invalid fields around in case they
* become valid after a policy reload. */
if (ret == -EINVAL) {
- printk(KERN_WARNING "audit rule for LSM \'%s\' is "
- "invalid\n", df->lsm_str);
+ pr_warn("audit rule for LSM \'%s\' is invalid\n",
+ df->lsm_str);
ret = 0;
}
@@ -1240,12 +1255,14 @@ static int audit_filter_user_rules(struct audit_krule *rule, int type,
for (i = 0; i < rule->field_count; i++) {
struct audit_field *f = &rule->fields[i];
+ pid_t pid;
int result = 0;
u32 sid;
switch (f->type) {
case AUDIT_PID:
- result = audit_comparator(task_pid_vnr(current), f->op, f->val);
+ pid = task_pid_nr(current);
+ result = audit_comparator(pid, f->op, f->val);
break;
case AUDIT_UID:
result = audit_uid_comparator(current_uid(), f->op, f->uid);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7aef2f4b6c64..21eae3c05ec0 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -42,6 +42,8 @@
* and <dustin.kirkland@us.ibm.com> for LSPP certification compliance.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/init.h>
#include <asm/types.h>
#include <linux/atomic.h>
@@ -68,6 +70,7 @@
#include <linux/capability.h>
#include <linux/fs_struct.h>
#include <linux/compat.h>
+#include <linux/ctype.h>
#include "audit.h"
@@ -79,6 +82,9 @@
/* no execve audit message should be longer than this (userspace limits) */
#define MAX_EXECVE_AUDIT_LEN 7500
+/* max length to print of cmdline/proctitle value during audit */
+#define MAX_PROCTITLE_AUDIT_LEN 128
+
/* number of audit rules */
int audit_n_rules;
@@ -451,15 +457,17 @@ static int audit_filter_rules(struct task_struct *tsk,
struct audit_field *f = &rule->fields[i];
struct audit_names *n;
int result = 0;
+ pid_t pid;
switch (f->type) {
case AUDIT_PID:
- result = audit_comparator(tsk->pid, f->op, f->val);
+ pid = task_pid_nr(tsk);
+ result = audit_comparator(pid, f->op, f->val);
break;
case AUDIT_PPID:
if (ctx) {
if (!ctx->ppid)
- ctx->ppid = sys_getppid();
+ ctx->ppid = task_ppid_nr(tsk);
result = audit_comparator(ctx->ppid, f->op, f->val);
}
break;
@@ -720,6 +728,22 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
return AUDIT_BUILD_CONTEXT;
}
+static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
+{
+ int word, bit;
+
+ if (val > 0xffffffff)
+ return false;
+
+ word = AUDIT_WORD(val);
+ if (word >= AUDIT_BITMASK_SIZE)
+ return false;
+
+ bit = AUDIT_BIT(val);
+
+ return rule->mask[word] & bit;
+}
+
/* At syscall entry and exit time, this filter is called if the
* audit_state is not low enough that auditing cannot take place, but is
* also not high enough that we already know we have to write an audit
@@ -737,11 +761,8 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
rcu_read_lock();
if (!list_empty(list)) {
- int word = AUDIT_WORD(ctx->major);
- int bit = AUDIT_BIT(ctx->major);
-
list_for_each_entry_rcu(e, list, list) {
- if ((e->rule.mask[word] & bit) == bit &&
+ if (audit_in_mask(&e->rule, ctx->major) &&
audit_filter_rules(tsk, &e->rule, ctx, NULL,
&state, false)) {
rcu_read_unlock();
@@ -761,20 +782,16 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
static int audit_filter_inode_name(struct task_struct *tsk,
struct audit_names *n,
struct audit_context *ctx) {
- int word, bit;
int h = audit_hash_ino((u32)n->ino);
struct list_head *list = &audit_inode_hash[h];
struct audit_entry *e;
enum audit_state state;
- word = AUDIT_WORD(ctx->major);
- bit = AUDIT_BIT(ctx->major);
-
if (list_empty(list))
return 0;
list_for_each_entry_rcu(e, list, list) {
- if ((e->rule.mask[word] & bit) == bit &&
+ if (audit_in_mask(&e->rule, ctx->major) &&
audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
ctx->current_state = state;
return 1;
@@ -805,7 +822,8 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
rcu_read_unlock();
}
-static inline struct audit_context *audit_get_context(struct task_struct *tsk,
+/* Transfer the audit context pointer to the caller, clearing it in the tsk's struct */
+static inline struct audit_context *audit_take_context(struct task_struct *tsk,
int return_valid,
long return_code)
{
@@ -842,6 +860,13 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
return context;
}
+static inline void audit_proctitle_free(struct audit_context *context)
+{
+ kfree(context->proctitle.value);
+ context->proctitle.value = NULL;
+ context->proctitle.len = 0;
+}
+
static inline void audit_free_names(struct audit_context *context)
{
struct audit_names *n, *next;
@@ -850,16 +875,15 @@ static inline void audit_free_names(struct audit_context *context)
if (context->put_count + context->ino_count != context->name_count) {
int i = 0;
- printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d"
- " name_count=%d put_count=%d"
- " ino_count=%d [NOT freeing]\n",
- __FILE__, __LINE__,
+ pr_err("%s:%d(:%d): major=%d in_syscall=%d"
+ " name_count=%d put_count=%d ino_count=%d"
+ " [NOT freeing]\n", __FILE__, __LINE__,
context->serial, context->major, context->in_syscall,
context->name_count, context->put_count,
context->ino_count);
list_for_each_entry(n, &context->names_list, list) {
- printk(KERN_ERR "names[%d] = %p = %s\n", i++,
- n->name, n->name->name ?: "(null)");
+ pr_err("names[%d] = %p = %s\n", i++, n->name,
+ n->name->name ?: "(null)");
}
dump_stack();
return;
@@ -955,6 +979,7 @@ static inline void audit_free_context(struct audit_context *context)
audit_free_aux(context);
kfree(context->filterkey);
kfree(context->sockaddr);
+ audit_proctitle_free(context);
kfree(context);
}
@@ -1157,7 +1182,7 @@ static void audit_log_execve_info(struct audit_context *context,
*/
buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
if (!buf) {
- audit_panic("out of memory for argv string\n");
+ audit_panic("out of memory for argv string");
return;
}
@@ -1271,6 +1296,59 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_end(ab);
}
+static inline int audit_proctitle_rtrim(char *proctitle, int len)
+{
+ char *end = proctitle + len - 1;
+ while (end > proctitle && !isprint(*end))
+ end--;
+
+ /* catch the case where proctitle is only 1 non-print character */
+ len = end - proctitle + 1;
+ len -= isprint(proctitle[len-1]) == 0;
+ return len;
+}
+
+static void audit_log_proctitle(struct task_struct *tsk,
+ struct audit_context *context)
+{
+ int res;
+ char *buf;
+ char *msg = "(null)";
+ int len = strlen(msg);
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE);
+ if (!ab)
+ return; /* audit_panic or being filtered */
+
+ audit_log_format(ab, "proctitle=");
+
+ /* Not cached */
+ if (!context->proctitle.value) {
+ buf = kmalloc(MAX_PROCTITLE_AUDIT_LEN, GFP_KERNEL);
+ if (!buf)
+ goto out;
+ /* Historically called this from procfs naming */
+ res = get_cmdline(tsk, buf, MAX_PROCTITLE_AUDIT_LEN);
+ if (res == 0) {
+ kfree(buf);
+ goto out;
+ }
+ res = audit_proctitle_rtrim(buf, res);
+ if (res == 0) {
+ kfree(buf);
+ goto out;
+ }
+ context->proctitle.value = buf;
+ context->proctitle.len = res;
+ }
+ msg = context->proctitle.value;
+ len = context->proctitle.len;
+out:
+ audit_log_n_untrustedstring(ab, msg, len);
+ audit_log_end(ab);
+}
+
static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
{
int i, call_panic = 0;
@@ -1388,6 +1466,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
audit_log_name(context, n, NULL, i++, &call_panic);
}
+ audit_log_proctitle(tsk, context);
+
/* Send end of event record to help user space know we are finished */
ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
if (ab)
@@ -1406,7 +1486,7 @@ void __audit_free(struct task_struct *tsk)
{
struct audit_context *context;
- context = audit_get_context(tsk, 0, 0);
+ context = audit_take_context(tsk, 0, 0);
if (!context)
return;
@@ -1500,7 +1580,7 @@ void __audit_syscall_exit(int success, long return_code)
else
success = AUDITSC_FAILURE;
- context = audit_get_context(tsk, success, return_code);
+ context = audit_take_context(tsk, success, return_code);
if (!context)
return;
@@ -1550,7 +1630,7 @@ static inline void handle_one(const struct inode *inode)
if (likely(put_tree_ref(context, chunk)))
return;
if (unlikely(!grow_tree_refs(context))) {
- printk(KERN_WARNING "out of memory, audit has lost a tree reference\n");
+ pr_warn("out of memory, audit has lost a tree reference\n");
audit_set_auditable(context);
audit_put_chunk(chunk);
unroll_tree_refs(context, p, count);
@@ -1609,8 +1689,7 @@ retry:
goto retry;
}
/* too bad */
- printk(KERN_WARNING
- "out of memory, audit has lost a tree reference\n");
+ pr_warn("out of memory, audit has lost a tree reference\n");
unroll_tree_refs(context, p, count);
audit_set_auditable(context);
return;
@@ -1682,7 +1761,7 @@ void __audit_getname(struct filename *name)
if (!context->in_syscall) {
#if AUDIT_DEBUG == 2
- printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n",
+ pr_err("%s:%d(:%d): ignoring getname(%p)\n",
__FILE__, __LINE__, context->serial, name);
dump_stack();
#endif
@@ -1721,15 +1800,15 @@ void audit_putname(struct filename *name)
BUG_ON(!context);
if (!name->aname || !context->in_syscall) {
#if AUDIT_DEBUG == 2
- printk(KERN_ERR "%s:%d(:%d): final_putname(%p)\n",
+ pr_err("%s:%d(:%d): final_putname(%p)\n",
__FILE__, __LINE__, context->serial, name);
if (context->name_count) {
struct audit_names *n;
int i = 0;
list_for_each_entry(n, &context->names_list, list)
- printk(KERN_ERR "name[%d] = %p = %s\n", i++,
- n->name, n->name->name ?: "(null)");
+ pr_err("name[%d] = %p = %s\n", i++, n->name,
+ n->name->name ?: "(null)");
}
#endif
final_putname(name);
@@ -1738,9 +1817,8 @@ void audit_putname(struct filename *name)
else {
++context->put_count;
if (context->put_count > context->name_count) {
- printk(KERN_ERR "%s:%d(:%d): major=%d"
- " in_syscall=%d putname(%p) name_count=%d"
- " put_count=%d\n",
+ pr_err("%s:%d(:%d): major=%d in_syscall=%d putname(%p)"
+ " name_count=%d put_count=%d\n",
__FILE__, __LINE__,
context->serial, context->major,
context->in_syscall, name->name,
@@ -1981,12 +2059,10 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
if (!ab)
return;
- audit_log_format(ab, "pid=%d uid=%u"
- " old-auid=%u new-auid=%u old-ses=%u new-ses=%u"
- " res=%d",
- current->pid, uid,
- oldloginuid, loginuid, oldsessionid, sessionid,
- !rc);
+ audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid);
+ audit_log_task_context(ab);
+ audit_log_format(ab, " old-auid=%u auid=%u old-ses=%u ses=%u res=%d",
+ oldloginuid, loginuid, oldsessionid, sessionid, !rc);
audit_log_end(ab);
}
@@ -2208,7 +2284,7 @@ void __audit_ptrace(struct task_struct *t)
{
struct audit_context *context = current->audit_context;
- context->target_pid = t->pid;
+ context->target_pid = task_pid_nr(t);
context->target_auid = audit_get_loginuid(t);
context->target_uid = task_uid(t);
context->target_sessionid = audit_get_sessionid(t);
@@ -2233,7 +2309,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
if (audit_pid && t->tgid == audit_pid) {
if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
- audit_sig_pid = tsk->pid;
+ audit_sig_pid = task_pid_nr(tsk);
if (uid_valid(tsk->loginuid))
audit_sig_uid = tsk->loginuid;
else
@@ -2247,7 +2323,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
/* optimize the common case by putting first signal recipient directly
* in audit_context */
if (!ctx->target_pid) {
- ctx->target_pid = t->tgid;
+ ctx->target_pid = task_tgid_nr(t);
ctx->target_auid = audit_get_loginuid(t);
ctx->target_uid = t_uid;
ctx->target_sessionid = audit_get_sessionid(t);
@@ -2268,7 +2344,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
}
BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS);
- axp->target_pid[axp->pid_count] = t->tgid;
+ axp->target_pid[axp->pid_count] = task_tgid_nr(t);
axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
axp->target_uid[axp->pid_count] = t_uid;
axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
@@ -2368,7 +2444,7 @@ static void audit_log_task(struct audit_buffer *ab)
from_kgid(&init_user_ns, gid),
sessionid);
audit_log_task_context(ab);
- audit_log_format(ab, " pid=%d comm=", current->pid);
+ audit_log_format(ab, " pid=%d comm=", task_pid_nr(current));
audit_log_untrustedstring(ab, current->comm);
if (mm) {
down_read(&mm->mmap_sem);
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index a5e026bc45c4..1323360d90e3 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -19,8 +19,8 @@
static void backtrace_test_normal(void)
{
- printk("Testing a backtrace from process context.\n");
- printk("The following trace is a kernel self test and not a bug!\n");
+ pr_info("Testing a backtrace from process context.\n");
+ pr_info("The following trace is a kernel self test and not a bug!\n");
dump_stack();
}
@@ -37,8 +37,8 @@ static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0);
static void backtrace_test_irq(void)
{
- printk("Testing a backtrace from irq context.\n");
- printk("The following trace is a kernel self test and not a bug!\n");
+ pr_info("Testing a backtrace from irq context.\n");
+ pr_info("The following trace is a kernel self test and not a bug!\n");
init_completion(&backtrace_work);
tasklet_schedule(&backtrace_tasklet);
@@ -51,8 +51,8 @@ static void backtrace_test_saved(void)
struct stack_trace trace;
unsigned long entries[8];
- printk("Testing a saved backtrace.\n");
- printk("The following trace is a kernel self test and not a bug!\n");
+ pr_info("Testing a saved backtrace.\n");
+ pr_info("The following trace is a kernel self test and not a bug!\n");
trace.nr_entries = 0;
trace.max_entries = ARRAY_SIZE(entries);
@@ -65,19 +65,19 @@ static void backtrace_test_saved(void)
#else
static void backtrace_test_saved(void)
{
- printk("Saved backtrace test skipped.\n");
+ pr_info("Saved backtrace test skipped.\n");
}
#endif
static int backtrace_regression_test(void)
{
- printk("====[ backtrace testing ]===========\n");
+ pr_info("====[ backtrace testing ]===========\n");
backtrace_test_normal();
backtrace_test_irq();
backtrace_test_saved();
- printk("====[ end of backtrace testing ]====\n");
+ pr_info("====[ end of backtrace testing ]====\n");
return 0;
}
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 9fd4246b04b8..e1d1d1952bfa 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -9,7 +9,6 @@
#include <linux/page-flags.h>
#include <linux/mmzone.h>
#include <linux/kbuild.h>
-#include <linux/page_cgroup.h>
#include <linux/log2.h>
#include <linux/spinlock_types.h>
@@ -18,7 +17,6 @@ void foo(void)
/* The enum constants to put into include/generated/bounds.h */
DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
- DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
#ifdef CONFIG_SMP
DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
#endif
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
new file mode 100644
index 000000000000..6a71145e2769
--- /dev/null
+++ b/kernel/bpf/Makefile
@@ -0,0 +1 @@
+obj-y := core.o
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
new file mode 100644
index 000000000000..7f0dbcbb34af
--- /dev/null
+++ b/kernel/bpf/core.c
@@ -0,0 +1,534 @@
+/*
+ * Linux Socket Filter - Kernel level socket filtering
+ *
+ * Based on the design of the Berkeley Packet Filter. The new
+ * internal format has been designed by PLUMgrid:
+ *
+ * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
+ *
+ * Authors:
+ *
+ * Jay Schulist <jschlst@samba.org>
+ * Alexei Starovoitov <ast@plumgrid.com>
+ * Daniel Borkmann <dborkman@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Andi Kleen - Fix a few bad bugs and races.
+ * Kris Katterjohn - Added many additional checks in bpf_check_classic()
+ */
+#include <linux/filter.h>
+#include <linux/skbuff.h>
+#include <asm/unaligned.h>
+
+/* Registers */
+#define BPF_R0 regs[BPF_REG_0]
+#define BPF_R1 regs[BPF_REG_1]
+#define BPF_R2 regs[BPF_REG_2]
+#define BPF_R3 regs[BPF_REG_3]
+#define BPF_R4 regs[BPF_REG_4]
+#define BPF_R5 regs[BPF_REG_5]
+#define BPF_R6 regs[BPF_REG_6]
+#define BPF_R7 regs[BPF_REG_7]
+#define BPF_R8 regs[BPF_REG_8]
+#define BPF_R9 regs[BPF_REG_9]
+#define BPF_R10 regs[BPF_REG_10]
+
+/* Named registers */
+#define DST regs[insn->dst_reg]
+#define SRC regs[insn->src_reg]
+#define FP regs[BPF_REG_FP]
+#define ARG1 regs[BPF_REG_ARG1]
+#define CTX regs[BPF_REG_CTX]
+#define IMM insn->imm
+
+/* No hurry in this branch
+ *
+ * Exported for the bpf jit load helper.
+ */
+void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)
+{
+ u8 *ptr = NULL;
+
+ if (k >= SKF_NET_OFF)
+ ptr = skb_network_header(skb) + k - SKF_NET_OFF;
+ else if (k >= SKF_LL_OFF)
+ ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
+ if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
+ return ptr;
+
+ return NULL;
+}
+
+/* Base function for offset calculation. Needs to go into .text section,
+ * therefore keeping it non-static as well; will also be used by JITs
+ * anyway later on, so do not let the compiler omit it.
+ */
+noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ return 0;
+}
+
+/**
+ * __bpf_prog_run - run eBPF program on a given context
+ * @ctx: is the data we are operating on
+ * @insn: is the array of eBPF instructions
+ *
+ * Decode and execute eBPF instructions.
+ */
+static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
+{
+ u64 stack[MAX_BPF_STACK / sizeof(u64)];
+ u64 regs[MAX_BPF_REG], tmp;
+ static const void *jumptable[256] = {
+ [0 ... 255] = &&default_label,
+ /* Now overwrite non-defaults ... */
+ /* 32 bit ALU operations */
+ [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X,
+ [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K,
+ [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X,
+ [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K,
+ [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X,
+ [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K,
+ [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X,
+ [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K,
+ [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X,
+ [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K,
+ [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X,
+ [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K,
+ [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X,
+ [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K,
+ [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X,
+ [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K,
+ [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X,
+ [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K,
+ [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X,
+ [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K,
+ [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X,
+ [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K,
+ [BPF_ALU | BPF_NEG] = &&ALU_NEG,
+ [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE,
+ [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE,
+ /* 64 bit ALU operations */
+ [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X,
+ [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K,
+ [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X,
+ [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K,
+ [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X,
+ [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K,
+ [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X,
+ [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K,
+ [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X,
+ [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K,
+ [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X,
+ [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K,
+ [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X,
+ [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K,
+ [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X,
+ [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K,
+ [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X,
+ [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K,
+ [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X,
+ [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K,
+ [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X,
+ [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K,
+ [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X,
+ [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K,
+ [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
+ /* Call instruction */
+ [BPF_JMP | BPF_CALL] = &&JMP_CALL,
+ /* Jumps */
+ [BPF_JMP | BPF_JA] = &&JMP_JA,
+ [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
+ [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K,
+ [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X,
+ [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
+ [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
+ [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
+ [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
+ [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
+ [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
+ [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
+ [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
+ [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
+ [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
+ [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
+ /* Program return */
+ [BPF_JMP | BPF_EXIT] = &&JMP_EXIT,
+ /* Store instructions */
+ [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B,
+ [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H,
+ [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W,
+ [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW,
+ [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W,
+ [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW,
+ [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B,
+ [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H,
+ [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W,
+ [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW,
+ /* Load instructions */
+ [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B,
+ [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H,
+ [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W,
+ [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW,
+ [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W,
+ [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H,
+ [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B,
+ [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
+ [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
+ [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
+ };
+ void *ptr;
+ int off;
+
+#define CONT ({ insn++; goto select_insn; })
+#define CONT_JMP ({ insn++; goto select_insn; })
+
+ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
+ ARG1 = (u64) (unsigned long) ctx;
+
+ /* Registers used in classic BPF programs need to be reset first. */
+ regs[BPF_REG_A] = 0;
+ regs[BPF_REG_X] = 0;
+
+select_insn:
+ goto *jumptable[insn->code];
+
+ /* ALU */
+#define ALU(OPCODE, OP) \
+ ALU64_##OPCODE##_X: \
+ DST = DST OP SRC; \
+ CONT; \
+ ALU_##OPCODE##_X: \
+ DST = (u32) DST OP (u32) SRC; \
+ CONT; \
+ ALU64_##OPCODE##_K: \
+ DST = DST OP IMM; \
+ CONT; \
+ ALU_##OPCODE##_K: \
+ DST = (u32) DST OP (u32) IMM; \
+ CONT;
+
+ ALU(ADD, +)
+ ALU(SUB, -)
+ ALU(AND, &)
+ ALU(OR, |)
+ ALU(LSH, <<)
+ ALU(RSH, >>)
+ ALU(XOR, ^)
+ ALU(MUL, *)
+#undef ALU
+ ALU_NEG:
+ DST = (u32) -DST;
+ CONT;
+ ALU64_NEG:
+ DST = -DST;
+ CONT;
+ ALU_MOV_X:
+ DST = (u32) SRC;
+ CONT;
+ ALU_MOV_K:
+ DST = (u32) IMM;
+ CONT;
+ ALU64_MOV_X:
+ DST = SRC;
+ CONT;
+ ALU64_MOV_K:
+ DST = IMM;
+ CONT;
+ ALU64_ARSH_X:
+ (*(s64 *) &DST) >>= SRC;
+ CONT;
+ ALU64_ARSH_K:
+ (*(s64 *) &DST) >>= IMM;
+ CONT;
+ ALU64_MOD_X:
+ if (unlikely(SRC == 0))
+ return 0;
+ tmp = DST;
+ DST = do_div(tmp, SRC);
+ CONT;
+ ALU_MOD_X:
+ if (unlikely(SRC == 0))
+ return 0;
+ tmp = (u32) DST;
+ DST = do_div(tmp, (u32) SRC);
+ CONT;
+ ALU64_MOD_K:
+ tmp = DST;
+ DST = do_div(tmp, IMM);
+ CONT;
+ ALU_MOD_K:
+ tmp = (u32) DST;
+ DST = do_div(tmp, (u32) IMM);
+ CONT;
+ ALU64_DIV_X:
+ if (unlikely(SRC == 0))
+ return 0;
+ do_div(DST, SRC);
+ CONT;
+ ALU_DIV_X:
+ if (unlikely(SRC == 0))
+ return 0;
+ tmp = (u32) DST;
+ do_div(tmp, (u32) SRC);
+ DST = (u32) tmp;
+ CONT;
+ ALU64_DIV_K:
+ do_div(DST, IMM);
+ CONT;
+ ALU_DIV_K:
+ tmp = (u32) DST;
+ do_div(tmp, (u32) IMM);
+ DST = (u32) tmp;
+ CONT;
+ ALU_END_TO_BE:
+ switch (IMM) {
+ case 16:
+ DST = (__force u16) cpu_to_be16(DST);
+ break;
+ case 32:
+ DST = (__force u32) cpu_to_be32(DST);
+ break;
+ case 64:
+ DST = (__force u64) cpu_to_be64(DST);
+ break;
+ }
+ CONT;
+ ALU_END_TO_LE:
+ switch (IMM) {
+ case 16:
+ DST = (__force u16) cpu_to_le16(DST);
+ break;
+ case 32:
+ DST = (__force u32) cpu_to_le32(DST);
+ break;
+ case 64:
+ DST = (__force u64) cpu_to_le64(DST);
+ break;
+ }
+ CONT;
+
+ /* CALL */
+ JMP_CALL:
+ /* Function call scratches BPF_R1-BPF_R5 registers,
+ * preserves BPF_R6-BPF_R9, and stores return value
+ * into BPF_R0.
+ */
+ BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
+ BPF_R4, BPF_R5);
+ CONT;
+
+ /* JMP */
+ JMP_JA:
+ insn += insn->off;
+ CONT;
+ JMP_JEQ_X:
+ if (DST == SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JEQ_K:
+ if (DST == IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JNE_X:
+ if (DST != SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JNE_K:
+ if (DST != IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGT_X:
+ if (DST > SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGT_K:
+ if (DST > IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGE_X:
+ if (DST >= SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGE_K:
+ if (DST >= IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGT_X:
+ if (((s64) DST) > ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGT_K:
+ if (((s64) DST) > ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGE_X:
+ if (((s64) DST) >= ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGE_K:
+ if (((s64) DST) >= ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSET_X:
+ if (DST & SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSET_K:
+ if (DST & IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_EXIT:
+ return BPF_R0;
+
+ /* STX and ST and LDX*/
+#define LDST(SIZEOP, SIZE) \
+ STX_MEM_##SIZEOP: \
+ *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \
+ CONT; \
+ ST_MEM_##SIZEOP: \
+ *(SIZE *)(unsigned long) (DST + insn->off) = IMM; \
+ CONT; \
+ LDX_MEM_##SIZEOP: \
+ DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
+ CONT;
+
+ LDST(B, u8)
+ LDST(H, u16)
+ LDST(W, u32)
+ LDST(DW, u64)
+#undef LDST
+ STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
+ atomic_add((u32) SRC, (atomic_t *)(unsigned long)
+ (DST + insn->off));
+ CONT;
+ STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
+ atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
+ (DST + insn->off));
+ CONT;
+ LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
+ off = IMM;
+load_word:
+ /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are
+ * only appearing in the programs where ctx ==
+ * skb. All programs keep 'ctx' in regs[BPF_REG_CTX]
+ * == BPF_R6, bpf_convert_filter() saves it in BPF_R6,
+ * internal BPF verifier will check that BPF_R6 ==
+ * ctx.
+ *
+ * BPF_ABS and BPF_IND are wrappers of function calls,
+ * so they scratch BPF_R1-BPF_R5 registers, preserve
+ * BPF_R6-BPF_R9, and store return value into BPF_R0.
+ *
+ * Implicit input:
+ * ctx == skb == BPF_R6 == CTX
+ *
+ * Explicit input:
+ * SRC == any register
+ * IMM == 32-bit immediate
+ *
+ * Output:
+ * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness
+ */
+
+ ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp);
+ if (likely(ptr != NULL)) {
+ BPF_R0 = get_unaligned_be32(ptr);
+ CONT;
+ }
+
+ return 0;
+ LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */
+ off = IMM;
+load_half:
+ ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp);
+ if (likely(ptr != NULL)) {
+ BPF_R0 = get_unaligned_be16(ptr);
+ CONT;
+ }
+
+ return 0;
+ LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */
+ off = IMM;
+load_byte:
+ ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp);
+ if (likely(ptr != NULL)) {
+ BPF_R0 = *(u8 *)ptr;
+ CONT;
+ }
+
+ return 0;
+ LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */
+ off = IMM + SRC;
+ goto load_word;
+ LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */
+ off = IMM + SRC;
+ goto load_half;
+ LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */
+ off = IMM + SRC;
+ goto load_byte;
+
+ default_label:
+ /* If we ever reach this, we have a bug somewhere. */
+ WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
+ return 0;
+}
+
+void __weak bpf_int_jit_compile(struct bpf_prog *prog)
+{
+}
+
+/**
+ * bpf_prog_select_runtime - select execution runtime for BPF program
+ * @fp: bpf_prog populated with internal BPF program
+ *
+ * try to JIT internal BPF program, if JIT is not available select interpreter
+ * BPF program will be executed via BPF_PROG_RUN() macro
+ */
+void bpf_prog_select_runtime(struct bpf_prog *fp)
+{
+ fp->bpf_func = (void *) __bpf_prog_run;
+
+ /* Probe if internal BPF can be JITed */
+ bpf_int_jit_compile(fp);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
+
+/* free internal BPF program */
+void bpf_prog_free(struct bpf_prog *fp)
+{
+ bpf_jit_free(fp);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_free);
diff --git a/kernel/capability.c b/kernel/capability.c
index 34019c57888d..989f5bfc57dc 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -7,6 +7,8 @@
* 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/mm.h>
@@ -22,7 +24,6 @@
*/
const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
-
EXPORT_SYMBOL(__cap_empty_set);
int file_caps_enabled = 1;
@@ -42,15 +43,10 @@ __setup("no_file_caps", file_caps_disable);
static void warn_legacy_capability_use(void)
{
- static int warned;
- if (!warned) {
- char name[sizeof(current->comm)];
-
- printk(KERN_INFO "warning: `%s' uses 32-bit capabilities"
- " (legacy support in use)\n",
- get_task_comm(name, current));
- warned = 1;
- }
+ char name[sizeof(current->comm)];
+
+ pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n",
+ get_task_comm(name, current));
}
/*
@@ -71,16 +67,10 @@ static void warn_legacy_capability_use(void)
static void warn_deprecated_v2(void)
{
- static int warned;
-
- if (!warned) {
- char name[sizeof(current->comm)];
+ char name[sizeof(current->comm)];
- printk(KERN_INFO "warning: `%s' uses deprecated v2"
- " capabilities in a way that may be insecure.\n",
- get_task_comm(name, current));
- warned = 1;
- }
+ pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n",
+ get_task_comm(name, current));
}
/*
@@ -198,7 +188,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
*
* An alternative would be to return an error here
* (-ERANGE), but that causes legacy applications to
- * unexpectidly fail; the capget/modify/capset aborts
+ * unexpectedly fail; the capget/modify/capset aborts
* before modification is attempted and the application
* fails.
*/
@@ -268,6 +258,10 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
i++;
}
+ effective.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+ permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+ inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+
new = prepare_creds();
if (!new)
return -ENOMEM;
@@ -380,7 +374,7 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
bool ns_capable(struct user_namespace *ns, int cap)
{
if (unlikely(!cap_valid(cap))) {
- printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
+ pr_crit("capable() called with invalid cap=%u\n", cap);
BUG();
}
@@ -404,7 +398,8 @@ EXPORT_SYMBOL(ns_capable);
* This does not set PF_SUPERPRIV because the caller may not
* actually be privileged.
*/
-bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap)
+bool file_ns_capable(const struct file *file, struct user_namespace *ns,
+ int cap)
{
if (WARN_ON_ONCE(!cap_valid(cap)))
return false;
@@ -433,23 +428,19 @@ bool capable(int cap)
EXPORT_SYMBOL(capable);
/**
- * inode_capable - Check superior capability over inode
+ * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
* @inode: The inode in question
* @cap: The capability in question
*
- * Return true if the current task has the given superior capability
- * targeted at it's own user namespace and that the given inode is owned
- * by the current user namespace or a child namespace.
- *
- * Currently we check to see if an inode is owned by the current
- * user namespace by seeing if the inode's owner maps into the
- * current user namespace.
- *
+ * Return true if the current task has the given capability targeted at
+ * its own user namespace and that the given inode's uid and gid are
+ * mapped into the current user namespace.
*/
-bool inode_capable(const struct inode *inode, int cap)
+bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
{
struct user_namespace *ns = current_user_ns();
- return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid);
+ return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) &&
+ kgid_has_mapping(ns, inode->i_gid);
}
-EXPORT_SYMBOL(inode_capable);
+EXPORT_SYMBOL(capable_wrt_inode_uidgid);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 105f273b6f86..3a73f995a81e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -26,6 +26,8 @@
* distribution for more details.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/cgroup.h>
#include <linux/cred.h>
#include <linux/ctype.h>
@@ -33,6 +35,7 @@
#include <linux/init_task.h>
#include <linux/kernel.h>
#include <linux/list.h>
+#include <linux/magic.h>
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/mount.h>
@@ -40,23 +43,20 @@
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
-#include <linux/backing-dev.h>
#include <linux/slab.h>
-#include <linux/magic.h>
#include <linux/spinlock.h>
+#include <linux/rwsem.h>
#include <linux/string.h>
#include <linux/sort.h>
#include <linux/kmod.h>
-#include <linux/module.h>
#include <linux/delayacct.h>
#include <linux/cgroupstats.h>
#include <linux/hashtable.h>
-#include <linux/namei.h>
#include <linux/pid_namespace.h>
#include <linux/idr.h>
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
-#include <linux/flex_array.h> /* used in cgroup_attach_task */
#include <linux/kthread.h>
+#include <linux/delay.h>
#include <linux/atomic.h>
@@ -68,44 +68,46 @@
*/
#define CGROUP_PIDLIST_DESTROY_DELAY HZ
+#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
+ MAX_CFTYPE_NAME + 2)
+
/*
* cgroup_mutex is the master lock. Any modification to cgroup or its
* hierarchy must be performed while holding it.
*
- * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify
- * cgroupfs_root of any cgroup hierarchy - subsys list, flags,
- * release_agent_path and so on. Modifying requires both cgroup_mutex and
- * cgroup_root_mutex. Readers can acquire either of the two. This is to
- * break the following locking order cycle.
+ * css_set_rwsem protects task->cgroups pointer, the list of css_set
+ * objects, and the chain of tasks off each css_set.
*
- * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
- * B. namespace_sem -> cgroup_mutex
- *
- * B happens only through cgroup_show_options() and using cgroup_root_mutex
- * breaks it.
+ * These locks are exported if CONFIG_PROVE_RCU so that accessors in
+ * cgroup.h can use them for lockdep annotations.
*/
#ifdef CONFIG_PROVE_RCU
DEFINE_MUTEX(cgroup_mutex);
-EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */
+DECLARE_RWSEM(css_set_rwsem);
+EXPORT_SYMBOL_GPL(cgroup_mutex);
+EXPORT_SYMBOL_GPL(css_set_rwsem);
#else
static DEFINE_MUTEX(cgroup_mutex);
+static DECLARE_RWSEM(css_set_rwsem);
#endif
-static DEFINE_MUTEX(cgroup_root_mutex);
+/*
+ * Protects cgroup_idr and css_idr so that IDs can be released without
+ * grabbing cgroup_mutex.
+ */
+static DEFINE_SPINLOCK(cgroup_idr_lock);
+
+/*
+ * Protects cgroup_subsys->release_agent_path. Modifying it also requires
+ * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
+ */
+static DEFINE_SPINLOCK(release_agent_path_lock);
#define cgroup_assert_mutex_or_rcu_locked() \
rcu_lockdep_assert(rcu_read_lock_held() || \
lockdep_is_held(&cgroup_mutex), \
"cgroup_mutex or RCU read lock required");
-#ifdef CONFIG_LOCKDEP
-#define cgroup_assert_mutex_or_root_locked() \
- WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \
- !lockdep_is_held(&cgroup_root_mutex)))
-#else
-#define cgroup_assert_mutex_or_root_locked() do { } while (0)
-#endif
-
/*
* cgroup destruction makes heavy use of work items and there can be a lot
* of concurrent destructions. Use a separate workqueue so that cgroup
@@ -120,51 +122,58 @@ static struct workqueue_struct *cgroup_destroy_wq;
*/
static struct workqueue_struct *cgroup_pidlist_destroy_wq;
-/*
- * Generate an array of cgroup subsystem pointers. At boot time, this is
- * populated with the built in subsystems, and modular subsystems are
- * registered after that. The mutable section of this array is protected by
- * cgroup_mutex.
- */
-#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys,
-#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
-static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = {
+/* generate an array of cgroup subsystem pointers */
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
+static struct cgroup_subsys *cgroup_subsys[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
+
+/* array of cgroup subsystem names */
+#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
+static const char *cgroup_subsys_name[] = {
#include <linux/cgroup_subsys.h>
};
+#undef SUBSYS
/*
- * The dummy hierarchy, reserved for the subsystems that are otherwise
+ * The default hierarchy, reserved for the subsystems that are otherwise
* unattached - it never has more than a single cgroup, and all tasks are
* part of that cgroup.
*/
-static struct cgroupfs_root cgroup_dummy_root;
+struct cgroup_root cgrp_dfl_root;
+
+/*
+ * The default hierarchy always exists but is hidden until mounted for the
+ * first time. This is for backward compatibility.
+ */
+static bool cgrp_dfl_root_visible;
-/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */
-static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup;
+/*
+ * Set by the boot param of the same name and makes subsystems with NULL
+ * ->dfl_files to use ->legacy_files on the default hierarchy.
+ */
+static bool cgroup_legacy_files_on_dfl;
+
+/* some controllers are not supported in the default hierarchy */
+static unsigned int cgrp_dfl_root_inhibit_ss_mask;
/* The list of hierarchy roots */
static LIST_HEAD(cgroup_roots);
static int cgroup_root_count;
-/*
- * Hierarchy ID allocation and mapping. It follows the same exclusion
- * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for
- * writes, either for reads.
- */
+/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
static DEFINE_IDR(cgroup_hierarchy_idr);
-static struct cgroup_name root_cgroup_name = { .name = "/" };
-
/*
- * Assign a monotonically increasing serial number to cgroups. It
- * guarantees cgroups with bigger numbers are newer than those with smaller
- * numbers. Also, as cgroups are always appended to the parent's
- * ->children list, it guarantees that sibling cgroups are always sorted in
- * the ascending serial number order on the list. Protected by
- * cgroup_mutex.
+ * Assign a monotonically increasing serial number to csses. It guarantees
+ * cgroups with bigger numbers are newer than those with smaller numbers.
+ * Also, as csses are always appended to the parent's ->children list, it
+ * guarantees that sibling csses are always sorted in the ascending serial
+ * number order on the list. Protected by cgroup_mutex.
*/
-static u64 cgroup_serial_nr_next = 1;
+static u64 css_serial_nr_next = 1;
/* This flag indicates whether tasks in the fork and exit paths should
* check for fork/exit handlers to call. This avoids us having to do
@@ -173,19 +182,65 @@ static u64 cgroup_serial_nr_next = 1;
*/
static int need_forkexit_callback __read_mostly;
-static struct cftype cgroup_base_files[];
+static struct cftype cgroup_dfl_base_files[];
+static struct cftype cgroup_legacy_base_files[];
-static void cgroup_destroy_css_killed(struct cgroup *cgrp);
+static void cgroup_put(struct cgroup *cgrp);
+static int rebind_subsystems(struct cgroup_root *dst_root,
+ unsigned int ss_mask);
static int cgroup_destroy_locked(struct cgroup *cgrp);
+static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
+ bool visible);
+static void css_release(struct percpu_ref *ref);
+static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
-static int cgroup_file_release(struct inode *inode, struct file *file);
static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
+/* IDR wrappers which synchronize using cgroup_idr_lock */
+static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
+ gfp_t gfp_mask)
+{
+ int ret;
+
+ idr_preload(gfp_mask);
+ spin_lock_bh(&cgroup_idr_lock);
+ ret = idr_alloc(idr, ptr, start, end, gfp_mask);
+ spin_unlock_bh(&cgroup_idr_lock);
+ idr_preload_end();
+ return ret;
+}
+
+static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
+{
+ void *ret;
+
+ spin_lock_bh(&cgroup_idr_lock);
+ ret = idr_replace(idr, ptr, id);
+ spin_unlock_bh(&cgroup_idr_lock);
+ return ret;
+}
+
+static void cgroup_idr_remove(struct idr *idr, int id)
+{
+ spin_lock_bh(&cgroup_idr_lock);
+ idr_remove(idr, id);
+ spin_unlock_bh(&cgroup_idr_lock);
+}
+
+static struct cgroup *cgroup_parent(struct cgroup *cgrp)
+{
+ struct cgroup_subsys_state *parent_css = cgrp->self.parent;
+
+ if (parent_css)
+ return container_of(parent_css, struct cgroup, self);
+ return NULL;
+}
+
/**
* cgroup_css - obtain a cgroup's css for the specified subsystem
* @cgrp: the cgroup of interest
- * @ss: the subsystem of interest (%NULL returns the dummy_css)
+ * @ss: the subsystem of interest (%NULL returns @cgrp->self)
*
* Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
* function must be called either under cgroup_mutex or rcu_read_lock() and
@@ -197,17 +252,65 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
{
if (ss)
- return rcu_dereference_check(cgrp->subsys[ss->subsys_id],
- lockdep_is_held(&cgroup_mutex));
+ return rcu_dereference_check(cgrp->subsys[ss->id],
+ lockdep_is_held(&cgroup_mutex));
else
- return &cgrp->dummy_css;
+ return &cgrp->self;
+}
+
+/**
+ * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest (%NULL returns @cgrp->self)
+ *
+ * Similar to cgroup_css() but returns the effctive css, which is defined
+ * as the matching css of the nearest ancestor including self which has @ss
+ * enabled. If @ss is associated with the hierarchy @cgrp is on, this
+ * function is guaranteed to return non-NULL css.
+ */
+static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!ss)
+ return &cgrp->self;
+
+ if (!(cgrp->root->subsys_mask & (1 << ss->id)))
+ return NULL;
+
+ while (cgroup_parent(cgrp) &&
+ !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
+ cgrp = cgroup_parent(cgrp);
+
+ return cgroup_css(cgrp, ss);
}
/* convenient tests for these bits */
static inline bool cgroup_is_dead(const struct cgroup *cgrp)
{
- return test_bit(CGRP_DEAD, &cgrp->flags);
+ return !(cgrp->self.flags & CSS_ONLINE);
+}
+
+struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
+{
+ struct cgroup *cgrp = of->kn->parent->priv;
+ struct cftype *cft = of_cft(of);
+
+ /*
+ * This is open and unprotected implementation of cgroup_css().
+ * seq_css() is only called from a kernfs file operation which has
+ * an active reference on the file. Because all the subsystem
+ * files are drained before a css is disassociated with a cgroup,
+ * the matching css from the cgroup's subsys table is guaranteed to
+ * be and stay valid until the enclosing operation is complete.
+ */
+ if (cft->ss)
+ return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
+ else
+ return &cgrp->self;
}
+EXPORT_SYMBOL_GPL(of_css);
/**
* cgroup_is_descendant - test ancestry
@@ -223,11 +326,10 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
while (cgrp) {
if (cgrp == ancestor)
return true;
- cgrp = cgrp->parent;
+ cgrp = cgroup_parent(cgrp);
}
return false;
}
-EXPORT_SYMBOL_GPL(cgroup_is_descendant);
static int cgroup_is_releasable(const struct cgroup *cgrp)
{
@@ -248,7 +350,7 @@ static int notify_on_release(const struct cgroup *cgrp)
* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
* @cgrp: the target cgroup to iterate css's of
*
- * Should be called under cgroup_mutex.
+ * Should be called under cgroup_[tree_]mutex.
*/
#define for_each_css(css, ssid, cgrp) \
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
@@ -258,66 +360,39 @@ static int notify_on_release(const struct cgroup *cgrp)
else
/**
- * for_each_subsys - iterate all loaded cgroup subsystems
- * @ss: the iteration cursor
- * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ * for_each_e_css - iterate all effective css's of a cgroup
+ * @css: the iteration cursor
+ * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
+ * @cgrp: the target cgroup to iterate css's of
*
- * Iterates through all loaded subsystems. Should be called under
- * cgroup_mutex or cgroup_root_mutex.
+ * Should be called under cgroup_[tree_]mutex.
*/
-#define for_each_subsys(ss, ssid) \
- for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \
- (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
- if (!((ss) = cgroup_subsys[(ssid)])) { } \
+#define for_each_e_css(css, ssid, cgrp) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
+ if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
+ ; \
else
/**
- * for_each_builtin_subsys - iterate all built-in cgroup subsystems
+ * for_each_subsys - iterate all enabled cgroup subsystems
* @ss: the iteration cursor
- * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end
- *
- * Bulit-in subsystems are always present and iteration itself doesn't
- * require any synchronization.
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
*/
-#define for_each_builtin_subsys(ss, i) \
- for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \
- (((ss) = cgroup_subsys[i]) || true); (i)++)
+#define for_each_subsys(ss, ssid) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
+ (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
-/* iterate across the active hierarchies */
-#define for_each_active_root(root) \
+/* iterate across the hierarchies */
+#define for_each_root(root) \
list_for_each_entry((root), &cgroup_roots, root_list)
-static inline struct cgroup *__d_cgrp(struct dentry *dentry)
-{
- return dentry->d_fsdata;
-}
-
-static inline struct cfent *__d_cfe(struct dentry *dentry)
-{
- return dentry->d_fsdata;
-}
-
-static inline struct cftype *__d_cft(struct dentry *dentry)
-{
- return __d_cfe(dentry)->type;
-}
-
-/**
- * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
- * @cgrp: the cgroup to be checked for liveness
- *
- * On success, returns true; the mutex should be later unlocked. On
- * failure returns false with no lock held.
- */
-static bool cgroup_lock_live_group(struct cgroup *cgrp)
-{
- mutex_lock(&cgroup_mutex);
- if (cgroup_is_dead(cgrp)) {
- mutex_unlock(&cgroup_mutex);
- return false;
- }
- return true;
-}
+/* iterate over child cgrps, lock should be held throughout iteration */
+#define cgroup_for_each_live_child(child, cgrp) \
+ list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
+ if (({ lockdep_assert_held(&cgroup_mutex); \
+ cgroup_is_dead(child); })) \
+ ; \
+ else
/* the list of cgroups eligible for automatic release. Protected by
* release_list_lock */
@@ -347,23 +422,60 @@ struct cgrp_cset_link {
struct list_head cgrp_link;
};
-/* The default css_set - used by init and its children prior to any
+/*
+ * The default css_set - used by init and its children prior to any
* hierarchies being mounted. It contains a pointer to the root state
* for each subsystem. Also used to anchor the list of css_sets. Not
* reference-counted, to improve performance when child cgroups
* haven't been created.
*/
+struct css_set init_css_set = {
+ .refcount = ATOMIC_INIT(1),
+ .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
+ .tasks = LIST_HEAD_INIT(init_css_set.tasks),
+ .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
+ .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
+ .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
+};
-static struct css_set init_css_set;
-static struct cgrp_cset_link init_cgrp_cset_link;
+static int css_set_count = 1; /* 1 for init_css_set */
-/*
- * css_set_lock protects the list of css_set objects, and the chain of
- * tasks off each css_set. Nests outside task->alloc_lock due to
- * css_task_iter_start().
+/**
+ * cgroup_update_populated - updated populated count of a cgroup
+ * @cgrp: the target cgroup
+ * @populated: inc or dec populated count
+ *
+ * @cgrp is either getting the first task (css_set) or losing the last.
+ * Update @cgrp->populated_cnt accordingly. The count is propagated
+ * towards root so that a given cgroup's populated_cnt is zero iff the
+ * cgroup and all its descendants are empty.
+ *
+ * @cgrp's interface file "cgroup.populated" is zero if
+ * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt
+ * changes from or to zero, userland is notified that the content of the
+ * interface file has changed. This can be used to detect when @cgrp and
+ * its descendants become populated or empty.
*/
-static DEFINE_RWLOCK(css_set_lock);
-static int css_set_count;
+static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
+{
+ lockdep_assert_held(&css_set_rwsem);
+
+ do {
+ bool trigger;
+
+ if (populated)
+ trigger = !cgrp->populated_cnt++;
+ else
+ trigger = !--cgrp->populated_cnt;
+
+ if (!trigger)
+ break;
+
+ if (cgrp->populated_kn)
+ kernfs_notify(cgrp->populated_kn);
+ cgrp = cgroup_parent(cgrp);
+ } while (cgrp);
+}
/*
* hash table for cgroup groups. This improves the performance to find
@@ -386,32 +498,20 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
return key;
}
-/*
- * We don't maintain the lists running through each css_set to its task
- * until after the first call to css_task_iter_start(). This reduces the
- * fork()/exit() overhead for people who have cgroups compiled into their
- * kernel but not actually in use.
- */
-static int use_task_css_set_links __read_mostly;
-
-static void __put_css_set(struct css_set *cset, int taskexit)
+static void put_css_set_locked(struct css_set *cset, bool taskexit)
{
struct cgrp_cset_link *link, *tmp_link;
+ struct cgroup_subsys *ss;
+ int ssid;
- /*
- * Ensure that the refcount doesn't hit zero while any readers
- * can see it. Similar to atomic_dec_and_lock(), but for an
- * rwlock
- */
- if (atomic_add_unless(&cset->refcount, -1, 1))
- return;
- write_lock(&css_set_lock);
- if (!atomic_dec_and_test(&cset->refcount)) {
- write_unlock(&css_set_lock);
+ lockdep_assert_held(&css_set_rwsem);
+
+ if (!atomic_dec_and_test(&cset->refcount))
return;
- }
/* This css_set is dead. unlink it and release cgroup refcounts */
+ for_each_subsys(ss, ssid)
+ list_del(&cset->e_cset_node[ssid]);
hash_del(&cset->hlist);
css_set_count--;
@@ -421,20 +521,37 @@ static void __put_css_set(struct css_set *cset, int taskexit)
list_del(&link->cset_link);
list_del(&link->cgrp_link);
- /* @cgrp can't go away while we're holding css_set_lock */
- if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
- if (taskexit)
- set_bit(CGRP_RELEASABLE, &cgrp->flags);
- check_for_release(cgrp);
+ /* @cgrp can't go away while we're holding css_set_rwsem */
+ if (list_empty(&cgrp->cset_links)) {
+ cgroup_update_populated(cgrp, false);
+ if (notify_on_release(cgrp)) {
+ if (taskexit)
+ set_bit(CGRP_RELEASABLE, &cgrp->flags);
+ check_for_release(cgrp);
+ }
}
kfree(link);
}
- write_unlock(&css_set_lock);
kfree_rcu(cset, rcu_head);
}
+static void put_css_set(struct css_set *cset, bool taskexit)
+{
+ /*
+ * Ensure that the refcount doesn't hit zero while any readers
+ * can see it. Similar to atomic_dec_and_lock(), but for an
+ * rwlock
+ */
+ if (atomic_add_unless(&cset->refcount, -1, 1))
+ return;
+
+ down_write(&css_set_rwsem);
+ put_css_set_locked(cset, taskexit);
+ up_write(&css_set_rwsem);
+}
+
/*
* refcounted get/put for css_set objects
*/
@@ -443,16 +560,6 @@ static inline void get_css_set(struct css_set *cset)
atomic_inc(&cset->refcount);
}
-static inline void put_css_set(struct css_set *cset)
-{
- __put_css_set(cset, 0);
-}
-
-static inline void put_css_set_taskexit(struct css_set *cset)
-{
- __put_css_set(cset, 1);
-}
-
/**
* compare_css_sets - helper function for find_existing_css_set().
* @cset: candidate css_set being tested
@@ -470,20 +577,20 @@ static bool compare_css_sets(struct css_set *cset,
{
struct list_head *l1, *l2;
- if (memcmp(template, cset->subsys, sizeof(cset->subsys))) {
- /* Not all subsystems matched */
+ /*
+ * On the default hierarchy, there can be csets which are
+ * associated with the same set of cgroups but different csses.
+ * Let's first ensure that csses match.
+ */
+ if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
return false;
- }
/*
* Compare cgroup pointers in order to distinguish between
- * different cgroups in heirarchies with no subsystems. We
- * could get by with just this check alone (and skip the
- * memcmp above) but on most setups the memcmp check will
- * avoid the need for this more expensive check on almost all
- * candidates.
+ * different cgroups in hierarchies. As different cgroups may
+ * share the same effective css, this comparison is always
+ * necessary.
*/
-
l1 = &cset->cgrp_links;
l2 = &old_cset->cgrp_links;
while (1) {
@@ -535,7 +642,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
struct cgroup *cgrp,
struct cgroup_subsys_state *template[])
{
- struct cgroupfs_root *root = cgrp->root;
+ struct cgroup_root *root = cgrp->root;
struct cgroup_subsys *ss;
struct css_set *cset;
unsigned long key;
@@ -548,13 +655,16 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
*/
for_each_subsys(ss, i) {
if (root->subsys_mask & (1UL << i)) {
- /* Subsystem is in this hierarchy. So we want
- * the subsystem state from the new
- * cgroup */
- template[i] = cgroup_css(cgrp, ss);
+ /*
+ * @ss is in this hierarchy, so we want the
+ * effective css from @cgrp.
+ */
+ template[i] = cgroup_e_css(cgrp, ss);
} else {
- /* Subsystem is not in this hierarchy, so we
- * don't want to change the subsystem state */
+ /*
+ * @ss is not in this hierarchy, so we don't want
+ * to change the css.
+ */
template[i] = old_cset->subsys[i];
}
}
@@ -620,10 +730,18 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
struct cgrp_cset_link *link;
BUG_ON(list_empty(tmp_links));
+
+ if (cgroup_on_dfl(cgrp))
+ cset->dfl_cgrp = cgrp;
+
link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
link->cset = cset;
link->cgrp = cgrp;
+
+ if (list_empty(&cgrp->cset_links))
+ cgroup_update_populated(cgrp, true);
list_move(&link->cset_link, &cgrp->cset_links);
+
/*
* Always add links to the tail of the list so that the list
* is sorted by order of hierarchy creation
@@ -646,17 +764,19 @@ static struct css_set *find_css_set(struct css_set *old_cset,
struct css_set *cset;
struct list_head tmp_links;
struct cgrp_cset_link *link;
+ struct cgroup_subsys *ss;
unsigned long key;
+ int ssid;
lockdep_assert_held(&cgroup_mutex);
/* First see if we already have a cgroup group that matches
* the desired set */
- read_lock(&css_set_lock);
+ down_read(&css_set_rwsem);
cset = find_existing_css_set(old_cset, cgrp, template);
if (cset)
get_css_set(cset);
- read_unlock(&css_set_lock);
+ up_read(&css_set_rwsem);
if (cset)
return cset;
@@ -674,13 +794,16 @@ static struct css_set *find_css_set(struct css_set *old_cset,
atomic_set(&cset->refcount, 1);
INIT_LIST_HEAD(&cset->cgrp_links);
INIT_LIST_HEAD(&cset->tasks);
+ INIT_LIST_HEAD(&cset->mg_tasks);
+ INIT_LIST_HEAD(&cset->mg_preload_node);
+ INIT_LIST_HEAD(&cset->mg_node);
INIT_HLIST_NODE(&cset->hlist);
/* Copy the set of subsystem state objects generated in
* find_existing_css_set() */
memcpy(cset->subsys, template, sizeof(cset->subsys));
- write_lock(&css_set_lock);
+ down_write(&css_set_rwsem);
/* Add reference counts and links from the new css_set. */
list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
@@ -694,35 +817,111 @@ static struct css_set *find_css_set(struct css_set *old_cset,
css_set_count++;
- /* Add this cgroup group to the hash table */
+ /* Add @cset to the hash table */
key = css_set_hash(cset->subsys);
hash_add(css_set_table, &cset->hlist, key);
- write_unlock(&css_set_lock);
+ for_each_subsys(ss, ssid)
+ list_add_tail(&cset->e_cset_node[ssid],
+ &cset->subsys[ssid]->cgroup->e_csets[ssid]);
+
+ up_write(&css_set_rwsem);
return cset;
}
-/*
- * Return the cgroup for "task" from the given hierarchy. Must be
- * called with cgroup_mutex held.
- */
-static struct cgroup *task_cgroup_from_root(struct task_struct *task,
- struct cgroupfs_root *root)
+static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
{
- struct css_set *cset;
- struct cgroup *res = NULL;
+ struct cgroup *root_cgrp = kf_root->kn->priv;
+
+ return root_cgrp->root;
+}
+
+static int cgroup_init_root_id(struct cgroup_root *root)
+{
+ int id;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
+ if (id < 0)
+ return id;
+
+ root->hierarchy_id = id;
+ return 0;
+}
+
+static void cgroup_exit_root_id(struct cgroup_root *root)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (root->hierarchy_id) {
+ idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
+ root->hierarchy_id = 0;
+ }
+}
+
+static void cgroup_free_root(struct cgroup_root *root)
+{
+ if (root) {
+ /* hierarhcy ID shoulid already have been released */
+ WARN_ON_ONCE(root->hierarchy_id);
+
+ idr_destroy(&root->cgroup_idr);
+ kfree(root);
+ }
+}
+
+static void cgroup_destroy_root(struct cgroup_root *root)
+{
+ struct cgroup *cgrp = &root->cgrp;
+ struct cgrp_cset_link *link, *tmp_link;
+
+ mutex_lock(&cgroup_mutex);
+
+ BUG_ON(atomic_read(&root->nr_cgrps));
+ BUG_ON(!list_empty(&cgrp->self.children));
+
+ /* Rebind all subsystems back to the default hierarchy */
+ rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
- BUG_ON(!mutex_is_locked(&cgroup_mutex));
- read_lock(&css_set_lock);
/*
- * No need to lock the task - since we hold cgroup_mutex the
- * task can't change groups, so the only thing that can happen
- * is that it exits and its css is set back to init_css_set.
+ * Release all the links from cset_links to this hierarchy's
+ * root cgroup
*/
- cset = task_css_set(task);
+ down_write(&css_set_rwsem);
+
+ list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
+ list_del(&link->cset_link);
+ list_del(&link->cgrp_link);
+ kfree(link);
+ }
+ up_write(&css_set_rwsem);
+
+ if (!list_empty(&root->root_list)) {
+ list_del(&root->root_list);
+ cgroup_root_count--;
+ }
+
+ cgroup_exit_root_id(root);
+
+ mutex_unlock(&cgroup_mutex);
+
+ kernfs_destroy_root(root->kf_root);
+ cgroup_free_root(root);
+}
+
+/* look up cgroup associated with given css_set on the specified hierarchy */
+static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
+ struct cgroup_root *root)
+{
+ struct cgroup *res = NULL;
+
+ lockdep_assert_held(&cgroup_mutex);
+ lockdep_assert_held(&css_set_rwsem);
+
if (cset == &init_css_set) {
- res = &root->top_cgroup;
+ res = &root->cgrp;
} else {
struct cgrp_cset_link *link;
@@ -735,16 +934,27 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
}
}
}
- read_unlock(&css_set_lock);
+
BUG_ON(!res);
return res;
}
/*
- * There is one global cgroup mutex. We also require taking
- * task_lock() when dereferencing a task's cgroup subsys pointers.
- * See "The task_lock() exception", at the end of this comment.
- *
+ * Return the cgroup for "task" from the given hierarchy. Must be
+ * called with cgroup_mutex and css_set_rwsem held.
+ */
+static struct cgroup *task_cgroup_from_root(struct task_struct *task,
+ struct cgroup_root *root)
+{
+ /*
+ * No need to lock the task - since we hold cgroup_mutex the
+ * task can't change groups, so the only thing that can happen
+ * is that it exits and its css is set back to init_css_set.
+ */
+ return cset_cgroup_from_root(task_css_set(task), root);
+}
+
+/*
* A task must hold cgroup_mutex to modify cgroups.
*
* Any task can increment and decrement the count field without lock.
@@ -770,175 +980,197 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
* A cgroup can only be deleted if both its 'count' of using tasks
* is zero, and its list of 'children' cgroups is empty. Since all
* tasks in the system use _some_ cgroup, and since there is always at
- * least one task in the system (init, pid == 1), therefore, top_cgroup
+ * least one task in the system (init, pid == 1), therefore, root cgroup
* always has either children cgroups and/or using tasks. So we don't
- * need a special hack to ensure that top_cgroup cannot be deleted.
- *
- * The task_lock() exception
- *
- * The need for this exception arises from the action of
- * cgroup_attach_task(), which overwrites one task's cgroup pointer with
- * another. It does so using cgroup_mutex, however there are
- * several performance critical places that need to reference
- * task->cgroup without the expense of grabbing a system global
- * mutex. Therefore except as noted below, when dereferencing or, as
- * in cgroup_attach_task(), modifying a task's cgroup pointer we use
- * task_lock(), which acts on a spinlock (task->alloc_lock) already in
- * the task_struct routinely used for such matters.
+ * need a special hack to ensure that root cgroup cannot be deleted.
*
* P.S. One more locking exception. RCU is used to guard the
* update of a tasks cgroup pointer by cgroup_attach_task()
*/
-/*
- * A couple of forward declarations required, due to cyclic reference loop:
- * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
- * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
- * -> cgroup_mkdir.
- */
-
-static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
-static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
-static const struct inode_operations cgroup_dir_inode_operations;
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask);
+static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
static const struct file_operations proc_cgroupstats_operations;
-static struct backing_dev_info cgroup_backing_dev_info = {
- .name = "cgroup",
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
-
-static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
+static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
+ char *buf)
{
- struct inode *inode = new_inode(sb);
-
- if (inode) {
- inode->i_ino = get_next_ino();
- inode->i_mode = mode;
- inode->i_uid = current_fsuid();
- inode->i_gid = current_fsgid();
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
- }
- return inode;
-}
-
-static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
-{
- struct cgroup_name *name;
-
- name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL);
- if (!name)
- return NULL;
- strcpy(name->name, dentry->d_name.name);
- return name;
+ if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
+ !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
+ snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
+ cft->ss->name, cft->name);
+ else
+ strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+ return buf;
}
-static void cgroup_free_fn(struct work_struct *work)
+/**
+ * cgroup_file_mode - deduce file mode of a control file
+ * @cft: the control file in question
+ *
+ * returns cft->mode if ->mode is not 0
+ * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
+ * returns S_IRUGO if it has only a read handler
+ * returns S_IWUSR if it has only a write hander
+ */
+static umode_t cgroup_file_mode(const struct cftype *cft)
{
- struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
-
- mutex_lock(&cgroup_mutex);
- cgrp->root->number_of_cgroups--;
- mutex_unlock(&cgroup_mutex);
+ umode_t mode = 0;
- /*
- * We get a ref to the parent's dentry, and put the ref when
- * this cgroup is being freed, so it's guaranteed that the
- * parent won't be destroyed before its children.
- */
- dput(cgrp->parent->dentry);
+ if (cft->mode)
+ return cft->mode;
- /*
- * Drop the active superblock reference that we took when we
- * created the cgroup. This will free cgrp->root, if we are
- * holding the last reference to @sb.
- */
- deactivate_super(cgrp->root->sb);
+ if (cft->read_u64 || cft->read_s64 || cft->seq_show)
+ mode |= S_IRUGO;
- cgroup_pidlist_destroy_all(cgrp);
+ if (cft->write_u64 || cft->write_s64 || cft->write)
+ mode |= S_IWUSR;
- simple_xattrs_free(&cgrp->xattrs);
+ return mode;
+}
- kfree(rcu_dereference_raw(cgrp->name));
- kfree(cgrp);
+static void cgroup_get(struct cgroup *cgrp)
+{
+ WARN_ON_ONCE(cgroup_is_dead(cgrp));
+ css_get(&cgrp->self);
}
-static void cgroup_free_rcu(struct rcu_head *head)
+static bool cgroup_tryget(struct cgroup *cgrp)
{
- struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
+ return css_tryget(&cgrp->self);
+}
- INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
- queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
+static void cgroup_put(struct cgroup *cgrp)
+{
+ css_put(&cgrp->self);
}
-static void cgroup_diput(struct dentry *dentry, struct inode *inode)
+/**
+ * cgroup_refresh_child_subsys_mask - update child_subsys_mask
+ * @cgrp: the target cgroup
+ *
+ * On the default hierarchy, a subsystem may request other subsystems to be
+ * enabled together through its ->depends_on mask. In such cases, more
+ * subsystems than specified in "cgroup.subtree_control" may be enabled.
+ *
+ * This function determines which subsystems need to be enabled given the
+ * current @cgrp->subtree_control and records it in
+ * @cgrp->child_subsys_mask. The resulting mask is always a superset of
+ * @cgrp->subtree_control and follows the usual hierarchy rules.
+ */
+static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
{
- /* is dentry a directory ? if so, kfree() associated cgroup */
- if (S_ISDIR(inode->i_mode)) {
- struct cgroup *cgrp = dentry->d_fsdata;
+ struct cgroup *parent = cgroup_parent(cgrp);
+ unsigned int cur_ss_mask = cgrp->subtree_control;
+ struct cgroup_subsys *ss;
+ int ssid;
- BUG_ON(!(cgroup_is_dead(cgrp)));
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!cgroup_on_dfl(cgrp)) {
+ cgrp->child_subsys_mask = cur_ss_mask;
+ return;
+ }
+
+ while (true) {
+ unsigned int new_ss_mask = cur_ss_mask;
+
+ for_each_subsys(ss, ssid)
+ if (cur_ss_mask & (1 << ssid))
+ new_ss_mask |= ss->depends_on;
/*
- * XXX: cgrp->id is only used to look up css's. As cgroup
- * and css's lifetimes will be decoupled, it should be made
- * per-subsystem and moved to css->id so that lookups are
- * successful until the target css is released.
+ * Mask out subsystems which aren't available. This can
+ * happen only if some depended-upon subsystems were bound
+ * to non-default hierarchies.
*/
- mutex_lock(&cgroup_mutex);
- idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
- mutex_unlock(&cgroup_mutex);
- cgrp->id = -1;
+ if (parent)
+ new_ss_mask &= parent->child_subsys_mask;
+ else
+ new_ss_mask &= cgrp->root->subsys_mask;
- call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
- } else {
- struct cfent *cfe = __d_cfe(dentry);
- struct cgroup *cgrp = dentry->d_parent->d_fsdata;
-
- WARN_ONCE(!list_empty(&cfe->node) &&
- cgrp != &cgrp->root->top_cgroup,
- "cfe still linked for %s\n", cfe->type->name);
- simple_xattrs_free(&cfe->xattrs);
- kfree(cfe);
+ if (new_ss_mask == cur_ss_mask)
+ break;
+ cur_ss_mask = new_ss_mask;
}
- iput(inode);
+
+ cgrp->child_subsys_mask = cur_ss_mask;
}
-static void remove_dir(struct dentry *d)
+/**
+ * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
+ * @kn: the kernfs_node being serviced
+ *
+ * This helper undoes cgroup_kn_lock_live() and should be invoked before
+ * the method finishes if locking succeeded. Note that once this function
+ * returns the cgroup returned by cgroup_kn_lock_live() may become
+ * inaccessible any time. If the caller intends to continue to access the
+ * cgroup, it should pin it before invoking this function.
+ */
+static void cgroup_kn_unlock(struct kernfs_node *kn)
{
- struct dentry *parent = dget(d->d_parent);
+ struct cgroup *cgrp;
- d_delete(d);
- simple_rmdir(parent->d_inode, d);
- dput(parent);
+ if (kernfs_type(kn) == KERNFS_DIR)
+ cgrp = kn->priv;
+ else
+ cgrp = kn->parent->priv;
+
+ mutex_unlock(&cgroup_mutex);
+
+ kernfs_unbreak_active_protection(kn);
+ cgroup_put(cgrp);
}
-static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
+/**
+ * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
+ * @kn: the kernfs_node being serviced
+ *
+ * This helper is to be used by a cgroup kernfs method currently servicing
+ * @kn. It breaks the active protection, performs cgroup locking and
+ * verifies that the associated cgroup is alive. Returns the cgroup if
+ * alive; otherwise, %NULL. A successful return should be undone by a
+ * matching cgroup_kn_unlock() invocation.
+ *
+ * Any cgroup kernfs method implementation which requires locking the
+ * associated cgroup should use this helper. It avoids nesting cgroup
+ * locking under kernfs active protection and allows all kernfs operations
+ * including self-removal.
+ */
+static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
{
- struct cfent *cfe;
+ struct cgroup *cgrp;
- lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
- lockdep_assert_held(&cgroup_mutex);
+ if (kernfs_type(kn) == KERNFS_DIR)
+ cgrp = kn->priv;
+ else
+ cgrp = kn->parent->priv;
/*
- * If we're doing cleanup due to failure of cgroup_create(),
- * the corresponding @cfe may not exist.
+ * We're gonna grab cgroup_mutex which nests outside kernfs
+ * active_ref. cgroup liveliness check alone provides enough
+ * protection against removal. Ensure @cgrp stays accessible and
+ * break the active_ref protection.
*/
- list_for_each_entry(cfe, &cgrp->files, node) {
- struct dentry *d = cfe->dentry;
+ if (!cgroup_tryget(cgrp))
+ return NULL;
+ kernfs_break_active_protection(kn);
- if (cft && cfe->type != cft)
- continue;
+ mutex_lock(&cgroup_mutex);
- dget(d);
- d_delete(d);
- simple_unlink(cgrp->dentry->d_inode, d);
- list_del_init(&cfe->node);
- dput(d);
+ if (!cgroup_is_dead(cgrp))
+ return cgrp;
- break;
- }
+ cgroup_kn_unlock(kn);
+ return NULL;
+}
+
+static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
+{
+ char name[CGROUP_FILE_NAME_MAX];
+
+ lockdep_assert_held(&cgroup_mutex);
+ kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
}
/**
@@ -946,203 +1178,171 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
* @cgrp: target cgroup
* @subsys_mask: mask of the subsystem ids whose files should be removed
*/
-static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
+static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
{
struct cgroup_subsys *ss;
int i;
for_each_subsys(ss, i) {
- struct cftype_set *set;
+ struct cftype *cfts;
- if (!test_bit(i, &subsys_mask))
+ if (!(subsys_mask & (1 << i)))
continue;
- list_for_each_entry(set, &ss->cftsets, node)
- cgroup_addrm_files(cgrp, set->cfts, false);
+ list_for_each_entry(cfts, &ss->cfts, node)
+ cgroup_addrm_files(cgrp, cfts, false);
}
}
-/*
- * NOTE : the dentry must have been dget()'ed
- */
-static void cgroup_d_remove_dir(struct dentry *dentry)
-{
- struct dentry *parent;
-
- parent = dentry->d_parent;
- spin_lock(&parent->d_lock);
- spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- list_del_init(&dentry->d_u.d_child);
- spin_unlock(&dentry->d_lock);
- spin_unlock(&parent->d_lock);
- remove_dir(dentry);
-}
-
-/*
- * Call with cgroup_mutex held. Drops reference counts on modules, including
- * any duplicate ones that parse_cgroupfs_options took. If this function
- * returns an error, no reference counts are touched.
- */
-static int rebind_subsystems(struct cgroupfs_root *root,
- unsigned long added_mask, unsigned removed_mask)
+static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
{
- struct cgroup *cgrp = &root->top_cgroup;
struct cgroup_subsys *ss;
- unsigned long pinned = 0;
- int i, ret;
+ unsigned int tmp_ss_mask;
+ int ssid, i, ret;
- BUG_ON(!mutex_is_locked(&cgroup_mutex));
- BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
+ lockdep_assert_held(&cgroup_mutex);
- /* Check that any added subsystems are currently free */
- for_each_subsys(ss, i) {
- if (!(added_mask & (1 << i)))
+ for_each_subsys(ss, ssid) {
+ if (!(ss_mask & (1 << ssid)))
continue;
- /* is the subsystem mounted elsewhere? */
- if (ss->root != &cgroup_dummy_root) {
- ret = -EBUSY;
- goto out_put;
- }
+ /* if @ss has non-root csses attached to it, can't move */
+ if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
+ return -EBUSY;
- /* pin the module */
- if (!try_module_get(ss->module)) {
- ret = -ENOENT;
- goto out_put;
- }
- pinned |= 1 << i;
+ /* can't move between two non-dummy roots either */
+ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
+ return -EBUSY;
}
- /* subsys could be missing if unloaded between parsing and here */
- if (added_mask != pinned) {
- ret = -ENOENT;
- goto out_put;
- }
+ /* skip creating root files on dfl_root for inhibited subsystems */
+ tmp_ss_mask = ss_mask;
+ if (dst_root == &cgrp_dfl_root)
+ tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
- ret = cgroup_populate_dir(cgrp, added_mask);
- if (ret)
- goto out_put;
+ ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask);
+ if (ret) {
+ if (dst_root != &cgrp_dfl_root)
+ return ret;
+
+ /*
+ * Rebinding back to the default root is not allowed to
+ * fail. Using both default and non-default roots should
+ * be rare. Moving subsystems back and forth even more so.
+ * Just warn about it and continue.
+ */
+ if (cgrp_dfl_root_visible) {
+ pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
+ ret, ss_mask);
+ pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
+ }
+ }
/*
* Nothing can fail from this point on. Remove files for the
* removed subsystems and rebind each subsystem.
*/
- cgroup_clear_dir(cgrp, removed_mask);
-
- for_each_subsys(ss, i) {
- unsigned long bit = 1UL << i;
-
- if (bit & added_mask) {
- /* We're binding this subsystem to this hierarchy */
- BUG_ON(cgroup_css(cgrp, ss));
- BUG_ON(!cgroup_css(cgroup_dummy_top, ss));
- BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top);
-
- rcu_assign_pointer(cgrp->subsys[i],
- cgroup_css(cgroup_dummy_top, ss));
- cgroup_css(cgrp, ss)->cgroup = cgrp;
+ for_each_subsys(ss, ssid)
+ if (ss_mask & (1 << ssid))
+ cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
- ss->root = root;
- if (ss->bind)
- ss->bind(cgroup_css(cgrp, ss));
+ for_each_subsys(ss, ssid) {
+ struct cgroup_root *src_root;
+ struct cgroup_subsys_state *css;
+ struct css_set *cset;
- /* refcount was already taken, and we're keeping it */
- root->subsys_mask |= bit;
- } else if (bit & removed_mask) {
- /* We're removing this subsystem */
- BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss));
- BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp);
+ if (!(ss_mask & (1 << ssid)))
+ continue;
- if (ss->bind)
- ss->bind(cgroup_css(cgroup_dummy_top, ss));
+ src_root = ss->root;
+ css = cgroup_css(&src_root->cgrp, ss);
- cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top;
- RCU_INIT_POINTER(cgrp->subsys[i], NULL);
+ WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
- cgroup_subsys[i]->root = &cgroup_dummy_root;
+ RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
+ rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
+ ss->root = dst_root;
+ css->cgroup = &dst_root->cgrp;
- /* subsystem is now free - drop reference on module */
- module_put(ss->module);
- root->subsys_mask &= ~bit;
+ down_write(&css_set_rwsem);
+ hash_for_each(css_set_table, i, cset, hlist)
+ list_move_tail(&cset->e_cset_node[ss->id],
+ &dst_root->cgrp.e_csets[ss->id]);
+ up_write(&css_set_rwsem);
+
+ src_root->subsys_mask &= ~(1 << ssid);
+ src_root->cgrp.subtree_control &= ~(1 << ssid);
+ cgroup_refresh_child_subsys_mask(&src_root->cgrp);
+
+ /* default hierarchy doesn't enable controllers by default */
+ dst_root->subsys_mask |= 1 << ssid;
+ if (dst_root != &cgrp_dfl_root) {
+ dst_root->cgrp.subtree_control |= 1 << ssid;
+ cgroup_refresh_child_subsys_mask(&dst_root->cgrp);
}
- }
- /*
- * Mark @root has finished binding subsystems. @root->subsys_mask
- * now matches the bound subsystems.
- */
- root->flags |= CGRP_ROOT_SUBSYS_BOUND;
+ if (ss->bind)
+ ss->bind(css);
+ }
+ kernfs_activate(dst_root->cgrp.kn);
return 0;
-
-out_put:
- for_each_subsys(ss, i)
- if (pinned & (1 << i))
- module_put(ss->module);
- return ret;
}
-static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
+static int cgroup_show_options(struct seq_file *seq,
+ struct kernfs_root *kf_root)
{
- struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
+ struct cgroup_root *root = cgroup_root_from_kf(kf_root);
struct cgroup_subsys *ss;
int ssid;
- mutex_lock(&cgroup_root_mutex);
for_each_subsys(ss, ssid)
if (root->subsys_mask & (1 << ssid))
seq_printf(seq, ",%s", ss->name);
- if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
- seq_puts(seq, ",sane_behavior");
if (root->flags & CGRP_ROOT_NOPREFIX)
seq_puts(seq, ",noprefix");
if (root->flags & CGRP_ROOT_XATTR)
seq_puts(seq, ",xattr");
+
+ spin_lock(&release_agent_path_lock);
if (strlen(root->release_agent_path))
seq_printf(seq, ",release_agent=%s", root->release_agent_path);
- if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
+ spin_unlock(&release_agent_path_lock);
+
+ if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
seq_puts(seq, ",clone_children");
if (strlen(root->name))
seq_printf(seq, ",name=%s", root->name);
- mutex_unlock(&cgroup_root_mutex);
return 0;
}
struct cgroup_sb_opts {
- unsigned long subsys_mask;
- unsigned long flags;
+ unsigned int subsys_mask;
+ unsigned int flags;
char *release_agent;
bool cpuset_clone_children;
char *name;
/* User explicitly requested empty subsystem */
bool none;
-
- struct cgroupfs_root *new_root;
-
};
-/*
- * Convert a hierarchy specifier into a bitmask of subsystems and
- * flags. Call with cgroup_mutex held to protect the cgroup_subsys[]
- * array. This function takes refcounts on subsystems to be used, unless it
- * returns error, in which case no refcounts are taken.
- */
static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
{
char *token, *o = data;
bool all_ss = false, one_ss = false;
- unsigned long mask = (unsigned long)-1;
+ unsigned int mask = -1U;
struct cgroup_subsys *ss;
+ int nr_opts = 0;
int i;
- BUG_ON(!mutex_is_locked(&cgroup_mutex));
-
#ifdef CONFIG_CPUSETS
- mask = ~(1UL << cpuset_subsys_id);
+ mask = ~(1U << cpuset_cgrp_id);
#endif
memset(opts, 0, sizeof(*opts));
while ((token = strsep(&o, ",")) != NULL) {
+ nr_opts++;
+
if (!*token)
return -EINVAL;
if (!strcmp(token, "none")) {
@@ -1218,7 +1418,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
/* Mutually exclusive option 'all' + subsystem name */
if (all_ss)
return -EINVAL;
- set_bit(i, &opts->subsys_mask);
+ opts->subsys_mask |= (1 << i);
one_ss = true;
break;
@@ -1227,31 +1427,31 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
return -ENOENT;
}
+ if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
+ pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
+ if (nr_opts != 1) {
+ pr_err("sane_behavior: no other mount options allowed\n");
+ return -EINVAL;
+ }
+ return 0;
+ }
+
/*
* If the 'all' option was specified select all the subsystems,
- * otherwise if 'none', 'name=' and a subsystem name options
- * were not specified, let's default to 'all'
+ * otherwise if 'none', 'name=' and a subsystem name options were
+ * not specified, let's default to 'all'
*/
if (all_ss || (!one_ss && !opts->none && !opts->name))
for_each_subsys(ss, i)
if (!ss->disabled)
- set_bit(i, &opts->subsys_mask);
+ opts->subsys_mask |= (1 << i);
- /* Consistency checks */
-
- if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
- pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
-
- if (opts->flags & CGRP_ROOT_NOPREFIX) {
- pr_err("cgroup: sane_behavior: noprefix is not allowed\n");
- return -EINVAL;
- }
-
- if (opts->cpuset_clone_children) {
- pr_err("cgroup: sane_behavior: clone_children is not allowed\n");
- return -EINVAL;
- }
- }
+ /*
+ * We either have to specify by name or by subsystems. (So all
+ * empty hierarchies must have a name).
+ */
+ if (!opts->subsys_mask && !opts->name)
+ return -EINVAL;
/*
* Option noprefix was introduced just for backward compatibility
@@ -1261,37 +1461,26 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
return -EINVAL;
-
/* Can't specify "none" and some subsystems */
if (opts->subsys_mask && opts->none)
return -EINVAL;
- /*
- * We either have to specify by name or by subsystems. (So all
- * empty hierarchies must have a name).
- */
- if (!opts->subsys_mask && !opts->name)
- return -EINVAL;
-
return 0;
}
-static int cgroup_remount(struct super_block *sb, int *flags, char *data)
+static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
{
int ret = 0;
- struct cgroupfs_root *root = sb->s_fs_info;
- struct cgroup *cgrp = &root->top_cgroup;
+ struct cgroup_root *root = cgroup_root_from_kf(kf_root);
struct cgroup_sb_opts opts;
- unsigned long added_mask, removed_mask;
+ unsigned int added_mask, removed_mask;
- if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
- pr_err("cgroup: sane_behavior: remount is not allowed\n");
+ if (root == &cgrp_dfl_root) {
+ pr_err("remount is not allowed\n");
return -EINVAL;
}
- mutex_lock(&cgrp->dentry->d_inode->i_mutex);
mutex_lock(&cgroup_mutex);
- mutex_lock(&cgroup_root_mutex);
/* See what subsystems are wanted */
ret = parse_cgroupfs_options(data, &opts);
@@ -1299,439 +1488,411 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
goto out_unlock;
if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
- pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
- task_tgid_nr(current), current->comm);
+ pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
+ task_tgid_nr(current), current->comm);
added_mask = opts.subsys_mask & ~root->subsys_mask;
removed_mask = root->subsys_mask & ~opts.subsys_mask;
/* Don't allow flags or name to change at remount */
- if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
+ if ((opts.flags ^ root->flags) ||
(opts.name && strcmp(opts.name, root->name))) {
- pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n",
- opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
- root->flags & CGRP_ROOT_OPTION_MASK, root->name);
+ pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
+ opts.flags, opts.name ?: "", root->flags, root->name);
ret = -EINVAL;
goto out_unlock;
}
/* remounting is not allowed for populated hierarchies */
- if (root->number_of_cgroups > 1) {
+ if (!list_empty(&root->cgrp.self.children)) {
ret = -EBUSY;
goto out_unlock;
}
- ret = rebind_subsystems(root, added_mask, removed_mask);
+ ret = rebind_subsystems(root, added_mask);
if (ret)
goto out_unlock;
- if (opts.release_agent)
+ rebind_subsystems(&cgrp_dfl_root, removed_mask);
+
+ if (opts.release_agent) {
+ spin_lock(&release_agent_path_lock);
strcpy(root->release_agent_path, opts.release_agent);
+ spin_unlock(&release_agent_path_lock);
+ }
out_unlock:
kfree(opts.release_agent);
kfree(opts.name);
- mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
- mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
return ret;
}
-static const struct super_operations cgroup_ops = {
- .statfs = simple_statfs,
- .drop_inode = generic_delete_inode,
- .show_options = cgroup_show_options,
- .remount_fs = cgroup_remount,
-};
-
-static void init_cgroup_housekeeping(struct cgroup *cgrp)
-{
- INIT_LIST_HEAD(&cgrp->sibling);
- INIT_LIST_HEAD(&cgrp->children);
- INIT_LIST_HEAD(&cgrp->files);
- INIT_LIST_HEAD(&cgrp->cset_links);
- INIT_LIST_HEAD(&cgrp->release_list);
- INIT_LIST_HEAD(&cgrp->pidlists);
- mutex_init(&cgrp->pidlist_mutex);
- cgrp->dummy_css.cgroup = cgrp;
- simple_xattrs_init(&cgrp->xattrs);
-}
+/*
+ * To reduce the fork() overhead for systems that are not actually using
+ * their cgroups capability, we don't maintain the lists running through
+ * each css_set to its tasks until we see the list actually used - in other
+ * words after the first mount.
+ */
+static bool use_task_css_set_links __read_mostly;
-static void init_cgroup_root(struct cgroupfs_root *root)
+static void cgroup_enable_task_cg_lists(void)
{
- struct cgroup *cgrp = &root->top_cgroup;
-
- INIT_LIST_HEAD(&root->root_list);
- root->number_of_cgroups = 1;
- cgrp->root = root;
- RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
- init_cgroup_housekeeping(cgrp);
- idr_init(&root->cgroup_idr);
-}
+ struct task_struct *p, *g;
-static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
-{
- int id;
+ down_write(&css_set_rwsem);
- lockdep_assert_held(&cgroup_mutex);
- lockdep_assert_held(&cgroup_root_mutex);
+ if (use_task_css_set_links)
+ goto out_unlock;
- id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end,
- GFP_KERNEL);
- if (id < 0)
- return id;
+ use_task_css_set_links = true;
- root->hierarchy_id = id;
- return 0;
-}
+ /*
+ * We need tasklist_lock because RCU is not safe against
+ * while_each_thread(). Besides, a forking task that has passed
+ * cgroup_post_fork() without seeing use_task_css_set_links = 1
+ * is not guaranteed to have its child immediately visible in the
+ * tasklist if we walk through it with RCU.
+ */
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p) {
+ WARN_ON_ONCE(!list_empty(&p->cg_list) ||
+ task_css_set(p) != &init_css_set);
-static void cgroup_exit_root_id(struct cgroupfs_root *root)
-{
- lockdep_assert_held(&cgroup_mutex);
- lockdep_assert_held(&cgroup_root_mutex);
+ /*
+ * We should check if the process is exiting, otherwise
+ * it will race with cgroup_exit() in that the list
+ * entry won't be deleted though the process has exited.
+ * Do it while holding siglock so that we don't end up
+ * racing against cgroup_exit().
+ */
+ spin_lock_irq(&p->sighand->siglock);
+ if (!(p->flags & PF_EXITING)) {
+ struct css_set *cset = task_css_set(p);
- if (root->hierarchy_id) {
- idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
- root->hierarchy_id = 0;
- }
+ list_add(&p->cg_list, &cset->tasks);
+ get_css_set(cset);
+ }
+ spin_unlock_irq(&p->sighand->siglock);
+ } while_each_thread(g, p);
+ read_unlock(&tasklist_lock);
+out_unlock:
+ up_write(&css_set_rwsem);
}
-static int cgroup_test_super(struct super_block *sb, void *data)
+static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
- struct cgroup_sb_opts *opts = data;
- struct cgroupfs_root *root = sb->s_fs_info;
+ struct cgroup_subsys *ss;
+ int ssid;
- /* If we asked for a name then it must match */
- if (opts->name && strcmp(opts->name, root->name))
- return 0;
+ INIT_LIST_HEAD(&cgrp->self.sibling);
+ INIT_LIST_HEAD(&cgrp->self.children);
+ INIT_LIST_HEAD(&cgrp->cset_links);
+ INIT_LIST_HEAD(&cgrp->release_list);
+ INIT_LIST_HEAD(&cgrp->pidlists);
+ mutex_init(&cgrp->pidlist_mutex);
+ cgrp->self.cgroup = cgrp;
+ cgrp->self.flags |= CSS_ONLINE;
- /*
- * If we asked for subsystems (or explicitly for no
- * subsystems) then they must match
- */
- if ((opts->subsys_mask || opts->none)
- && (opts->subsys_mask != root->subsys_mask))
- return 0;
+ for_each_subsys(ss, ssid)
+ INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
- return 1;
+ init_waitqueue_head(&cgrp->offline_waitq);
}
-static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
+static void init_cgroup_root(struct cgroup_root *root,
+ struct cgroup_sb_opts *opts)
{
- struct cgroupfs_root *root;
+ struct cgroup *cgrp = &root->cgrp;
- if (!opts->subsys_mask && !opts->none)
- return NULL;
-
- root = kzalloc(sizeof(*root), GFP_KERNEL);
- if (!root)
- return ERR_PTR(-ENOMEM);
-
- init_cgroup_root(root);
+ INIT_LIST_HEAD(&root->root_list);
+ atomic_set(&root->nr_cgrps, 1);
+ cgrp->root = root;
+ init_cgroup_housekeeping(cgrp);
+ idr_init(&root->cgroup_idr);
- /*
- * We need to set @root->subsys_mask now so that @root can be
- * matched by cgroup_test_super() before it finishes
- * initialization; otherwise, competing mounts with the same
- * options may try to bind the same subsystems instead of waiting
- * for the first one leading to unexpected mount errors.
- * SUBSYS_BOUND will be set once actual binding is complete.
- */
- root->subsys_mask = opts->subsys_mask;
root->flags = opts->flags;
if (opts->release_agent)
strcpy(root->release_agent_path, opts->release_agent);
if (opts->name)
strcpy(root->name, opts->name);
if (opts->cpuset_clone_children)
- set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags);
- return root;
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
-static void cgroup_free_root(struct cgroupfs_root *root)
+static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
{
- if (root) {
- /* hierarhcy ID shoulid already have been released */
- WARN_ON_ONCE(root->hierarchy_id);
+ LIST_HEAD(tmp_links);
+ struct cgroup *root_cgrp = &root->cgrp;
+ struct cftype *base_files;
+ struct css_set *cset;
+ int i, ret;
- idr_destroy(&root->cgroup_idr);
- kfree(root);
- }
-}
+ lockdep_assert_held(&cgroup_mutex);
-static int cgroup_set_super(struct super_block *sb, void *data)
-{
- int ret;
- struct cgroup_sb_opts *opts = data;
+ ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
+ if (ret < 0)
+ goto out;
+ root_cgrp->id = ret;
- /* If we don't have a new root, we can't set up a new sb */
- if (!opts->new_root)
- return -EINVAL;
+ ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release);
+ if (ret)
+ goto out;
- BUG_ON(!opts->subsys_mask && !opts->none);
+ /*
+ * We're accessing css_set_count without locking css_set_rwsem here,
+ * but that's OK - it can only be increased by someone holding
+ * cgroup_lock, and that's us. The worst that can happen is that we
+ * have some link structures left over
+ */
+ ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
+ if (ret)
+ goto cancel_ref;
- ret = set_anon_super(sb, NULL);
+ ret = cgroup_init_root_id(root);
if (ret)
- return ret;
+ goto cancel_ref;
+
+ root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
+ KERNFS_ROOT_CREATE_DEACTIVATED,
+ root_cgrp);
+ if (IS_ERR(root->kf_root)) {
+ ret = PTR_ERR(root->kf_root);
+ goto exit_root_id;
+ }
+ root_cgrp->kn = root->kf_root->kn;
- sb->s_fs_info = opts->new_root;
- opts->new_root->sb = sb;
+ if (root == &cgrp_dfl_root)
+ base_files = cgroup_dfl_base_files;
+ else
+ base_files = cgroup_legacy_base_files;
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
- sb->s_magic = CGROUP_SUPER_MAGIC;
- sb->s_op = &cgroup_ops;
+ ret = cgroup_addrm_files(root_cgrp, base_files, true);
+ if (ret)
+ goto destroy_root;
- return 0;
-}
+ ret = rebind_subsystems(root, ss_mask);
+ if (ret)
+ goto destroy_root;
-static int cgroup_get_rootdir(struct super_block *sb)
-{
- static const struct dentry_operations cgroup_dops = {
- .d_iput = cgroup_diput,
- .d_delete = always_delete_dentry,
- };
+ /*
+ * There must be no failure case after here, since rebinding takes
+ * care of subsystems' refcounts, which are explicitly dropped in
+ * the failure exit path.
+ */
+ list_add(&root->root_list, &cgroup_roots);
+ cgroup_root_count++;
+
+ /*
+ * Link the root cgroup in this hierarchy into all the css_set
+ * objects.
+ */
+ down_write(&css_set_rwsem);
+ hash_for_each(css_set_table, i, cset, hlist)
+ link_css_set(&tmp_links, cset, root_cgrp);
+ up_write(&css_set_rwsem);
- struct inode *inode =
- cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
+ BUG_ON(!list_empty(&root_cgrp->self.children));
+ BUG_ON(atomic_read(&root->nr_cgrps) != 1);
- if (!inode)
- return -ENOMEM;
+ kernfs_activate(root_cgrp->kn);
+ ret = 0;
+ goto out;
- inode->i_fop = &simple_dir_operations;
- inode->i_op = &cgroup_dir_inode_operations;
- /* directories start off with i_nlink == 2 (for "." entry) */
- inc_nlink(inode);
- sb->s_root = d_make_root(inode);
- if (!sb->s_root)
- return -ENOMEM;
- /* for everything else we want ->d_op set */
- sb->s_d_op = &cgroup_dops;
- return 0;
+destroy_root:
+ kernfs_destroy_root(root->kf_root);
+ root->kf_root = NULL;
+exit_root_id:
+ cgroup_exit_root_id(root);
+cancel_ref:
+ percpu_ref_exit(&root_cgrp->self.refcnt);
+out:
+ free_cgrp_cset_links(&tmp_links);
+ return ret;
}
static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
void *data)
{
+ struct super_block *pinned_sb = NULL;
+ struct cgroup_subsys *ss;
+ struct cgroup_root *root;
struct cgroup_sb_opts opts;
- struct cgroupfs_root *root;
- int ret = 0;
- struct super_block *sb;
- struct cgroupfs_root *new_root;
- struct list_head tmp_links;
- struct inode *inode;
- const struct cred *cred;
+ struct dentry *dentry;
+ int ret;
+ int i;
+ bool new_sb;
+
+ /*
+ * The first time anyone tries to mount a cgroup, enable the list
+ * linking each css_set to its tasks and fix up all existing tasks.
+ */
+ if (!use_task_css_set_links)
+ cgroup_enable_task_cg_lists();
- /* First find the desired set of subsystems */
mutex_lock(&cgroup_mutex);
+
+ /* First find the desired set of subsystems */
ret = parse_cgroupfs_options(data, &opts);
- mutex_unlock(&cgroup_mutex);
if (ret)
- goto out_err;
+ goto out_unlock;
+
+ /* look for a matching existing root */
+ if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
+ cgrp_dfl_root_visible = true;
+ root = &cgrp_dfl_root;
+ cgroup_get(&root->cgrp);
+ ret = 0;
+ goto out_unlock;
+ }
/*
- * Allocate a new cgroup root. We may not need it if we're
- * reusing an existing hierarchy.
+ * Destruction of cgroup root is asynchronous, so subsystems may
+ * still be dying after the previous unmount. Let's drain the
+ * dying subsystems. We just need to ensure that the ones
+ * unmounted previously finish dying and don't care about new ones
+ * starting. Testing ref liveliness is good enough.
*/
- new_root = cgroup_root_from_opts(&opts);
- if (IS_ERR(new_root)) {
- ret = PTR_ERR(new_root);
- goto out_err;
- }
- opts.new_root = new_root;
+ for_each_subsys(ss, i) {
+ if (!(opts.subsys_mask & (1 << i)) ||
+ ss->root == &cgrp_dfl_root)
+ continue;
- /* Locate an existing or new sb for this hierarchy */
- sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts);
- if (IS_ERR(sb)) {
- ret = PTR_ERR(sb);
- cgroup_free_root(opts.new_root);
- goto out_err;
+ if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
+ mutex_unlock(&cgroup_mutex);
+ msleep(10);
+ ret = restart_syscall();
+ goto out_free;
+ }
+ cgroup_put(&ss->root->cgrp);
}
- root = sb->s_fs_info;
- BUG_ON(!root);
- if (root == opts.new_root) {
- /* We used the new root structure, so this is a new hierarchy */
- struct cgroup *root_cgrp = &root->top_cgroup;
- struct cgroupfs_root *existing_root;
- int i;
- struct css_set *cset;
-
- BUG_ON(sb->s_root != NULL);
-
- ret = cgroup_get_rootdir(sb);
- if (ret)
- goto drop_new_super;
- inode = sb->s_root->d_inode;
-
- mutex_lock(&inode->i_mutex);
- mutex_lock(&cgroup_mutex);
- mutex_lock(&cgroup_root_mutex);
-
- ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
- if (ret < 0)
- goto unlock_drop;
- root_cgrp->id = ret;
+ for_each_root(root) {
+ bool name_match = false;
- /* Check for name clashes with existing mounts */
- ret = -EBUSY;
- if (strlen(root->name))
- for_each_active_root(existing_root)
- if (!strcmp(existing_root->name, root->name))
- goto unlock_drop;
+ if (root == &cgrp_dfl_root)
+ continue;
/*
- * We're accessing css_set_count without locking
- * css_set_lock here, but that's OK - it can only be
- * increased by someone holding cgroup_lock, and
- * that's us. The worst that can happen is that we
- * have some link structures left over
+ * If we asked for a name then it must match. Also, if
+ * name matches but sybsys_mask doesn't, we should fail.
+ * Remember whether name matched.
*/
- ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
- if (ret)
- goto unlock_drop;
-
- /* ID 0 is reserved for dummy root, 1 for unified hierarchy */
- ret = cgroup_init_root_id(root, 2, 0);
- if (ret)
- goto unlock_drop;
-
- sb->s_root->d_fsdata = root_cgrp;
- root_cgrp->dentry = sb->s_root;
+ if (opts.name) {
+ if (strcmp(opts.name, root->name))
+ continue;
+ name_match = true;
+ }
/*
- * We're inside get_sb() and will call lookup_one_len() to
- * create the root files, which doesn't work if SELinux is
- * in use. The following cred dancing somehow works around
- * it. See 2ce9738ba ("cgroupfs: use init_cred when
- * populating new cgroupfs mount") for more details.
+ * If we asked for subsystems (or explicitly for no
+ * subsystems) then they must match.
*/
- cred = override_creds(&init_cred);
-
- ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
- if (ret)
- goto rm_base_files;
-
- ret = rebind_subsystems(root, root->subsys_mask, 0);
- if (ret)
- goto rm_base_files;
+ if ((opts.subsys_mask || opts.none) &&
+ (opts.subsys_mask != root->subsys_mask)) {
+ if (!name_match)
+ continue;
+ ret = -EBUSY;
+ goto out_unlock;
+ }
- revert_creds(cred);
+ if (root->flags ^ opts.flags)
+ pr_warn("new mount options do not match the existing superblock, will be ignored\n");
/*
- * There must be no failure case after here, since rebinding
- * takes care of subsystems' refcounts, which are explicitly
- * dropped in the failure exit path.
+ * We want to reuse @root whose lifetime is governed by its
+ * ->cgrp. Let's check whether @root is alive and keep it
+ * that way. As cgroup_kill_sb() can happen anytime, we
+ * want to block it by pinning the sb so that @root doesn't
+ * get killed before mount is complete.
+ *
+ * With the sb pinned, tryget_live can reliably indicate
+ * whether @root can be reused. If it's being killed,
+ * drain it. We can use wait_queue for the wait but this
+ * path is super cold. Let's just sleep a bit and retry.
*/
+ pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
+ if (IS_ERR(pinned_sb) ||
+ !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
+ mutex_unlock(&cgroup_mutex);
+ if (!IS_ERR_OR_NULL(pinned_sb))
+ deactivate_super(pinned_sb);
+ msleep(10);
+ ret = restart_syscall();
+ goto out_free;
+ }
- list_add(&root->root_list, &cgroup_roots);
- cgroup_root_count++;
-
- /* Link the top cgroup in this hierarchy into all
- * the css_set objects */
- write_lock(&css_set_lock);
- hash_for_each(css_set_table, i, cset, hlist)
- link_css_set(&tmp_links, cset, root_cgrp);
- write_unlock(&css_set_lock);
-
- free_cgrp_cset_links(&tmp_links);
-
- BUG_ON(!list_empty(&root_cgrp->children));
- BUG_ON(root->number_of_cgroups != 1);
+ ret = 0;
+ goto out_unlock;
+ }
- mutex_unlock(&cgroup_root_mutex);
- mutex_unlock(&cgroup_mutex);
- mutex_unlock(&inode->i_mutex);
- } else {
- /*
- * We re-used an existing hierarchy - the new root (if
- * any) is not needed
- */
- cgroup_free_root(opts.new_root);
+ /*
+ * No such thing, create a new one. name= matching without subsys
+ * specification is allowed for already existing hierarchies but we
+ * can't create new one without subsys specification.
+ */
+ if (!opts.subsys_mask && !opts.none) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
- if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
- if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
- pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
- ret = -EINVAL;
- goto drop_new_super;
- } else {
- pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
- }
- }
+ root = kzalloc(sizeof(*root), GFP_KERNEL);
+ if (!root) {
+ ret = -ENOMEM;
+ goto out_unlock;
}
- kfree(opts.release_agent);
- kfree(opts.name);
- return dget(sb->s_root);
+ init_cgroup_root(root, &opts);
- rm_base_files:
- free_cgrp_cset_links(&tmp_links);
- cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false);
- revert_creds(cred);
- unlock_drop:
- cgroup_exit_root_id(root);
- mutex_unlock(&cgroup_root_mutex);
+ ret = cgroup_setup_root(root, opts.subsys_mask);
+ if (ret)
+ cgroup_free_root(root);
+
+out_unlock:
mutex_unlock(&cgroup_mutex);
- mutex_unlock(&inode->i_mutex);
- drop_new_super:
- deactivate_locked_super(sb);
- out_err:
+out_free:
kfree(opts.release_agent);
kfree(opts.name);
- return ERR_PTR(ret);
-}
-static void cgroup_kill_sb(struct super_block *sb)
-{
- struct cgroupfs_root *root = sb->s_fs_info;
- struct cgroup *cgrp = &root->top_cgroup;
- struct cgrp_cset_link *link, *tmp_link;
- int ret;
-
- BUG_ON(!root);
-
- BUG_ON(root->number_of_cgroups != 1);
- BUG_ON(!list_empty(&cgrp->children));
-
- mutex_lock(&cgrp->dentry->d_inode->i_mutex);
- mutex_lock(&cgroup_mutex);
- mutex_lock(&cgroup_root_mutex);
+ if (ret)
+ return ERR_PTR(ret);
- /* Rebind all subsystems back to the default hierarchy */
- if (root->flags & CGRP_ROOT_SUBSYS_BOUND) {
- ret = rebind_subsystems(root, 0, root->subsys_mask);
- /* Shouldn't be able to fail ... */
- BUG_ON(ret);
- }
+ dentry = kernfs_mount(fs_type, flags, root->kf_root,
+ CGROUP_SUPER_MAGIC, &new_sb);
+ if (IS_ERR(dentry) || !new_sb)
+ cgroup_put(&root->cgrp);
/*
- * Release all the links from cset_links to this hierarchy's
- * root cgroup
+ * If @pinned_sb, we're reusing an existing root and holding an
+ * extra ref on its sb. Mount is complete. Put the extra ref.
*/
- write_lock(&css_set_lock);
-
- list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
- list_del(&link->cset_link);
- list_del(&link->cgrp_link);
- kfree(link);
- }
- write_unlock(&css_set_lock);
-
- if (!list_empty(&root->root_list)) {
- list_del(&root->root_list);
- cgroup_root_count--;
+ if (pinned_sb) {
+ WARN_ON(new_sb);
+ deactivate_super(pinned_sb);
}
- cgroup_exit_root_id(root);
+ return dentry;
+}
- mutex_unlock(&cgroup_root_mutex);
- mutex_unlock(&cgroup_mutex);
- mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
+static void cgroup_kill_sb(struct super_block *sb)
+{
+ struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
+ struct cgroup_root *root = cgroup_root_from_kf(kf_root);
- simple_xattrs_free(&cgrp->xattrs);
+ /*
+ * If @root doesn't have any mounts or children, start killing it.
+ * This prevents new mounts by disabling percpu_ref_tryget_live().
+ * cgroup_mount() may wait for @root's release.
+ *
+ * And don't kill the default root.
+ */
+ if (css_has_online_children(&root->cgrp.self) ||
+ root == &cgrp_dfl_root)
+ cgroup_put(&root->cgrp);
+ else
+ percpu_ref_kill(&root->cgrp.self.refcnt);
- kill_litter_super(sb);
- cgroup_free_root(root);
+ kernfs_kill_sb(sb);
}
static struct file_system_type cgroup_fs_type = {
@@ -1743,57 +1904,6 @@ static struct file_system_type cgroup_fs_type = {
static struct kobject *cgroup_kobj;
/**
- * cgroup_path - generate the path of a cgroup
- * @cgrp: the cgroup in question
- * @buf: the buffer to write the path into
- * @buflen: the length of the buffer
- *
- * Writes path of cgroup into buf. Returns 0 on success, -errno on error.
- *
- * We can't generate cgroup path using dentry->d_name, as accessing
- * dentry->name must be protected by irq-unsafe dentry->d_lock or parent
- * inode's i_mutex, while on the other hand cgroup_path() can be called
- * with some irq-safe spinlocks held.
- */
-int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
-{
- int ret = -ENAMETOOLONG;
- char *start;
-
- if (!cgrp->parent) {
- if (strlcpy(buf, "/", buflen) >= buflen)
- return -ENAMETOOLONG;
- return 0;
- }
-
- start = buf + buflen - 1;
- *start = '\0';
-
- rcu_read_lock();
- do {
- const char *name = cgroup_name(cgrp);
- int len;
-
- len = strlen(name);
- if ((start -= len) < buf)
- goto out;
- memcpy(start, name, len);
-
- if (--start < buf)
- goto out;
- *start = '/';
-
- cgrp = cgrp->parent;
- } while (cgrp->parent);
- ret = 0;
- memmove(buf, start, buf + buflen - start);
-out:
- rcu_read_unlock();
- return ret;
-}
-EXPORT_SYMBOL_GPL(cgroup_path);
-
-/**
* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
* @task: target task
* @buf: the buffer to write the path into
@@ -1804,49 +1914,55 @@ EXPORT_SYMBOL_GPL(cgroup_path);
* function grabs cgroup_mutex and shouldn't be used inside locks used by
* cgroup controller callbacks.
*
- * Returns 0 on success, fails with -%ENAMETOOLONG if @buflen is too short.
+ * Return value is the same as kernfs_path().
*/
-int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
+char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
{
- struct cgroupfs_root *root;
+ struct cgroup_root *root;
struct cgroup *cgrp;
- int hierarchy_id = 1, ret = 0;
-
- if (buflen < 2)
- return -ENAMETOOLONG;
+ int hierarchy_id = 1;
+ char *path = NULL;
mutex_lock(&cgroup_mutex);
+ down_read(&css_set_rwsem);
root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
if (root) {
cgrp = task_cgroup_from_root(task, root);
- ret = cgroup_path(cgrp, buf, buflen);
+ path = cgroup_path(cgrp, buf, buflen);
} else {
/* if no hierarchy exists, everyone is in "/" */
- memcpy(buf, "/", 2);
+ if (strlcpy(buf, "/", buflen) < buflen)
+ path = buf;
}
+ up_read(&css_set_rwsem);
mutex_unlock(&cgroup_mutex);
- return ret;
+ return path;
}
EXPORT_SYMBOL_GPL(task_cgroup_path);
-/*
- * Control Group taskset
- */
-struct task_and_cgroup {
- struct task_struct *task;
- struct cgroup *cgrp;
- struct css_set *cset;
-};
-
+/* used to track tasks and other necessary states during migration */
struct cgroup_taskset {
- struct task_and_cgroup single;
- struct flex_array *tc_array;
- int tc_array_len;
- int idx;
- struct cgroup *cur_cgrp;
+ /* the src and dst cset list running through cset->mg_node */
+ struct list_head src_csets;
+ struct list_head dst_csets;
+
+ /*
+ * Fields for cgroup_taskset_*() iteration.
+ *
+ * Before migration is committed, the target migration tasks are on
+ * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of
+ * the csets on ->dst_csets. ->csets point to either ->src_csets
+ * or ->dst_csets depending on whether migration is committed.
+ *
+ * ->cur_csets and ->cur_task point to the current task position
+ * during iteration.
+ */
+ struct list_head *csets;
+ struct css_set *cur_cset;
+ struct task_struct *cur_task;
};
/**
@@ -1857,15 +1973,11 @@ struct cgroup_taskset {
*/
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
{
- if (tset->tc_array) {
- tset->idx = 0;
- return cgroup_taskset_next(tset);
- } else {
- tset->cur_cgrp = tset->single.cgrp;
- return tset->single.task;
- }
+ tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
+ tset->cur_task = NULL;
+
+ return cgroup_taskset_next(tset);
}
-EXPORT_SYMBOL_GPL(cgroup_taskset_first);
/**
* cgroup_taskset_next - iterate to the next task in taskset
@@ -1876,48 +1988,36 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_first);
*/
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
{
- struct task_and_cgroup *tc;
+ struct css_set *cset = tset->cur_cset;
+ struct task_struct *task = tset->cur_task;
- if (!tset->tc_array || tset->idx >= tset->tc_array_len)
- return NULL;
+ while (&cset->mg_node != tset->csets) {
+ if (!task)
+ task = list_first_entry(&cset->mg_tasks,
+ struct task_struct, cg_list);
+ else
+ task = list_next_entry(task, cg_list);
- tc = flex_array_get(tset->tc_array, tset->idx++);
- tset->cur_cgrp = tc->cgrp;
- return tc->task;
-}
-EXPORT_SYMBOL_GPL(cgroup_taskset_next);
+ if (&task->cg_list != &cset->mg_tasks) {
+ tset->cur_cset = cset;
+ tset->cur_task = task;
+ return task;
+ }
-/**
- * cgroup_taskset_cur_css - return the matching css for the current task
- * @tset: taskset of interest
- * @subsys_id: the ID of the target subsystem
- *
- * Return the css for the current (last returned) task of @tset for
- * subsystem specified by @subsys_id. This function must be preceded by
- * either cgroup_taskset_first() or cgroup_taskset_next().
- */
-struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset,
- int subsys_id)
-{
- return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]);
-}
-EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css);
+ cset = list_next_entry(cset, mg_node);
+ task = NULL;
+ }
-/**
- * cgroup_taskset_size - return the number of tasks in taskset
- * @tset: taskset of interest
- */
-int cgroup_taskset_size(struct cgroup_taskset *tset)
-{
- return tset->tc_array ? tset->tc_array_len : 1;
+ return NULL;
}
-EXPORT_SYMBOL_GPL(cgroup_taskset_size);
-
-/*
+/**
* cgroup_task_migrate - move a task from one cgroup to another.
+ * @old_cgrp: the cgroup @tsk is being migrated from
+ * @tsk: the task being migrated
+ * @new_cset: the new css_set @tsk is being attached to
*
- * Must be called with cgroup_mutex and threadgroup locked.
+ * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
*/
static void cgroup_task_migrate(struct cgroup *old_cgrp,
struct task_struct *tsk,
@@ -1925,6 +2025,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
{
struct css_set *old_cset;
+ lockdep_assert_held(&cgroup_mutex);
+ lockdep_assert_held(&css_set_rwsem);
+
/*
* We are synchronized through threadgroup_lock() against PF_EXITING
* setting such that we can't race against cgroup_exit() changing the
@@ -1933,15 +2036,16 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
WARN_ON_ONCE(tsk->flags & PF_EXITING);
old_cset = task_css_set(tsk);
- task_lock(tsk);
+ get_css_set(new_cset);
rcu_assign_pointer(tsk->cgroups, new_cset);
- task_unlock(tsk);
- /* Update the css_set linked lists if we're using them */
- write_lock(&css_set_lock);
- if (!list_empty(&tsk->cg_list))
- list_move(&tsk->cg_list, &new_cset->tasks);
- write_unlock(&css_set_lock);
+ /*
+ * Use move_tail so that cgroup_taskset_first() still returns the
+ * leader after migration. This works because cgroup_migrate()
+ * ensures that the dst_cset of the leader is the first on the
+ * tset's dst_csets list.
+ */
+ list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
/*
* We just gained a reference on old_cset by taking it from the
@@ -1949,100 +2053,219 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
* we're safe to drop it here; it will be freed under RCU.
*/
set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
- put_css_set(old_cset);
+ put_css_set_locked(old_cset, false);
}
/**
- * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
- * @cgrp: the cgroup to attach to
- * @tsk: the task or the leader of the threadgroup to be attached
- * @threadgroup: attach the whole threadgroup?
+ * cgroup_migrate_finish - cleanup after attach
+ * @preloaded_csets: list of preloaded css_sets
*
- * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
- * task_lock of @tsk or each thread in the threadgroup individually in turn.
+ * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See
+ * those functions for details.
*/
-static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
- bool threadgroup)
+static void cgroup_migrate_finish(struct list_head *preloaded_csets)
{
- int retval, i, group_size;
- struct cgroupfs_root *root = cgrp->root;
- struct cgroup_subsys_state *css, *failed_css = NULL;
- /* threadgroup list cursor and array */
- struct task_struct *leader = tsk;
- struct task_and_cgroup *tc;
- struct flex_array *group;
- struct cgroup_taskset tset = { };
+ struct css_set *cset, *tmp_cset;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ down_write(&css_set_rwsem);
+ list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
+ cset->mg_src_cgrp = NULL;
+ cset->mg_dst_cset = NULL;
+ list_del_init(&cset->mg_preload_node);
+ put_css_set_locked(cset, false);
+ }
+ up_write(&css_set_rwsem);
+}
+
+/**
+ * cgroup_migrate_add_src - add a migration source css_set
+ * @src_cset: the source css_set to add
+ * @dst_cgrp: the destination cgroup
+ * @preloaded_csets: list of preloaded css_sets
+ *
+ * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin
+ * @src_cset and add it to @preloaded_csets, which should later be cleaned
+ * up by cgroup_migrate_finish().
+ *
+ * This function may be called without holding threadgroup_lock even if the
+ * target is a process. Threads may be created and destroyed but as long
+ * as cgroup_mutex is not dropped, no new css_set can be put into play and
+ * the preloaded css_sets are guaranteed to cover all migrations.
+ */
+static void cgroup_migrate_add_src(struct css_set *src_cset,
+ struct cgroup *dst_cgrp,
+ struct list_head *preloaded_csets)
+{
+ struct cgroup *src_cgrp;
+
+ lockdep_assert_held(&cgroup_mutex);
+ lockdep_assert_held(&css_set_rwsem);
+
+ src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
+
+ if (!list_empty(&src_cset->mg_preload_node))
+ return;
+
+ WARN_ON(src_cset->mg_src_cgrp);
+ WARN_ON(!list_empty(&src_cset->mg_tasks));
+ WARN_ON(!list_empty(&src_cset->mg_node));
+
+ src_cset->mg_src_cgrp = src_cgrp;
+ get_css_set(src_cset);
+ list_add(&src_cset->mg_preload_node, preloaded_csets);
+}
+
+/**
+ * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
+ * @dst_cgrp: the destination cgroup (may be %NULL)
+ * @preloaded_csets: list of preloaded source css_sets
+ *
+ * Tasks are about to be moved to @dst_cgrp and all the source css_sets
+ * have been preloaded to @preloaded_csets. This function looks up and
+ * pins all destination css_sets, links each to its source, and append them
+ * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each
+ * source css_set is assumed to be its cgroup on the default hierarchy.
+ *
+ * This function must be called after cgroup_migrate_add_src() has been
+ * called on each migration source css_set. After migration is performed
+ * using cgroup_migrate(), cgroup_migrate_finish() must be called on
+ * @preloaded_csets.
+ */
+static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
+ struct list_head *preloaded_csets)
+{
+ LIST_HEAD(csets);
+ struct css_set *src_cset, *tmp_cset;
+
+ lockdep_assert_held(&cgroup_mutex);
/*
- * step 0: in order to do expensive, possibly blocking operations for
- * every thread, we cannot iterate the thread group list, since it needs
- * rcu or tasklist locked. instead, build an array of all threads in the
- * group - group_rwsem prevents new threads from appearing, and if
- * threads exit, this will just be an over-estimate.
+ * Except for the root, child_subsys_mask must be zero for a cgroup
+ * with tasks so that child cgroups don't compete against tasks.
*/
- if (threadgroup)
- group_size = get_nr_threads(tsk);
- else
- group_size = 1;
- /* flex_array supports very large thread-groups better than kmalloc. */
- group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
- if (!group)
- return -ENOMEM;
- /* pre-allocate to guarantee space while iterating in rcu read-side. */
- retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL);
- if (retval)
- goto out_free_group_list;
+ if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) &&
+ dst_cgrp->child_subsys_mask)
+ return -EBUSY;
+
+ /* look up the dst cset for each src cset and link it to src */
+ list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
+ struct css_set *dst_cset;
+
+ dst_cset = find_css_set(src_cset,
+ dst_cgrp ?: src_cset->dfl_cgrp);
+ if (!dst_cset)
+ goto err;
+
+ WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
+
+ /*
+ * If src cset equals dst, it's noop. Drop the src.
+ * cgroup_migrate() will skip the cset too. Note that we
+ * can't handle src == dst as some nodes are used by both.
+ */
+ if (src_cset == dst_cset) {
+ src_cset->mg_src_cgrp = NULL;
+ list_del_init(&src_cset->mg_preload_node);
+ put_css_set(src_cset, false);
+ put_css_set(dst_cset, false);
+ continue;
+ }
+
+ src_cset->mg_dst_cset = dst_cset;
+
+ if (list_empty(&dst_cset->mg_preload_node))
+ list_add(&dst_cset->mg_preload_node, &csets);
+ else
+ put_css_set(dst_cset, false);
+ }
+
+ list_splice_tail(&csets, preloaded_csets);
+ return 0;
+err:
+ cgroup_migrate_finish(&csets);
+ return -ENOMEM;
+}
+
+/**
+ * cgroup_migrate - migrate a process or task to a cgroup
+ * @cgrp: the destination cgroup
+ * @leader: the leader of the process or the task to migrate
+ * @threadgroup: whether @leader points to the whole process or a single task
+ *
+ * Migrate a process or task denoted by @leader to @cgrp. If migrating a
+ * process, the caller must be holding threadgroup_lock of @leader. The
+ * caller is also responsible for invoking cgroup_migrate_add_src() and
+ * cgroup_migrate_prepare_dst() on the targets before invoking this
+ * function and following up with cgroup_migrate_finish().
+ *
+ * As long as a controller's ->can_attach() doesn't fail, this function is
+ * guaranteed to succeed. This means that, excluding ->can_attach()
+ * failure, when migrating multiple targets, the success or failure can be
+ * decided for all targets by invoking group_migrate_prepare_dst() before
+ * actually starting migrating.
+ */
+static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
+ bool threadgroup)
+{
+ struct cgroup_taskset tset = {
+ .src_csets = LIST_HEAD_INIT(tset.src_csets),
+ .dst_csets = LIST_HEAD_INIT(tset.dst_csets),
+ .csets = &tset.src_csets,
+ };
+ struct cgroup_subsys_state *css, *failed_css = NULL;
+ struct css_set *cset, *tmp_cset;
+ struct task_struct *task, *tmp_task;
+ int i, ret;
- i = 0;
/*
* Prevent freeing of tasks while we take a snapshot. Tasks that are
* already PF_EXITING could be freed from underneath us unless we
* take an rcu_read_lock.
*/
+ down_write(&css_set_rwsem);
rcu_read_lock();
+ task = leader;
do {
- struct task_and_cgroup ent;
+ /* @task either already exited or can't exit until the end */
+ if (task->flags & PF_EXITING)
+ goto next;
- /* @tsk either already exited or can't exit until the end */
- if (tsk->flags & PF_EXITING)
+ /* leave @task alone if post_fork() hasn't linked it yet */
+ if (list_empty(&task->cg_list))
goto next;
- /* as per above, nr_threads may decrease, but not increase. */
- BUG_ON(i >= group_size);
- ent.task = tsk;
- ent.cgrp = task_cgroup_from_root(tsk, root);
- /* nothing to do if this task is already in the cgroup */
- if (ent.cgrp == cgrp)
+ cset = task_css_set(task);
+ if (!cset->mg_src_cgrp)
goto next;
+
/*
- * saying GFP_ATOMIC has no effect here because we did prealloc
- * earlier, but it's good form to communicate our expectations.
+ * cgroup_taskset_first() must always return the leader.
+ * Take care to avoid disturbing the ordering.
*/
- retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
- BUG_ON(retval != 0);
- i++;
+ list_move_tail(&task->cg_list, &cset->mg_tasks);
+ if (list_empty(&cset->mg_node))
+ list_add_tail(&cset->mg_node, &tset.src_csets);
+ if (list_empty(&cset->mg_dst_cset->mg_node))
+ list_move_tail(&cset->mg_dst_cset->mg_node,
+ &tset.dst_csets);
next:
if (!threadgroup)
break;
- } while_each_thread(leader, tsk);
+ } while_each_thread(leader, task);
rcu_read_unlock();
- /* remember the number of threads in the array for later. */
- group_size = i;
- tset.tc_array = group;
- tset.tc_array_len = group_size;
+ up_write(&css_set_rwsem);
/* methods shouldn't be called if no task is actually migrating */
- retval = 0;
- if (!group_size)
- goto out_free_group_list;
+ if (list_empty(&tset.src_csets))
+ return 0;
- /*
- * step 1: check that we can legitimately attach to the cgroup.
- */
- for_each_css(css, i, cgrp) {
+ /* check that we can legitimately attach to the cgroup */
+ for_each_e_css(css, i, cgrp) {
if (css->ss->can_attach) {
- retval = css->ss->can_attach(css, &tset);
- if (retval) {
+ ret = css->ss->can_attach(css, &tset);
+ if (ret) {
failed_css = css;
goto out_cancel_attach;
}
@@ -2050,78 +2273,106 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
}
/*
- * step 2: make sure css_sets exist for all threads to be migrated.
- * we use find_css_set, which allocates a new one if necessary.
+ * Now that we're guaranteed success, proceed to move all tasks to
+ * the new cgroup. There are no failure cases after here, so this
+ * is the commit point.
*/
- for (i = 0; i < group_size; i++) {
- struct css_set *old_cset;
-
- tc = flex_array_get(group, i);
- old_cset = task_css_set(tc->task);
- tc->cset = find_css_set(old_cset, cgrp);
- if (!tc->cset) {
- retval = -ENOMEM;
- goto out_put_css_set_refs;
- }
+ down_write(&css_set_rwsem);
+ list_for_each_entry(cset, &tset.src_csets, mg_node) {
+ list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
+ cgroup_task_migrate(cset->mg_src_cgrp, task,
+ cset->mg_dst_cset);
}
+ up_write(&css_set_rwsem);
/*
- * step 3: now that we're guaranteed success wrt the css_sets,
- * proceed to move all tasks to the new cgroup. There are no
- * failure cases after here, so this is the commit point.
+ * Migration is committed, all target tasks are now on dst_csets.
+ * Nothing is sensitive to fork() after this point. Notify
+ * controllers that migration is complete.
*/
- for (i = 0; i < group_size; i++) {
- tc = flex_array_get(group, i);
- cgroup_task_migrate(tc->cgrp, tc->task, tc->cset);
- }
- /* nothing is sensitive to fork() after this point. */
+ tset.csets = &tset.dst_csets;
- /*
- * step 4: do subsystem attach callbacks.
- */
- for_each_css(css, i, cgrp)
+ for_each_e_css(css, i, cgrp)
if (css->ss->attach)
css->ss->attach(css, &tset);
- /*
- * step 5: success! and cleanup
- */
- retval = 0;
-out_put_css_set_refs:
- if (retval) {
- for (i = 0; i < group_size; i++) {
- tc = flex_array_get(group, i);
- if (!tc->cset)
- break;
- put_css_set(tc->cset);
- }
- }
+ ret = 0;
+ goto out_release_tset;
+
out_cancel_attach:
- if (retval) {
- for_each_css(css, i, cgrp) {
- if (css == failed_css)
- break;
- if (css->ss->cancel_attach)
- css->ss->cancel_attach(css, &tset);
- }
+ for_each_e_css(css, i, cgrp) {
+ if (css == failed_css)
+ break;
+ if (css->ss->cancel_attach)
+ css->ss->cancel_attach(css, &tset);
}
-out_free_group_list:
- flex_array_free(group);
- return retval;
+out_release_tset:
+ down_write(&css_set_rwsem);
+ list_splice_init(&tset.dst_csets, &tset.src_csets);
+ list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
+ list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
+ list_del_init(&cset->mg_node);
+ }
+ up_write(&css_set_rwsem);
+ return ret;
+}
+
+/**
+ * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
+ * @dst_cgrp: the cgroup to attach to
+ * @leader: the task or the leader of the threadgroup to be attached
+ * @threadgroup: attach the whole threadgroup?
+ *
+ * Call holding cgroup_mutex and threadgroup_lock of @leader.
+ */
+static int cgroup_attach_task(struct cgroup *dst_cgrp,
+ struct task_struct *leader, bool threadgroup)
+{
+ LIST_HEAD(preloaded_csets);
+ struct task_struct *task;
+ int ret;
+
+ /* look up all src csets */
+ down_read(&css_set_rwsem);
+ rcu_read_lock();
+ task = leader;
+ do {
+ cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
+ &preloaded_csets);
+ if (!threadgroup)
+ break;
+ } while_each_thread(leader, task);
+ rcu_read_unlock();
+ up_read(&css_set_rwsem);
+
+ /* prepare dst csets and commit */
+ ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
+ if (!ret)
+ ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
+
+ cgroup_migrate_finish(&preloaded_csets);
+ return ret;
}
/*
* Find the task_struct of the task to attach by vpid and pass it along to the
* function to attach either it or all tasks in its threadgroup. Will lock
- * cgroup_mutex and threadgroup; may take task_lock of task.
+ * cgroup_mutex and threadgroup.
*/
-static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
+static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off, bool threadgroup)
{
struct task_struct *tsk;
const struct cred *cred = current_cred(), *tcred;
+ struct cgroup *cgrp;
+ pid_t pid;
int ret;
- if (!cgroup_lock_live_group(cgrp))
+ if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
+ return -EINVAL;
+
+ cgrp = cgroup_kn_lock_live(of->kn);
+ if (!cgrp)
return -ENODEV;
retry_find_task:
@@ -2187,8 +2438,8 @@ retry_find_task:
put_task_struct(tsk);
out_unlock_cgroup:
- mutex_unlock(&cgroup_mutex);
- return ret;
+ cgroup_kn_unlock(of->kn);
+ return ret ?: nbytes;
}
/**
@@ -2198,12 +2449,19 @@ out_unlock_cgroup:
*/
int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
{
- struct cgroupfs_root *root;
+ struct cgroup_root *root;
int retval = 0;
mutex_lock(&cgroup_mutex);
- for_each_active_root(root) {
- struct cgroup *from_cgrp = task_cgroup_from_root(from, root);
+ for_each_root(root) {
+ struct cgroup *from_cgrp;
+
+ if (root == &cgrp_dfl_root)
+ continue;
+
+ down_read(&css_set_rwsem);
+ from_cgrp = task_cgroup_from_root(from, root);
+ up_read(&css_set_rwsem);
retval = cgroup_attach_task(from_cgrp, tsk, false);
if (retval)
@@ -2215,472 +2473,585 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
}
EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
-static int cgroup_tasks_write(struct cgroup_subsys_state *css,
- struct cftype *cft, u64 pid)
+static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
- return attach_task_by_pid(css->cgroup, pid, false);
+ return __cgroup_procs_write(of, buf, nbytes, off, false);
}
-static int cgroup_procs_write(struct cgroup_subsys_state *css,
- struct cftype *cft, u64 tgid)
+static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
- return attach_task_by_pid(css->cgroup, tgid, true);
+ return __cgroup_procs_write(of, buf, nbytes, off, true);
}
-static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
- struct cftype *cft, const char *buffer)
+static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
- BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX);
- if (strlen(buffer) >= PATH_MAX)
- return -EINVAL;
- if (!cgroup_lock_live_group(css->cgroup))
+ struct cgroup *cgrp;
+
+ BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+
+ cgrp = cgroup_kn_lock_live(of->kn);
+ if (!cgrp)
return -ENODEV;
- mutex_lock(&cgroup_root_mutex);
- strcpy(css->cgroup->root->release_agent_path, buffer);
- mutex_unlock(&cgroup_root_mutex);
- mutex_unlock(&cgroup_mutex);
- return 0;
+ spin_lock(&release_agent_path_lock);
+ strlcpy(cgrp->root->release_agent_path, strstrip(buf),
+ sizeof(cgrp->root->release_agent_path));
+ spin_unlock(&release_agent_path_lock);
+ cgroup_kn_unlock(of->kn);
+ return nbytes;
}
static int cgroup_release_agent_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
+ spin_lock(&release_agent_path_lock);
seq_puts(seq, cgrp->root->release_agent_path);
+ spin_unlock(&release_agent_path_lock);
seq_putc(seq, '\n');
- mutex_unlock(&cgroup_mutex);
return 0;
}
static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
{
+ seq_puts(seq, "0\n");
+ return 0;
+}
+
+static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
+{
+ struct cgroup_subsys *ss;
+ bool printed = false;
+ int ssid;
+
+ for_each_subsys(ss, ssid) {
+ if (ss_mask & (1 << ssid)) {
+ if (printed)
+ seq_putc(seq, ' ');
+ seq_printf(seq, "%s", ss->name);
+ printed = true;
+ }
+ }
+ if (printed)
+ seq_putc(seq, '\n');
+}
+
+/* show controllers which are currently attached to the default hierarchy */
+static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
+{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
+ cgroup_print_ss_mask(seq, cgrp->root->subsys_mask &
+ ~cgrp_dfl_root_inhibit_ss_mask);
return 0;
}
-/* A buffer size big enough for numbers or short strings */
-#define CGROUP_LOCAL_BUFFER_SIZE 64
+/* show controllers which are enabled from the parent */
+static int cgroup_controllers_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
-static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
- size_t nbytes, loff_t *ppos)
+ cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control);
+ return 0;
+}
+
+/* show controllers which are enabled for a given cgroup's children */
+static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
{
- struct cfent *cfe = __d_cfe(file->f_dentry);
- struct cftype *cft = __d_cft(file->f_dentry);
- struct cgroup_subsys_state *css = cfe->css;
- size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1;
- char *buf;
- int ret;
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
- if (nbytes >= max_bytes)
- return -E2BIG;
+ cgroup_print_ss_mask(seq, cgrp->subtree_control);
+ return 0;
+}
- buf = kmalloc(nbytes + 1, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
+/**
+ * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
+ * @cgrp: root of the subtree to update csses for
+ *
+ * @cgrp's child_subsys_mask has changed and its subtree's (self excluded)
+ * css associations need to be updated accordingly. This function looks up
+ * all css_sets which are attached to the subtree, creates the matching
+ * updated css_sets and migrates the tasks to the new ones.
+ */
+static int cgroup_update_dfl_csses(struct cgroup *cgrp)
+{
+ LIST_HEAD(preloaded_csets);
+ struct cgroup_subsys_state *css;
+ struct css_set *src_cset;
+ int ret;
- if (copy_from_user(buf, userbuf, nbytes)) {
- ret = -EFAULT;
- goto out_free;
- }
+ lockdep_assert_held(&cgroup_mutex);
- buf[nbytes] = '\0';
+ /* look up all csses currently attached to @cgrp's subtree */
+ down_read(&css_set_rwsem);
+ css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
+ struct cgrp_cset_link *link;
- if (cft->write_string) {
- ret = cft->write_string(css, cft, strstrip(buf));
- } else if (cft->write_u64) {
- unsigned long long v;
- ret = kstrtoull(buf, 0, &v);
- if (!ret)
- ret = cft->write_u64(css, cft, v);
- } else if (cft->write_s64) {
- long long v;
- ret = kstrtoll(buf, 0, &v);
- if (!ret)
- ret = cft->write_s64(css, cft, v);
- } else if (cft->trigger) {
- ret = cft->trigger(css, (unsigned int)cft->private);
- } else {
- ret = -EINVAL;
+ /* self is not affected by child_subsys_mask change */
+ if (css->cgroup == cgrp)
+ continue;
+
+ list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
+ cgroup_migrate_add_src(link->cset, cgrp,
+ &preloaded_csets);
}
-out_free:
- kfree(buf);
- return ret ?: nbytes;
-}
+ up_read(&css_set_rwsem);
-/*
- * seqfile ops/methods for returning structured data. Currently just
- * supports string->u64 maps, but can be extended in future.
- */
+ /* NULL dst indicates self on default hierarchy */
+ ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
+ if (ret)
+ goto out_finish;
-static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
-{
- struct cftype *cft = seq_cft(seq);
+ list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
+ struct task_struct *last_task = NULL, *task;
+
+ /* src_csets precede dst_csets, break on the first dst_cset */
+ if (!src_cset->mg_src_cgrp)
+ break;
- if (cft->seq_start) {
- return cft->seq_start(seq, ppos);
- } else {
/*
- * The same behavior and code as single_open(). Returns
- * !NULL if pos is at the beginning; otherwise, NULL.
+ * All tasks in src_cset need to be migrated to the
+ * matching dst_cset. Empty it process by process. We
+ * walk tasks but migrate processes. The leader might even
+ * belong to a different cset but such src_cset would also
+ * be among the target src_csets because the default
+ * hierarchy enforces per-process membership.
*/
- return NULL + !*ppos;
+ while (true) {
+ down_read(&css_set_rwsem);
+ task = list_first_entry_or_null(&src_cset->tasks,
+ struct task_struct, cg_list);
+ if (task) {
+ task = task->group_leader;
+ WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
+ get_task_struct(task);
+ }
+ up_read(&css_set_rwsem);
+
+ if (!task)
+ break;
+
+ /* guard against possible infinite loop */
+ if (WARN(last_task == task,
+ "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
+ goto out_finish;
+ last_task = task;
+
+ threadgroup_lock(task);
+ /* raced against de_thread() from another thread? */
+ if (!thread_group_leader(task)) {
+ threadgroup_unlock(task);
+ put_task_struct(task);
+ continue;
+ }
+
+ ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
+
+ threadgroup_unlock(task);
+ put_task_struct(task);
+
+ if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
+ goto out_finish;
+ }
}
+
+out_finish:
+ cgroup_migrate_finish(&preloaded_csets);
+ return ret;
}
-static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
+/* change the enabled child controllers for a cgroup in the default hierarchy */
+static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
{
- struct cftype *cft = seq_cft(seq);
+ unsigned int enable = 0, disable = 0;
+ unsigned int css_enable, css_disable, old_ctrl, new_ctrl;
+ struct cgroup *cgrp, *child;
+ struct cgroup_subsys *ss;
+ char *tok;
+ int ssid, ret;
- if (cft->seq_next) {
- return cft->seq_next(seq, v, ppos);
- } else {
- /*
- * The same behavior and code as single_open(), always
- * terminate after the initial read.
- */
- ++*ppos;
- return NULL;
+ /*
+ * Parse input - space separated list of subsystem names prefixed
+ * with either + or -.
+ */
+ buf = strstrip(buf);
+ while ((tok = strsep(&buf, " "))) {
+ if (tok[0] == '\0')
+ continue;
+ for_each_subsys(ss, ssid) {
+ if (ss->disabled || strcmp(tok + 1, ss->name) ||
+ ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask))
+ continue;
+
+ if (*tok == '+') {
+ enable |= 1 << ssid;
+ disable &= ~(1 << ssid);
+ } else if (*tok == '-') {
+ disable |= 1 << ssid;
+ enable &= ~(1 << ssid);
+ } else {
+ return -EINVAL;
+ }
+ break;
+ }
+ if (ssid == CGROUP_SUBSYS_COUNT)
+ return -EINVAL;
}
-}
-static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
-{
- struct cftype *cft = seq_cft(seq);
+ cgrp = cgroup_kn_lock_live(of->kn);
+ if (!cgrp)
+ return -ENODEV;
- if (cft->seq_stop)
- cft->seq_stop(seq, v);
-}
+ for_each_subsys(ss, ssid) {
+ if (enable & (1 << ssid)) {
+ if (cgrp->subtree_control & (1 << ssid)) {
+ enable &= ~(1 << ssid);
+ continue;
+ }
-static int cgroup_seqfile_show(struct seq_file *m, void *arg)
-{
- struct cftype *cft = seq_cft(m);
- struct cgroup_subsys_state *css = seq_css(m);
+ /* unavailable or not enabled on the parent? */
+ if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
+ (cgroup_parent(cgrp) &&
+ !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
- if (cft->seq_show)
- return cft->seq_show(m, arg);
+ /*
+ * @ss is already enabled through dependency and
+ * we'll just make it visible. Skip draining.
+ */
+ if (cgrp->child_subsys_mask & (1 << ssid))
+ continue;
- if (cft->read_u64)
- seq_printf(m, "%llu\n", cft->read_u64(css, cft));
- else if (cft->read_s64)
- seq_printf(m, "%lld\n", cft->read_s64(css, cft));
- else
- return -EINVAL;
- return 0;
-}
+ /*
+ * Because css offlining is asynchronous, userland
+ * might try to re-enable the same controller while
+ * the previous instance is still around. In such
+ * cases, wait till it's gone using offline_waitq.
+ */
+ cgroup_for_each_live_child(child, cgrp) {
+ DEFINE_WAIT(wait);
-static struct seq_operations cgroup_seq_operations = {
- .start = cgroup_seqfile_start,
- .next = cgroup_seqfile_next,
- .stop = cgroup_seqfile_stop,
- .show = cgroup_seqfile_show,
-};
+ if (!cgroup_css(child, ss))
+ continue;
-static int cgroup_file_open(struct inode *inode, struct file *file)
-{
- struct cfent *cfe = __d_cfe(file->f_dentry);
- struct cftype *cft = __d_cft(file->f_dentry);
- struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent);
- struct cgroup_subsys_state *css;
- struct cgroup_open_file *of;
- int err;
+ cgroup_get(child);
+ prepare_to_wait(&child->offline_waitq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ cgroup_kn_unlock(of->kn);
+ schedule();
+ finish_wait(&child->offline_waitq, &wait);
+ cgroup_put(child);
- err = generic_file_open(inode, file);
- if (err)
- return err;
+ return restart_syscall();
+ }
+ } else if (disable & (1 << ssid)) {
+ if (!(cgrp->subtree_control & (1 << ssid))) {
+ disable &= ~(1 << ssid);
+ continue;
+ }
- /*
- * If the file belongs to a subsystem, pin the css. Will be
- * unpinned either on open failure or release. This ensures that
- * @css stays alive for all file operations.
- */
- rcu_read_lock();
- css = cgroup_css(cgrp, cft->ss);
- if (cft->ss && !css_tryget(css))
- css = NULL;
- rcu_read_unlock();
+ /* a child has it enabled? */
+ cgroup_for_each_live_child(child, cgrp) {
+ if (child->subtree_control & (1 << ssid)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+ }
+ }
+ }
- if (!css)
- return -ENODEV;
+ if (!enable && !disable) {
+ ret = 0;
+ goto out_unlock;
+ }
/*
- * @cfe->css is used by read/write/close to determine the
- * associated css. @file->private_data would be a better place but
- * that's already used by seqfile. Multiple accessors may use it
- * simultaneously which is okay as the association never changes.
+ * Except for the root, subtree_control must be zero for a cgroup
+ * with tasks so that child cgroups don't compete against tasks.
*/
- WARN_ON_ONCE(cfe->css && cfe->css != css);
- cfe->css = css;
-
- of = __seq_open_private(file, &cgroup_seq_operations,
- sizeof(struct cgroup_open_file));
- if (of) {
- of->cfe = cfe;
- return 0;
+ if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
+ ret = -EBUSY;
+ goto out_unlock;
}
- if (css->ss)
- css_put(css);
- return -ENOMEM;
-}
-
-static int cgroup_file_release(struct inode *inode, struct file *file)
-{
- struct cfent *cfe = __d_cfe(file->f_dentry);
- struct cgroup_subsys_state *css = cfe->css;
+ /*
+ * Update subsys masks and calculate what needs to be done. More
+ * subsystems than specified may need to be enabled or disabled
+ * depending on subsystem dependencies.
+ */
+ cgrp->subtree_control |= enable;
+ cgrp->subtree_control &= ~disable;
- if (css->ss)
- css_put(css);
- return seq_release_private(inode, file);
-}
+ old_ctrl = cgrp->child_subsys_mask;
+ cgroup_refresh_child_subsys_mask(cgrp);
+ new_ctrl = cgrp->child_subsys_mask;
-/*
- * cgroup_rename - Only allow simple rename of directories in place.
- */
-static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
-{
- int ret;
- struct cgroup_name *name, *old_name;
- struct cgroup *cgrp;
+ css_enable = ~old_ctrl & new_ctrl;
+ css_disable = old_ctrl & ~new_ctrl;
+ enable |= css_enable;
+ disable |= css_disable;
/*
- * It's convinient to use parent dir's i_mutex to protected
- * cgrp->name.
+ * Create new csses or make the existing ones visible. A css is
+ * created invisible if it's being implicitly enabled through
+ * dependency. An invisible css is made visible when the userland
+ * explicitly enables it.
*/
- lockdep_assert_held(&old_dir->i_mutex);
+ for_each_subsys(ss, ssid) {
+ if (!(enable & (1 << ssid)))
+ continue;
- if (!S_ISDIR(old_dentry->d_inode->i_mode))
- return -ENOTDIR;
- if (new_dentry->d_inode)
- return -EEXIST;
- if (old_dir != new_dir)
- return -EIO;
+ cgroup_for_each_live_child(child, cgrp) {
+ if (css_enable & (1 << ssid))
+ ret = create_css(child, ss,
+ cgrp->subtree_control & (1 << ssid));
+ else
+ ret = cgroup_populate_dir(child, 1 << ssid);
+ if (ret)
+ goto err_undo_css;
+ }
+ }
- cgrp = __d_cgrp(old_dentry);
+ /*
+ * At this point, cgroup_e_css() results reflect the new csses
+ * making the following cgroup_update_dfl_csses() properly update
+ * css associations of all tasks in the subtree.
+ */
+ ret = cgroup_update_dfl_csses(cgrp);
+ if (ret)
+ goto err_undo_css;
/*
- * This isn't a proper migration and its usefulness is very
- * limited. Disallow if sane_behavior.
+ * All tasks are migrated out of disabled csses. Kill or hide
+ * them. A css is hidden when the userland requests it to be
+ * disabled while other subsystems are still depending on it. The
+ * css must not actively control resources and be in the vanilla
+ * state if it's made visible again later. Controllers which may
+ * be depended upon should provide ->css_reset() for this purpose.
*/
- if (cgroup_sane_behavior(cgrp))
- return -EPERM;
+ for_each_subsys(ss, ssid) {
+ if (!(disable & (1 << ssid)))
+ continue;
- name = cgroup_alloc_name(new_dentry);
- if (!name)
- return -ENOMEM;
+ cgroup_for_each_live_child(child, cgrp) {
+ struct cgroup_subsys_state *css = cgroup_css(child, ss);
- ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry);
- if (ret) {
- kfree(name);
- return ret;
+ if (css_disable & (1 << ssid)) {
+ kill_css(css);
+ } else {
+ cgroup_clear_dir(child, 1 << ssid);
+ if (ss->css_reset)
+ ss->css_reset(css);
+ }
+ }
}
- old_name = rcu_dereference_protected(cgrp->name, true);
- rcu_assign_pointer(cgrp->name, name);
+ kernfs_activate(cgrp->kn);
+ ret = 0;
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+ return ret ?: nbytes;
- kfree_rcu(old_name, rcu_head);
- return 0;
-}
+err_undo_css:
+ cgrp->subtree_control &= ~enable;
+ cgrp->subtree_control |= disable;
+ cgroup_refresh_child_subsys_mask(cgrp);
-static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
-{
- if (S_ISDIR(dentry->d_inode->i_mode))
- return &__d_cgrp(dentry)->xattrs;
- else
- return &__d_cfe(dentry)->xattrs;
+ for_each_subsys(ss, ssid) {
+ if (!(enable & (1 << ssid)))
+ continue;
+
+ cgroup_for_each_live_child(child, cgrp) {
+ struct cgroup_subsys_state *css = cgroup_css(child, ss);
+
+ if (!css)
+ continue;
+
+ if (css_enable & (1 << ssid))
+ kill_css(css);
+ else
+ cgroup_clear_dir(child, 1 << ssid);
+ }
+ }
+ goto out_unlock;
}
-static inline int xattr_enabled(struct dentry *dentry)
+static int cgroup_populated_show(struct seq_file *seq, void *v)
{
- struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
- return root->flags & CGRP_ROOT_XATTR;
+ seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt);
+ return 0;
}
-static bool is_valid_xattr(const char *name)
+static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
{
- if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
- !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
- return true;
- return false;
+ struct cgroup *cgrp = of->kn->parent->priv;
+ struct cftype *cft = of->kn->priv;
+ struct cgroup_subsys_state *css;
+ int ret;
+
+ if (cft->write)
+ return cft->write(of, buf, nbytes, off);
+
+ /*
+ * kernfs guarantees that a file isn't deleted with operations in
+ * flight, which means that the matching css is and stays alive and
+ * doesn't need to be pinned. The RCU locking is not necessary
+ * either. It's just for the convenience of using cgroup_css().
+ */
+ rcu_read_lock();
+ css = cgroup_css(cgrp, cft->ss);
+ rcu_read_unlock();
+
+ if (cft->write_u64) {
+ unsigned long long v;
+ ret = kstrtoull(buf, 0, &v);
+ if (!ret)
+ ret = cft->write_u64(css, cft, v);
+ } else if (cft->write_s64) {
+ long long v;
+ ret = kstrtoll(buf, 0, &v);
+ if (!ret)
+ ret = cft->write_s64(css, cft, v);
+ } else {
+ ret = -EINVAL;
+ }
+
+ return ret ?: nbytes;
}
-static int cgroup_setxattr(struct dentry *dentry, const char *name,
- const void *val, size_t size, int flags)
+static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
{
- if (!xattr_enabled(dentry))
- return -EOPNOTSUPP;
- if (!is_valid_xattr(name))
- return -EINVAL;
- return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags);
+ return seq_cft(seq)->seq_start(seq, ppos);
}
-static int cgroup_removexattr(struct dentry *dentry, const char *name)
+static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
{
- if (!xattr_enabled(dentry))
- return -EOPNOTSUPP;
- if (!is_valid_xattr(name))
- return -EINVAL;
- return simple_xattr_remove(__d_xattrs(dentry), name);
+ return seq_cft(seq)->seq_next(seq, v, ppos);
}
-static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name,
- void *buf, size_t size)
+static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
{
- if (!xattr_enabled(dentry))
- return -EOPNOTSUPP;
- if (!is_valid_xattr(name))
- return -EINVAL;
- return simple_xattr_get(__d_xattrs(dentry), name, buf, size);
+ seq_cft(seq)->seq_stop(seq, v);
}
-static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
+static int cgroup_seqfile_show(struct seq_file *m, void *arg)
{
- if (!xattr_enabled(dentry))
- return -EOPNOTSUPP;
- return simple_xattr_list(__d_xattrs(dentry), buf, size);
-}
+ struct cftype *cft = seq_cft(m);
+ struct cgroup_subsys_state *css = seq_css(m);
-static const struct file_operations cgroup_file_operations = {
- .read = seq_read,
- .write = cgroup_file_write,
- .llseek = generic_file_llseek,
- .open = cgroup_file_open,
- .release = cgroup_file_release,
-};
+ if (cft->seq_show)
+ return cft->seq_show(m, arg);
-static const struct inode_operations cgroup_file_inode_operations = {
- .setxattr = cgroup_setxattr,
- .getxattr = cgroup_getxattr,
- .listxattr = cgroup_listxattr,
- .removexattr = cgroup_removexattr,
+ if (cft->read_u64)
+ seq_printf(m, "%llu\n", cft->read_u64(css, cft));
+ else if (cft->read_s64)
+ seq_printf(m, "%lld\n", cft->read_s64(css, cft));
+ else
+ return -EINVAL;
+ return 0;
+}
+
+static struct kernfs_ops cgroup_kf_single_ops = {
+ .atomic_write_len = PAGE_SIZE,
+ .write = cgroup_file_write,
+ .seq_show = cgroup_seqfile_show,
};
-static const struct inode_operations cgroup_dir_inode_operations = {
- .lookup = simple_lookup,
- .mkdir = cgroup_mkdir,
- .rmdir = cgroup_rmdir,
- .rename = cgroup_rename,
- .setxattr = cgroup_setxattr,
- .getxattr = cgroup_getxattr,
- .listxattr = cgroup_listxattr,
- .removexattr = cgroup_removexattr,
+static struct kernfs_ops cgroup_kf_ops = {
+ .atomic_write_len = PAGE_SIZE,
+ .write = cgroup_file_write,
+ .seq_start = cgroup_seqfile_start,
+ .seq_next = cgroup_seqfile_next,
+ .seq_stop = cgroup_seqfile_stop,
+ .seq_show = cgroup_seqfile_show,
};
-static int cgroup_create_file(struct dentry *dentry, umode_t mode,
- struct super_block *sb)
+/*
+ * cgroup_rename - Only allow simple rename of directories in place.
+ */
+static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
+ const char *new_name_str)
{
- struct inode *inode;
+ struct cgroup *cgrp = kn->priv;
+ int ret;
- if (!dentry)
- return -ENOENT;
- if (dentry->d_inode)
- return -EEXIST;
+ if (kernfs_type(kn) != KERNFS_DIR)
+ return -ENOTDIR;
+ if (kn->parent != new_parent)
+ return -EIO;
- inode = cgroup_new_inode(mode, sb);
- if (!inode)
- return -ENOMEM;
+ /*
+ * This isn't a proper migration and its usefulness is very
+ * limited. Disallow on the default hierarchy.
+ */
+ if (cgroup_on_dfl(cgrp))
+ return -EPERM;
- if (S_ISDIR(mode)) {
- inode->i_op = &cgroup_dir_inode_operations;
- inode->i_fop = &simple_dir_operations;
+ /*
+ * We're gonna grab cgroup_mutex which nests outside kernfs
+ * active_ref. kernfs_rename() doesn't require active_ref
+ * protection. Break them before grabbing cgroup_mutex.
+ */
+ kernfs_break_active_protection(new_parent);
+ kernfs_break_active_protection(kn);
- /* start off with i_nlink == 2 (for "." entry) */
- inc_nlink(inode);
- inc_nlink(dentry->d_parent->d_inode);
+ mutex_lock(&cgroup_mutex);
- /*
- * Control reaches here with cgroup_mutex held.
- * @inode->i_mutex should nest outside cgroup_mutex but we
- * want to populate it immediately without releasing
- * cgroup_mutex. As @inode isn't visible to anyone else
- * yet, trylock will always succeed without affecting
- * lockdep checks.
- */
- WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
- } else if (S_ISREG(mode)) {
- inode->i_size = 0;
- inode->i_fop = &cgroup_file_operations;
- inode->i_op = &cgroup_file_inode_operations;
- }
- d_instantiate(dentry, inode);
- dget(dentry); /* Extra count - pin the dentry in core */
- return 0;
-}
+ ret = kernfs_rename(kn, new_parent, new_name_str);
-/**
- * cgroup_file_mode - deduce file mode of a control file
- * @cft: the control file in question
- *
- * returns cft->mode if ->mode is not 0
- * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
- * returns S_IRUGO if it has only a read handler
- * returns S_IWUSR if it has only a write hander
- */
-static umode_t cgroup_file_mode(const struct cftype *cft)
-{
- umode_t mode = 0;
+ mutex_unlock(&cgroup_mutex);
- if (cft->mode)
- return cft->mode;
+ kernfs_unbreak_active_protection(kn);
+ kernfs_unbreak_active_protection(new_parent);
+ return ret;
+}
- if (cft->read_u64 || cft->read_s64 || cft->seq_show)
- mode |= S_IRUGO;
+/* set uid and gid of cgroup dirs and files to that of the creator */
+static int cgroup_kn_set_ugid(struct kernfs_node *kn)
+{
+ struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
+ .ia_uid = current_fsuid(),
+ .ia_gid = current_fsgid(), };
- if (cft->write_u64 || cft->write_s64 || cft->write_string ||
- cft->trigger)
- mode |= S_IWUSR;
+ if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
+ gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
+ return 0;
- return mode;
+ return kernfs_setattr(kn, &iattr);
}
static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
{
- struct dentry *dir = cgrp->dentry;
- struct cgroup *parent = __d_cgrp(dir);
- struct dentry *dentry;
- struct cfent *cfe;
- int error;
- umode_t mode;
- char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
-
- if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
- !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
- strcpy(name, cft->ss->name);
- strcat(name, ".");
- }
- strcat(name, cft->name);
-
- BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
+ char name[CGROUP_FILE_NAME_MAX];
+ struct kernfs_node *kn;
+ struct lock_class_key *key = NULL;
+ int ret;
- cfe = kzalloc(sizeof(*cfe), GFP_KERNEL);
- if (!cfe)
- return -ENOMEM;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ key = &cft->lockdep_key;
+#endif
+ kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
+ cgroup_file_mode(cft), 0, cft->kf_ops, cft,
+ NULL, false, key);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
- dentry = lookup_one_len(name, dir, strlen(name));
- if (IS_ERR(dentry)) {
- error = PTR_ERR(dentry);
- goto out;
+ ret = cgroup_kn_set_ugid(kn);
+ if (ret) {
+ kernfs_remove(kn);
+ return ret;
}
- cfe->type = (void *)cft;
- cfe->dentry = dentry;
- dentry->d_fsdata = cfe;
- simple_xattrs_init(&cfe->xattrs);
-
- mode = cgroup_file_mode(cft);
- error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
- if (!error) {
- list_add_tail(&cfe->node, &parent->files);
- cfe = NULL;
- }
- dput(dentry);
-out:
- kfree(cfe);
- return error;
+ if (cft->seq_show == cgroup_populated_show)
+ cgrp->populated_kn = kn;
+ return 0;
}
/**
@@ -2700,23 +3071,24 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
struct cftype *cft;
int ret;
- lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
lockdep_assert_held(&cgroup_mutex);
for (cft = cfts; cft->name[0] != '\0'; cft++) {
/* does cft->flags tell us to skip this file on @cgrp? */
- if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
+ if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
continue;
- if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
+ if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
continue;
- if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
+ if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
+ continue;
+ if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
continue;
if (is_add) {
ret = cgroup_add_file(cgrp, cft);
if (ret) {
- pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
- cft->name, ret);
+ pr_warn("%s: failed to add %s, err=%d\n",
+ __func__, cft->name, ret);
return ret;
}
} else {
@@ -2726,44 +3098,15 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
return 0;
}
-static void cgroup_cfts_prepare(void)
- __acquires(&cgroup_mutex)
-{
- /*
- * Thanks to the entanglement with vfs inode locking, we can't walk
- * the existing cgroups under cgroup_mutex and create files.
- * Instead, we use css_for_each_descendant_pre() and drop RCU read
- * lock before calling cgroup_addrm_files().
- */
- mutex_lock(&cgroup_mutex);
-}
-
-static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
- __releases(&cgroup_mutex)
+static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
{
LIST_HEAD(pending);
struct cgroup_subsys *ss = cfts[0].ss;
- struct cgroup *root = &ss->root->top_cgroup;
- struct super_block *sb = ss->root->sb;
- struct dentry *prev = NULL;
- struct inode *inode;
+ struct cgroup *root = &ss->root->cgrp;
struct cgroup_subsys_state *css;
- u64 update_before;
int ret = 0;
- /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
- if (!cfts || ss->root == &cgroup_dummy_root ||
- !atomic_inc_not_zero(&sb->s_active)) {
- mutex_unlock(&cgroup_mutex);
- return 0;
- }
-
- /*
- * All cgroups which are created after we drop cgroup_mutex will
- * have the updated set of files, so we only need to update the
- * cgroups created before the current @cgroup_serial_nr_next.
- */
- update_before = cgroup_serial_nr_next;
+ lockdep_assert_held(&cgroup_mutex);
/* add/rm files for all cgroups created before */
css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
@@ -2772,23 +3115,97 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
if (cgroup_is_dead(cgrp))
continue;
- inode = cgrp->dentry->d_inode;
- dget(cgrp->dentry);
- dput(prev);
- prev = cgrp->dentry;
-
- mutex_unlock(&cgroup_mutex);
- mutex_lock(&inode->i_mutex);
- mutex_lock(&cgroup_mutex);
- if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
- ret = cgroup_addrm_files(cgrp, cfts, is_add);
- mutex_unlock(&inode->i_mutex);
+ ret = cgroup_addrm_files(cgrp, cfts, is_add);
if (ret)
break;
}
+
+ if (is_add && !ret)
+ kernfs_activate(root->kn);
+ return ret;
+}
+
+static void cgroup_exit_cftypes(struct cftype *cfts)
+{
+ struct cftype *cft;
+
+ for (cft = cfts; cft->name[0] != '\0'; cft++) {
+ /* free copy for custom atomic_write_len, see init_cftypes() */
+ if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
+ kfree(cft->kf_ops);
+ cft->kf_ops = NULL;
+ cft->ss = NULL;
+
+ /* revert flags set by cgroup core while adding @cfts */
+ cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
+ }
+}
+
+static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+ struct cftype *cft;
+
+ for (cft = cfts; cft->name[0] != '\0'; cft++) {
+ struct kernfs_ops *kf_ops;
+
+ WARN_ON(cft->ss || cft->kf_ops);
+
+ if (cft->seq_start)
+ kf_ops = &cgroup_kf_ops;
+ else
+ kf_ops = &cgroup_kf_single_ops;
+
+ /*
+ * Ugh... if @cft wants a custom max_write_len, we need to
+ * make a copy of kf_ops to set its atomic_write_len.
+ */
+ if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
+ kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
+ if (!kf_ops) {
+ cgroup_exit_cftypes(cfts);
+ return -ENOMEM;
+ }
+ kf_ops->atomic_write_len = cft->max_write_len;
+ }
+
+ cft->kf_ops = kf_ops;
+ cft->ss = ss;
+ }
+
+ return 0;
+}
+
+static int cgroup_rm_cftypes_locked(struct cftype *cfts)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!cfts || !cfts[0].ss)
+ return -ENOENT;
+
+ list_del(&cfts->node);
+ cgroup_apply_cftypes(cfts, false);
+ cgroup_exit_cftypes(cfts);
+ return 0;
+}
+
+/**
+ * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Unregister @cfts. Files described by @cfts are removed from all
+ * existing cgroups and all future cgroups won't have them either. This
+ * function can be called anytime whether @cfts' subsys is attached or not.
+ *
+ * Returns 0 on successful unregistration, -ENOENT if @cfts is not
+ * registered.
+ */
+int cgroup_rm_cftypes(struct cftype *cfts)
+{
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = cgroup_rm_cftypes_locked(cfts);
mutex_unlock(&cgroup_mutex);
- dput(prev);
- deactivate_super(sb);
return ret;
}
@@ -2806,60 +3223,72 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
* function currently returns 0 as long as @cfts registration is successful
* even if some file creation attempts on existing cgroups fail.
*/
-int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
- struct cftype_set *set;
- struct cftype *cft;
int ret;
- set = kzalloc(sizeof(*set), GFP_KERNEL);
- if (!set)
- return -ENOMEM;
+ if (ss->disabled)
+ return 0;
- for (cft = cfts; cft->name[0] != '\0'; cft++)
- cft->ss = ss;
+ if (!cfts || cfts[0].name[0] == '\0')
+ return 0;
+
+ ret = cgroup_init_cftypes(ss, cfts);
+ if (ret)
+ return ret;
+
+ mutex_lock(&cgroup_mutex);
- cgroup_cfts_prepare();
- set->cfts = cfts;
- list_add_tail(&set->node, &ss->cftsets);
- ret = cgroup_cfts_commit(cfts, true);
+ list_add_tail(&cfts->node, &ss->cfts);
+ ret = cgroup_apply_cftypes(cfts, true);
if (ret)
- cgroup_rm_cftypes(cfts);
+ cgroup_rm_cftypes_locked(cfts);
+
+ mutex_unlock(&cgroup_mutex);
return ret;
}
-EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
/**
- * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
+ * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
+ * @ss: target cgroup subsystem
* @cfts: zero-length name terminated array of cftypes
*
- * Unregister @cfts. Files described by @cfts are removed from all
- * existing cgroups and all future cgroups won't have them either. This
- * function can be called anytime whether @cfts' subsys is attached or not.
- *
- * Returns 0 on successful unregistration, -ENOENT if @cfts is not
- * registered.
+ * Similar to cgroup_add_cftypes() but the added files are only used for
+ * the default hierarchy.
*/
-int cgroup_rm_cftypes(struct cftype *cfts)
+int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
- struct cftype_set *set;
+ struct cftype *cft;
- if (!cfts || !cfts[0].ss)
- return -ENOENT;
+ for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+ cft->flags |= __CFTYPE_ONLY_ON_DFL;
+ return cgroup_add_cftypes(ss, cfts);
+}
- cgroup_cfts_prepare();
+/**
+ * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Similar to cgroup_add_cftypes() but the added files are only used for
+ * the legacy hierarchies.
+ */
+int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+ struct cftype *cft;
- list_for_each_entry(set, &cfts[0].ss->cftsets, node) {
- if (set->cfts == cfts) {
- list_del(&set->node);
- kfree(set);
- cgroup_cfts_commit(cfts, false);
- return 0;
- }
+ /*
+ * If legacy_flies_on_dfl, we want to show the legacy files on the
+ * dfl hierarchy but iff the target subsystem hasn't been updated
+ * for the dfl hierarchy yet.
+ */
+ if (!cgroup_legacy_files_on_dfl ||
+ ss->dfl_cftypes != ss->legacy_cftypes) {
+ for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+ cft->flags |= __CFTYPE_NOT_ON_DFL;
}
- cgroup_cfts_commit(NULL, false);
- return -ENOENT;
+ return cgroup_add_cftypes(ss, cfts);
}
/**
@@ -2868,112 +3297,80 @@ int cgroup_rm_cftypes(struct cftype *cfts)
*
* Return the number of tasks in the cgroup.
*/
-int cgroup_task_count(const struct cgroup *cgrp)
+static int cgroup_task_count(const struct cgroup *cgrp)
{
int count = 0;
struct cgrp_cset_link *link;
- read_lock(&css_set_lock);
+ down_read(&css_set_rwsem);
list_for_each_entry(link, &cgrp->cset_links, cset_link)
count += atomic_read(&link->cset->refcount);
- read_unlock(&css_set_lock);
+ up_read(&css_set_rwsem);
return count;
}
-/*
- * To reduce the fork() overhead for systems that are not actually using
- * their cgroups capability, we don't maintain the lists running through
- * each css_set to its tasks until we see the list actually used - in other
- * words after the first call to css_task_iter_start().
- */
-static void cgroup_enable_task_cg_lists(void)
-{
- struct task_struct *p, *g;
- write_lock(&css_set_lock);
- use_task_css_set_links = 1;
- /*
- * We need tasklist_lock because RCU is not safe against
- * while_each_thread(). Besides, a forking task that has passed
- * cgroup_post_fork() without seeing use_task_css_set_links = 1
- * is not guaranteed to have its child immediately visible in the
- * tasklist if we walk through it with RCU.
- */
- read_lock(&tasklist_lock);
- do_each_thread(g, p) {
- task_lock(p);
- /*
- * We should check if the process is exiting, otherwise
- * it will race with cgroup_exit() in that the list
- * entry won't be deleted though the process has exited.
- * Do it while holding siglock so that we don't end up
- * racing against cgroup_exit().
- */
- spin_lock_irq(&p->sighand->siglock);
- if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
- list_add(&p->cg_list, &task_css_set(p)->tasks);
- spin_unlock_irq(&p->sighand->siglock);
-
- task_unlock(p);
- } while_each_thread(g, p);
- read_unlock(&tasklist_lock);
- write_unlock(&css_set_lock);
-}
-
/**
* css_next_child - find the next child of a given css
- * @pos_css: the current position (%NULL to initiate traversal)
- * @parent_css: css whose children to walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @parent: css whose children to walk
*
- * This function returns the next child of @parent_css and should be called
+ * This function returns the next child of @parent and should be called
* under either cgroup_mutex or RCU read lock. The only requirement is
- * that @parent_css and @pos_css are accessible. The next sibling is
- * guaranteed to be returned regardless of their states.
+ * that @parent and @pos are accessible. The next sibling is guaranteed to
+ * be returned regardless of their states.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal. It's each subsystem's
+ * responsibility to synchronize against on/offlining.
*/
-struct cgroup_subsys_state *
-css_next_child(struct cgroup_subsys_state *pos_css,
- struct cgroup_subsys_state *parent_css)
+struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
+ struct cgroup_subsys_state *parent)
{
- struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;
- struct cgroup *cgrp = parent_css->cgroup;
- struct cgroup *next;
+ struct cgroup_subsys_state *next;
cgroup_assert_mutex_or_rcu_locked();
/*
- * @pos could already have been removed. Once a cgroup is removed,
- * its ->sibling.next is no longer updated when its next sibling
- * changes. As CGRP_DEAD assertion is serialized and happens
- * before the cgroup is taken off the ->sibling list, if we see it
- * unasserted, it's guaranteed that the next sibling hasn't
- * finished its grace period even if it's already removed, and thus
- * safe to dereference from this RCU critical section. If
- * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
- * to be visible as %true here.
+ * @pos could already have been unlinked from the sibling list.
+ * Once a cgroup is removed, its ->sibling.next is no longer
+ * updated when its next sibling changes. CSS_RELEASED is set when
+ * @pos is taken off list, at which time its next pointer is valid,
+ * and, as releases are serialized, the one pointed to by the next
+ * pointer is guaranteed to not have started release yet. This
+ * implies that if we observe !CSS_RELEASED on @pos in this RCU
+ * critical section, the one pointed to by its next pointer is
+ * guaranteed to not have finished its RCU grace period even if we
+ * have dropped rcu_read_lock() inbetween iterations.
*
- * If @pos is dead, its next pointer can't be dereferenced;
- * however, as each cgroup is given a monotonically increasing
- * unique serial number and always appended to the sibling list,
- * the next one can be found by walking the parent's children until
- * we see a cgroup with higher serial number than @pos's. While
- * this path can be slower, it's taken only when either the current
- * cgroup is removed or iteration and removal race.
+ * If @pos has CSS_RELEASED set, its next pointer can't be
+ * dereferenced; however, as each css is given a monotonically
+ * increasing unique serial number and always appended to the
+ * sibling list, the next one can be found by walking the parent's
+ * children until the first css with higher serial number than
+ * @pos's. While this path can be slower, it happens iff iteration
+ * races against release and the race window is very small.
*/
if (!pos) {
- next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);
- } else if (likely(!cgroup_is_dead(pos))) {
- next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
+ next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
+ } else if (likely(!(pos->flags & CSS_RELEASED))) {
+ next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
} else {
- list_for_each_entry_rcu(next, &cgrp->children, sibling)
+ list_for_each_entry_rcu(next, &parent->children, sibling)
if (next->serial_nr > pos->serial_nr)
break;
}
- if (&next->sibling == &cgrp->children)
- return NULL;
-
- return cgroup_css(next, parent_css->ss);
+ /*
+ * @next, if not pointing to the head, can be dereferenced and is
+ * the next sibling.
+ */
+ if (&next->sibling != &parent->children)
+ return next;
+ return NULL;
}
-EXPORT_SYMBOL_GPL(css_next_child);
/**
* css_next_descendant_pre - find the next descendant for pre-order walk
@@ -2988,6 +3385,13 @@ EXPORT_SYMBOL_GPL(css_next_child);
* doesn't require the whole traversal to be contained in a single critical
* section. This function will return the correct next descendant as long
* as both @pos and @root are accessible and @pos is a descendant of @root.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal. It's each subsystem's
+ * responsibility to synchronize against on/offlining.
*/
struct cgroup_subsys_state *
css_next_descendant_pre(struct cgroup_subsys_state *pos,
@@ -3008,15 +3412,14 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
/* no child, visit my or the closest ancestor's next sibling */
while (pos != root) {
- next = css_next_child(pos, css_parent(pos));
+ next = css_next_child(pos, pos->parent);
if (next)
return next;
- pos = css_parent(pos);
+ pos = pos->parent;
}
return NULL;
}
-EXPORT_SYMBOL_GPL(css_next_descendant_pre);
/**
* css_rightmost_descendant - return the rightmost descendant of a css
@@ -3048,7 +3451,6 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos)
return last;
}
-EXPORT_SYMBOL_GPL(css_rightmost_descendant);
static struct cgroup_subsys_state *
css_leftmost_descendant(struct cgroup_subsys_state *pos)
@@ -3077,6 +3479,13 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos)
* section. This function will return the correct next descendant as long
* as both @pos and @cgroup are accessible and @pos is a descendant of
* @cgroup.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal. It's each subsystem's
+ * responsibility to synchronize against on/offlining.
*/
struct cgroup_subsys_state *
css_next_descendant_post(struct cgroup_subsys_state *pos,
@@ -3095,14 +3504,37 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
return NULL;
/* if there's an unvisited sibling, visit its leftmost descendant */
- next = css_next_child(pos, css_parent(pos));
+ next = css_next_child(pos, pos->parent);
if (next)
return css_leftmost_descendant(next);
/* no sibling left, visit parent */
- return css_parent(pos);
+ return pos->parent;
+}
+
+/**
+ * css_has_online_children - does a css have online children
+ * @css: the target css
+ *
+ * Returns %true if @css has any online children; otherwise, %false. This
+ * function can be called from any context but the caller is responsible
+ * for synchronizing against on/offlining as necessary.
+ */
+bool css_has_online_children(struct cgroup_subsys_state *css)
+{
+ struct cgroup_subsys_state *child;
+ bool ret = false;
+
+ rcu_read_lock();
+ css_for_each_child(child, css) {
+ if (child->flags & CSS_ONLINE) {
+ ret = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return ret;
}
-EXPORT_SYMBOL_GPL(css_next_descendant_post);
/**
* css_advance_task_iter - advance a task itererator to the next css_set
@@ -3112,22 +3544,36 @@ EXPORT_SYMBOL_GPL(css_next_descendant_post);
*/
static void css_advance_task_iter(struct css_task_iter *it)
{
- struct list_head *l = it->cset_link;
+ struct list_head *l = it->cset_pos;
struct cgrp_cset_link *link;
struct css_set *cset;
/* Advance to the next non-empty css_set */
do {
l = l->next;
- if (l == &it->origin_css->cgroup->cset_links) {
- it->cset_link = NULL;
+ if (l == it->cset_head) {
+ it->cset_pos = NULL;
return;
}
- link = list_entry(l, struct cgrp_cset_link, cset_link);
- cset = link->cset;
- } while (list_empty(&cset->tasks));
- it->cset_link = l;
- it->task = cset->tasks.next;
+
+ if (it->ss) {
+ cset = container_of(l, struct css_set,
+ e_cset_node[it->ss->id]);
+ } else {
+ link = list_entry(l, struct cgrp_cset_link, cset_link);
+ cset = link->cset;
+ }
+ } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
+
+ it->cset_pos = l;
+
+ if (!list_empty(&cset->tasks))
+ it->task_pos = cset->tasks.next;
+ else
+ it->task_pos = cset->mg_tasks.next;
+
+ it->tasks_head = &cset->tasks;
+ it->mg_tasks_head = &cset->mg_tasks;
}
/**
@@ -3146,20 +3592,21 @@ static void css_advance_task_iter(struct css_task_iter *it)
*/
void css_task_iter_start(struct cgroup_subsys_state *css,
struct css_task_iter *it)
- __acquires(css_set_lock)
+ __acquires(css_set_rwsem)
{
- /*
- * The first time anyone tries to iterate across a css, we need to
- * enable the list linking each css_set to its tasks, and fix up
- * all existing tasks.
- */
- if (!use_task_css_set_links)
- cgroup_enable_task_cg_lists();
+ /* no one should try to iterate before mounting cgroups */
+ WARN_ON_ONCE(!use_task_css_set_links);
+
+ down_read(&css_set_rwsem);
- read_lock(&css_set_lock);
+ it->ss = css->ss;
- it->origin_css = css;
- it->cset_link = &css->cgroup->cset_links;
+ if (it->ss)
+ it->cset_pos = &css->cgroup->e_csets[css->ss->id];
+ else
+ it->cset_pos = &css->cgroup->cset_links;
+
+ it->cset_head = it->cset_pos;
css_advance_task_iter(it);
}
@@ -3175,25 +3622,28 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
struct task_struct *css_task_iter_next(struct css_task_iter *it)
{
struct task_struct *res;
- struct list_head *l = it->task;
- struct cgrp_cset_link *link;
+ struct list_head *l = it->task_pos;
/* If the iterator cg is NULL, we have no tasks */
- if (!it->cset_link)
+ if (!it->cset_pos)
return NULL;
res = list_entry(l, struct task_struct, cg_list);
- /* Advance iterator to find next entry */
+
+ /*
+ * Advance iterator to find next entry. cset->tasks is consumed
+ * first and then ->mg_tasks. After ->mg_tasks, we move onto the
+ * next cset.
+ */
l = l->next;
- link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link);
- if (l == &link->cset->tasks) {
- /*
- * We reached the end of this task list - move on to the
- * next cgrp_cset_link.
- */
+
+ if (l == it->tasks_head)
+ l = it->mg_tasks_head->next;
+
+ if (l == it->mg_tasks_head)
css_advance_task_iter(it);
- } else {
- it->task = l;
- }
+ else
+ it->task_pos = l;
+
return res;
}
@@ -3204,191 +3654,62 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
* Finish task iteration started by css_task_iter_start().
*/
void css_task_iter_end(struct css_task_iter *it)
- __releases(css_set_lock)
+ __releases(css_set_rwsem)
{
- read_unlock(&css_set_lock);
-}
-
-static inline int started_after_time(struct task_struct *t1,
- struct timespec *time,
- struct task_struct *t2)
-{
- int start_diff = timespec_compare(&t1->start_time, time);
- if (start_diff > 0) {
- return 1;
- } else if (start_diff < 0) {
- return 0;
- } else {
- /*
- * Arbitrarily, if two processes started at the same
- * time, we'll say that the lower pointer value
- * started first. Note that t2 may have exited by now
- * so this may not be a valid pointer any longer, but
- * that's fine - it still serves to distinguish
- * between two tasks started (effectively) simultaneously.
- */
- return t1 > t2;
- }
-}
-
-/*
- * This function is a callback from heap_insert() and is used to order
- * the heap.
- * In this case we order the heap in descending task start time.
- */
-static inline int started_after(void *p1, void *p2)
-{
- struct task_struct *t1 = p1;
- struct task_struct *t2 = p2;
- return started_after_time(t1, &t2->start_time, t2);
+ up_read(&css_set_rwsem);
}
/**
- * css_scan_tasks - iterate though all the tasks in a css
- * @css: the css to iterate tasks of
- * @test: optional test callback
- * @process: process callback
- * @data: data passed to @test and @process
- * @heap: optional pre-allocated heap used for task iteration
- *
- * Iterate through all the tasks in @css, calling @test for each, and if it
- * returns %true, call @process for it also.
- *
- * @test may be NULL, meaning always true (select all tasks), which
- * effectively duplicates css_task_iter_{start,next,end}() but does not
- * lock css_set_lock for the call to @process.
- *
- * It is guaranteed that @process will act on every task that is a member
- * of @css for the duration of this call. This function may or may not
- * call @process for tasks that exit or move to a different css during the
- * call, or are forked or move into the css during the call.
- *
- * Note that @test may be called with locks held, and may in some
- * situations be called multiple times for the same task, so it should be
- * cheap.
+ * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
+ * @to: cgroup to which the tasks will be moved
+ * @from: cgroup in which the tasks currently reside
*
- * If @heap is non-NULL, a heap has been pre-allocated and will be used for
- * heap operations (and its "gt" member will be overwritten), else a
- * temporary heap will be used (allocation of which may cause this function
- * to fail).
+ * Locking rules between cgroup_post_fork() and the migration path
+ * guarantee that, if a task is forking while being migrated, the new child
+ * is guaranteed to be either visible in the source cgroup after the
+ * parent's migration is complete or put into the target cgroup. No task
+ * can slip out of migration through forking.
*/
-int css_scan_tasks(struct cgroup_subsys_state *css,
- bool (*test)(struct task_struct *, void *),
- void (*process)(struct task_struct *, void *),
- void *data, struct ptr_heap *heap)
+int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
{
- int retval, i;
+ LIST_HEAD(preloaded_csets);
+ struct cgrp_cset_link *link;
struct css_task_iter it;
- struct task_struct *p, *dropped;
- /* Never dereference latest_task, since it's not refcounted */
- struct task_struct *latest_task = NULL;
- struct ptr_heap tmp_heap;
- struct timespec latest_time = { 0, 0 };
-
- if (heap) {
- /* The caller supplied our heap and pre-allocated its memory */
- heap->gt = &started_after;
- } else {
- /* We need to allocate our own heap memory */
- heap = &tmp_heap;
- retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
- if (retval)
- /* cannot allocate the heap */
- return retval;
- }
+ struct task_struct *task;
+ int ret;
- again:
- /*
- * Scan tasks in the css, using the @test callback to determine
- * which are of interest, and invoking @process callback on the
- * ones which need an update. Since we don't want to hold any
- * locks during the task updates, gather tasks to be processed in a
- * heap structure. The heap is sorted by descending task start
- * time. If the statically-sized heap fills up, we overflow tasks
- * that started later, and in future iterations only consider tasks
- * that started after the latest task in the previous pass. This
- * guarantees forward progress and that we don't miss any tasks.
- */
- heap->size = 0;
- css_task_iter_start(css, &it);
- while ((p = css_task_iter_next(&it))) {
- /*
- * Only affect tasks that qualify per the caller's callback,
- * if he provided one
- */
- if (test && !test(p, data))
- continue;
- /*
- * Only process tasks that started after the last task
- * we processed
- */
- if (!started_after_time(p, &latest_time, latest_task))
- continue;
- dropped = heap_insert(heap, p);
- if (dropped == NULL) {
- /*
- * The new task was inserted; the heap wasn't
- * previously full
- */
- get_task_struct(p);
- } else if (dropped != p) {
- /*
- * The new task was inserted, and pushed out a
- * different task
- */
- get_task_struct(p);
- put_task_struct(dropped);
- }
- /*
- * Else the new task was newer than anything already in
- * the heap and wasn't inserted
- */
- }
- css_task_iter_end(&it);
+ mutex_lock(&cgroup_mutex);
- if (heap->size) {
- for (i = 0; i < heap->size; i++) {
- struct task_struct *q = heap->ptrs[i];
- if (i == 0) {
- latest_time = q->start_time;
- latest_task = q;
- }
- /* Process the task per the caller's callback */
- process(q, data);
- put_task_struct(q);
- }
- /*
- * If we had to process any tasks at all, scan again
- * in case some of them were in the middle of forking
- * children that didn't get processed.
- * Not the most efficient way to do it, but it avoids
- * having to take callback_mutex in the fork path
- */
- goto again;
- }
- if (heap == &tmp_heap)
- heap_free(&tmp_heap);
- return 0;
-}
+ /* all tasks in @from are being moved, all csets are source */
+ down_read(&css_set_rwsem);
+ list_for_each_entry(link, &from->cset_links, cset_link)
+ cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
+ up_read(&css_set_rwsem);
-static void cgroup_transfer_one_task(struct task_struct *task, void *data)
-{
- struct cgroup *new_cgroup = data;
+ ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
+ if (ret)
+ goto out_err;
- mutex_lock(&cgroup_mutex);
- cgroup_attach_task(new_cgroup, task, false);
+ /*
+ * Migrate tasks one-by-one until @form is empty. This fails iff
+ * ->can_attach() fails.
+ */
+ do {
+ css_task_iter_start(&from->self, &it);
+ task = css_task_iter_next(&it);
+ if (task)
+ get_task_struct(task);
+ css_task_iter_end(&it);
+
+ if (task) {
+ ret = cgroup_migrate(to, task, false);
+ put_task_struct(task);
+ }
+ } while (task && !ret);
+out_err:
+ cgroup_migrate_finish(&preloaded_csets);
mutex_unlock(&cgroup_mutex);
-}
-
-/**
- * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
- * @to: cgroup to which the tasks will be moved
- * @from: cgroup in which the tasks currently reside
- */
-int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
-{
- return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task,
- to, NULL);
+ return ret;
}
/*
@@ -3535,8 +3856,9 @@ after:
*
* All this extra complexity was caused by the original implementation
* committing to an entirely unnecessary property. In the long term, we
- * want to do away with it. Explicitly scramble sort order if
- * sane_behavior so that no such expectation exists in the new interface.
+ * want to do away with it. Explicitly scramble sort order if on the
+ * default hierarchy so that no such expectation exists in the new
+ * interface.
*
* Scrambling is done by swapping every two consecutive bits, which is
* non-identity one-to-one mapping which disturbs sort order sufficiently.
@@ -3551,7 +3873,7 @@ static pid_t pid_fry(pid_t pid)
static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
{
- if (cgroup_sane_behavior(cgrp))
+ if (cgroup_on_dfl(cgrp))
return pid_fry(pid);
else
return pid;
@@ -3639,7 +3961,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
if (!array)
return -ENOMEM;
/* now, populate the array */
- css_task_iter_start(&cgrp->dummy_css, &it);
+ css_task_iter_start(&cgrp->self, &it);
while ((tsk = css_task_iter_next(&it))) {
if (unlikely(n == length))
break;
@@ -3654,7 +3976,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
css_task_iter_end(&it);
length = n;
/* now sort & (if procs) strip out duplicates */
- if (cgroup_sane_behavior(cgrp))
+ if (cgroup_on_dfl(cgrp))
sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
else
sort(array, length, sizeof(pid_t), cmppid, NULL);
@@ -3663,7 +3985,6 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
l = cgroup_pidlist_find_create(cgrp, type);
if (!l) {
- mutex_unlock(&cgrp->pidlist_mutex);
pidlist_free(array);
return -ENOMEM;
}
@@ -3687,23 +4008,33 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
*/
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
{
- int ret = -EINVAL;
+ struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
struct cgroup *cgrp;
struct css_task_iter it;
struct task_struct *tsk;
+ /* it should be kernfs_node belonging to cgroupfs and is a directory */
+ if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
+ kernfs_type(kn) != KERNFS_DIR)
+ return -EINVAL;
+
+ mutex_lock(&cgroup_mutex);
+
/*
- * Validate dentry by checking the superblock operations,
- * and make sure it's a directory.
+ * We aren't being called from kernfs and there's no guarantee on
+ * @kn->priv's validity. For this and css_tryget_online_from_dir(),
+ * @kn->priv is RCU safe. Let's do the RCU dancing.
*/
- if (dentry->d_sb->s_op != &cgroup_ops ||
- !S_ISDIR(dentry->d_inode->i_mode))
- goto err;
-
- ret = 0;
- cgrp = dentry->d_fsdata;
+ rcu_read_lock();
+ cgrp = rcu_dereference(kn->priv);
+ if (!cgrp || cgroup_is_dead(cgrp)) {
+ rcu_read_unlock();
+ mutex_unlock(&cgroup_mutex);
+ return -ENOENT;
+ }
+ rcu_read_unlock();
- css_task_iter_start(&cgrp->dummy_css, &it);
+ css_task_iter_start(&cgrp->self, &it);
while ((tsk = css_task_iter_next(&it))) {
switch (tsk->state) {
case TASK_RUNNING:
@@ -3726,8 +4057,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
}
css_task_iter_end(&it);
-err:
- return ret;
+ mutex_unlock(&cgroup_mutex);
+ return 0;
}
@@ -3745,7 +4076,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
* after a seek to the start). Use a binary-search to find the
* next pid to display, if any
*/
- struct cgroup_open_file *of = s->private;
+ struct kernfs_open_file *of = s->private;
struct cgroup *cgrp = seq_css(s)->cgroup;
struct cgroup_pidlist *l;
enum cgroup_filetype type = seq_cft(s)->private;
@@ -3800,7 +4131,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
static void cgroup_pidlist_stop(struct seq_file *s, void *v)
{
- struct cgroup_open_file *of = s->private;
+ struct kernfs_open_file *of = s->private;
struct cgroup_pidlist *l = of->priv;
if (l)
@@ -3811,7 +4142,7 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v)
static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
{
- struct cgroup_open_file *of = s->private;
+ struct kernfs_open_file *of = s->private;
struct cgroup_pidlist *l = of->priv;
pid_t *p = v;
pid_t *end = l->list + l->length;
@@ -3833,17 +4164,6 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v)
return seq_printf(s, "%d\n", *(int *)v);
}
-/*
- * seq_operations functions for iterating on pidlists through seq_file -
- * independent of whether it's tasks or procs
- */
-static const struct seq_operations cgroup_pidlist_seq_operations = {
- .start = cgroup_pidlist_start,
- .stop = cgroup_pidlist_stop,
- .next = cgroup_pidlist_next,
- .show = cgroup_pidlist_show,
-};
-
static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
struct cftype *cft)
{
@@ -3861,23 +4181,6 @@ static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
return 0;
}
-/*
- * When dput() is called asynchronously, if umount has been done and
- * then deactivate_super() in cgroup_free_fn() kills the superblock,
- * there's a small window that vfs will see the root dentry with non-zero
- * refcnt and trigger BUG().
- *
- * That's why we hold a reference before dput() and drop it right after.
- */
-static void cgroup_dput(struct cgroup *cgrp)
-{
- struct super_block *sb = cgrp->root->sb;
-
- atomic_inc(&sb->s_active);
- dput(cgrp->dentry);
- deactivate_super(sb);
-}
-
static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
@@ -3894,7 +4197,8 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
return 0;
}
-static struct cftype cgroup_base_files[] = {
+/* cgroup core interface files for the default hierarchy */
+static struct cftype cgroup_dfl_base_files[] = {
{
.name = "cgroup.procs",
.seq_start = cgroup_pidlist_start,
@@ -3902,12 +4206,46 @@ static struct cftype cgroup_base_files[] = {
.seq_stop = cgroup_pidlist_stop,
.seq_show = cgroup_pidlist_show,
.private = CGROUP_FILE_PROCS,
- .write_u64 = cgroup_procs_write,
+ .write = cgroup_procs_write,
+ .mode = S_IRUGO | S_IWUSR,
+ },
+ {
+ .name = "cgroup.controllers",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_root_controllers_show,
+ },
+ {
+ .name = "cgroup.controllers",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_controllers_show,
+ },
+ {
+ .name = "cgroup.subtree_control",
+ .seq_show = cgroup_subtree_control_show,
+ .write = cgroup_subtree_control_write,
+ },
+ {
+ .name = "cgroup.populated",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_populated_show,
+ },
+ { } /* terminate */
+};
+
+/* cgroup core interface files for the legacy hierarchies */
+static struct cftype cgroup_legacy_base_files[] = {
+ {
+ .name = "cgroup.procs",
+ .seq_start = cgroup_pidlist_start,
+ .seq_next = cgroup_pidlist_next,
+ .seq_stop = cgroup_pidlist_stop,
+ .seq_show = cgroup_pidlist_show,
+ .private = CGROUP_FILE_PROCS,
+ .write = cgroup_procs_write,
.mode = S_IRUGO | S_IWUSR,
},
{
.name = "cgroup.clone_children",
- .flags = CFTYPE_INSANE,
.read_u64 = cgroup_clone_children_read,
.write_u64 = cgroup_clone_children_write,
},
@@ -3916,35 +4254,27 @@ static struct cftype cgroup_base_files[] = {
.flags = CFTYPE_ONLY_ON_ROOT,
.seq_show = cgroup_sane_behavior_show,
},
-
- /*
- * Historical crazy stuff. These don't have "cgroup." prefix and
- * don't exist if sane_behavior. If you're depending on these, be
- * prepared to be burned.
- */
{
.name = "tasks",
- .flags = CFTYPE_INSANE, /* use "procs" instead */
.seq_start = cgroup_pidlist_start,
.seq_next = cgroup_pidlist_next,
.seq_stop = cgroup_pidlist_stop,
.seq_show = cgroup_pidlist_show,
.private = CGROUP_FILE_TASKS,
- .write_u64 = cgroup_tasks_write,
+ .write = cgroup_tasks_write,
.mode = S_IRUGO | S_IWUSR,
},
{
.name = "notify_on_release",
- .flags = CFTYPE_INSANE,
.read_u64 = cgroup_read_notify_on_release,
.write_u64 = cgroup_write_notify_on_release,
},
{
.name = "release_agent",
- .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
+ .flags = CFTYPE_ONLY_ON_ROOT,
.seq_show = cgroup_release_agent_show,
- .write_string = cgroup_release_agent_write,
- .max_write_len = PATH_MAX,
+ .write = cgroup_release_agent_write,
+ .max_write_len = PATH_MAX - 1,
},
{ } /* terminate */
};
@@ -3956,20 +4286,20 @@ static struct cftype cgroup_base_files[] = {
*
* On failure, no file is added.
*/
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask)
{
struct cgroup_subsys *ss;
int i, ret = 0;
/* process cftsets of each subsystem */
for_each_subsys(ss, i) {
- struct cftype_set *set;
+ struct cftype *cfts;
- if (!test_bit(i, &subsys_mask))
+ if (!(subsys_mask & (1 << i)))
continue;
- list_for_each_entry(set, &ss->cftsets, node) {
- ret = cgroup_addrm_files(cgrp, set->cfts, true);
+ list_for_each_entry(cfts, &ss->cfts, node) {
+ ret = cgroup_addrm_files(cgrp, cfts, true);
if (ret < 0)
goto err;
}
@@ -3987,9 +4317,9 @@ err:
* Implemented in kill_css().
*
* 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
- * and thus css_tryget() is guaranteed to fail, the css can be offlined
- * by invoking offline_css(). After offlining, the base ref is put.
- * Implemented in css_killed_work_fn().
+ * and thus css_tryget_online() is guaranteed to fail, the css can be
+ * offlined by invoking offline_css(). After offlining, the base ref is
+ * put. Implemented in css_killed_work_fn().
*
* 3. When the percpu_ref reaches zero, the only possible remaining
* accessors are inside RCU read sections. css_release() schedules the
@@ -4008,11 +4338,39 @@ static void css_free_work_fn(struct work_struct *work)
container_of(work, struct cgroup_subsys_state, destroy_work);
struct cgroup *cgrp = css->cgroup;
- if (css->parent)
- css_put(css->parent);
+ percpu_ref_exit(&css->refcnt);
- css->ss->css_free(css);
- cgroup_dput(cgrp);
+ if (css->ss) {
+ /* css free path */
+ if (css->parent)
+ css_put(css->parent);
+
+ css->ss->css_free(css);
+ cgroup_put(cgrp);
+ } else {
+ /* cgroup free path */
+ atomic_dec(&cgrp->root->nr_cgrps);
+ cgroup_pidlist_destroy_all(cgrp);
+
+ if (cgroup_parent(cgrp)) {
+ /*
+ * We get a ref to the parent, and put the ref when
+ * this cgroup is being freed, so it's guaranteed
+ * that the parent won't be destroyed before its
+ * children.
+ */
+ cgroup_put(cgroup_parent(cgrp));
+ kernfs_put(cgrp->kn);
+ kfree(cgrp);
+ } else {
+ /*
+ * This is root cgroup's refcnt reaching zero,
+ * which indicates that the root should be
+ * released.
+ */
+ cgroup_destroy_root(cgrp->root);
+ }
+ }
}
static void css_free_rcu_fn(struct rcu_head *rcu_head)
@@ -4020,34 +4378,72 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
struct cgroup_subsys_state *css =
container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
- /*
- * css holds an extra ref to @cgrp->dentry which is put on the last
- * css_put(). dput() requires process context which we don't have.
- */
INIT_WORK(&css->destroy_work, css_free_work_fn);
queue_work(cgroup_destroy_wq, &css->destroy_work);
}
+static void css_release_work_fn(struct work_struct *work)
+{
+ struct cgroup_subsys_state *css =
+ container_of(work, struct cgroup_subsys_state, destroy_work);
+ struct cgroup_subsys *ss = css->ss;
+ struct cgroup *cgrp = css->cgroup;
+
+ mutex_lock(&cgroup_mutex);
+
+ css->flags |= CSS_RELEASED;
+ list_del_rcu(&css->sibling);
+
+ if (ss) {
+ /* css release path */
+ cgroup_idr_remove(&ss->css_idr, css->id);
+ } else {
+ /* cgroup release path */
+ cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
+ cgrp->id = -1;
+
+ /*
+ * There are two control paths which try to determine
+ * cgroup from dentry without going through kernfs -
+ * cgroupstats_build() and css_tryget_online_from_dir().
+ * Those are supported by RCU protecting clearing of
+ * cgrp->kn->priv backpointer.
+ */
+ RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
+ }
+
+ mutex_unlock(&cgroup_mutex);
+
+ call_rcu(&css->rcu_head, css_free_rcu_fn);
+}
+
static void css_release(struct percpu_ref *ref)
{
struct cgroup_subsys_state *css =
container_of(ref, struct cgroup_subsys_state, refcnt);
- rcu_assign_pointer(css->cgroup->subsys[css->ss->subsys_id], NULL);
- call_rcu(&css->rcu_head, css_free_rcu_fn);
+ INIT_WORK(&css->destroy_work, css_release_work_fn);
+ queue_work(cgroup_destroy_wq, &css->destroy_work);
}
-static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
- struct cgroup *cgrp)
+static void init_and_link_css(struct cgroup_subsys_state *css,
+ struct cgroup_subsys *ss, struct cgroup *cgrp)
{
+ lockdep_assert_held(&cgroup_mutex);
+
+ cgroup_get(cgrp);
+
+ memset(css, 0, sizeof(*css));
css->cgroup = cgrp;
css->ss = ss;
- css->flags = 0;
+ INIT_LIST_HEAD(&css->sibling);
+ INIT_LIST_HEAD(&css->children);
+ css->serial_nr = css_serial_nr_next++;
- if (cgrp->parent)
- css->parent = cgroup_css(cgrp->parent, ss);
- else
- css->flags |= CSS_ROOT;
+ if (cgroup_parent(cgrp)) {
+ css->parent = cgroup_css(cgroup_parent(cgrp), ss);
+ css_get(css->parent);
+ }
BUG_ON(cgroup_css(cgrp, ss));
}
@@ -4064,8 +4460,7 @@ static int online_css(struct cgroup_subsys_state *css)
ret = ss->css_online(css);
if (!ret) {
css->flags |= CSS_ONLINE;
- css->cgroup->nr_css++;
- rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css);
+ rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
}
return ret;
}
@@ -4084,133 +4479,128 @@ static void offline_css(struct cgroup_subsys_state *css)
ss->css_offline(css);
css->flags &= ~CSS_ONLINE;
- css->cgroup->nr_css--;
- RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css);
+ RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
+
+ wake_up_all(&css->cgroup->offline_waitq);
}
/**
* create_css - create a cgroup_subsys_state
* @cgrp: the cgroup new css will be associated with
* @ss: the subsys of new css
+ * @visible: whether to create control knobs for the new css or not
*
* Create a new css associated with @cgrp - @ss pair. On success, the new
- * css is online and installed in @cgrp with all interface files created.
- * Returns 0 on success, -errno on failure.
+ * css is online and installed in @cgrp with all interface files created if
+ * @visible. Returns 0 on success, -errno on failure.
*/
-static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
+static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
+ bool visible)
{
- struct cgroup *parent = cgrp->parent;
+ struct cgroup *parent = cgroup_parent(cgrp);
+ struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
struct cgroup_subsys_state *css;
int err;
- lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
lockdep_assert_held(&cgroup_mutex);
- css = ss->css_alloc(cgroup_css(parent, ss));
+ css = ss->css_alloc(parent_css);
if (IS_ERR(css))
return PTR_ERR(css);
+ init_and_link_css(css, ss, cgrp);
+
err = percpu_ref_init(&css->refcnt, css_release);
if (err)
- goto err_free;
+ goto err_free_css;
- init_css(css, ss, cgrp);
+ err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
+ if (err < 0)
+ goto err_free_percpu_ref;
+ css->id = err;
- err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id);
- if (err)
- goto err_free;
+ if (visible) {
+ err = cgroup_populate_dir(cgrp, 1 << ss->id);
+ if (err)
+ goto err_free_id;
+ }
+
+ /* @css is ready to be brought online now, make it visible */
+ list_add_tail_rcu(&css->sibling, &parent_css->children);
+ cgroup_idr_replace(&ss->css_idr, css, css->id);
err = online_css(css);
if (err)
- goto err_free;
-
- dget(cgrp->dentry);
- css_get(css->parent);
+ goto err_list_del;
if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
- parent->parent) {
- pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
- current->comm, current->pid, ss->name);
+ cgroup_parent(parent)) {
+ pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
+ current->comm, current->pid, ss->name);
if (!strcmp(ss->name, "memory"))
- pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
+ pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
ss->warned_broken_hierarchy = true;
}
return 0;
-err_free:
- percpu_ref_cancel_init(&css->refcnt);
- ss->css_free(css);
+err_list_del:
+ list_del_rcu(&css->sibling);
+ cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
+err_free_id:
+ cgroup_idr_remove(&ss->css_idr, css->id);
+err_free_percpu_ref:
+ percpu_ref_exit(&css->refcnt);
+err_free_css:
+ call_rcu(&css->rcu_head, css_free_rcu_fn);
return err;
}
-/*
- * cgroup_create - create a cgroup
- * @parent: cgroup that will be parent of the new cgroup
- * @dentry: dentry of the new cgroup
- * @mode: mode to set on new inode
- *
- * Must be called with the mutex on the parent inode held
- */
-static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
- umode_t mode)
+static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+ umode_t mode)
{
- struct cgroup *cgrp;
- struct cgroup_name *name;
- struct cgroupfs_root *root = parent->root;
- int ssid, err;
+ struct cgroup *parent, *cgrp;
+ struct cgroup_root *root;
struct cgroup_subsys *ss;
- struct super_block *sb = root->sb;
+ struct kernfs_node *kn;
+ struct cftype *base_files;
+ int ssid, ret;
+
+ /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
+ */
+ if (strchr(name, '\n'))
+ return -EINVAL;
+
+ parent = cgroup_kn_lock_live(parent_kn);
+ if (!parent)
+ return -ENODEV;
+ root = parent->root;
/* allocate the cgroup and its ID, 0 is reserved for the root */
cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
- if (!cgrp)
- return -ENOMEM;
-
- name = cgroup_alloc_name(dentry);
- if (!name) {
- err = -ENOMEM;
- goto err_free_cgrp;
+ if (!cgrp) {
+ ret = -ENOMEM;
+ goto out_unlock;
}
- rcu_assign_pointer(cgrp->name, name);
- /*
- * Only live parents can have children. Note that the liveliness
- * check isn't strictly necessary because cgroup_mkdir() and
- * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
- * anyway so that locking is contained inside cgroup proper and we
- * don't get nasty surprises if we ever grow another caller.
- */
- if (!cgroup_lock_live_group(parent)) {
- err = -ENODEV;
- goto err_free_name;
- }
+ ret = percpu_ref_init(&cgrp->self.refcnt, css_release);
+ if (ret)
+ goto out_free_cgrp;
/*
* Temporarily set the pointer to NULL, so idr_find() won't return
* a half-baked cgroup.
*/
- cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
+ cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
if (cgrp->id < 0) {
- err = -ENOMEM;
- goto err_unlock;
+ ret = -ENOMEM;
+ goto out_cancel_ref;
}
- /* Grab a reference on the superblock so the hierarchy doesn't
- * get deleted on unmount if there are child cgroups. This
- * can be done outside cgroup_mutex, since the sb can't
- * disappear while someone has an open control file on the
- * fs */
- atomic_inc(&sb->s_active);
-
init_cgroup_housekeeping(cgrp);
- dentry->d_fsdata = cgrp;
- cgrp->dentry = dentry;
-
- cgrp->parent = parent;
- cgrp->dummy_css.parent = &parent->dummy_css;
- cgrp->root = parent->root;
+ cgrp->self.parent = &parent->self;
+ cgrp->root = root;
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -4218,111 +4608,99 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
+ /* create the directory */
+ kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
+ if (IS_ERR(kn)) {
+ ret = PTR_ERR(kn);
+ goto out_free_id;
+ }
+ cgrp->kn = kn;
+
/*
- * Create directory. cgroup_create_file() returns with the new
- * directory locked on success so that it can be populated without
- * dropping cgroup_mutex.
+ * This extra ref will be put in cgroup_free_fn() and guarantees
+ * that @cgrp->kn is always accessible.
*/
- err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
- if (err < 0)
- goto err_free_id;
- lockdep_assert_held(&dentry->d_inode->i_mutex);
+ kernfs_get(kn);
- cgrp->serial_nr = cgroup_serial_nr_next++;
+ cgrp->self.serial_nr = css_serial_nr_next++;
/* allocation complete, commit to creation */
- list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
- root->number_of_cgroups++;
-
- /* hold a ref to the parent's dentry */
- dget(parent->dentry);
+ list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
+ atomic_inc(&root->nr_cgrps);
+ cgroup_get(parent);
/*
* @cgrp is now fully operational. If something fails after this
* point, it'll be released via the normal destruction path.
*/
- idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
+ cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
- err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
- if (err)
- goto err_destroy;
+ ret = cgroup_kn_set_ugid(kn);
+ if (ret)
+ goto out_destroy;
+
+ if (cgroup_on_dfl(cgrp))
+ base_files = cgroup_dfl_base_files;
+ else
+ base_files = cgroup_legacy_base_files;
+
+ ret = cgroup_addrm_files(cgrp, base_files, true);
+ if (ret)
+ goto out_destroy;
/* let's create and online css's */
for_each_subsys(ss, ssid) {
- if (root->subsys_mask & (1 << ssid)) {
- err = create_css(cgrp, ss);
- if (err)
- goto err_destroy;
+ if (parent->child_subsys_mask & (1 << ssid)) {
+ ret = create_css(cgrp, ss,
+ parent->subtree_control & (1 << ssid));
+ if (ret)
+ goto out_destroy;
}
}
- mutex_unlock(&cgroup_mutex);
- mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
+ /*
+ * On the default hierarchy, a child doesn't automatically inherit
+ * subtree_control from the parent. Each is configured manually.
+ */
+ if (!cgroup_on_dfl(cgrp)) {
+ cgrp->subtree_control = parent->subtree_control;
+ cgroup_refresh_child_subsys_mask(cgrp);
+ }
- return 0;
+ kernfs_activate(kn);
-err_free_id:
- idr_remove(&root->cgroup_idr, cgrp->id);
- /* Release the reference count that we took on the superblock */
- deactivate_super(sb);
-err_unlock:
- mutex_unlock(&cgroup_mutex);
-err_free_name:
- kfree(rcu_dereference_raw(cgrp->name));
-err_free_cgrp:
+ ret = 0;
+ goto out_unlock;
+
+out_free_id:
+ cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
+out_cancel_ref:
+ percpu_ref_exit(&cgrp->self.refcnt);
+out_free_cgrp:
kfree(cgrp);
- return err;
+out_unlock:
+ cgroup_kn_unlock(parent_kn);
+ return ret;
-err_destroy:
+out_destroy:
cgroup_destroy_locked(cgrp);
- mutex_unlock(&cgroup_mutex);
- mutex_unlock(&dentry->d_inode->i_mutex);
- return err;
-}
-
-static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
-{
- struct cgroup *c_parent = dentry->d_parent->d_fsdata;
-
- /* the vfs holds inode->i_mutex already */
- return cgroup_create(c_parent, dentry, mode | S_IFDIR);
+ goto out_unlock;
}
/*
* This is called when the refcnt of a css is confirmed to be killed.
- * css_tryget() is now guaranteed to fail.
+ * css_tryget_online() is now guaranteed to fail. Tell the subsystem to
+ * initate destruction and put the css ref from kill_css().
*/
static void css_killed_work_fn(struct work_struct *work)
{
struct cgroup_subsys_state *css =
container_of(work, struct cgroup_subsys_state, destroy_work);
- struct cgroup *cgrp = css->cgroup;
mutex_lock(&cgroup_mutex);
-
- /*
- * css_tryget() is guaranteed to fail now. Tell subsystems to
- * initate destruction.
- */
offline_css(css);
-
- /*
- * If @cgrp is marked dead, it's waiting for refs of all css's to
- * be disabled before proceeding to the second phase of cgroup
- * destruction. If we are the last one, kick it off.
- */
- if (!cgrp->nr_css && cgroup_is_dead(cgrp))
- cgroup_destroy_css_killed(cgrp);
-
mutex_unlock(&cgroup_mutex);
- /*
- * Put the css refs from kill_css(). Each css holds an extra
- * reference to the cgroup's dentry and cgroup removal proceeds
- * regardless of css refs. On the last put of each css, whenever
- * that may be, the extra dentry ref is put so that dentry
- * destruction happens only after all css's are released.
- */
css_put(css);
}
@@ -4342,12 +4720,18 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
*
* This function initiates destruction of @css by removing cgroup interface
* files and putting its base reference. ->css_offline() will be invoked
- * asynchronously once css_tryget() is guaranteed to fail and when the
- * reference count reaches zero, @css will be released.
+ * asynchronously once css_tryget_online() is guaranteed to fail and when
+ * the reference count reaches zero, @css will be released.
*/
static void kill_css(struct cgroup_subsys_state *css)
{
- cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id);
+ lockdep_assert_held(&cgroup_mutex);
+
+ /*
+ * This must happen before css is disassociated with its cgroup.
+ * See seq_css() for details.
+ */
+ cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
/*
* Killing would put the base ref, but we need to keep it alive
@@ -4358,7 +4742,7 @@ static void kill_css(struct cgroup_subsys_state *css)
/*
* cgroup core guarantees that, by the time ->css_offline() is
* invoked, no new css reference will be given out via
- * css_tryget(). We can't simply call percpu_ref_kill() and
+ * css_tryget_online(). We can't simply call percpu_ref_kill() and
* proceed to offlining css's because percpu_ref_kill() doesn't
* guarantee that the ref is seen as killed on all CPUs on return.
*
@@ -4374,9 +4758,9 @@ static void kill_css(struct cgroup_subsys_state *css)
*
* css's make use of percpu refcnts whose killing latency shouldn't be
* exposed to userland and are RCU protected. Also, cgroup core needs to
- * guarantee that css_tryget() won't succeed by the time ->css_offline() is
- * invoked. To satisfy all the requirements, destruction is implemented in
- * the following two steps.
+ * guarantee that css_tryget_online() won't succeed by the time
+ * ->css_offline() is invoked. To satisfy all the requirements,
+ * destruction is implemented in the following two steps.
*
* s1. Verify @cgrp can be destroyed and mark it dying. Remove all
* userland visible parts and start killing the percpu refcnts of
@@ -4395,141 +4779,88 @@ static void kill_css(struct cgroup_subsys_state *css)
static int cgroup_destroy_locked(struct cgroup *cgrp)
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
- struct dentry *d = cgrp->dentry;
struct cgroup_subsys_state *css;
- struct cgroup *child;
bool empty;
int ssid;
- lockdep_assert_held(&d->d_inode->i_mutex);
lockdep_assert_held(&cgroup_mutex);
/*
- * css_set_lock synchronizes access to ->cset_links and prevents
- * @cgrp from being removed while __put_css_set() is in progress.
+ * css_set_rwsem synchronizes access to ->cset_links and prevents
+ * @cgrp from being removed while put_css_set() is in progress.
*/
- read_lock(&css_set_lock);
+ down_read(&css_set_rwsem);
empty = list_empty(&cgrp->cset_links);
- read_unlock(&css_set_lock);
+ up_read(&css_set_rwsem);
if (!empty)
return -EBUSY;
/*
- * Make sure there's no live children. We can't test ->children
- * emptiness as dead children linger on it while being destroyed;
- * otherwise, "rmdir parent/child parent" may fail with -EBUSY.
+ * Make sure there's no live children. We can't test emptiness of
+ * ->self.children as dead children linger on it while being
+ * drained; otherwise, "rmdir parent/child parent" may fail.
*/
- empty = true;
- rcu_read_lock();
- list_for_each_entry_rcu(child, &cgrp->children, sibling) {
- empty = cgroup_is_dead(child);
- if (!empty)
- break;
- }
- rcu_read_unlock();
- if (!empty)
+ if (css_has_online_children(&cgrp->self))
return -EBUSY;
/*
- * Initiate massacre of all css's. cgroup_destroy_css_killed()
- * will be invoked to perform the rest of destruction once the
- * percpu refs of all css's are confirmed to be killed.
+ * Mark @cgrp dead. This prevents further task migration and child
+ * creation by disabling cgroup_lock_live_group().
*/
+ cgrp->self.flags &= ~CSS_ONLINE;
+
+ /* initiate massacre of all css's */
for_each_css(css, ssid, cgrp)
kill_css(css);
- /*
- * Mark @cgrp dead. This prevents further task migration and child
- * creation by disabling cgroup_lock_live_group(). Note that
- * CGRP_DEAD assertion is depended upon by css_next_child() to
- * resume iteration after dropping RCU read lock. See
- * css_next_child() for details.
- */
- set_bit(CGRP_DEAD, &cgrp->flags);
-
- /* CGRP_DEAD is set, remove from ->release_list for the last time */
+ /* CSS_ONLINE is clear, remove from ->release_list for the last time */
raw_spin_lock(&release_list_lock);
if (!list_empty(&cgrp->release_list))
list_del_init(&cgrp->release_list);
raw_spin_unlock(&release_list_lock);
/*
- * If @cgrp has css's attached, the second stage of cgroup
- * destruction is kicked off from css_killed_work_fn() after the
- * refs of all attached css's are killed. If @cgrp doesn't have
- * any css, we kick it off here.
+ * Remove @cgrp directory along with the base files. @cgrp has an
+ * extra ref on its kn.
*/
- if (!cgrp->nr_css)
- cgroup_destroy_css_killed(cgrp);
+ kernfs_remove(cgrp->kn);
- /*
- * Clear the base files and remove @cgrp directory. The removal
- * puts the base ref but we aren't quite done with @cgrp yet, so
- * hold onto it.
- */
- cgroup_addrm_files(cgrp, cgroup_base_files, false);
- dget(d);
- cgroup_d_remove_dir(d);
+ set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags);
+ check_for_release(cgroup_parent(cgrp));
+
+ /* put the base reference */
+ percpu_ref_kill(&cgrp->self.refcnt);
return 0;
};
-/**
- * cgroup_destroy_css_killed - the second step of cgroup destruction
- * @work: cgroup->destroy_free_work
- *
- * This function is invoked from a work item for a cgroup which is being
- * destroyed after all css's are offlined and performs the rest of
- * destruction. This is the second step of destruction described in the
- * comment above cgroup_destroy_locked().
- */
-static void cgroup_destroy_css_killed(struct cgroup *cgrp)
+static int cgroup_rmdir(struct kernfs_node *kn)
{
- struct cgroup *parent = cgrp->parent;
- struct dentry *d = cgrp->dentry;
-
- lockdep_assert_held(&cgroup_mutex);
-
- /* delete this cgroup from parent->children */
- list_del_rcu(&cgrp->sibling);
-
- dput(d);
+ struct cgroup *cgrp;
+ int ret = 0;
- set_bit(CGRP_RELEASABLE, &parent->flags);
- check_for_release(parent);
-}
+ cgrp = cgroup_kn_lock_live(kn);
+ if (!cgrp)
+ return 0;
+ cgroup_get(cgrp); /* for @kn->priv clearing */
-static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
-{
- int ret;
+ ret = cgroup_destroy_locked(cgrp);
- mutex_lock(&cgroup_mutex);
- ret = cgroup_destroy_locked(dentry->d_fsdata);
- mutex_unlock(&cgroup_mutex);
+ cgroup_kn_unlock(kn);
+ cgroup_put(cgrp);
return ret;
}
-static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
-{
- INIT_LIST_HEAD(&ss->cftsets);
-
- /*
- * base_cftset is embedded in subsys itself, no need to worry about
- * deregistration.
- */
- if (ss->base_cftypes) {
- struct cftype *cft;
-
- for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++)
- cft->ss = ss;
-
- ss->base_cftset.cfts = ss->base_cftypes;
- list_add_tail(&ss->base_cftset.node, &ss->cftsets);
- }
-}
+static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
+ .remount_fs = cgroup_remount,
+ .show_options = cgroup_show_options,
+ .mkdir = cgroup_mkdir,
+ .rmdir = cgroup_rmdir,
+ .rename = cgroup_rename,
+};
-static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
+static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
{
struct cgroup_subsys_state *css;
@@ -4537,21 +4868,35 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
mutex_lock(&cgroup_mutex);
- /* init base cftset */
- cgroup_init_cftsets(ss);
+ idr_init(&ss->css_idr);
+ INIT_LIST_HEAD(&ss->cfts);
- /* Create the top cgroup state for this subsystem */
- ss->root = &cgroup_dummy_root;
- css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
+ /* Create the root cgroup state for this subsystem */
+ ss->root = &cgrp_dfl_root;
+ css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
/* We don't handle early failures gracefully */
BUG_ON(IS_ERR(css));
- init_css(css, ss, cgroup_dummy_top);
+ init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
+
+ /*
+ * Root csses are never destroyed and we can't initialize
+ * percpu_ref during early init. Disable refcnting.
+ */
+ css->flags |= CSS_NO_REF;
+
+ if (early) {
+ /* allocation can't be done safely during early init */
+ css->id = 1;
+ } else {
+ css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
+ BUG_ON(css->id < 0);
+ }
/* Update the init_css_set to contain a subsys
* pointer to this state - since the subsystem is
* newly registered, all tasks and hence the
- * init_css_set is in the subsystem's top cgroup. */
- init_css_set.subsys[ss->subsys_id] = css;
+ * init_css_set is in the subsystem's root cgroup. */
+ init_css_set.subsys[ss->id] = css;
need_forkexit_callback |= ss->fork || ss->exit;
@@ -4563,184 +4908,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
BUG_ON(online_css(css));
mutex_unlock(&cgroup_mutex);
-
- /* this function shouldn't be used with modular subsystems, since they
- * need to register a subsys_id, among other things */
- BUG_ON(ss->module);
-}
-
-/**
- * cgroup_load_subsys: load and register a modular subsystem at runtime
- * @ss: the subsystem to load
- *
- * This function should be called in a modular subsystem's initcall. If the
- * subsystem is built as a module, it will be assigned a new subsys_id and set
- * up for use. If the subsystem is built-in anyway, work is delegated to the
- * simpler cgroup_init_subsys.
- */
-int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
-{
- struct cgroup_subsys_state *css;
- int i, ret;
- struct hlist_node *tmp;
- struct css_set *cset;
- unsigned long key;
-
- /* check name and function validity */
- if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
- ss->css_alloc == NULL || ss->css_free == NULL)
- return -EINVAL;
-
- /*
- * we don't support callbacks in modular subsystems. this check is
- * before the ss->module check for consistency; a subsystem that could
- * be a module should still have no callbacks even if the user isn't
- * compiling it as one.
- */
- if (ss->fork || ss->exit)
- return -EINVAL;
-
- /*
- * an optionally modular subsystem is built-in: we want to do nothing,
- * since cgroup_init_subsys will have already taken care of it.
- */
- if (ss->module == NULL) {
- /* a sanity check */
- BUG_ON(cgroup_subsys[ss->subsys_id] != ss);
- return 0;
- }
-
- /* init base cftset */
- cgroup_init_cftsets(ss);
-
- mutex_lock(&cgroup_mutex);
- mutex_lock(&cgroup_root_mutex);
- cgroup_subsys[ss->subsys_id] = ss;
-
- /*
- * no ss->css_alloc seems to need anything important in the ss
- * struct, so this can happen first (i.e. before the dummy root
- * attachment).
- */
- css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
- if (IS_ERR(css)) {
- /* failure case - need to deassign the cgroup_subsys[] slot. */
- cgroup_subsys[ss->subsys_id] = NULL;
- mutex_unlock(&cgroup_root_mutex);
- mutex_unlock(&cgroup_mutex);
- return PTR_ERR(css);
- }
-
- ss->root = &cgroup_dummy_root;
-
- /* our new subsystem will be attached to the dummy hierarchy. */
- init_css(css, ss, cgroup_dummy_top);
-
- /*
- * Now we need to entangle the css into the existing css_sets. unlike
- * in cgroup_init_subsys, there are now multiple css_sets, so each one
- * will need a new pointer to it; done by iterating the css_set_table.
- * furthermore, modifying the existing css_sets will corrupt the hash
- * table state, so each changed css_set will need its hash recomputed.
- * this is all done under the css_set_lock.
- */
- write_lock(&css_set_lock);
- hash_for_each_safe(css_set_table, i, tmp, cset, hlist) {
- /* skip entries that we already rehashed */
- if (cset->subsys[ss->subsys_id])
- continue;
- /* remove existing entry */
- hash_del(&cset->hlist);
- /* set new value */
- cset->subsys[ss->subsys_id] = css;
- /* recompute hash and restore entry */
- key = css_set_hash(cset->subsys);
- hash_add(css_set_table, &cset->hlist, key);
- }
- write_unlock(&css_set_lock);
-
- ret = online_css(css);
- if (ret) {
- ss->css_free(css);
- goto err_unload;
- }
-
- /* success! */
- mutex_unlock(&cgroup_root_mutex);
- mutex_unlock(&cgroup_mutex);
- return 0;
-
-err_unload:
- mutex_unlock(&cgroup_root_mutex);
- mutex_unlock(&cgroup_mutex);
- /* @ss can't be mounted here as try_module_get() would fail */
- cgroup_unload_subsys(ss);
- return ret;
-}
-EXPORT_SYMBOL_GPL(cgroup_load_subsys);
-
-/**
- * cgroup_unload_subsys: unload a modular subsystem
- * @ss: the subsystem to unload
- *
- * This function should be called in a modular subsystem's exitcall. When this
- * function is invoked, the refcount on the subsystem's module will be 0, so
- * the subsystem will not be attached to any hierarchy.
- */
-void cgroup_unload_subsys(struct cgroup_subsys *ss)
-{
- struct cgrp_cset_link *link;
- struct cgroup_subsys_state *css;
-
- BUG_ON(ss->module == NULL);
-
- /*
- * we shouldn't be called if the subsystem is in use, and the use of
- * try_module_get() in rebind_subsystems() should ensure that it
- * doesn't start being used while we're killing it off.
- */
- BUG_ON(ss->root != &cgroup_dummy_root);
-
- mutex_lock(&cgroup_mutex);
- mutex_lock(&cgroup_root_mutex);
-
- css = cgroup_css(cgroup_dummy_top, ss);
- if (css)
- offline_css(css);
-
- /* deassign the subsys_id */
- cgroup_subsys[ss->subsys_id] = NULL;
-
- /*
- * disentangle the css from all css_sets attached to the dummy
- * top. as in loading, we need to pay our respects to the hashtable
- * gods.
- */
- write_lock(&css_set_lock);
- list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) {
- struct css_set *cset = link->cset;
- unsigned long key;
-
- hash_del(&cset->hlist);
- cset->subsys[ss->subsys_id] = NULL;
- key = css_set_hash(cset->subsys);
- hash_add(css_set_table, &cset->hlist, key);
- }
- write_unlock(&css_set_lock);
-
- /*
- * remove subsystem's css from the cgroup_dummy_top and free it -
- * need to free before marking as null because ss->css_free needs
- * the cgrp->subsys pointer to find their state.
- */
- if (css)
- ss->css_free(css);
- RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
-
- mutex_unlock(&cgroup_root_mutex);
- mutex_unlock(&cgroup_mutex);
}
-EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
/**
* cgroup_init_early - cgroup initialization at system boot
@@ -4750,37 +4918,28 @@ EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
*/
int __init cgroup_init_early(void)
{
+ static struct cgroup_sb_opts __initdata opts;
struct cgroup_subsys *ss;
int i;
- atomic_set(&init_css_set.refcount, 1);
- INIT_LIST_HEAD(&init_css_set.cgrp_links);
- INIT_LIST_HEAD(&init_css_set.tasks);
- INIT_HLIST_NODE(&init_css_set.hlist);
- css_set_count = 1;
- init_cgroup_root(&cgroup_dummy_root);
- cgroup_root_count = 1;
+ init_cgroup_root(&cgrp_dfl_root, &opts);
+ cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
+
RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
- init_cgrp_cset_link.cset = &init_css_set;
- init_cgrp_cset_link.cgrp = cgroup_dummy_top;
- list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links);
- list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links);
-
- /* at bootup time, we don't worry about modular subsystems */
- for_each_builtin_subsys(ss, i) {
- BUG_ON(!ss->name);
- BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
- BUG_ON(!ss->css_alloc);
- BUG_ON(!ss->css_free);
- if (ss->subsys_id != i) {
- printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
- ss->name, ss->subsys_id);
- BUG();
- }
+ for_each_subsys(ss, i) {
+ WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
+ "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
+ i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
+ ss->id, ss->name);
+ WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
+ "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
+
+ ss->id = i;
+ ss->name = cgroup_subsys_name[i];
if (ss->early_init)
- cgroup_init_subsys(ss);
+ cgroup_init_subsys(ss, true);
}
return 0;
}
@@ -4795,53 +4954,72 @@ int __init cgroup_init(void)
{
struct cgroup_subsys *ss;
unsigned long key;
- int i, err;
-
- err = bdi_init(&cgroup_backing_dev_info);
- if (err)
- return err;
+ int ssid, err;
- for_each_builtin_subsys(ss, i) {
- if (!ss->early_init)
- cgroup_init_subsys(ss);
- }
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
- /* allocate id for the dummy hierarchy */
mutex_lock(&cgroup_mutex);
- mutex_lock(&cgroup_root_mutex);
/* Add init_css_set to the hash table */
key = css_set_hash(init_css_set.subsys);
hash_add(css_set_table, &init_css_set.hlist, key);
- BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1));
-
- err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top,
- 0, 1, GFP_KERNEL);
- BUG_ON(err < 0);
+ BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
- mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
- cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
- if (!cgroup_kobj) {
- err = -ENOMEM;
- goto out;
+ for_each_subsys(ss, ssid) {
+ if (ss->early_init) {
+ struct cgroup_subsys_state *css =
+ init_css_set.subsys[ss->id];
+
+ css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
+ GFP_KERNEL);
+ BUG_ON(css->id < 0);
+ } else {
+ cgroup_init_subsys(ss, false);
+ }
+
+ list_add_tail(&init_css_set.e_cset_node[ssid],
+ &cgrp_dfl_root.cgrp.e_csets[ssid]);
+
+ /*
+ * Setting dfl_root subsys_mask needs to consider the
+ * disabled flag and cftype registration needs kmalloc,
+ * both of which aren't available during early_init.
+ */
+ if (ss->disabled)
+ continue;
+
+ cgrp_dfl_root.subsys_mask |= 1 << ss->id;
+
+ if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes)
+ ss->dfl_cftypes = ss->legacy_cftypes;
+
+ if (!ss->dfl_cftypes)
+ cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
+
+ if (ss->dfl_cftypes == ss->legacy_cftypes) {
+ WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
+ } else {
+ WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
+ WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
+ }
}
+ cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
+ if (!cgroup_kobj)
+ return -ENOMEM;
+
err = register_filesystem(&cgroup_fs_type);
if (err < 0) {
kobject_put(cgroup_kobj);
- goto out;
+ return err;
}
proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
-
-out:
- if (err)
- bdi_destroy(&cgroup_backing_dev_info);
-
- return err;
+ return 0;
}
static int __init cgroup_wq_init(void)
@@ -4873,12 +5051,6 @@ core_initcall(cgroup_wq_init);
* proc_cgroup_show()
* - Print task's cgroup paths into seq_file, one line for each hierarchy
* - Used for /proc/<pid>/cgroup.
- * - No need to task_lock(tsk) on this tsk->cgroup reference, as it
- * doesn't really matter if tsk->cgroup changes after we read it,
- * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
- * anyway. No need to check that tsk->cgroup != NULL, thanks to
- * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
- * cgroup to top_cgroup.
*/
/* TODO: Use a proper seq_file iterator */
@@ -4886,12 +5058,12 @@ int proc_cgroup_show(struct seq_file *m, void *v)
{
struct pid *pid;
struct task_struct *tsk;
- char *buf;
+ char *buf, *path;
int retval;
- struct cgroupfs_root *root;
+ struct cgroup_root *root;
retval = -ENOMEM;
- buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!buf)
goto out;
@@ -4904,12 +5076,16 @@ int proc_cgroup_show(struct seq_file *m, void *v)
retval = 0;
mutex_lock(&cgroup_mutex);
+ down_read(&css_set_rwsem);
- for_each_active_root(root) {
+ for_each_root(root) {
struct cgroup_subsys *ss;
struct cgroup *cgrp;
int ssid, count = 0;
+ if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
+ continue;
+
seq_printf(m, "%d:", root->hierarchy_id);
for_each_subsys(ss, ssid)
if (root->subsys_mask & (1 << ssid))
@@ -4919,14 +5095,17 @@ int proc_cgroup_show(struct seq_file *m, void *v)
root->name);
seq_putc(m, ':');
cgrp = task_cgroup_from_root(tsk, root);
- retval = cgroup_path(cgrp, buf, PAGE_SIZE);
- if (retval < 0)
+ path = cgroup_path(cgrp, buf, PATH_MAX);
+ if (!path) {
+ retval = -ENAMETOOLONG;
goto out_unlock;
- seq_puts(m, buf);
+ }
+ seq_puts(m, path);
seq_putc(m, '\n');
}
out_unlock:
+ up_read(&css_set_rwsem);
mutex_unlock(&cgroup_mutex);
put_task_struct(tsk);
out_free:
@@ -4952,7 +5131,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
for_each_subsys(ss, i)
seq_printf(m, "%s\t%d\t%d\t%d\n",
ss->name, ss->root->hierarchy_id,
- ss->root->number_of_cgroups, !ss->disabled);
+ atomic_read(&ss->root->nr_cgrps), !ss->disabled);
mutex_unlock(&cgroup_mutex);
return 0;
@@ -4971,27 +5150,16 @@ static const struct file_operations proc_cgroupstats_operations = {
};
/**
- * cgroup_fork - attach newly forked task to its parents cgroup.
+ * cgroup_fork - initialize cgroup related fields during copy_process()
* @child: pointer to task_struct of forking parent process.
*
- * Description: A task inherits its parent's cgroup at fork().
- *
- * A pointer to the shared css_set was automatically copied in
- * fork.c by dup_task_struct(). However, we ignore that copy, since
- * it was not made under the protection of RCU or cgroup_mutex, so
- * might no longer be a valid cgroup pointer. cgroup_attach_task() might
- * have already changed current->cgroups, allowing the previously
- * referenced cgroup group to be removed and freed.
- *
- * At the point that cgroup_fork() is called, 'current' is the parent
- * task, and the passed argument 'child' points to the child task.
+ * A task is associated with the init_css_set until cgroup_post_fork()
+ * attaches it to the parent's css_set. Empty cg_list indicates that
+ * @child isn't holding reference to its css_set.
*/
void cgroup_fork(struct task_struct *child)
{
- task_lock(current);
- get_css_set(task_css_set(current));
- child->cgroups = current->cgroups;
- task_unlock(current);
+ RCU_INIT_POINTER(child->cgroups, &init_css_set);
INIT_LIST_HEAD(&child->cg_list);
}
@@ -5011,23 +5179,37 @@ void cgroup_post_fork(struct task_struct *child)
int i;
/*
- * use_task_css_set_links is set to 1 before we walk the tasklist
- * under the tasklist_lock and we read it here after we added the child
- * to the tasklist under the tasklist_lock as well. If the child wasn't
- * yet in the tasklist when we walked through it from
- * cgroup_enable_task_cg_lists(), then use_task_css_set_links value
- * should be visible now due to the paired locking and barriers implied
- * by LOCK/UNLOCK: it is written before the tasklist_lock unlock
- * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock
- * lock on fork.
+ * This may race against cgroup_enable_task_cg_links(). As that
+ * function sets use_task_css_set_links before grabbing
+ * tasklist_lock and we just went through tasklist_lock to add
+ * @child, it's guaranteed that either we see the set
+ * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
+ * @child during its iteration.
+ *
+ * If we won the race, @child is associated with %current's
+ * css_set. Grabbing css_set_rwsem guarantees both that the
+ * association is stable, and, on completion of the parent's
+ * migration, @child is visible in the source of migration or
+ * already in the destination cgroup. This guarantee is necessary
+ * when implementing operations which need to migrate all tasks of
+ * a cgroup to another.
+ *
+ * Note that if we lose to cgroup_enable_task_cg_links(), @child
+ * will remain in init_css_set. This is safe because all tasks are
+ * in the init_css_set before cg_links is enabled and there's no
+ * operation which transfers all tasks out of init_css_set.
*/
if (use_task_css_set_links) {
- write_lock(&css_set_lock);
- task_lock(child);
- if (list_empty(&child->cg_list))
- list_add(&child->cg_list, &task_css_set(child)->tasks);
- task_unlock(child);
- write_unlock(&css_set_lock);
+ struct css_set *cset;
+
+ down_write(&css_set_rwsem);
+ cset = task_css_set(current);
+ if (list_empty(&child->cg_list)) {
+ rcu_assign_pointer(child->cgroups, cset);
+ list_add(&child->cg_list, &cset->tasks);
+ get_css_set(cset);
+ }
+ up_write(&css_set_rwsem);
}
/*
@@ -5036,15 +5218,7 @@ void cgroup_post_fork(struct task_struct *child)
* and addition to css_set.
*/
if (need_forkexit_callback) {
- /*
- * fork/exit callbacks are supported only for builtin
- * subsystems, and the builtin section of the subsys
- * array is immutable, so we don't need to lock the
- * subsys array here. On the other hand, modular section
- * of the array can be freed at module unload, so we
- * can't touch that.
- */
- for_each_builtin_subsys(ss, i)
+ for_each_subsys(ss, i)
if (ss->fork)
ss->fork(child);
}
@@ -5053,7 +5227,6 @@ void cgroup_post_fork(struct task_struct *child)
/**
* cgroup_exit - detach cgroup from exiting task
* @tsk: pointer to task_struct of exiting process
- * @run_callback: run exit callbacks?
*
* Description: Detach cgroup from @tsk and release it.
*
@@ -5063,57 +5236,38 @@ void cgroup_post_fork(struct task_struct *child)
* use notify_on_release cgroups where very high task exit scaling
* is required on large systems.
*
- * the_top_cgroup_hack:
- *
- * Set the exiting tasks cgroup to the root cgroup (top_cgroup).
- *
- * We call cgroup_exit() while the task is still competent to
- * handle notify_on_release(), then leave the task attached to the
- * root cgroup in each hierarchy for the remainder of its exit.
- *
- * To do this properly, we would increment the reference count on
- * top_cgroup, and near the very end of the kernel/exit.c do_exit()
- * code we would add a second cgroup function call, to drop that
- * reference. This would just create an unnecessary hot spot on
- * the top_cgroup reference count, to no avail.
- *
- * Normally, holding a reference to a cgroup without bumping its
- * count is unsafe. The cgroup could go away, or someone could
- * attach us to a different cgroup, decrementing the count on
- * the first cgroup that we never incremented. But in this case,
- * top_cgroup isn't going away, and either task has PF_EXITING set,
- * which wards off any cgroup_attach_task() attempts, or task is a failed
- * fork, never visible to cgroup_attach_task.
+ * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We
+ * call cgroup_exit() while the task is still competent to handle
+ * notify_on_release(), then leave the task attached to the root cgroup in
+ * each hierarchy for the remainder of its exit. No need to bother with
+ * init_css_set refcnting. init_css_set never goes away and we can't race
+ * with migration path - PF_EXITING is visible to migration path.
*/
-void cgroup_exit(struct task_struct *tsk, int run_callbacks)
+void cgroup_exit(struct task_struct *tsk)
{
struct cgroup_subsys *ss;
struct css_set *cset;
+ bool put_cset = false;
int i;
/*
- * Unlink from the css_set task list if necessary.
- * Optimistically check cg_list before taking
- * css_set_lock
+ * Unlink from @tsk from its css_set. As migration path can't race
+ * with us, we can check cg_list without grabbing css_set_rwsem.
*/
if (!list_empty(&tsk->cg_list)) {
- write_lock(&css_set_lock);
- if (!list_empty(&tsk->cg_list))
- list_del_init(&tsk->cg_list);
- write_unlock(&css_set_lock);
+ down_write(&css_set_rwsem);
+ list_del_init(&tsk->cg_list);
+ up_write(&css_set_rwsem);
+ put_cset = true;
}
/* Reassign the task to the init_css_set. */
- task_lock(tsk);
cset = task_css_set(tsk);
RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
- if (run_callbacks && need_forkexit_callback) {
- /*
- * fork/exit callbacks are supported only for builtin
- * subsystems, see cgroup_post_fork() for details.
- */
- for_each_builtin_subsys(ss, i) {
+ if (need_forkexit_callback) {
+ /* see cgroup_post_fork() for details */
+ for_each_subsys(ss, i) {
if (ss->exit) {
struct cgroup_subsys_state *old_css = cset->subsys[i];
struct cgroup_subsys_state *css = task_css(tsk, i);
@@ -5122,15 +5276,15 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
}
}
}
- task_unlock(tsk);
- put_css_set_taskexit(cset);
+ if (put_cset)
+ put_css_set(cset, true);
}
static void check_for_release(struct cgroup *cgrp)
{
- if (cgroup_is_releasable(cgrp) &&
- list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {
+ if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) &&
+ !css_has_online_children(&cgrp->self)) {
/*
* Control Group is currently removeable. If it's not
* already queued for a userspace notification, queue
@@ -5181,16 +5335,17 @@ static void cgroup_release_agent(struct work_struct *work)
while (!list_empty(&release_list)) {
char *argv[3], *envp[3];
int i;
- char *pathbuf = NULL, *agentbuf = NULL;
+ char *pathbuf = NULL, *agentbuf = NULL, *path;
struct cgroup *cgrp = list_entry(release_list.next,
struct cgroup,
release_list);
list_del_init(&cgrp->release_list);
raw_spin_unlock(&release_list_lock);
- pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!pathbuf)
goto continue_free;
- if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
+ path = cgroup_path(cgrp, pathbuf, PATH_MAX);
+ if (!path)
goto continue_free;
agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
if (!agentbuf)
@@ -5198,7 +5353,7 @@ static void cgroup_release_agent(struct work_struct *work)
i = 0;
argv[i++] = agentbuf;
- argv[i++] = pathbuf;
+ argv[i++] = path;
argv[i] = NULL;
i = 0;
@@ -5232,11 +5387,7 @@ static int __init cgroup_disable(char *str)
if (!*token)
continue;
- /*
- * cgroup_disable, being at boot time, can't know about
- * module subsystems, so we don't worry about them.
- */
- for_each_builtin_subsys(ss, i) {
+ for_each_subsys(ss, i) {
if (!strcmp(token, ss->name)) {
ss->disabled = 1;
printk(KERN_INFO "Disabling %s control group"
@@ -5249,29 +5400,51 @@ static int __init cgroup_disable(char *str)
}
__setup("cgroup_disable=", cgroup_disable);
+static int __init cgroup_set_legacy_files_on_dfl(char *str)
+{
+ printk("cgroup: using legacy files on the default hierarchy\n");
+ cgroup_legacy_files_on_dfl = true;
+ return 0;
+}
+__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl);
+
/**
- * css_from_dir - get corresponding css from the dentry of a cgroup dir
+ * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
* @dentry: directory dentry of interest
* @ss: subsystem of interest
*
- * Must be called under cgroup_mutex or RCU read lock. The caller is
- * responsible for pinning the returned css if it needs to be accessed
- * outside the critical section.
+ * If @dentry is a directory for a cgroup which has @ss enabled on it, try
+ * to get the corresponding css and return it. If such css doesn't exist
+ * or can't be pinned, an ERR_PTR value is returned.
*/
-struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
- struct cgroup_subsys *ss)
+struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
+ struct cgroup_subsys *ss)
{
+ struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
+ struct cgroup_subsys_state *css = NULL;
struct cgroup *cgrp;
- cgroup_assert_mutex_or_rcu_locked();
-
/* is @dentry a cgroup dir? */
- if (!dentry->d_inode ||
- dentry->d_inode->i_op != &cgroup_dir_inode_operations)
+ if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
+ kernfs_type(kn) != KERNFS_DIR)
return ERR_PTR(-EBADF);
- cgrp = __d_cgrp(dentry);
- return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT);
+ rcu_read_lock();
+
+ /*
+ * This path doesn't originate from kernfs and @kn could already
+ * have been or be removed at any point. @kn->priv is RCU
+ * protected for this access. See css_release_work_fn() for details.
+ */
+ cgrp = rcu_dereference(kn->priv);
+ if (cgrp)
+ css = cgroup_css(cgrp, ss);
+
+ if (!css || !css_tryget_online(css))
+ css = ERR_PTR(-ENOENT);
+
+ rcu_read_unlock();
+ return css;
}
/**
@@ -5284,14 +5457,8 @@ struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
*/
struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
{
- struct cgroup *cgrp;
-
- cgroup_assert_mutex_or_rcu_locked();
-
- cgrp = idr_find(&ss->root->cgroup_idr, id);
- if (cgrp)
- return cgroup_css(cgrp, ss);
- return NULL;
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return idr_find(&ss->css_idr, id);
}
#ifdef CONFIG_CGROUP_DEBUG
@@ -5338,23 +5505,25 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
{
struct cgrp_cset_link *link;
struct css_set *cset;
+ char *name_buf;
- read_lock(&css_set_lock);
+ name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+ if (!name_buf)
+ return -ENOMEM;
+
+ down_read(&css_set_rwsem);
rcu_read_lock();
cset = rcu_dereference(current->cgroups);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
- const char *name;
- if (c->dentry)
- name = c->dentry->d_name.name;
- else
- name = "?";
+ cgroup_name(c, name_buf, NAME_MAX + 1);
seq_printf(seq, "Root %d group %s\n",
- c->root->hierarchy_id, name);
+ c->root->hierarchy_id, name_buf);
}
rcu_read_unlock();
- read_unlock(&css_set_lock);
+ up_read(&css_set_rwsem);
+ kfree(name_buf);
return 0;
}
@@ -5364,23 +5533,30 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
struct cgroup_subsys_state *css = seq_css(seq);
struct cgrp_cset_link *link;
- read_lock(&css_set_lock);
+ down_read(&css_set_rwsem);
list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
struct css_set *cset = link->cset;
struct task_struct *task;
int count = 0;
+
seq_printf(seq, "css_set %p\n", cset);
+
list_for_each_entry(task, &cset->tasks, cg_list) {
- if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
- seq_puts(seq, " ...\n");
- break;
- } else {
- seq_printf(seq, " task %d\n",
- task_pid_vnr(task));
- }
+ if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+ goto overflow;
+ seq_printf(seq, " task %d\n", task_pid_vnr(task));
+ }
+
+ list_for_each_entry(task, &cset->mg_tasks, cg_list) {
+ if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+ goto overflow;
+ seq_printf(seq, " task %d\n", task_pid_vnr(task));
}
+ continue;
+ overflow:
+ seq_puts(seq, " ...\n");
}
- read_unlock(&css_set_lock);
+ up_read(&css_set_rwsem);
return 0;
}
@@ -5423,11 +5599,9 @@ static struct cftype debug_files[] = {
{ } /* terminate */
};
-struct cgroup_subsys debug_subsys = {
- .name = "debug",
+struct cgroup_subsys debug_cgrp_subsys = {
.css_alloc = debug_css_alloc,
.css_free = debug_css_free,
- .subsys_id = debug_subsys_id,
- .base_cftypes = debug_files,
+ .legacy_cftypes = debug_files,
};
#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 6c3154e477f6..92b98cc0ee76 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -21,6 +21,7 @@
#include <linux/uaccess.h>
#include <linux/freezer.h>
#include <linux/seq_file.h>
+#include <linux/mutex.h>
/*
* A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
@@ -42,9 +43,10 @@ enum freezer_state_flags {
struct freezer {
struct cgroup_subsys_state css;
unsigned int state;
- spinlock_t lock;
};
+static DEFINE_MUTEX(freezer_mutex);
+
static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
{
return css ? container_of(css, struct freezer, css) : NULL;
@@ -52,12 +54,12 @@ static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
static inline struct freezer *task_freezer(struct task_struct *task)
{
- return css_freezer(task_css(task, freezer_subsys_id));
+ return css_freezer(task_css(task, freezer_cgrp_id));
}
static struct freezer *parent_freezer(struct freezer *freezer)
{
- return css_freezer(css_parent(&freezer->css));
+ return css_freezer(freezer->css.parent);
}
bool cgroup_freezing(struct task_struct *task)
@@ -71,10 +73,6 @@ bool cgroup_freezing(struct task_struct *task)
return ret;
}
-/*
- * cgroups_write_string() limits the size of freezer state strings to
- * CGROUP_LOCAL_BUFFER_SIZE
- */
static const char *freezer_state_strs(unsigned int state)
{
if (state & CGROUP_FROZEN)
@@ -84,8 +82,6 @@ static const char *freezer_state_strs(unsigned int state)
return "THAWED";
};
-struct cgroup_subsys freezer_subsys;
-
static struct cgroup_subsys_state *
freezer_css_alloc(struct cgroup_subsys_state *parent_css)
{
@@ -95,7 +91,6 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css)
if (!freezer)
return ERR_PTR(-ENOMEM);
- spin_lock_init(&freezer->lock);
return &freezer->css;
}
@@ -112,14 +107,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
struct freezer *freezer = css_freezer(css);
struct freezer *parent = parent_freezer(freezer);
- /*
- * The following double locking and freezing state inheritance
- * guarantee that @cgroup can never escape ancestors' freezing
- * states. See css_for_each_descendant_pre() for details.
- */
- if (parent)
- spin_lock_irq(&parent->lock);
- spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING);
+ mutex_lock(&freezer_mutex);
freezer->state |= CGROUP_FREEZER_ONLINE;
@@ -128,10 +116,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
atomic_inc(&system_freezing_cnt);
}
- spin_unlock(&freezer->lock);
- if (parent)
- spin_unlock_irq(&parent->lock);
-
+ mutex_unlock(&freezer_mutex);
return 0;
}
@@ -146,14 +131,14 @@ static void freezer_css_offline(struct cgroup_subsys_state *css)
{
struct freezer *freezer = css_freezer(css);
- spin_lock_irq(&freezer->lock);
+ mutex_lock(&freezer_mutex);
if (freezer->state & CGROUP_FREEZING)
atomic_dec(&system_freezing_cnt);
freezer->state = 0;
- spin_unlock_irq(&freezer->lock);
+ mutex_unlock(&freezer_mutex);
}
static void freezer_css_free(struct cgroup_subsys_state *css)
@@ -177,7 +162,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
struct task_struct *task;
bool clear_frozen = false;
- spin_lock_irq(&freezer->lock);
+ mutex_lock(&freezer_mutex);
/*
* Make the new tasks conform to the current state of @new_css.
@@ -189,7 +174,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
* current state before executing the following - !frozen tasks may
* be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
*/
- cgroup_taskset_for_each(task, new_css, tset) {
+ cgroup_taskset_for_each(task, tset) {
if (!(freezer->state & CGROUP_FREEZING)) {
__thaw_task(task);
} else {
@@ -199,43 +184,48 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
}
}
- spin_unlock_irq(&freezer->lock);
-
- /*
- * Propagate FROZEN clearing upwards. We may race with
- * update_if_frozen(), but as long as both work bottom-up, either
- * update_if_frozen() sees child's FROZEN cleared or we clear the
- * parent's FROZEN later. No parent w/ !FROZEN children can be
- * left FROZEN.
- */
+ /* propagate FROZEN clearing upwards */
while (clear_frozen && (freezer = parent_freezer(freezer))) {
- spin_lock_irq(&freezer->lock);
freezer->state &= ~CGROUP_FROZEN;
clear_frozen = freezer->state & CGROUP_FREEZING;
- spin_unlock_irq(&freezer->lock);
}
+
+ mutex_unlock(&freezer_mutex);
}
+/**
+ * freezer_fork - cgroup post fork callback
+ * @task: a task which has just been forked
+ *
+ * @task has just been created and should conform to the current state of
+ * the cgroup_freezer it belongs to. This function may race against
+ * freezer_attach(). Losing to freezer_attach() means that we don't have
+ * to do anything as freezer_attach() will put @task into the appropriate
+ * state.
+ */
static void freezer_fork(struct task_struct *task)
{
struct freezer *freezer;
- rcu_read_lock();
- freezer = task_freezer(task);
-
/*
- * The root cgroup is non-freezable, so we can skip the
- * following check.
+ * The root cgroup is non-freezable, so we can skip locking the
+ * freezer. This is safe regardless of race with task migration.
+ * If we didn't race or won, skipping is obviously the right thing
+ * to do. If we lost and root is the new cgroup, noop is still the
+ * right thing to do.
*/
- if (!parent_freezer(freezer))
- goto out;
+ if (task_css_is_root(task, freezer_cgrp_id))
+ return;
- spin_lock_irq(&freezer->lock);
+ mutex_lock(&freezer_mutex);
+ rcu_read_lock();
+
+ freezer = task_freezer(task);
if (freezer->state & CGROUP_FREEZING)
freeze_task(task);
- spin_unlock_irq(&freezer->lock);
-out:
+
rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
}
/**
@@ -261,22 +251,24 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
struct css_task_iter it;
struct task_struct *task;
- WARN_ON_ONCE(!rcu_read_lock_held());
-
- spin_lock_irq(&freezer->lock);
+ lockdep_assert_held(&freezer_mutex);
if (!(freezer->state & CGROUP_FREEZING) ||
(freezer->state & CGROUP_FROZEN))
- goto out_unlock;
+ return;
/* are all (live) children frozen? */
+ rcu_read_lock();
css_for_each_child(pos, css) {
struct freezer *child = css_freezer(pos);
if ((child->state & CGROUP_FREEZER_ONLINE) &&
- !(child->state & CGROUP_FROZEN))
- goto out_unlock;
+ !(child->state & CGROUP_FROZEN)) {
+ rcu_read_unlock();
+ return;
+ }
}
+ rcu_read_unlock();
/* are all tasks frozen? */
css_task_iter_start(css, &it);
@@ -297,21 +289,29 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
freezer->state |= CGROUP_FROZEN;
out_iter_end:
css_task_iter_end(&it);
-out_unlock:
- spin_unlock_irq(&freezer->lock);
}
static int freezer_read(struct seq_file *m, void *v)
{
struct cgroup_subsys_state *css = seq_css(m), *pos;
+ mutex_lock(&freezer_mutex);
rcu_read_lock();
/* update states bottom-up */
- css_for_each_descendant_post(pos, css)
+ css_for_each_descendant_post(pos, css) {
+ if (!css_tryget_online(pos))
+ continue;
+ rcu_read_unlock();
+
update_if_frozen(pos);
+ rcu_read_lock();
+ css_put(pos);
+ }
+
rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
seq_puts(m, freezer_state_strs(css_freezer(css)->state));
seq_putc(m, '\n');
@@ -353,7 +353,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
unsigned int state)
{
/* also synchronizes against task migration, see freezer_attach() */
- lockdep_assert_held(&freezer->lock);
+ lockdep_assert_held(&freezer_mutex);
if (!(freezer->state & CGROUP_FREEZER_ONLINE))
return;
@@ -394,47 +394,47 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
* descendant will try to inherit its parent's FREEZING state as
* CGROUP_FREEZING_PARENT.
*/
+ mutex_lock(&freezer_mutex);
rcu_read_lock();
css_for_each_descendant_pre(pos, &freezer->css) {
struct freezer *pos_f = css_freezer(pos);
struct freezer *parent = parent_freezer(pos_f);
- spin_lock_irq(&pos_f->lock);
+ if (!css_tryget_online(pos))
+ continue;
+ rcu_read_unlock();
- if (pos_f == freezer) {
+ if (pos_f == freezer)
freezer_apply_state(pos_f, freeze,
CGROUP_FREEZING_SELF);
- } else {
- /*
- * Our update to @parent->state is already visible
- * which is all we need. No need to lock @parent.
- * For more info on synchronization, see
- * freezer_post_create().
- */
+ else
freezer_apply_state(pos_f,
parent->state & CGROUP_FREEZING,
CGROUP_FREEZING_PARENT);
- }
- spin_unlock_irq(&pos_f->lock);
+ rcu_read_lock();
+ css_put(pos);
}
rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
}
-static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft,
- const char *buffer)
+static ssize_t freezer_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
bool freeze;
- if (strcmp(buffer, freezer_state_strs(0)) == 0)
+ buf = strstrip(buf);
+
+ if (strcmp(buf, freezer_state_strs(0)) == 0)
freeze = false;
- else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0)
+ else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0)
freeze = true;
else
return -EINVAL;
- freezer_change_state(css_freezer(css), freeze);
- return 0;
+ freezer_change_state(css_freezer(of_css(of)), freeze);
+ return nbytes;
}
static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
@@ -458,7 +458,7 @@ static struct cftype files[] = {
.name = "state",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = freezer_read,
- .write_string = freezer_write,
+ .write = freezer_write,
},
{
.name = "self_freezing",
@@ -473,14 +473,12 @@ static struct cftype files[] = {
{ } /* terminate */
};
-struct cgroup_subsys freezer_subsys = {
- .name = "freezer",
+struct cgroup_subsys freezer_cgrp_subsys = {
.css_alloc = freezer_css_alloc,
.css_online = freezer_css_online,
.css_offline = freezer_css_offline,
.css_free = freezer_css_free,
- .subsys_id = freezer_subsys_id,
.attach = freezer_attach,
.fork = freezer_fork,
- .base_cftypes = files,
+ .legacy_cftypes = files,
};
diff --git a/kernel/compat.c b/kernel/compat.c
index 0a09e481b70b..ebb3c369d03d 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -30,28 +30,6 @@
#include <asm/uaccess.h>
-/*
- * Get/set struct timeval with struct timespec on the native side
- */
-static int compat_get_timeval_convert(struct timespec *o,
- struct compat_timeval __user *i)
-{
- long usec;
-
- if (get_user(o->tv_sec, &i->tv_sec) ||
- get_user(usec, &i->tv_usec))
- return -EFAULT;
- o->tv_nsec = usec * 1000;
- return 0;
-}
-
-static int compat_put_timeval_convert(struct compat_timeval __user *o,
- struct timeval *i)
-{
- return (put_user(i->tv_sec, &o->tv_sec) ||
- put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
-}
-
static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp)
{
memset(txc, 0, sizeof(struct timex));
@@ -110,13 +88,13 @@ static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc)
return 0;
}
-asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
- struct timezone __user *tz)
+COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv,
+ struct timezone __user *, tz)
{
if (tv) {
struct timeval ktv;
do_gettimeofday(&ktv);
- if (compat_put_timeval_convert(tv, &ktv))
+ if (compat_put_timeval(&ktv, tv))
return -EFAULT;
}
if (tz) {
@@ -127,92 +105,114 @@ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
return 0;
}
-asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
- struct timezone __user *tz)
+COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv,
+ struct timezone __user *, tz)
{
- struct timespec kts;
- struct timezone ktz;
+ struct timeval user_tv;
+ struct timespec new_ts;
+ struct timezone new_tz;
if (tv) {
- if (compat_get_timeval_convert(&kts, tv))
+ if (compat_get_timeval(&user_tv, tv))
return -EFAULT;
+ new_ts.tv_sec = user_tv.tv_sec;
+ new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
}
if (tz) {
- if (copy_from_user(&ktz, tz, sizeof(ktz)))
+ if (copy_from_user(&new_tz, tz, sizeof(*tz)))
return -EFAULT;
}
- return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
+ return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
}
-int get_compat_timeval(struct timeval *tv, const struct compat_timeval __user *ctv)
+static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv)
{
return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) ||
__get_user(tv->tv_sec, &ctv->tv_sec) ||
__get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
}
-EXPORT_SYMBOL_GPL(get_compat_timeval);
-int put_compat_timeval(const struct timeval *tv, struct compat_timeval __user *ctv)
+static int __compat_put_timeval(const struct timeval *tv, struct compat_timeval __user *ctv)
{
return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) ||
__put_user(tv->tv_sec, &ctv->tv_sec) ||
__put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
}
-EXPORT_SYMBOL_GPL(put_compat_timeval);
-int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
+static int __compat_get_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
{
return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
__get_user(ts->tv_sec, &cts->tv_sec) ||
__get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
}
-EXPORT_SYMBOL_GPL(get_compat_timespec);
-int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts)
+static int __compat_put_timespec(const struct timespec *ts, struct compat_timespec __user *cts)
{
return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) ||
__put_user(ts->tv_sec, &cts->tv_sec) ||
__put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
}
-EXPORT_SYMBOL_GPL(put_compat_timespec);
int compat_get_timeval(struct timeval *tv, const void __user *utv)
{
if (COMPAT_USE_64BIT_TIME)
- return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0;
+ return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0;
else
- return get_compat_timeval(tv, utv);
+ return __compat_get_timeval(tv, utv);
}
EXPORT_SYMBOL_GPL(compat_get_timeval);
int compat_put_timeval(const struct timeval *tv, void __user *utv)
{
if (COMPAT_USE_64BIT_TIME)
- return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0;
+ return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0;
else
- return put_compat_timeval(tv, utv);
+ return __compat_put_timeval(tv, utv);
}
EXPORT_SYMBOL_GPL(compat_put_timeval);
int compat_get_timespec(struct timespec *ts, const void __user *uts)
{
if (COMPAT_USE_64BIT_TIME)
- return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0;
+ return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
else
- return get_compat_timespec(ts, uts);
+ return __compat_get_timespec(ts, uts);
}
EXPORT_SYMBOL_GPL(compat_get_timespec);
int compat_put_timespec(const struct timespec *ts, void __user *uts)
{
if (COMPAT_USE_64BIT_TIME)
- return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0;
+ return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
else
- return put_compat_timespec(ts, uts);
+ return __compat_put_timespec(ts, uts);
}
EXPORT_SYMBOL_GPL(compat_put_timespec);
+int compat_convert_timespec(struct timespec __user **kts,
+ const void __user *cts)
+{
+ struct timespec ts;
+ struct timespec __user *uts;
+
+ if (!cts || COMPAT_USE_64BIT_TIME) {
+ *kts = (struct timespec __user *)cts;
+ return 0;
+ }
+
+ uts = compat_alloc_user_space(sizeof(ts));
+ if (!uts)
+ return -EFAULT;
+ if (compat_get_timespec(&ts, cts))
+ return -EFAULT;
+ if (copy_to_user(uts, &ts, sizeof(ts)))
+ return -EFAULT;
+
+ *kts = uts;
+ return 0;
+}
+
static long compat_nanosleep_restart(struct restart_block *restart)
{
struct compat_timespec __user *rmtp;
@@ -226,24 +226,24 @@ static long compat_nanosleep_restart(struct restart_block *restart)
ret = hrtimer_nanosleep_restart(restart);
set_fs(oldfs);
- if (ret) {
+ if (ret == -ERESTART_RESTARTBLOCK) {
rmtp = restart->nanosleep.compat_rmtp;
- if (rmtp && put_compat_timespec(&rmt, rmtp))
+ if (rmtp && compat_put_timespec(&rmt, rmtp))
return -EFAULT;
}
return ret;
}
-asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
- struct compat_timespec __user *rmtp)
+COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
+ struct compat_timespec __user *, rmtp)
{
struct timespec tu, rmt;
mm_segment_t oldfs;
long ret;
- if (get_compat_timespec(&tu, rqtp))
+ if (compat_get_timespec(&tu, rqtp))
return -EFAULT;
if (!timespec_valid(&tu))
@@ -256,17 +256,35 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
HRTIMER_MODE_REL, CLOCK_MONOTONIC);
set_fs(oldfs);
- if (ret) {
+ /*
+ * hrtimer_nanosleep() can only return 0 or
+ * -ERESTART_RESTARTBLOCK here because:
+ *
+ * - we call it with HRTIMER_MODE_REL and therefor exclude the
+ * -ERESTARTNOHAND return path.
+ *
+ * - we supply the rmtp argument from the task stack (due to
+ * the necessary compat conversion. So the update cannot
+ * fail, which excludes the -EFAULT return path as well. If
+ * it fails nevertheless we have a bigger problem and wont
+ * reach this place anymore.
+ *
+ * - if the return value is 0, we do not have to update rmtp
+ * because there is no remaining time.
+ *
+ * We check for -ERESTART_RESTARTBLOCK nevertheless if the
+ * core implementation decides to return random nonsense.
+ */
+ if (ret == -ERESTART_RESTARTBLOCK) {
struct restart_block *restart
= &current_thread_info()->restart_block;
restart->fn = compat_nanosleep_restart;
restart->nanosleep.compat_rmtp = rmtp;
- if (rmtp && put_compat_timespec(&rmt, rmtp))
+ if (rmtp && compat_put_timespec(&rmt, rmtp))
return -EFAULT;
}
-
return ret;
}
@@ -328,7 +346,7 @@ static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
}
-asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
+COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf)
{
if (tbuf) {
struct tms tms;
@@ -354,7 +372,7 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
* types that can be passed to put_user()/get_user().
*/
-asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
+COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set)
{
old_sigset_t s;
long ret;
@@ -424,8 +442,8 @@ COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how,
#endif
-asmlinkage long compat_sys_setrlimit(unsigned int resource,
- struct compat_rlimit __user *rlim)
+COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource,
+ struct compat_rlimit __user *, rlim)
{
struct rlimit r;
@@ -443,15 +461,15 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource,
#ifdef COMPAT_RLIM_OLD_INFINITY
-asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
- struct compat_rlimit __user *rlim)
+COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
+ struct compat_rlimit __user *, rlim)
{
struct rlimit r;
int ret;
mm_segment_t old_fs = get_fs();
set_fs(KERNEL_DS);
- ret = sys_old_getrlimit(resource, &r);
+ ret = sys_old_getrlimit(resource, (struct rlimit __user *)&r);
set_fs(old_fs);
if (!ret) {
@@ -470,8 +488,8 @@ asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
#endif
-asmlinkage long compat_sys_getrlimit(unsigned int resource,
- struct compat_rlimit __user *rlim)
+COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource,
+ struct compat_rlimit __user *, rlim)
{
struct rlimit r;
int ret;
@@ -596,9 +614,9 @@ static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
return compat_get_bitmap(k, user_mask_ptr, len * 8);
}
-asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid,
- unsigned int len,
- compat_ulong_t __user *user_mask_ptr)
+COMPAT_SYSCALL_DEFINE3(sched_setaffinity, compat_pid_t, pid,
+ unsigned int, len,
+ compat_ulong_t __user *, user_mask_ptr)
{
cpumask_var_t new_mask;
int retval;
@@ -616,8 +634,8 @@ out:
return retval;
}
-asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
- compat_ulong_t __user *user_mask_ptr)
+COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len,
+ compat_ulong_t __user *, user_mask_ptr)
{
int ret;
cpumask_var_t mask;
@@ -647,8 +665,8 @@ asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
int get_compat_itimerspec(struct itimerspec *dst,
const struct compat_itimerspec __user *src)
{
- if (get_compat_timespec(&dst->it_interval, &src->it_interval) ||
- get_compat_timespec(&dst->it_value, &src->it_value))
+ if (__compat_get_timespec(&dst->it_interval, &src->it_interval) ||
+ __compat_get_timespec(&dst->it_value, &src->it_value))
return -EFAULT;
return 0;
}
@@ -656,15 +674,15 @@ int get_compat_itimerspec(struct itimerspec *dst,
int put_compat_itimerspec(struct compat_itimerspec __user *dst,
const struct itimerspec *src)
{
- if (put_compat_timespec(&src->it_interval, &dst->it_interval) ||
- put_compat_timespec(&src->it_value, &dst->it_value))
+ if (__compat_put_timespec(&src->it_interval, &dst->it_interval) ||
+ __compat_put_timespec(&src->it_value, &dst->it_value))
return -EFAULT;
return 0;
}
-long compat_sys_timer_create(clockid_t which_clock,
- struct compat_sigevent __user *timer_event_spec,
- timer_t __user *created_timer_id)
+COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
+ struct compat_sigevent __user *, timer_event_spec,
+ timer_t __user *, created_timer_id)
{
struct sigevent __user *event = NULL;
@@ -680,9 +698,9 @@ long compat_sys_timer_create(clockid_t which_clock,
return sys_timer_create(which_clock, event, created_timer_id);
}
-long compat_sys_timer_settime(timer_t timer_id, int flags,
- struct compat_itimerspec __user *new,
- struct compat_itimerspec __user *old)
+COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
+ struct compat_itimerspec __user *, new,
+ struct compat_itimerspec __user *, old)
{
long err;
mm_segment_t oldfs;
@@ -703,8 +721,8 @@ long compat_sys_timer_settime(timer_t timer_id, int flags,
return err;
}
-long compat_sys_timer_gettime(timer_t timer_id,
- struct compat_itimerspec __user *setting)
+COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
+ struct compat_itimerspec __user *, setting)
{
long err;
mm_segment_t oldfs;
@@ -720,14 +738,14 @@ long compat_sys_timer_gettime(timer_t timer_id,
return err;
}
-long compat_sys_clock_settime(clockid_t which_clock,
- struct compat_timespec __user *tp)
+COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
+ struct compat_timespec __user *, tp)
{
long err;
mm_segment_t oldfs;
struct timespec ts;
- if (get_compat_timespec(&ts, tp))
+ if (compat_get_timespec(&ts, tp))
return -EFAULT;
oldfs = get_fs();
set_fs(KERNEL_DS);
@@ -737,8 +755,8 @@ long compat_sys_clock_settime(clockid_t which_clock,
return err;
}
-long compat_sys_clock_gettime(clockid_t which_clock,
- struct compat_timespec __user *tp)
+COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
+ struct compat_timespec __user *, tp)
{
long err;
mm_segment_t oldfs;
@@ -749,13 +767,13 @@ long compat_sys_clock_gettime(clockid_t which_clock,
err = sys_clock_gettime(which_clock,
(struct timespec __user *) &ts);
set_fs(oldfs);
- if (!err && put_compat_timespec(&ts, tp))
+ if (!err && compat_put_timespec(&ts, tp))
return -EFAULT;
return err;
}
-long compat_sys_clock_adjtime(clockid_t which_clock,
- struct compat_timex __user *utp)
+COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
+ struct compat_timex __user *, utp)
{
struct timex txc;
mm_segment_t oldfs;
@@ -777,8 +795,8 @@ long compat_sys_clock_adjtime(clockid_t which_clock,
return ret;
}
-long compat_sys_clock_getres(clockid_t which_clock,
- struct compat_timespec __user *tp)
+COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
+ struct compat_timespec __user *, tp)
{
long err;
mm_segment_t oldfs;
@@ -789,7 +807,7 @@ long compat_sys_clock_getres(clockid_t which_clock,
err = sys_clock_getres(which_clock,
(struct timespec __user *) &ts);
set_fs(oldfs);
- if (!err && tp && put_compat_timespec(&ts, tp))
+ if (!err && tp && compat_put_timespec(&ts, tp))
return -EFAULT;
return err;
}
@@ -799,7 +817,7 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart)
long err;
mm_segment_t oldfs;
struct timespec tu;
- struct compat_timespec *rmtp = restart->nanosleep.compat_rmtp;
+ struct compat_timespec __user *rmtp = restart->nanosleep.compat_rmtp;
restart->nanosleep.rmtp = (struct timespec __user *) &tu;
oldfs = get_fs();
@@ -808,7 +826,7 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart)
set_fs(oldfs);
if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
- put_compat_timespec(&tu, rmtp))
+ compat_put_timespec(&tu, rmtp))
return -EFAULT;
if (err == -ERESTART_RESTARTBLOCK) {
@@ -818,16 +836,16 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart)
return err;
}
-long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
- struct compat_timespec __user *rqtp,
- struct compat_timespec __user *rmtp)
+COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
+ struct compat_timespec __user *, rqtp,
+ struct compat_timespec __user *, rmtp)
{
long err;
mm_segment_t oldfs;
struct timespec in, out;
struct restart_block *restart;
- if (get_compat_timespec(&in, rqtp))
+ if (compat_get_timespec(&in, rqtp))
return -EFAULT;
oldfs = get_fs();
@@ -838,7 +856,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
set_fs(oldfs);
if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
- put_compat_timespec(&out, rmtp))
+ compat_put_timespec(&out, rmtp))
return -EFAULT;
if (err == -ERESTART_RESTARTBLOCK) {
@@ -1010,7 +1028,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
/* compat_time_t is a 32 bit "long" and needs to get converted. */
-asmlinkage long compat_sys_time(compat_time_t __user * tloc)
+COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc)
{
compat_time_t i;
struct timeval tv;
@@ -1026,7 +1044,7 @@ asmlinkage long compat_sys_time(compat_time_t __user * tloc)
return i;
}
-asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
+COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr)
{
struct timespec tv;
int err;
@@ -1046,7 +1064,7 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
-asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
+COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp)
{
struct timex txc;
int err, ret;
@@ -1065,11 +1083,11 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
}
#ifdef CONFIG_NUMA
-asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
- compat_uptr_t __user *pages32,
- const int __user *nodes,
- int __user *status,
- int flags)
+COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
+ compat_uptr_t __user *, pages32,
+ const int __user *, nodes,
+ int __user *, status,
+ int, flags)
{
const void __user * __user *pages;
int i;
@@ -1085,10 +1103,10 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
}
-asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
- compat_ulong_t maxnode,
- const compat_ulong_t __user *old_nodes,
- const compat_ulong_t __user *new_nodes)
+COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
+ compat_ulong_t, maxnode,
+ const compat_ulong_t __user *, old_nodes,
+ const compat_ulong_t __user *, new_nodes)
{
unsigned long __user *old = NULL;
unsigned long __user *new = NULL;
@@ -1130,7 +1148,7 @@ COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
set_fs(KERNEL_DS);
ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
set_fs(old_fs);
- if (put_compat_timespec(&t, interval))
+ if (compat_put_timespec(&t, interval))
return -EFAULT;
return ret;
}
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
new file mode 100644
index 000000000000..c2de56ab0fce
--- /dev/null
+++ b/kernel/configs/tiny.config
@@ -0,0 +1,4 @@
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+CONFIG_KERNEL_XZ=y
+CONFIG_OPTIMIZE_INLINING=y
+CONFIG_SLOB=y
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 6cb20d2e7ee0..5664985c46a0 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -19,6 +19,7 @@
#include <linux/sched.h>
#include <linux/hardirq.h>
#include <linux/export.h>
+#include <linux/kprobes.h>
#define CREATE_TRACE_POINTS
#include <trace/events/context_tracking.h>
@@ -104,6 +105,7 @@ void context_tracking_user_enter(void)
}
local_irq_restore(flags);
}
+NOKPROBE_SYMBOL(context_tracking_user_enter);
#ifdef CONFIG_PREEMPT
/**
@@ -120,7 +122,7 @@ void context_tracking_user_enter(void)
* instead of preempt_schedule() to exit user context if needed before
* calling the scheduler.
*/
-asmlinkage void __sched notrace preempt_schedule_context(void)
+asmlinkage __visible void __sched notrace preempt_schedule_context(void)
{
enum ctx_state prev_ctx;
@@ -181,6 +183,7 @@ void context_tracking_user_exit(void)
}
local_irq_restore(flags);
}
+NOKPROBE_SYMBOL(context_tracking_user_exit);
/**
* __context_tracking_task_switch - context switch the syscall callbacks
diff --git a/kernel/cpu.c b/kernel/cpu.c
index deff2e693766..81e2a388a0f6 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -19,6 +19,8 @@
#include <linux/mutex.h>
#include <linux/gfp.h>
#include <linux/suspend.h>
+#include <linux/lockdep.h>
+#include <trace/events/power.h>
#include "smpboot.h"
@@ -27,18 +29,23 @@
static DEFINE_MUTEX(cpu_add_remove_lock);
/*
- * The following two API's must be used when attempting
- * to serialize the updates to cpu_online_mask, cpu_present_mask.
+ * The following two APIs (cpu_maps_update_begin/done) must be used when
+ * attempting to serialize the updates to cpu_online_mask & cpu_present_mask.
+ * The APIs cpu_notifier_register_begin/done() must be used to protect CPU
+ * hotplug callback (un)registration performed using __register_cpu_notifier()
+ * or __unregister_cpu_notifier().
*/
void cpu_maps_update_begin(void)
{
mutex_lock(&cpu_add_remove_lock);
}
+EXPORT_SYMBOL(cpu_notifier_register_begin);
void cpu_maps_update_done(void)
{
mutex_unlock(&cpu_add_remove_lock);
}
+EXPORT_SYMBOL(cpu_notifier_register_done);
static RAW_NOTIFIER_HEAD(cpu_chain);
@@ -57,17 +64,30 @@ static struct {
* an ongoing cpu hotplug operation.
*/
int refcount;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
} cpu_hotplug = {
.active_writer = NULL,
.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
.refcount = 0,
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ .dep_map = {.name = "cpu_hotplug.lock" },
+#endif
};
+/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
+#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
+#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
+#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
+
void get_online_cpus(void)
{
might_sleep();
if (cpu_hotplug.active_writer == current)
return;
+ cpuhp_lock_acquire_read();
mutex_lock(&cpu_hotplug.lock);
cpu_hotplug.refcount++;
mutex_unlock(&cpu_hotplug.lock);
@@ -87,6 +107,7 @@ void put_online_cpus(void)
if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
wake_up_process(cpu_hotplug.active_writer);
mutex_unlock(&cpu_hotplug.lock);
+ cpuhp_lock_release();
}
EXPORT_SYMBOL_GPL(put_online_cpus);
@@ -117,6 +138,7 @@ void cpu_hotplug_begin(void)
{
cpu_hotplug.active_writer = current;
+ cpuhp_lock_acquire();
for (;;) {
mutex_lock(&cpu_hotplug.lock);
if (likely(!cpu_hotplug.refcount))
@@ -131,6 +153,7 @@ void cpu_hotplug_done(void)
{
cpu_hotplug.active_writer = NULL;
mutex_unlock(&cpu_hotplug.lock);
+ cpuhp_lock_release();
}
/*
@@ -166,6 +189,11 @@ int __ref register_cpu_notifier(struct notifier_block *nb)
return ret;
}
+int __ref __register_cpu_notifier(struct notifier_block *nb)
+{
+ return raw_notifier_chain_register(&cpu_chain, nb);
+}
+
static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
int *nr_calls)
{
@@ -189,6 +217,7 @@ static void cpu_notify_nofail(unsigned long val, void *v)
BUG_ON(cpu_notify(val, v));
}
EXPORT_SYMBOL(register_cpu_notifier);
+EXPORT_SYMBOL(__register_cpu_notifier);
void __ref unregister_cpu_notifier(struct notifier_block *nb)
{
@@ -198,6 +227,12 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL(unregister_cpu_notifier);
+void __ref __unregister_cpu_notifier(struct notifier_block *nb)
+{
+ raw_notifier_chain_unregister(&cpu_chain, nb);
+}
+EXPORT_SYMBOL(__unregister_cpu_notifier);
+
/**
* clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
* @cpu: a CPU id
@@ -239,22 +274,28 @@ void clear_tasks_mm_cpumask(int cpu)
rcu_read_unlock();
}
-static inline void check_for_tasks(int cpu)
+static inline void check_for_tasks(int dead_cpu)
{
- struct task_struct *p;
- cputime_t utime, stime;
+ struct task_struct *g, *p;
- write_lock_irq(&tasklist_lock);
- for_each_process(p) {
- task_cputime(p, &utime, &stime);
- if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
- (utime || stime))
- printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
- "(state = %ld, flags = %x)\n",
- p->comm, task_pid_nr(p), cpu,
- p->state, p->flags);
- }
- write_unlock_irq(&tasklist_lock);
+ read_lock_irq(&tasklist_lock);
+ do_each_thread(g, p) {
+ if (!p->on_rq)
+ continue;
+ /*
+ * We do the check with unlocked task_rq(p)->lock.
+ * Order the reading to do not warn about a task,
+ * which was running on this cpu in the past, and
+ * it's just been woken on another cpu.
+ */
+ rmb();
+ if (task_cpu(p) != dead_cpu)
+ continue;
+
+ pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n",
+ p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags);
+ } while_each_thread(g, p);
+ read_unlock_irq(&tasklist_lock);
}
struct take_cpu_down_param {
@@ -302,8 +343,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
if (err) {
nr_calls--;
__cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
- printk("%s: attempt to take down CPU %u failed\n",
- __func__, cpu);
+ pr_warn("%s: attempt to take down CPU %u failed\n",
+ __func__, cpu);
goto out_release;
}
@@ -410,8 +451,8 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
if (ret) {
nr_calls--;
- printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n",
- __func__, cpu);
+ pr_warn("%s: attempt to bring up CPU %u failed\n",
+ __func__, cpu);
goto out_notify;
}
@@ -441,11 +482,10 @@ int cpu_up(unsigned int cpu)
int err = 0;
if (!cpu_possible(cpu)) {
- printk(KERN_ERR "can't online cpu %d because it is not "
- "configured as may-hotadd at boot time\n", cpu);
+ pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
+ cpu);
#if defined(CONFIG_IA64)
- printk(KERN_ERR "please check additional_cpus= boot "
- "parameter\n");
+ pr_err("please check additional_cpus= boot parameter\n");
#endif
return -EINVAL;
}
@@ -484,16 +524,17 @@ int disable_nonboot_cpus(void)
*/
cpumask_clear(frozen_cpus);
- printk("Disabling non-boot CPUs ...\n");
+ pr_info("Disabling non-boot CPUs ...\n");
for_each_online_cpu(cpu) {
if (cpu == first_cpu)
continue;
+ trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
error = _cpu_down(cpu, 1);
+ trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
if (!error)
cpumask_set_cpu(cpu, frozen_cpus);
else {
- printk(KERN_ERR "Error taking CPU%d down: %d\n",
- cpu, error);
+ pr_err("Error taking CPU%d down: %d\n", cpu, error);
break;
}
}
@@ -503,7 +544,7 @@ int disable_nonboot_cpus(void)
/* Make sure the CPUs won't be enabled by someone else */
cpu_hotplug_disabled = 1;
} else {
- printk(KERN_ERR "Non-boot CPUs are not disabled\n");
+ pr_err("Non-boot CPUs are not disabled\n");
}
cpu_maps_update_done();
return error;
@@ -527,17 +568,19 @@ void __ref enable_nonboot_cpus(void)
if (cpumask_empty(frozen_cpus))
goto out;
- printk(KERN_INFO "Enabling non-boot CPUs ...\n");
+ pr_info("Enabling non-boot CPUs ...\n");
arch_enable_nonboot_cpus_begin();
for_each_cpu(cpu, frozen_cpus) {
+ trace_suspend_resume(TPS("CPU_ON"), cpu, true);
error = _cpu_up(cpu, 1);
+ trace_suspend_resume(TPS("CPU_ON"), cpu, false);
if (!error) {
- printk(KERN_INFO "CPU%d is up\n", cpu);
+ pr_info("CPU%d is up\n", cpu);
continue;
}
- printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
+ pr_warn("Error taking CPU%d up: %d\n", cpu, error);
}
arch_enable_nonboot_cpus_end();
@@ -692,10 +735,12 @@ void set_cpu_present(unsigned int cpu, bool present)
void set_cpu_online(unsigned int cpu, bool online)
{
- if (online)
+ if (online) {
cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
- else
+ cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
+ } else {
cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
+ }
}
void set_cpu_active(unsigned int cpu, bool active)
diff --git a/kernel/cpu/Makefile b/kernel/cpu/Makefile
deleted file mode 100644
index 59ab052ef7a0..000000000000
--- a/kernel/cpu/Makefile
+++ /dev/null
@@ -1 +0,0 @@
-obj-y = idle.o
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
deleted file mode 100644
index 277f494c2a9a..000000000000
--- a/kernel/cpu/idle.c
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Generic entry point for the idle threads
- */
-#include <linux/sched.h>
-#include <linux/cpu.h>
-#include <linux/tick.h>
-#include <linux/mm.h>
-#include <linux/stackprotector.h>
-
-#include <asm/tlb.h>
-
-#include <trace/events/power.h>
-
-static int __read_mostly cpu_idle_force_poll;
-
-void cpu_idle_poll_ctrl(bool enable)
-{
- if (enable) {
- cpu_idle_force_poll++;
- } else {
- cpu_idle_force_poll--;
- WARN_ON_ONCE(cpu_idle_force_poll < 0);
- }
-}
-
-#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
-static int __init cpu_idle_poll_setup(char *__unused)
-{
- cpu_idle_force_poll = 1;
- return 1;
-}
-__setup("nohlt", cpu_idle_poll_setup);
-
-static int __init cpu_idle_nopoll_setup(char *__unused)
-{
- cpu_idle_force_poll = 0;
- return 1;
-}
-__setup("hlt", cpu_idle_nopoll_setup);
-#endif
-
-static inline int cpu_idle_poll(void)
-{
- rcu_idle_enter();
- trace_cpu_idle_rcuidle(0, smp_processor_id());
- local_irq_enable();
- while (!tif_need_resched())
- cpu_relax();
- trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
- rcu_idle_exit();
- return 1;
-}
-
-/* Weak implementations for optional arch specific functions */
-void __weak arch_cpu_idle_prepare(void) { }
-void __weak arch_cpu_idle_enter(void) { }
-void __weak arch_cpu_idle_exit(void) { }
-void __weak arch_cpu_idle_dead(void) { }
-void __weak arch_cpu_idle(void)
-{
- cpu_idle_force_poll = 1;
- local_irq_enable();
-}
-
-/*
- * Generic idle loop implementation
- */
-static void cpu_idle_loop(void)
-{
- while (1) {
- tick_nohz_idle_enter();
-
- while (!need_resched()) {
- check_pgt_cache();
- rmb();
-
- if (cpu_is_offline(smp_processor_id()))
- arch_cpu_idle_dead();
-
- local_irq_disable();
- arch_cpu_idle_enter();
-
- /*
- * In poll mode we reenable interrupts and spin.
- *
- * Also if we detected in the wakeup from idle
- * path that the tick broadcast device expired
- * for us, we don't want to go deep idle as we
- * know that the IPI is going to arrive right
- * away
- */
- if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
- cpu_idle_poll();
- } else {
- if (!current_clr_polling_and_test()) {
- stop_critical_timings();
- rcu_idle_enter();
- arch_cpu_idle();
- WARN_ON_ONCE(irqs_disabled());
- rcu_idle_exit();
- start_critical_timings();
- } else {
- local_irq_enable();
- }
- __current_set_polling();
- }
- arch_cpu_idle_exit();
- }
-
- /*
- * Since we fell out of the loop above, we know
- * TIF_NEED_RESCHED must be set, propagate it into
- * PREEMPT_NEED_RESCHED.
- *
- * This is required because for polling idle loops we will
- * not have had an IPI to fold the state for us.
- */
- preempt_set_need_resched();
- tick_nohz_idle_exit();
- schedule_preempt_disabled();
- }
-}
-
-void cpu_startup_entry(enum cpuhp_state state)
-{
- /*
- * This #ifdef needs to die, but it's too late in the cycle to
- * make this generic (arm and sh have never invoked the canary
- * init for the non boot cpus!). Will be fixed in 3.11
- */
-#ifdef CONFIG_X86
- /*
- * If we're the non-boot CPU, nothing set the stack canary up
- * for us. The boot CPU already has it initialized but no harm
- * in doing it again. This is a good place for updating it, as
- * we wont ever return from this function (so the invalid
- * canaries already on the stack wont ever trigger).
- */
- boot_init_stack_canary();
-#endif
- __current_set_polling();
- arch_cpu_idle_prepare();
- cpu_idle_loop();
-}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e6b1b66afe52..52cb04c993b7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,12 +61,7 @@
#include <linux/cgroup.h>
#include <linux/wait.h>
-/*
- * Tracks how many cpusets are currently defined in system.
- * When there is only one cpuset (the root cpuset) we can
- * short circuit some hooks.
- */
-int number_of_cpusets __read_mostly;
+struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
/* See "Frequency meter" comments, below. */
@@ -81,8 +76,34 @@ struct cpuset {
struct cgroup_subsys_state css;
unsigned long flags; /* "unsigned long" so bitops work */
- cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
- nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
+
+ /*
+ * On default hierarchy:
+ *
+ * The user-configured masks can only be changed by writing to
+ * cpuset.cpus and cpuset.mems, and won't be limited by the
+ * parent masks.
+ *
+ * The effective masks is the real masks that apply to the tasks
+ * in the cpuset. They may be changed if the configured masks are
+ * changed or hotplug happens.
+ *
+ * effective_mask == configured_mask & parent's effective_mask,
+ * and if it ends up empty, it will inherit the parent's mask.
+ *
+ *
+ * On legacy hierachy:
+ *
+ * The user-configured masks are always the same with effective masks.
+ */
+
+ /* user-configured CPUs and Memory Nodes allow to tasks */
+ cpumask_var_t cpus_allowed;
+ nodemask_t mems_allowed;
+
+ /* effective CPUs and Memory Nodes allow to tasks */
+ cpumask_var_t effective_cpus;
+ nodemask_t effective_mems;
/*
* This is old Memory Nodes tasks took on.
@@ -119,12 +140,12 @@ static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
/* Retrieve the cpuset for a task */
static inline struct cpuset *task_cs(struct task_struct *task)
{
- return css_cs(task_css(task, cpuset_subsys_id));
+ return css_cs(task_css(task, cpuset_cgrp_id));
}
static inline struct cpuset *parent_cs(struct cpuset *cs)
{
- return css_cs(css_parent(&cs->css));
+ return css_cs(cs->css.parent);
}
#ifdef CONFIG_NUMA
@@ -312,9 +333,9 @@ static struct file_system_type cpuset_fs_type = {
*/
static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
{
- while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
+ while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
cs = parent_cs(cs);
- cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
+ cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
}
/*
@@ -330,9 +351,9 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
*/
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
- while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
+ while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
cs = parent_cs(cs);
- nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]);
+ nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
}
/*
@@ -344,13 +365,14 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
struct task_struct *tsk)
{
if (is_spread_page(cs))
- tsk->flags |= PF_SPREAD_PAGE;
+ task_set_spread_page(tsk);
else
- tsk->flags &= ~PF_SPREAD_PAGE;
+ task_clear_spread_page(tsk);
+
if (is_spread_slab(cs))
- tsk->flags |= PF_SPREAD_SLAB;
+ task_set_spread_slab(tsk);
else
- tsk->flags &= ~PF_SPREAD_SLAB;
+ task_clear_spread_slab(tsk);
}
/*
@@ -381,13 +403,20 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
if (!trial)
return NULL;
- if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
- kfree(trial);
- return NULL;
- }
- cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+ if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
+ goto free_cs;
+ if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
+ goto free_cpus;
+ cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+ cpumask_copy(trial->effective_cpus, cs->effective_cpus);
return trial;
+
+free_cpus:
+ free_cpumask_var(trial->cpus_allowed);
+free_cs:
+ kfree(trial);
+ return NULL;
}
/**
@@ -396,6 +425,7 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
*/
static void free_trial_cpuset(struct cpuset *trial)
{
+ free_cpumask_var(trial->effective_cpus);
free_cpumask_var(trial->cpus_allowed);
kfree(trial);
}
@@ -441,9 +471,9 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
par = parent_cs(cur);
- /* We must be a subset of our parent cpuset */
+ /* On legacy hiearchy, we must be a subset of our parent cpuset. */
ret = -EACCES;
- if (!is_cpuset_subset(trial, par))
+ if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
goto out;
/*
@@ -467,7 +497,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
* be changed to have empty cpus_allowed or mems_allowed.
*/
ret = -ENOSPC;
- if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress)) {
+ if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) {
if (!cpumask_empty(cur->cpus_allowed) &&
cpumask_empty(trial->cpus_allowed))
goto out;
@@ -485,11 +515,11 @@ out:
#ifdef CONFIG_SMP
/*
* Helper routine for generate_sched_domains().
- * Do cpusets a, b have overlapping cpus_allowed masks?
+ * Do cpusets a, b have overlapping effective cpus_allowed masks?
*/
static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
{
- return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
+ return cpumask_intersects(a->effective_cpus, b->effective_cpus);
}
static void
@@ -606,12 +636,12 @@ static int generate_sched_domains(cpumask_var_t **domains,
*dattr = SD_ATTR_INIT;
update_domain_attr_tree(dattr, &top_cpuset);
}
- cpumask_copy(doms[0], top_cpuset.cpus_allowed);
+ cpumask_copy(doms[0], top_cpuset.effective_cpus);
goto done;
}
- csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
+ csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);
if (!csa)
goto done;
csn = 0;
@@ -696,11 +726,8 @@ restart:
if (nslot == ndoms) {
static int warnings = 10;
if (warnings) {
- printk(KERN_WARNING
- "rebuild_sched_domains confused:"
- " nslot %d, ndoms %d, csn %d, i %d,"
- " apn %d\n",
- nslot, ndoms, csn, i, apn);
+ pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
+ nslot, ndoms, csn, i, apn);
warnings--;
}
continue;
@@ -713,7 +740,7 @@ restart:
struct cpuset *b = csa[j];
if (apn == b->pn) {
- cpumask_or(dp, dp, b->cpus_allowed);
+ cpumask_or(dp, dp, b->effective_cpus);
if (dattr)
update_domain_attr_tree(dattr + nslot, b);
@@ -765,7 +792,7 @@ static void rebuild_sched_domains_locked(void)
* passing doms with offlined cpu to partition_sched_domains().
* Anyways, hotplug work item will rebuild sched domains.
*/
- if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask))
+ if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
goto out;
/* Generate domain masks and attrs */
@@ -789,134 +816,102 @@ void rebuild_sched_domains(void)
mutex_unlock(&cpuset_mutex);
}
-/*
- * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus
- * @cs: the cpuset in interest
- *
- * A cpuset's effective cpumask is the cpumask of the nearest ancestor
- * with non-empty cpus. We use effective cpumask whenever:
- * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask
- * if the cpuset they reside in has no cpus)
- * - we want to retrieve task_cs(tsk)'s cpus_allowed.
- *
- * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an
- * exception. See comments there.
- */
-static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)
-{
- while (cpumask_empty(cs->cpus_allowed))
- cs = parent_cs(cs);
- return cs;
-}
-
-/*
- * effective_nodemask_cpuset - return nearest ancestor with non-empty mems
- * @cs: the cpuset in interest
- *
- * A cpuset's effective nodemask is the nodemask of the nearest ancestor
- * with non-empty memss. We use effective nodemask whenever:
- * - we update tasks' mems_allowed. (they take on the ancestor's nodemask
- * if the cpuset they reside in has no mems)
- * - we want to retrieve task_cs(tsk)'s mems_allowed.
- *
- * Called with cpuset_mutex held.
- */
-static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
-{
- while (nodes_empty(cs->mems_allowed))
- cs = parent_cs(cs);
- return cs;
-}
-
-/**
- * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
- * @tsk: task to test
- * @data: cpuset to @tsk belongs to
- *
- * Called by css_scan_tasks() for each task in a cgroup whose cpus_allowed
- * mask needs to be changed.
- *
- * We don't need to re-check for the cgroup/cpuset membership, since we're
- * holding cpuset_mutex at this point.
- */
-static void cpuset_change_cpumask(struct task_struct *tsk, void *data)
-{
- struct cpuset *cs = data;
- struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
-
- set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed);
-}
-
/**
* update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
- * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
- *
- * Called with cpuset_mutex held
- *
- * The css_scan_tasks() function will scan all the tasks in a cgroup,
- * calling callback functions for each.
*
- * No return value. It's guaranteed that css_scan_tasks() always returns 0
- * if @heap != NULL.
+ * Iterate through each task of @cs updating its cpus_allowed to the
+ * effective cpuset's. As this function is called with cpuset_mutex held,
+ * cpuset membership stays stable.
*/
-static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
+static void update_tasks_cpumask(struct cpuset *cs)
{
- css_scan_tasks(&cs->css, NULL, cpuset_change_cpumask, cs, heap);
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ css_task_iter_start(&cs->css, &it);
+ while ((task = css_task_iter_next(&it)))
+ set_cpus_allowed_ptr(task, cs->effective_cpus);
+ css_task_iter_end(&it);
}
/*
- * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
- * @root_cs: the root cpuset of the hierarchy
- * @update_root: update root cpuset or not?
- * @heap: the heap used by css_scan_tasks()
+ * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
+ * @cs: the cpuset to consider
+ * @new_cpus: temp variable for calculating new effective_cpus
+ *
+ * When congifured cpumask is changed, the effective cpumasks of this cpuset
+ * and all its descendants need to be updated.
*
- * This will update cpumasks of tasks in @root_cs and all other empty cpusets
- * which take on cpumask of @root_cs.
+ * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
*
* Called with cpuset_mutex held
*/
-static void update_tasks_cpumask_hier(struct cpuset *root_cs,
- bool update_root, struct ptr_heap *heap)
+static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
+ bool need_rebuild_sched_domains = false;
rcu_read_lock();
- cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
- if (cp == root_cs) {
- if (!update_root)
- continue;
- } else {
- /* skip the whole subtree if @cp have some CPU */
- if (!cpumask_empty(cp->cpus_allowed)) {
- pos_css = css_rightmost_descendant(pos_css);
- continue;
- }
+ cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+ struct cpuset *parent = parent_cs(cp);
+
+ cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
+
+ /*
+ * If it becomes empty, inherit the effective mask of the
+ * parent, which is guaranteed to have some CPUs.
+ */
+ if (cpumask_empty(new_cpus))
+ cpumask_copy(new_cpus, parent->effective_cpus);
+
+ /* Skip the whole subtree if the cpumask remains the same. */
+ if (cpumask_equal(new_cpus, cp->effective_cpus)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
}
- if (!css_tryget(&cp->css))
+
+ if (!css_tryget_online(&cp->css))
continue;
rcu_read_unlock();
- update_tasks_cpumask(cp, heap);
+ mutex_lock(&callback_mutex);
+ cpumask_copy(cp->effective_cpus, new_cpus);
+ mutex_unlock(&callback_mutex);
+
+ WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+ !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
+
+ update_tasks_cpumask(cp);
+
+ /*
+ * If the effective cpumask of any non-empty cpuset is changed,
+ * we need to rebuild sched domains.
+ */
+ if (!cpumask_empty(cp->cpus_allowed) &&
+ is_sched_load_balance(cp))
+ need_rebuild_sched_domains = true;
rcu_read_lock();
css_put(&cp->css);
}
rcu_read_unlock();
+
+ if (need_rebuild_sched_domains)
+ rebuild_sched_domains_locked();
}
/**
* update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
* @cs: the cpuset to consider
+ * @trialcs: trial cpuset
* @buf: buffer of cpu numbers written to this cpuset
*/
static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
- struct ptr_heap heap;
int retval;
- int is_load_balanced;
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
if (cs == &top_cpuset)
@@ -935,7 +930,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
return retval;
- if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
+ if (!cpumask_subset(trialcs->cpus_allowed,
+ top_cpuset.cpus_allowed))
return -EINVAL;
}
@@ -947,22 +943,12 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
return retval;
- retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
- if (retval)
- return retval;
-
- is_load_balanced = is_sched_load_balance(trialcs);
-
mutex_lock(&callback_mutex);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
mutex_unlock(&callback_mutex);
- update_tasks_cpumask_hier(cs, true, &heap);
-
- heap_free(&heap);
-
- if (is_load_balanced)
- rebuild_sched_domains_locked();
+ /* use trialcs->cpus_allowed as a temp variable */
+ update_cpumasks_hier(cs, trialcs->cpus_allowed);
return 0;
}
@@ -984,15 +970,13 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to)
{
struct task_struct *tsk = current;
- struct cpuset *mems_cs;
tsk->mems_allowed = *to;
do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
rcu_read_lock();
- mems_cs = effective_nodemask_cpuset(task_cs(tsk));
- guarantee_online_mems(mems_cs, &tsk->mems_allowed);
+ guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
rcu_read_unlock();
}
@@ -1022,7 +1006,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
task_lock(tsk);
/*
* Determine if a loop is necessary if another thread is doing
- * get_mems_allowed(). If at least one node remains unchanged and
+ * read_mems_allowed_begin(). If at least one node remains unchanged and
* tsk does not have a mempolicy, then an empty nodemask will not be
* possible when mems_allowed is larger than a word.
*/
@@ -1048,57 +1032,25 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
task_unlock(tsk);
}
-struct cpuset_change_nodemask_arg {
- struct cpuset *cs;
- nodemask_t *newmems;
-};
-
-/*
- * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
- * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
- * memory_migrate flag is set. Called with cpuset_mutex held.
- */
-static void cpuset_change_nodemask(struct task_struct *p, void *data)
-{
- struct cpuset_change_nodemask_arg *arg = data;
- struct cpuset *cs = arg->cs;
- struct mm_struct *mm;
- int migrate;
-
- cpuset_change_task_nodemask(p, arg->newmems);
-
- mm = get_task_mm(p);
- if (!mm)
- return;
-
- migrate = is_memory_migrate(cs);
-
- mpol_rebind_mm(mm, &cs->mems_allowed);
- if (migrate)
- cpuset_migrate_mm(mm, &cs->old_mems_allowed, arg->newmems);
- mmput(mm);
-}
-
static void *cpuset_being_rebound;
/**
* update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
* @cs: the cpuset in which each task's mems_allowed mask needs to be changed
- * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
*
- * Called with cpuset_mutex held. No return value. It's guaranteed that
- * css_scan_tasks() always returns 0 if @heap != NULL.
+ * Iterate through each task of @cs updating its mems_allowed to the
+ * effective cpuset's. As this function is called with cpuset_mutex held,
+ * cpuset membership stays stable.
*/
-static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
+static void update_tasks_nodemask(struct cpuset *cs)
{
static nodemask_t newmems; /* protected by cpuset_mutex */
- struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
- struct cpuset_change_nodemask_arg arg = { .cs = cs,
- .newmems = &newmems };
+ struct css_task_iter it;
+ struct task_struct *task;
cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
- guarantee_online_mems(mems_cs, &newmems);
+ guarantee_online_mems(cs, &newmems);
/*
* The mpol_rebind_mm() call takes mmap_sem, which we couldn't
@@ -1110,7 +1062,25 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
*/
- css_scan_tasks(&cs->css, NULL, cpuset_change_nodemask, &arg, heap);
+ css_task_iter_start(&cs->css, &it);
+ while ((task = css_task_iter_next(&it))) {
+ struct mm_struct *mm;
+ bool migrate;
+
+ cpuset_change_task_nodemask(task, &newmems);
+
+ mm = get_task_mm(task);
+ if (!mm)
+ continue;
+
+ migrate = is_memory_migrate(cs);
+
+ mpol_rebind_mm(mm, &cs->mems_allowed);
+ if (migrate)
+ cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
+ mmput(mm);
+ }
+ css_task_iter_end(&it);
/*
* All the tasks' nodemasks have been updated, update
@@ -1123,39 +1093,53 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
}
/*
- * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
- * @cs: the root cpuset of the hierarchy
- * @update_root: update the root cpuset or not?
- * @heap: the heap used by css_scan_tasks()
+ * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
+ * @cs: the cpuset to consider
+ * @new_mems: a temp variable for calculating new effective_mems
*
- * This will update nodemasks of tasks in @root_cs and all other empty cpusets
- * which take on nodemask of @root_cs.
+ * When configured nodemask is changed, the effective nodemasks of this cpuset
+ * and all its descendants need to be updated.
+ *
+ * On legacy hiearchy, effective_mems will be the same with mems_allowed.
*
* Called with cpuset_mutex held
*/
-static void update_tasks_nodemask_hier(struct cpuset *root_cs,
- bool update_root, struct ptr_heap *heap)
+static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
rcu_read_lock();
- cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
- if (cp == root_cs) {
- if (!update_root)
- continue;
- } else {
- /* skip the whole subtree if @cp have some CPU */
- if (!nodes_empty(cp->mems_allowed)) {
- pos_css = css_rightmost_descendant(pos_css);
- continue;
- }
+ cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+ struct cpuset *parent = parent_cs(cp);
+
+ nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
+
+ /*
+ * If it becomes empty, inherit the effective mask of the
+ * parent, which is guaranteed to have some MEMs.
+ */
+ if (nodes_empty(*new_mems))
+ *new_mems = parent->effective_mems;
+
+ /* Skip the whole subtree if the nodemask remains the same. */
+ if (nodes_equal(*new_mems, cp->effective_mems)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
}
- if (!css_tryget(&cp->css))
+
+ if (!css_tryget_online(&cp->css))
continue;
rcu_read_unlock();
- update_tasks_nodemask(cp, heap);
+ mutex_lock(&callback_mutex);
+ cp->effective_mems = *new_mems;
+ mutex_unlock(&callback_mutex);
+
+ WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+ !nodes_equal(cp->mems_allowed, cp->effective_mems));
+
+ update_tasks_nodemask(cp);
rcu_read_lock();
css_put(&cp->css);
@@ -1180,7 +1164,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
int retval;
- struct ptr_heap heap;
/*
* top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
@@ -1205,8 +1188,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
goto done;
if (!nodes_subset(trialcs->mems_allowed,
- node_states[N_MEMORY])) {
- retval = -EINVAL;
+ top_cpuset.mems_allowed)) {
+ retval = -EINVAL;
goto done;
}
}
@@ -1219,24 +1202,25 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
goto done;
- retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
- if (retval < 0)
- goto done;
-
mutex_lock(&callback_mutex);
cs->mems_allowed = trialcs->mems_allowed;
mutex_unlock(&callback_mutex);
- update_tasks_nodemask_hier(cs, true, &heap);
-
- heap_free(&heap);
+ /* use trialcs->mems_allowed as a temp variable */
+ update_nodemasks_hier(cs, &cs->mems_allowed);
done:
return retval;
}
int current_cpuset_is_being_rebound(void)
{
- return task_cs(current) == cpuset_being_rebound;
+ int ret;
+
+ rcu_read_lock();
+ ret = task_cs(current) == cpuset_being_rebound;
+ rcu_read_unlock();
+
+ return ret;
}
static int update_relax_domain_level(struct cpuset *cs, s64 val)
@@ -1257,38 +1241,22 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
}
/**
- * cpuset_change_flag - make a task's spread flags the same as its cpuset's
- * @tsk: task to be updated
- * @data: cpuset to @tsk belongs to
- *
- * Called by css_scan_tasks() for each task in a cgroup.
- *
- * We don't need to re-check for the cgroup/cpuset membership, since we're
- * holding cpuset_mutex at this point.
- */
-static void cpuset_change_flag(struct task_struct *tsk, void *data)
-{
- struct cpuset *cs = data;
-
- cpuset_update_task_spread_flag(cs, tsk);
-}
-
-/**
* update_tasks_flags - update the spread flags of tasks in the cpuset.
* @cs: the cpuset in which each task's spread flags needs to be changed
- * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
- *
- * Called with cpuset_mutex held
*
- * The css_scan_tasks() function will scan all the tasks in a cgroup,
- * calling callback functions for each.
- *
- * No return value. It's guaranteed that css_scan_tasks() always returns 0
- * if @heap != NULL.
+ * Iterate through each task of @cs updating its spread flags. As this
+ * function is called with cpuset_mutex held, cpuset membership stays
+ * stable.
*/
-static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
+static void update_tasks_flags(struct cpuset *cs)
{
- css_scan_tasks(&cs->css, NULL, cpuset_change_flag, cs, heap);
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ css_task_iter_start(&cs->css, &it);
+ while ((task = css_task_iter_next(&it)))
+ cpuset_update_task_spread_flag(cs, task);
+ css_task_iter_end(&it);
}
/*
@@ -1306,7 +1274,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
struct cpuset *trialcs;
int balance_flag_changed;
int spread_flag_changed;
- struct ptr_heap heap;
int err;
trialcs = alloc_trial_cpuset(cs);
@@ -1322,10 +1289,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
if (err < 0)
goto out;
- err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
- if (err < 0)
- goto out;
-
balance_flag_changed = (is_sched_load_balance(cs) !=
is_sched_load_balance(trialcs));
@@ -1340,8 +1303,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
rebuild_sched_domains_locked();
if (spread_flag_changed)
- update_tasks_flags(cs, &heap);
- heap_free(&heap);
+ update_tasks_flags(cs);
out:
free_trial_cpuset(trialcs);
return err;
@@ -1445,6 +1407,8 @@ static int fmeter_getrate(struct fmeter *fmp)
return val;
}
+static struct cpuset *cpuset_attach_old_cs;
+
/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
static int cpuset_can_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
@@ -1453,18 +1417,18 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
struct task_struct *task;
int ret;
+ /* used later by cpuset_attach() */
+ cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset));
+
mutex_lock(&cpuset_mutex);
- /*
- * We allow to move tasks into an empty cpuset if sane_behavior
- * flag is set.
- */
+ /* allow moving tasks into an empty cpuset if on default hierarchy */
ret = -ENOSPC;
- if (!cgroup_sane_behavior(css->cgroup) &&
+ if (!cgroup_on_dfl(css->cgroup) &&
(cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
goto out_unlock;
- cgroup_taskset_for_each(task, css, tset) {
+ cgroup_taskset_for_each(task, tset) {
/*
* Kthreads which disallow setaffinity shouldn't be moved
* to a new cpuset; we don't want to change their cpu
@@ -1516,12 +1480,8 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
struct mm_struct *mm;
struct task_struct *task;
struct task_struct *leader = cgroup_taskset_first(tset);
- struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset,
- cpuset_subsys_id);
struct cpuset *cs = css_cs(css);
- struct cpuset *oldcs = css_cs(oldcss);
- struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
- struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
+ struct cpuset *oldcs = cpuset_attach_old_cs;
mutex_lock(&cpuset_mutex);
@@ -1529,11 +1489,11 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
if (cs == &top_cpuset)
cpumask_copy(cpus_attach, cpu_possible_mask);
else
- guarantee_online_cpus(cpus_cs, cpus_attach);
+ guarantee_online_cpus(cs, cpus_attach);
- guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
+ guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
- cgroup_taskset_for_each(task, css, tset) {
+ cgroup_taskset_for_each(task, tset) {
/*
* can_attach beforehand should guarantee that this doesn't
* fail. TODO: have a better way to handle failure here
@@ -1548,11 +1508,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
* Change mm, possibly for multiple threads in a threadgroup. This is
* expensive and may sleep.
*/
- cpuset_attach_nodemask_to = cs->mems_allowed;
+ cpuset_attach_nodemask_to = cs->effective_mems;
mm = get_task_mm(leader);
if (mm) {
- struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs);
-
mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
/*
@@ -1563,7 +1521,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
* mm from.
*/
if (is_memory_migrate(cs)) {
- cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed,
+ cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
&cpuset_attach_nodemask_to);
}
mmput(mm);
@@ -1584,6 +1542,8 @@ typedef enum {
FILE_MEMORY_MIGRATE,
FILE_CPULIST,
FILE_MEMLIST,
+ FILE_EFFECTIVE_CPULIST,
+ FILE_EFFECTIVE_MEMLIST,
FILE_CPU_EXCLUSIVE,
FILE_MEM_EXCLUSIVE,
FILE_MEM_HARDWALL,
@@ -1672,13 +1632,15 @@ out_unlock:
/*
* Common handling for a write to a "cpus" or "mems" file.
*/
-static int cpuset_write_resmask(struct cgroup_subsys_state *css,
- struct cftype *cft, const char *buf)
+static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
- struct cpuset *cs = css_cs(css);
+ struct cpuset *cs = css_cs(of_css(of));
struct cpuset *trialcs;
int retval = -ENODEV;
+ buf = strstrip(buf);
+
/*
* CPU or memory hotunplug may leave @cs w/o any execution
* resources, in which case the hotplug code asynchronously updates
@@ -1689,7 +1651,17 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css,
* resources, wait for the previously scheduled operations before
* proceeding, so that we don't end up keep removing tasks added
* after execution capability is restored.
+ *
+ * cpuset_hotplug_work calls back into cgroup core via
+ * cgroup_transfer_tasks() and waiting for it from a cgroupfs
+ * operation like this one can lead to a deadlock through kernfs
+ * active_ref protection. Let's break the protection. Losing the
+ * protection is okay as we check whether @cs is online after
+ * grabbing cpuset_mutex anyway. This only happens on the legacy
+ * hierarchies.
*/
+ css_get(&cs->css);
+ kernfs_break_active_protection(of->kn);
flush_work(&cpuset_hotplug_work);
mutex_lock(&cpuset_mutex);
@@ -1702,7 +1674,7 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css,
goto out_unlock;
}
- switch (cft->private) {
+ switch (of_cft(of)->private) {
case FILE_CPULIST:
retval = update_cpumask(cs, trialcs, buf);
break;
@@ -1717,7 +1689,9 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css,
free_trial_cpuset(trialcs);
out_unlock:
mutex_unlock(&cpuset_mutex);
- return retval;
+ kernfs_unbreak_active_protection(of->kn);
+ css_put(&cs->css);
+ return retval ?: nbytes;
}
/*
@@ -1748,6 +1722,12 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
case FILE_MEMLIST:
s += nodelist_scnprintf(s, count, cs->mems_allowed);
break;
+ case FILE_EFFECTIVE_CPULIST:
+ s += cpulist_scnprintf(s, count, cs->effective_cpus);
+ break;
+ case FILE_EFFECTIVE_MEMLIST:
+ s += nodelist_scnprintf(s, count, cs->effective_mems);
+ break;
default:
ret = -EINVAL;
goto out_unlock;
@@ -1819,7 +1799,7 @@ static struct cftype files[] = {
{
.name = "cpus",
.seq_show = cpuset_common_seq_show,
- .write_string = cpuset_write_resmask,
+ .write = cpuset_write_resmask,
.max_write_len = (100U + 6 * NR_CPUS),
.private = FILE_CPULIST,
},
@@ -1827,12 +1807,24 @@ static struct cftype files[] = {
{
.name = "mems",
.seq_show = cpuset_common_seq_show,
- .write_string = cpuset_write_resmask,
+ .write = cpuset_write_resmask,
.max_write_len = (100U + 6 * MAX_NUMNODES),
.private = FILE_MEMLIST,
},
{
+ .name = "effective_cpus",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_CPULIST,
+ },
+
+ {
+ .name = "effective_mems",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_MEMLIST,
+ },
+
+ {
.name = "cpu_exclusive",
.read_u64 = cpuset_read_u64,
.write_u64 = cpuset_write_u64,
@@ -1923,18 +1915,26 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
cs = kzalloc(sizeof(*cs), GFP_KERNEL);
if (!cs)
return ERR_PTR(-ENOMEM);
- if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
- kfree(cs);
- return ERR_PTR(-ENOMEM);
- }
+ if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
+ goto free_cs;
+ if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
+ goto free_cpus;
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpumask_clear(cs->cpus_allowed);
nodes_clear(cs->mems_allowed);
+ cpumask_clear(cs->effective_cpus);
+ nodes_clear(cs->effective_mems);
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
return &cs->css;
+
+free_cpus:
+ free_cpumask_var(cs->cpus_allowed);
+free_cs:
+ kfree(cs);
+ return ERR_PTR(-ENOMEM);
}
static int cpuset_css_online(struct cgroup_subsys_state *css)
@@ -1955,7 +1955,14 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
if (is_spread_slab(parent))
set_bit(CS_SPREAD_SLAB, &cs->flags);
- number_of_cpusets++;
+ cpuset_inc();
+
+ mutex_lock(&callback_mutex);
+ if (cgroup_on_dfl(cs->css.cgroup)) {
+ cpumask_copy(cs->effective_cpus, parent->effective_cpus);
+ cs->effective_mems = parent->effective_mems;
+ }
+ mutex_unlock(&callback_mutex);
if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
goto out_unlock;
@@ -2006,7 +2013,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
if (is_sched_load_balance(cs))
update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
- number_of_cpusets--;
+ cpuset_dec();
clear_bit(CS_ONLINE, &cs->flags);
mutex_unlock(&cpuset_mutex);
@@ -2016,22 +2023,40 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
+ free_cpumask_var(cs->effective_cpus);
free_cpumask_var(cs->cpus_allowed);
kfree(cs);
}
-struct cgroup_subsys cpuset_subsys = {
- .name = "cpuset",
- .css_alloc = cpuset_css_alloc,
- .css_online = cpuset_css_online,
- .css_offline = cpuset_css_offline,
- .css_free = cpuset_css_free,
- .can_attach = cpuset_can_attach,
- .cancel_attach = cpuset_cancel_attach,
- .attach = cpuset_attach,
- .subsys_id = cpuset_subsys_id,
- .base_cftypes = files,
- .early_init = 1,
+static void cpuset_bind(struct cgroup_subsys_state *root_css)
+{
+ mutex_lock(&cpuset_mutex);
+ mutex_lock(&callback_mutex);
+
+ if (cgroup_on_dfl(root_css->cgroup)) {
+ cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
+ top_cpuset.mems_allowed = node_possible_map;
+ } else {
+ cpumask_copy(top_cpuset.cpus_allowed,
+ top_cpuset.effective_cpus);
+ top_cpuset.mems_allowed = top_cpuset.effective_mems;
+ }
+
+ mutex_unlock(&callback_mutex);
+ mutex_unlock(&cpuset_mutex);
+}
+
+struct cgroup_subsys cpuset_cgrp_subsys = {
+ .css_alloc = cpuset_css_alloc,
+ .css_online = cpuset_css_online,
+ .css_offline = cpuset_css_offline,
+ .css_free = cpuset_css_free,
+ .can_attach = cpuset_can_attach,
+ .cancel_attach = cpuset_cancel_attach,
+ .attach = cpuset_attach,
+ .bind = cpuset_bind,
+ .legacy_cftypes = files,
+ .early_init = 1,
};
/**
@@ -2046,9 +2071,13 @@ int __init cpuset_init(void)
if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
BUG();
+ if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
+ BUG();
cpumask_setall(top_cpuset.cpus_allowed);
nodes_setall(top_cpuset.mems_allowed);
+ cpumask_setall(top_cpuset.effective_cpus);
+ nodes_setall(top_cpuset.effective_mems);
fmeter_init(&top_cpuset.fmeter);
set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
@@ -2061,7 +2090,6 @@ int __init cpuset_init(void)
if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
BUG();
- number_of_cpusets = 1;
return 0;
}
@@ -2086,13 +2114,72 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
parent = parent_cs(parent);
if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
- rcu_read_lock();
- printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset %s\n",
- cgroup_name(cs->css.cgroup));
- rcu_read_unlock();
+ pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
+ pr_cont_cgroup_name(cs->css.cgroup);
+ pr_cont("\n");
}
}
+static void
+hotplug_update_tasks_legacy(struct cpuset *cs,
+ struct cpumask *new_cpus, nodemask_t *new_mems,
+ bool cpus_updated, bool mems_updated)
+{
+ bool is_empty;
+
+ mutex_lock(&callback_mutex);
+ cpumask_copy(cs->cpus_allowed, new_cpus);
+ cpumask_copy(cs->effective_cpus, new_cpus);
+ cs->mems_allowed = *new_mems;
+ cs->effective_mems = *new_mems;
+ mutex_unlock(&callback_mutex);
+
+ /*
+ * Don't call update_tasks_cpumask() if the cpuset becomes empty,
+ * as the tasks will be migratecd to an ancestor.
+ */
+ if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
+ update_tasks_cpumask(cs);
+ if (mems_updated && !nodes_empty(cs->mems_allowed))
+ update_tasks_nodemask(cs);
+
+ is_empty = cpumask_empty(cs->cpus_allowed) ||
+ nodes_empty(cs->mems_allowed);
+
+ mutex_unlock(&cpuset_mutex);
+
+ /*
+ * Move tasks to the nearest ancestor with execution resources,
+ * This is full cgroup operation which will also call back into
+ * cpuset. Should be done outside any lock.
+ */
+ if (is_empty)
+ remove_tasks_in_empty_cpuset(cs);
+
+ mutex_lock(&cpuset_mutex);
+}
+
+static void
+hotplug_update_tasks(struct cpuset *cs,
+ struct cpumask *new_cpus, nodemask_t *new_mems,
+ bool cpus_updated, bool mems_updated)
+{
+ if (cpumask_empty(new_cpus))
+ cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
+ if (nodes_empty(*new_mems))
+ *new_mems = parent_cs(cs)->effective_mems;
+
+ mutex_lock(&callback_mutex);
+ cpumask_copy(cs->effective_cpus, new_cpus);
+ cs->effective_mems = *new_mems;
+ mutex_unlock(&callback_mutex);
+
+ if (cpus_updated)
+ update_tasks_cpumask(cs);
+ if (mems_updated)
+ update_tasks_nodemask(cs);
+}
+
/**
* cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
* @cs: cpuset in interest
@@ -2103,11 +2190,10 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
*/
static void cpuset_hotplug_update_tasks(struct cpuset *cs)
{
- static cpumask_t off_cpus;
- static nodemask_t off_mems;
- bool is_empty;
- bool sane = cgroup_sane_behavior(cs->css.cgroup);
-
+ static cpumask_t new_cpus;
+ static nodemask_t new_mems;
+ bool cpus_updated;
+ bool mems_updated;
retry:
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
@@ -2122,51 +2208,20 @@ retry:
goto retry;
}
- cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
- nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
-
- mutex_lock(&callback_mutex);
- cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
- mutex_unlock(&callback_mutex);
-
- /*
- * If sane_behavior flag is set, we need to update tasks' cpumask
- * for empty cpuset to take on ancestor's cpumask. Otherwise, don't
- * call update_tasks_cpumask() if the cpuset becomes empty, as
- * the tasks in it will be migrated to an ancestor.
- */
- if ((sane && cpumask_empty(cs->cpus_allowed)) ||
- (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
- update_tasks_cpumask(cs, NULL);
-
- mutex_lock(&callback_mutex);
- nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
- mutex_unlock(&callback_mutex);
+ cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
+ nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
- /*
- * If sane_behavior flag is set, we need to update tasks' nodemask
- * for empty cpuset to take on ancestor's nodemask. Otherwise, don't
- * call update_tasks_nodemask() if the cpuset becomes empty, as
- * the tasks in it will be migratd to an ancestor.
- */
- if ((sane && nodes_empty(cs->mems_allowed)) ||
- (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
- update_tasks_nodemask(cs, NULL);
+ cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
+ mems_updated = !nodes_equal(new_mems, cs->effective_mems);
- is_empty = cpumask_empty(cs->cpus_allowed) ||
- nodes_empty(cs->mems_allowed);
+ if (cgroup_on_dfl(cs->css.cgroup))
+ hotplug_update_tasks(cs, &new_cpus, &new_mems,
+ cpus_updated, mems_updated);
+ else
+ hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
+ cpus_updated, mems_updated);
mutex_unlock(&cpuset_mutex);
-
- /*
- * If sane_behavior flag is set, we'll keep tasks in empty cpusets.
- *
- * Otherwise move tasks to the nearest ancestor with execution
- * resources. This is full cgroup operation which will
- * also call back into cpuset. Should be done outside any lock.
- */
- if (!sane && is_empty)
- remove_tasks_in_empty_cpuset(cs);
}
/**
@@ -2190,6 +2245,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
static cpumask_t new_cpus;
static nodemask_t new_mems;
bool cpus_updated, mems_updated;
+ bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
mutex_lock(&cpuset_mutex);
@@ -2197,13 +2253,15 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
cpumask_copy(&new_cpus, cpu_active_mask);
new_mems = node_states[N_MEMORY];
- cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
- mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
+ cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
+ mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
/* synchronize cpus_allowed to cpu_active_mask */
if (cpus_updated) {
mutex_lock(&callback_mutex);
- cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+ if (!on_dfl)
+ cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+ cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
mutex_unlock(&callback_mutex);
/* we don't mess with cpumasks of tasks in top_cpuset */
}
@@ -2211,9 +2269,11 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
/* synchronize mems_allowed to N_MEMORY */
if (mems_updated) {
mutex_lock(&callback_mutex);
- top_cpuset.mems_allowed = new_mems;
+ if (!on_dfl)
+ top_cpuset.mems_allowed = new_mems;
+ top_cpuset.effective_mems = new_mems;
mutex_unlock(&callback_mutex);
- update_tasks_nodemask(&top_cpuset, NULL);
+ update_tasks_nodemask(&top_cpuset);
}
mutex_unlock(&cpuset_mutex);
@@ -2225,7 +2285,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
rcu_read_lock();
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
- if (cs == &top_cpuset || !css_tryget(&cs->css))
+ if (cs == &top_cpuset || !css_tryget_online(&cs->css))
continue;
rcu_read_unlock();
@@ -2286,6 +2346,9 @@ void __init cpuset_init_smp(void)
top_cpuset.mems_allowed = node_states[N_MEMORY];
top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
+ cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
+ top_cpuset.effective_mems = node_states[N_MEMORY];
+
register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
}
@@ -2302,23 +2365,17 @@ void __init cpuset_init_smp(void)
void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
- struct cpuset *cpus_cs;
-
mutex_lock(&callback_mutex);
- task_lock(tsk);
- cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
- guarantee_online_cpus(cpus_cs, pmask);
- task_unlock(tsk);
+ rcu_read_lock();
+ guarantee_online_cpus(task_cs(tsk), pmask);
+ rcu_read_unlock();
mutex_unlock(&callback_mutex);
}
void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
- struct cpuset *cpus_cs;
-
rcu_read_lock();
- cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
- do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed);
+ do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
rcu_read_unlock();
/*
@@ -2357,14 +2414,12 @@ void cpuset_init_current_mems_allowed(void)
nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
{
- struct cpuset *mems_cs;
nodemask_t mask;
mutex_lock(&callback_mutex);
- task_lock(tsk);
- mems_cs = effective_nodemask_cpuset(task_cs(tsk));
- guarantee_online_mems(mems_cs, &mask);
- task_unlock(tsk);
+ rcu_read_lock();
+ guarantee_online_mems(task_cs(tsk), &mask);
+ rcu_read_unlock();
mutex_unlock(&callback_mutex);
return mask;
@@ -2480,10 +2535,10 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
/* Not hardwall and node outside mems_allowed: scan up cpusets */
mutex_lock(&callback_mutex);
- task_lock(current);
+ rcu_read_lock();
cs = nearest_hardwall_ancestor(task_cs(current));
allowed = node_isset(node, cs->mems_allowed);
- task_unlock(current);
+ rcu_read_unlock();
mutex_unlock(&callback_mutex);
return allowed;
@@ -2606,30 +2661,30 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
/**
* cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
- * @task: pointer to task_struct of some task.
+ * @tsk: pointer to task_struct of some task.
*
* Description: Prints @task's name, cpuset name, and cached copy of its
- * mems_allowed to the kernel log. Must hold task_lock(task) to allow
- * dereferencing task_cs(task).
+ * mems_allowed to the kernel log.
*/
void cpuset_print_task_mems_allowed(struct task_struct *tsk)
{
/* Statically allocated to prevent using excess stack. */
static char cpuset_nodelist[CPUSET_NODELIST_LEN];
static DEFINE_SPINLOCK(cpuset_buffer_lock);
+ struct cgroup *cgrp;
- struct cgroup *cgrp = task_cs(tsk)->css.cgroup;
-
- rcu_read_lock();
spin_lock(&cpuset_buffer_lock);
+ rcu_read_lock();
+ cgrp = task_cs(tsk)->css.cgroup;
nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
tsk->mems_allowed);
- printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
- tsk->comm, cgroup_name(cgrp), cpuset_nodelist);
+ pr_info("%s cpuset=", tsk->comm);
+ pr_cont_cgroup_name(cgrp);
+ pr_cont(" mems_allowed=%s\n", cpuset_nodelist);
- spin_unlock(&cpuset_buffer_lock);
rcu_read_unlock();
+ spin_unlock(&cpuset_buffer_lock);
}
/*
@@ -2660,9 +2715,9 @@ int cpuset_memory_pressure_enabled __read_mostly;
void __cpuset_memory_pressure_bump(void)
{
- task_lock(current);
+ rcu_read_lock();
fmeter_markevent(&task_cs(current)->fmeter);
- task_unlock(current);
+ rcu_read_unlock();
}
#ifdef CONFIG_PROC_PID_CPUSET
@@ -2679,12 +2734,12 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
{
struct pid *pid;
struct task_struct *tsk;
- char *buf;
+ char *buf, *p;
struct cgroup_subsys_state *css;
int retval;
retval = -ENOMEM;
- buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!buf)
goto out;
@@ -2694,14 +2749,16 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
if (!tsk)
goto out_free;
+ retval = -ENAMETOOLONG;
rcu_read_lock();
- css = task_css(tsk, cpuset_subsys_id);
- retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
+ css = task_css(tsk, cpuset_cgrp_id);
+ p = cgroup_path(css->cgroup, buf, PATH_MAX);
rcu_read_unlock();
- if (retval < 0)
+ if (!p)
goto out_put_task;
- seq_puts(m, buf);
+ seq_puts(m, p);
seq_putc(m, '\n');
+ retval = 0;
out_put_task:
put_task_struct(tsk);
out_free:
@@ -2714,10 +2771,10 @@ out:
/* Display task mems_allowed in /proc/<pid>/status file. */
void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
{
- seq_printf(m, "Mems_allowed:\t");
+ seq_puts(m, "Mems_allowed:\t");
seq_nodemask(m, &task->mems_allowed);
- seq_printf(m, "\n");
- seq_printf(m, "Mems_allowed_list:\t");
+ seq_puts(m, "\n");
+ seq_puts(m, "Mems_allowed_list:\t");
seq_nodemask_list(m, &task->mems_allowed);
- seq_printf(m, "\n");
+ seq_puts(m, "\n");
}
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 334b3980ffc1..1adf62b39b96 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -49,6 +49,7 @@
#include <linux/pid.h>
#include <linux/smp.h>
#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/rcupdate.h>
#include <asm/cacheflush.h>
@@ -224,10 +225,17 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
if (!CACHE_FLUSH_IS_SAFE)
return;
- if (current->mm && current->mm->mmap_cache) {
- flush_cache_range(current->mm->mmap_cache,
- addr, addr + BREAK_INSTR_SIZE);
+ if (current->mm) {
+ int i;
+
+ for (i = 0; i < VMACACHE_SIZE; i++) {
+ if (!current->vmacache[i])
+ continue;
+ flush_cache_range(current->vmacache[i],
+ addr, addr + BREAK_INSTR_SIZE);
+ }
}
+
/* Force flush instruction cache if it was outside the mm */
flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
}
@@ -526,7 +534,7 @@ return_normal:
kgdb_info[cpu].exception_state &=
~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
kgdb_info[cpu].enter_kgdb--;
- smp_mb__before_atomic_dec();
+ smp_mb__before_atomic();
atomic_dec(&slaves_in_kgdb);
dbg_touch_watchdogs();
local_irq_restore(flags);
@@ -654,7 +662,7 @@ kgdb_restore:
kgdb_info[cpu].exception_state &=
~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
kgdb_info[cpu].enter_kgdb--;
- smp_mb__before_atomic_dec();
+ smp_mb__before_atomic();
atomic_dec(&masters_in_kgdb);
/* Free kgdb_active */
atomic_set(&kgdb_active, -1);
@@ -1035,7 +1043,7 @@ int dbg_io_get_char(void)
* otherwise as a quick means to stop program execution and "break" into
* the debugger.
*/
-void kgdb_breakpoint(void)
+noinline void kgdb_breakpoint(void)
{
atomic_inc(&kgdb_setting_breakpoint);
wmb(); /* Sync point before breakpoint */
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index b03e0e814e43..fe15fff5df53 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -21,7 +21,7 @@
static void kdb_show_stack(struct task_struct *p, void *addr)
{
int old_lvl = console_loglevel;
- console_loglevel = 15;
+ console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
kdb_trap_printk++;
kdb_set_current_task(p);
if (addr) {
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 14ff4849262c..7c70812caea5 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -710,7 +710,7 @@ kdb_printit:
}
if (logging) {
saved_loglevel = console_loglevel;
- console_loglevel = 0;
+ console_loglevel = CONSOLE_LOGLEVEL_SILENT;
printk(KERN_INFO "%s", kdb_buffer);
}
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 0b097c8a1e50..379650b984f8 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1091,7 +1091,7 @@ static int kdb_reboot(int argc, const char **argv)
static void kdb_dumpregs(struct pt_regs *regs)
{
int old_lvl = console_loglevel;
- console_loglevel = 15;
+ console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
kdb_trap_printk++;
show_regs(regs);
kdb_trap_printk--;
@@ -2472,7 +2472,7 @@ static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm)
static void kdb_sysinfo(struct sysinfo *val)
{
struct timespec uptime;
- do_posix_clock_monotonic_gettime(&uptime);
+ ktime_get_ts(&uptime);
memset(val, 0, sizeof(*val));
val->uptime = uptime.tv_sec;
val->loads[0] = avenrun[0];
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 54996b71e66d..ef90b04d783f 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -46,42 +46,25 @@ void __delayacct_tsk_init(struct task_struct *tsk)
}
/*
- * Start accounting for a delay statistic using
- * its starting timestamp (@start)
+ * Finish delay accounting for a statistic using its timestamps (@start),
+ * accumalator (@total) and @count
*/
-
-static inline void delayacct_start(struct timespec *start)
+static void delayacct_end(u64 *start, u64 *total, u32 *count)
{
- do_posix_clock_monotonic_gettime(start);
-}
-
-/*
- * Finish delay accounting for a statistic using
- * its timestamps (@start, @end), accumalator (@total) and @count
- */
-
-static void delayacct_end(struct timespec *start, struct timespec *end,
- u64 *total, u32 *count)
-{
- struct timespec ts;
- s64 ns;
+ s64 ns = ktime_get_ns() - *start;
unsigned long flags;
- do_posix_clock_monotonic_gettime(end);
- ts = timespec_sub(*end, *start);
- ns = timespec_to_ns(&ts);
- if (ns < 0)
- return;
-
- spin_lock_irqsave(&current->delays->lock, flags);
- *total += ns;
- (*count)++;
- spin_unlock_irqrestore(&current->delays->lock, flags);
+ if (ns > 0) {
+ spin_lock_irqsave(&current->delays->lock, flags);
+ *total += ns;
+ (*count)++;
+ spin_unlock_irqrestore(&current->delays->lock, flags);
+ }
}
void __delayacct_blkio_start(void)
{
- delayacct_start(&current->delays->blkio_start);
+ current->delays->blkio_start = ktime_get_ns();
}
void __delayacct_blkio_end(void)
@@ -89,35 +72,29 @@ void __delayacct_blkio_end(void)
if (current->delays->flags & DELAYACCT_PF_SWAPIN)
/* Swapin block I/O */
delayacct_end(&current->delays->blkio_start,
- &current->delays->blkio_end,
&current->delays->swapin_delay,
&current->delays->swapin_count);
else /* Other block I/O */
delayacct_end(&current->delays->blkio_start,
- &current->delays->blkio_end,
&current->delays->blkio_delay,
&current->delays->blkio_count);
}
int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
{
- s64 tmp;
- unsigned long t1;
- unsigned long long t2, t3;
- unsigned long flags;
- struct timespec ts;
cputime_t utime, stime, stimescaled, utimescaled;
+ unsigned long long t2, t3;
+ unsigned long flags, t1;
+ s64 tmp;
- tmp = (s64)d->cpu_run_real_total;
task_cputime(tsk, &utime, &stime);
- cputime_to_timespec(utime + stime, &ts);
- tmp += timespec_to_ns(&ts);
+ tmp = (s64)d->cpu_run_real_total;
+ tmp += cputime_to_nsecs(utime + stime);
d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
- tmp = (s64)d->cpu_scaled_run_real_total;
task_cputime_scaled(tsk, &utimescaled, &stimescaled);
- cputime_to_timespec(utimescaled + stimescaled, &ts);
- tmp += timespec_to_ns(&ts);
+ tmp = (s64)d->cpu_scaled_run_real_total;
+ tmp += cputime_to_nsecs(utimescaled + stimescaled);
d->cpu_scaled_run_real_total =
(tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
@@ -169,13 +146,12 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
void __delayacct_freepages_start(void)
{
- delayacct_start(&current->delays->freepages_start);
+ current->delays->freepages_start = ktime_get_ns();
}
void __delayacct_freepages_end(void)
{
delayacct_end(&current->delays->freepages_start,
- &current->delays->freepages_end,
&current->delays->freepages_delay,
&current->delays->freepages_count);
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fa0b2d4ad83c..963bf139e2b2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -39,6 +39,9 @@
#include <linux/hw_breakpoint.h>
#include <linux/mm_types.h>
#include <linux/cgroup.h>
+#include <linux/module.h>
+#include <linux/mman.h>
+#include <linux/compat.h>
#include "internal.h"
@@ -231,11 +234,29 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
#define NR_ACCUMULATED_SAMPLES 128
static DEFINE_PER_CPU(u64, running_sample_length);
-void perf_sample_event_took(u64 sample_len_ns)
+static void perf_duration_warn(struct irq_work *w)
{
+ u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
u64 avg_local_sample_len;
u64 local_samples_len;
+
+ local_samples_len = __get_cpu_var(running_sample_length);
+ avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
+
+ printk_ratelimited(KERN_WARNING
+ "perf interrupt took too long (%lld > %lld), lowering "
+ "kernel.perf_event_max_sample_rate to %d\n",
+ avg_local_sample_len, allowed_ns >> 1,
+ sysctl_perf_event_sample_rate);
+}
+
+static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
+
+void perf_sample_event_took(u64 sample_len_ns)
+{
u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
+ u64 avg_local_sample_len;
+ u64 local_samples_len;
if (allowed_ns == 0)
return;
@@ -263,13 +284,14 @@ void perf_sample_event_took(u64 sample_len_ns)
sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
- printk_ratelimited(KERN_WARNING
- "perf samples too long (%lld > %lld), lowering "
- "kernel.perf_event_max_sample_rate to %d\n",
- avg_local_sample_len, allowed_ns,
- sysctl_perf_event_sample_rate);
-
update_perf_cpu_limits();
+
+ if (!irq_work_queue(&perf_duration_work)) {
+ early_printk("perf interrupt took too long (%lld > %lld), lowering "
+ "kernel.perf_event_max_sample_rate to %d\n",
+ avg_local_sample_len, allowed_ns >> 1,
+ sysctl_perf_event_sample_rate);
+ }
}
static atomic64_t perf_event_id;
@@ -342,7 +364,7 @@ struct perf_cgroup {
static inline struct perf_cgroup *
perf_cgroup_from_task(struct task_struct *task)
{
- return container_of(task_css(task, perf_subsys_id),
+ return container_of(task_css(task, perf_event_cgrp_id),
struct perf_cgroup, css);
}
@@ -370,11 +392,6 @@ perf_cgroup_match(struct perf_event *event)
event->cgrp->css.cgroup);
}
-static inline bool perf_tryget_cgroup(struct perf_event *event)
-{
- return css_tryget(&event->cgrp->css);
-}
-
static inline void perf_put_cgroup(struct perf_event *event)
{
css_put(&event->cgrp->css);
@@ -593,9 +610,8 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
if (!f.file)
return -EBADF;
- rcu_read_lock();
-
- css = css_from_dir(f.file->f_dentry, &perf_subsys);
+ css = css_tryget_online_from_dir(f.file->f_dentry,
+ &perf_event_cgrp_subsys);
if (IS_ERR(css)) {
ret = PTR_ERR(css);
goto out;
@@ -604,13 +620,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
cgrp = container_of(css, struct perf_cgroup, css);
event->cgrp = cgrp;
- /* must be done before we fput() the file */
- if (!perf_tryget_cgroup(event)) {
- event->cgrp = NULL;
- ret = -ENOENT;
- goto out;
- }
-
/*
* all events in a group must monitor
* the same cgroup because a task belongs
@@ -621,7 +630,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
ret = -EINVAL;
}
out:
- rcu_read_unlock();
fdput(f);
return ret;
}
@@ -1439,6 +1447,11 @@ group_sched_out(struct perf_event *group_event,
cpuctx->exclusive = 0;
}
+struct remove_event {
+ struct perf_event *event;
+ bool detach_group;
+};
+
/*
* Cross CPU call to remove a performance event
*
@@ -1447,12 +1460,15 @@ group_sched_out(struct perf_event *group_event,
*/
static int __perf_remove_from_context(void *info)
{
- struct perf_event *event = info;
+ struct remove_event *re = info;
+ struct perf_event *event = re->event;
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
raw_spin_lock(&ctx->lock);
event_sched_out(event, cpuctx, ctx);
+ if (re->detach_group)
+ perf_group_detach(event);
list_del_event(event, ctx);
if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
ctx->is_active = 0;
@@ -1477,10 +1493,14 @@ static int __perf_remove_from_context(void *info)
* When called from perf_event_exit_task, it's OK because the
* context has been detached from its task.
*/
-static void perf_remove_from_context(struct perf_event *event)
+static void perf_remove_from_context(struct perf_event *event, bool detach_group)
{
struct perf_event_context *ctx = event->ctx;
struct task_struct *task = ctx->task;
+ struct remove_event re = {
+ .event = event,
+ .detach_group = detach_group,
+ };
lockdep_assert_held(&ctx->mutex);
@@ -1489,12 +1509,12 @@ static void perf_remove_from_context(struct perf_event *event)
* Per cpu events are removed via an smp call and
* the removal is always successful.
*/
- cpu_function_call(event->cpu, __perf_remove_from_context, event);
+ cpu_function_call(event->cpu, __perf_remove_from_context, &re);
return;
}
retry:
- if (!task_function_call(task, __perf_remove_from_context, event))
+ if (!task_function_call(task, __perf_remove_from_context, &re))
return;
raw_spin_lock_irq(&ctx->lock);
@@ -1504,6 +1524,11 @@ retry:
*/
if (ctx->is_active) {
raw_spin_unlock_irq(&ctx->lock);
+ /*
+ * Reload the task pointer, it might have been changed by
+ * a concurrent perf_event_context_sched_out().
+ */
+ task = ctx->task;
goto retry;
}
@@ -1511,6 +1536,8 @@ retry:
* Since the task isn't running, its safe to remove the event, us
* holding the ctx->lock ensures the task won't get scheduled in.
*/
+ if (detach_group)
+ perf_group_detach(event);
list_del_event(event, ctx);
raw_spin_unlock_irq(&ctx->lock);
}
@@ -1659,6 +1686,8 @@ event_sched_in(struct perf_event *event,
u64 tstamp = perf_event_time(event);
int ret = 0;
+ lockdep_assert_held(&ctx->lock);
+
if (event->state <= PERF_EVENT_STATE_OFF)
return 0;
@@ -1714,7 +1743,7 @@ group_sched_in(struct perf_event *group_event,
struct perf_event_context *ctx)
{
struct perf_event *event, *partial_group = NULL;
- struct pmu *pmu = group_event->pmu;
+ struct pmu *pmu = ctx->pmu;
u64 now = ctx->time;
bool simulate = false;
@@ -1943,6 +1972,11 @@ retry:
*/
if (ctx->is_active) {
raw_spin_unlock_irq(&ctx->lock);
+ /*
+ * Reload the task pointer, it might have been changed by
+ * a concurrent perf_event_context_sched_out().
+ */
+ task = ctx->task;
goto retry;
}
@@ -2297,7 +2331,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
next_parent = rcu_dereference(next_ctx->parent_ctx);
/* If neither context have a parent context; they cannot be clones. */
- if (!parent && !next_parent)
+ if (!parent || !next_parent)
goto unlock;
if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
@@ -2563,8 +2597,6 @@ static void perf_branch_stack_sched_in(struct task_struct *prev,
if (cpuctx->ctx.nr_branch_stack > 0
&& pmu->flush_branch_stack) {
- pmu = cpuctx->ctx.pmu;
-
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(pmu);
@@ -2954,6 +2986,22 @@ out:
local_irq_restore(flags);
}
+void perf_event_exec(void)
+{
+ struct perf_event_context *ctx;
+ int ctxn;
+
+ rcu_read_lock();
+ for_each_task_context_nr(ctxn) {
+ ctx = current->perf_event_ctxp[ctxn];
+ if (!ctx)
+ continue;
+
+ perf_event_enable_on_exec(ctx);
+ }
+ rcu_read_unlock();
+}
+
/*
* Cross CPU call to read the hardware event
*/
@@ -3176,7 +3224,8 @@ static void free_event_rcu(struct rcu_head *head)
}
static void ring_buffer_put(struct ring_buffer *rb);
-static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
+static void ring_buffer_attach(struct perf_event *event,
+ struct ring_buffer *rb);
static void unaccount_event_cpu(struct perf_event *event, int cpu)
{
@@ -3227,17 +3276,19 @@ static void __free_event(struct perf_event *event)
if (event->ctx)
put_ctx(event->ctx);
+ if (event->pmu)
+ module_put(event->pmu->module);
+
call_rcu(&event->rcu_head, free_event_rcu);
}
-static void free_event(struct perf_event *event)
+
+static void _free_event(struct perf_event *event)
{
irq_work_sync(&event->pending);
unaccount_event(event);
if (event->rb) {
- struct ring_buffer *rb;
-
/*
* Can happen when we close an event with re-directed output.
*
@@ -3245,57 +3296,38 @@ static void free_event(struct perf_event *event)
* over us; possibly making our ring_buffer_put() the last.
*/
mutex_lock(&event->mmap_mutex);
- rb = event->rb;
- if (rb) {
- rcu_assign_pointer(event->rb, NULL);
- ring_buffer_detach(event, rb);
- ring_buffer_put(rb); /* could be last */
- }
+ ring_buffer_attach(event, NULL);
mutex_unlock(&event->mmap_mutex);
}
if (is_cgroup_event(event))
perf_detach_cgroup(event);
-
__free_event(event);
}
-int perf_event_release_kernel(struct perf_event *event)
+/*
+ * Used to free events which have a known refcount of 1, such as in error paths
+ * where the event isn't exposed yet and inherited events.
+ */
+static void free_event(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
-
- WARN_ON_ONCE(ctx->parent_ctx);
- /*
- * There are two ways this annotation is useful:
- *
- * 1) there is a lock recursion from perf_event_exit_task
- * see the comment there.
- *
- * 2) there is a lock-inversion with mmap_sem through
- * perf_event_read_group(), which takes faults while
- * holding ctx->mutex, however this is called after
- * the last filedesc died, so there is no possibility
- * to trigger the AB-BA case.
- */
- mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
- raw_spin_lock_irq(&ctx->lock);
- perf_group_detach(event);
- raw_spin_unlock_irq(&ctx->lock);
- perf_remove_from_context(event);
- mutex_unlock(&ctx->mutex);
-
- free_event(event);
+ if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
+ "unexpected event refcount: %ld; ptr=%p\n",
+ atomic_long_read(&event->refcount), event)) {
+ /* leak to avoid use-after-free */
+ return;
+ }
- return 0;
+ _free_event(event);
}
-EXPORT_SYMBOL_GPL(perf_event_release_kernel);
/*
* Called when the last reference to the file is gone.
*/
static void put_event(struct perf_event *event)
{
+ struct perf_event_context *ctx = event->ctx;
struct task_struct *owner;
if (!atomic_long_dec_and_test(&event->refcount))
@@ -3334,9 +3366,33 @@ static void put_event(struct perf_event *event)
put_task_struct(owner);
}
- perf_event_release_kernel(event);
+ WARN_ON_ONCE(ctx->parent_ctx);
+ /*
+ * There are two ways this annotation is useful:
+ *
+ * 1) there is a lock recursion from perf_event_exit_task
+ * see the comment there.
+ *
+ * 2) there is a lock-inversion with mmap_sem through
+ * perf_event_read_group(), which takes faults while
+ * holding ctx->mutex, however this is called after
+ * the last filedesc died, so there is no possibility
+ * to trigger the AB-BA case.
+ */
+ mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
+ perf_remove_from_context(event, true);
+ mutex_unlock(&ctx->mutex);
+
+ _free_event(event);
}
+int perf_event_release_kernel(struct perf_event *event)
+{
+ put_event(event);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(perf_event_release_kernel);
+
static int perf_release(struct inode *inode, struct file *file)
{
put_event(file->private_data);
@@ -3672,6 +3728,26 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return 0;
}
+#ifdef CONFIG_COMPAT
+static long perf_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ switch (_IOC_NR(cmd)) {
+ case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
+ case _IOC_NR(PERF_EVENT_IOC_ID):
+ /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
+ if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
+ cmd &= ~IOCSIZE_MASK;
+ cmd |= sizeof(void *) << IOCSIZE_SHIFT;
+ }
+ break;
+ }
+ return perf_ioctl(file, cmd, arg);
+}
+#else
+# define perf_compat_ioctl NULL
+#endif
+
int perf_event_task_enable(void)
{
struct perf_event *event;
@@ -3837,28 +3913,47 @@ unlock:
static void ring_buffer_attach(struct perf_event *event,
struct ring_buffer *rb)
{
+ struct ring_buffer *old_rb = NULL;
unsigned long flags;
- if (!list_empty(&event->rb_entry))
- return;
+ if (event->rb) {
+ /*
+ * Should be impossible, we set this when removing
+ * event->rb_entry and wait/clear when adding event->rb_entry.
+ */
+ WARN_ON_ONCE(event->rcu_pending);
- spin_lock_irqsave(&rb->event_lock, flags);
- if (list_empty(&event->rb_entry))
- list_add(&event->rb_entry, &rb->event_list);
- spin_unlock_irqrestore(&rb->event_lock, flags);
-}
+ old_rb = event->rb;
+ event->rcu_batches = get_state_synchronize_rcu();
+ event->rcu_pending = 1;
-static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb)
-{
- unsigned long flags;
+ spin_lock_irqsave(&old_rb->event_lock, flags);
+ list_del_rcu(&event->rb_entry);
+ spin_unlock_irqrestore(&old_rb->event_lock, flags);
+ }
- if (list_empty(&event->rb_entry))
- return;
+ if (event->rcu_pending && rb) {
+ cond_synchronize_rcu(event->rcu_batches);
+ event->rcu_pending = 0;
+ }
+
+ if (rb) {
+ spin_lock_irqsave(&rb->event_lock, flags);
+ list_add_rcu(&event->rb_entry, &rb->event_list);
+ spin_unlock_irqrestore(&rb->event_lock, flags);
+ }
- spin_lock_irqsave(&rb->event_lock, flags);
- list_del_init(&event->rb_entry);
- wake_up_all(&event->waitq);
- spin_unlock_irqrestore(&rb->event_lock, flags);
+ rcu_assign_pointer(event->rb, rb);
+
+ if (old_rb) {
+ ring_buffer_put(old_rb);
+ /*
+ * Since we detached before setting the new rb, so that we
+ * could attach the new rb, we could have missed a wakeup.
+ * Provide it now.
+ */
+ wake_up_all(&event->waitq);
+ }
}
static void ring_buffer_wakeup(struct perf_event *event)
@@ -3927,7 +4022,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
- struct ring_buffer *rb = event->rb;
+ struct ring_buffer *rb = ring_buffer_get(event);
struct user_struct *mmap_user = rb->mmap_user;
int mmap_locked = rb->mmap_locked;
unsigned long size = perf_data_size(rb);
@@ -3935,18 +4030,14 @@ static void perf_mmap_close(struct vm_area_struct *vma)
atomic_dec(&rb->mmap_count);
if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
- return;
+ goto out_put;
- /* Detach current event from the buffer. */
- rcu_assign_pointer(event->rb, NULL);
- ring_buffer_detach(event, rb);
+ ring_buffer_attach(event, NULL);
mutex_unlock(&event->mmap_mutex);
/* If there's still other mmap()s of this buffer, we're done. */
- if (atomic_read(&rb->mmap_count)) {
- ring_buffer_put(rb); /* can't be last */
- return;
- }
+ if (atomic_read(&rb->mmap_count))
+ goto out_put;
/*
* No other mmap()s, detach from all other events that might redirect
@@ -3976,11 +4067,9 @@ again:
* still restart the iteration to make sure we're not now
* iterating the wrong list.
*/
- if (event->rb == rb) {
- rcu_assign_pointer(event->rb, NULL);
- ring_buffer_detach(event, rb);
- ring_buffer_put(rb); /* can't be last, we still have one */
- }
+ if (event->rb == rb)
+ ring_buffer_attach(event, NULL);
+
mutex_unlock(&event->mmap_mutex);
put_event(event);
@@ -4005,6 +4094,7 @@ again:
vma->vm_mm->pinned_vm -= mmap_locked;
free_uid(mmap_user);
+out_put:
ring_buffer_put(rb); /* could be last */
}
@@ -4122,7 +4212,6 @@ again:
vma->vm_mm->pinned_vm += extra;
ring_buffer_attach(event, rb);
- rcu_assign_pointer(event->rb, rb);
perf_event_init_userpage(event);
perf_event_update_userpage(event);
@@ -4164,7 +4253,7 @@ static const struct file_operations perf_fops = {
.read = perf_read,
.poll = perf_poll,
.unlocked_ioctl = perf_ioctl,
- .compat_ioctl = perf_ioctl,
+ .compat_ioctl = perf_compat_ioctl,
.mmap = perf_mmap,
.fasync = perf_fasync,
};
@@ -5034,21 +5123,9 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
NULL);
}
-void perf_event_comm(struct task_struct *task)
+void perf_event_comm(struct task_struct *task, bool exec)
{
struct perf_comm_event comm_event;
- struct perf_event_context *ctx;
- int ctxn;
-
- rcu_read_lock();
- for_each_task_context_nr(ctxn) {
- ctx = task->perf_event_ctxp[ctxn];
- if (!ctx)
- continue;
-
- perf_event_enable_on_exec(ctx);
- }
- rcu_read_unlock();
if (!atomic_read(&nr_comm_events))
return;
@@ -5060,7 +5137,7 @@ void perf_event_comm(struct task_struct *task)
.event_id = {
.header = {
.type = PERF_RECORD_COMM,
- .misc = 0,
+ .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
/* .size */
},
/* .pid */
@@ -5083,6 +5160,7 @@ struct perf_mmap_event {
int maj, min;
u64 ino;
u64 ino_generation;
+ u32 prot, flags;
struct {
struct perf_event_header header;
@@ -5124,6 +5202,8 @@ static void perf_event_mmap_output(struct perf_event *event,
mmap_event->event_id.header.size += sizeof(mmap_event->min);
mmap_event->event_id.header.size += sizeof(mmap_event->ino);
mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
+ mmap_event->event_id.header.size += sizeof(mmap_event->prot);
+ mmap_event->event_id.header.size += sizeof(mmap_event->flags);
}
perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
@@ -5142,6 +5222,8 @@ static void perf_event_mmap_output(struct perf_event *event,
perf_output_put(&handle, mmap_event->min);
perf_output_put(&handle, mmap_event->ino);
perf_output_put(&handle, mmap_event->ino_generation);
+ perf_output_put(&handle, mmap_event->prot);
+ perf_output_put(&handle, mmap_event->flags);
}
__output_copy(&handle, mmap_event->file_name,
@@ -5160,6 +5242,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
struct file *file = vma->vm_file;
int maj = 0, min = 0;
u64 ino = 0, gen = 0;
+ u32 prot = 0, flags = 0;
unsigned int size;
char tmp[16];
char *buf = NULL;
@@ -5190,8 +5273,36 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
gen = inode->i_generation;
maj = MAJOR(dev);
min = MINOR(dev);
+
+ if (vma->vm_flags & VM_READ)
+ prot |= PROT_READ;
+ if (vma->vm_flags & VM_WRITE)
+ prot |= PROT_WRITE;
+ if (vma->vm_flags & VM_EXEC)
+ prot |= PROT_EXEC;
+
+ if (vma->vm_flags & VM_MAYSHARE)
+ flags = MAP_SHARED;
+ else
+ flags = MAP_PRIVATE;
+
+ if (vma->vm_flags & VM_DENYWRITE)
+ flags |= MAP_DENYWRITE;
+ if (vma->vm_flags & VM_MAYEXEC)
+ flags |= MAP_EXECUTABLE;
+ if (vma->vm_flags & VM_LOCKED)
+ flags |= MAP_LOCKED;
+ if (vma->vm_flags & VM_HUGETLB)
+ flags |= MAP_HUGETLB;
+
goto got_name;
} else {
+ if (vma->vm_ops && vma->vm_ops->name) {
+ name = (char *) vma->vm_ops->name(vma);
+ if (name)
+ goto cpy_name;
+ }
+
name = (char *)arch_vma_name(vma);
if (name)
goto cpy_name;
@@ -5230,6 +5341,8 @@ got_name:
mmap_event->min = min;
mmap_event->ino = ino;
mmap_event->ino_generation = gen;
+ mmap_event->prot = prot;
+ mmap_event->flags = flags;
if (!(vma->vm_flags & VM_EXEC))
mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
@@ -5270,6 +5383,8 @@ void perf_event_mmap(struct vm_area_struct *vma)
/* .min (attr_mmap2 only) */
/* .ino (attr_mmap2 only) */
/* .ino_generation (attr_mmap2 only) */
+ /* .prot (attr_mmap2 only) */
+ /* .flags (attr_mmap2 only) */
};
perf_event_mmap_event(&mmap_event);
@@ -5406,6 +5521,9 @@ struct swevent_htable {
/* Recursion avoidance in each contexts */
int recursion[PERF_NR_CONTEXTS];
+
+ /* Keeps track of cpu being initialized/exited */
+ bool online;
};
static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
@@ -5652,8 +5770,14 @@ static int perf_swevent_add(struct perf_event *event, int flags)
hwc->state = !(flags & PERF_EF_START);
head = find_swevent_head(swhash, event);
- if (WARN_ON_ONCE(!head))
+ if (!head) {
+ /*
+ * We can race with cpu hotplug code. Do not
+ * WARN if the cpu just got unplugged.
+ */
+ WARN_ON_ONCE(swhash->online);
return -EINVAL;
+ }
hlist_add_head_rcu(&event->hlist_entry, head);
@@ -6294,7 +6418,7 @@ static int perf_event_idx_default(struct perf_event *event)
* Ensures all contexts with the same task_ctx_nr have the same
* pmu_cpu_context too.
*/
-static void *find_pmu_context(int ctxn)
+static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
{
struct pmu *pmu;
@@ -6549,6 +6673,7 @@ free_pdc:
free_percpu(pmu->pmu_disable_count);
goto unlock;
}
+EXPORT_SYMBOL_GPL(perf_pmu_register);
void perf_pmu_unregister(struct pmu *pmu)
{
@@ -6570,6 +6695,7 @@ void perf_pmu_unregister(struct pmu *pmu)
put_device(pmu->dev);
free_pmu_context(pmu);
}
+EXPORT_SYMBOL_GPL(perf_pmu_unregister);
struct pmu *perf_init_event(struct perf_event *event)
{
@@ -6583,6 +6709,10 @@ struct pmu *perf_init_event(struct perf_event *event)
pmu = idr_find(&pmu_idr, event->attr.type);
rcu_read_unlock();
if (pmu) {
+ if (!try_module_get(pmu->module)) {
+ pmu = ERR_PTR(-ENODEV);
+ goto unlock;
+ }
event->pmu = pmu;
ret = pmu->event_init(event);
if (ret)
@@ -6591,6 +6721,10 @@ struct pmu *perf_init_event(struct perf_event *event)
}
list_for_each_entry_rcu(pmu, &pmus, entry) {
+ if (!try_module_get(pmu->module)) {
+ pmu = ERR_PTR(-ENODEV);
+ goto unlock;
+ }
event->pmu = pmu;
ret = pmu->event_init(event);
if (!ret)
@@ -6769,6 +6903,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
err_pmu:
if (event->destroy)
event->destroy(event);
+ module_put(pmu->module);
err_ns:
if (event->ns)
put_pid_ns(event->ns);
@@ -6832,10 +6967,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
if (ret)
return -EFAULT;
- /* disabled for now */
- if (attr->mmap2)
- return -EINVAL;
-
if (attr->__reserved_1)
return -EINVAL;
@@ -6912,7 +7043,7 @@ err_size:
static int
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
- struct ring_buffer *rb = NULL, *old_rb = NULL;
+ struct ring_buffer *rb = NULL;
int ret = -EINVAL;
if (!output_event)
@@ -6940,8 +7071,6 @@ set:
if (atomic_read(&event->mmap_count))
goto unlock;
- old_rb = event->rb;
-
if (output_event) {
/* get the rb we want to redirect to */
rb = ring_buffer_get(output_event);
@@ -6949,23 +7078,7 @@ set:
goto unlock;
}
- if (old_rb)
- ring_buffer_detach(event, old_rb);
-
- if (rb)
- ring_buffer_attach(event, rb);
-
- rcu_assign_pointer(event->rb, rb);
-
- if (old_rb) {
- ring_buffer_put(old_rb);
- /*
- * Since we detached before setting the new rb, so that we
- * could attach the new rb, we could have missed a wakeup.
- * Provide it now.
- */
- wake_up_all(&event->waitq);
- }
+ ring_buffer_attach(event, rb);
ret = 0;
unlock:
@@ -7016,6 +7129,9 @@ SYSCALL_DEFINE5(perf_event_open,
if (attr.freq) {
if (attr.sample_freq > sysctl_perf_event_sample_rate)
return -EINVAL;
+ } else {
+ if (attr.sample_period & (1ULL << 63))
+ return -EINVAL;
}
/*
@@ -7053,20 +7169,33 @@ SYSCALL_DEFINE5(perf_event_open,
}
}
+ if (task && group_leader &&
+ group_leader->attr.inherit != attr.inherit) {
+ err = -EINVAL;
+ goto err_task;
+ }
+
get_online_cpus();
event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
NULL, NULL);
if (IS_ERR(event)) {
err = PTR_ERR(event);
- goto err_task;
+ goto err_cpus;
}
if (flags & PERF_FLAG_PID_CGROUP) {
err = perf_cgroup_connect(pid, event, &attr, group_leader);
if (err) {
__free_event(event);
- goto err_task;
+ goto err_cpus;
+ }
+ }
+
+ if (is_sampling_event(event)) {
+ if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
+ err = -ENOTSUPP;
+ goto err_alloc;
}
}
@@ -7163,7 +7292,7 @@ SYSCALL_DEFINE5(perf_event_open,
struct perf_event_context *gctx = group_leader->ctx;
mutex_lock(&gctx->mutex);
- perf_remove_from_context(group_leader);
+ perf_remove_from_context(group_leader, false);
/*
* Removing from the context ends up with disabled
@@ -7173,7 +7302,7 @@ SYSCALL_DEFINE5(perf_event_open,
perf_event__state_init(group_leader);
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
- perf_remove_from_context(sibling);
+ perf_remove_from_context(sibling, false);
perf_event__state_init(sibling);
put_ctx(gctx);
}
@@ -7228,8 +7357,9 @@ err_context:
put_ctx(ctx);
err_alloc:
free_event(event);
-err_task:
+err_cpus:
put_online_cpus();
+err_task:
if (task)
put_task_struct(task);
err_group_fd:
@@ -7303,7 +7433,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
mutex_lock(&src_ctx->mutex);
list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
event_entry) {
- perf_remove_from_context(event);
+ perf_remove_from_context(event, false);
unaccount_event_cpu(event, src_cpu);
put_ctx(src_ctx);
list_add(&event->migrate_entry, &events);
@@ -7365,13 +7495,19 @@ __perf_event_exit_task(struct perf_event *child_event,
struct perf_event_context *child_ctx,
struct task_struct *child)
{
- if (child_event->parent) {
- raw_spin_lock_irq(&child_ctx->lock);
- perf_group_detach(child_event);
- raw_spin_unlock_irq(&child_ctx->lock);
- }
-
- perf_remove_from_context(child_event);
+ /*
+ * Do not destroy the 'original' grouping; because of the context
+ * switch optimization the original events could've ended up in a
+ * random child task.
+ *
+ * If we were to destroy the original group, all group related
+ * operations would cease to function properly after this random
+ * child dies.
+ *
+ * Do destroy all inherited groups, we don't care about those
+ * and being thorough is better.
+ */
+ perf_remove_from_context(child_event, !!child_event->parent);
/*
* It can happen that the parent exits first, and has events
@@ -7386,8 +7522,8 @@ __perf_event_exit_task(struct perf_event *child_event,
static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
{
- struct perf_event *child_event, *tmp;
- struct perf_event_context *child_ctx;
+ struct perf_event *child_event, *next;
+ struct perf_event_context *child_ctx, *parent_ctx;
unsigned long flags;
if (likely(!child->perf_event_ctxp[ctxn])) {
@@ -7412,6 +7548,15 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
raw_spin_lock(&child_ctx->lock);
task_ctx_sched_out(child_ctx);
child->perf_event_ctxp[ctxn] = NULL;
+
+ /*
+ * In order to avoid freeing: child_ctx->parent_ctx->task
+ * under perf_event_context::lock, grab another reference.
+ */
+ parent_ctx = child_ctx->parent_ctx;
+ if (parent_ctx)
+ get_ctx(parent_ctx);
+
/*
* If this context is a clone; unclone it so it can't get
* swapped to another process while we're removing all
@@ -7422,6 +7567,13 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
/*
+ * Now that we no longer hold perf_event_context::lock, drop
+ * our extra child_ctx->parent_ctx reference.
+ */
+ if (parent_ctx)
+ put_ctx(parent_ctx);
+
+ /*
* Report the task dead after unscheduling the events so that we
* won't get any samples after PERF_RECORD_EXIT. We can however still
* get a few PERF_RECORD_READ events.
@@ -7440,24 +7592,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
*/
mutex_lock(&child_ctx->mutex);
-again:
- list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
- group_entry)
- __perf_event_exit_task(child_event, child_ctx, child);
-
- list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
- group_entry)
+ list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
__perf_event_exit_task(child_event, child_ctx, child);
- /*
- * If the last event was a group event, it will have appended all
- * its siblings to the list, but we obtained 'tmp' before that which
- * will still point to the list head terminating the iteration.
- */
- if (!list_empty(&child_ctx->pinned_groups) ||
- !list_empty(&child_ctx->flexible_groups))
- goto again;
-
mutex_unlock(&child_ctx->mutex);
put_ctx(child_ctx);
@@ -7704,7 +7841,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
/*
* Initialize the perf_event context in task_struct
*/
-int perf_event_init_context(struct task_struct *child, int ctxn)
+static int perf_event_init_context(struct task_struct *child, int ctxn)
{
struct perf_event_context *child_ctx, *parent_ctx;
struct perf_event_context *cloned_ctx;
@@ -7722,6 +7859,8 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
* swapped under us.
*/
parent_ctx = perf_pin_task_context(parent, ctxn);
+ if (!parent_ctx)
+ return 0;
/*
* No need to check if parent_ctx != NULL here; since we saw
@@ -7809,8 +7948,10 @@ int perf_event_init_task(struct task_struct *child)
for_each_task_context_nr(ctxn) {
ret = perf_event_init_context(child, ctxn);
- if (ret)
+ if (ret) {
+ perf_event_free_task(child);
return ret;
+ }
}
return 0;
@@ -7833,6 +7974,7 @@ static void perf_event_init_cpu(int cpu)
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
mutex_lock(&swhash->hlist_mutex);
+ swhash->online = true;
if (swhash->hlist_refcount > 0) {
struct swevent_hlist *hlist;
@@ -7855,14 +7997,14 @@ static void perf_pmu_rotate_stop(struct pmu *pmu)
static void __perf_event_exit_context(void *__info)
{
+ struct remove_event re = { .detach_group = false };
struct perf_event_context *ctx = __info;
- struct perf_event *event;
perf_pmu_rotate_stop(ctx->pmu);
rcu_read_lock();
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry)
- __perf_remove_from_context(event);
+ list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
+ __perf_remove_from_context(&re);
rcu_read_unlock();
}
@@ -7890,6 +8032,7 @@ static void perf_event_exit_cpu(int cpu)
perf_event_exit_cpu_context(cpu);
mutex_lock(&swhash->hlist_mutex);
+ swhash->online = false;
swevent_hlist_release(swhash);
mutex_unlock(&swhash->hlist_mutex);
}
@@ -8036,7 +8179,7 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css,
{
struct task_struct *task;
- cgroup_taskset_for_each(task, css, tset)
+ cgroup_taskset_for_each(task, tset)
task_function_call(task, __perf_cgroup_move, task);
}
@@ -8055,9 +8198,7 @@ static void perf_cgroup_exit(struct cgroup_subsys_state *css,
task_function_call(task, __perf_cgroup_move, task);
}
-struct cgroup_subsys perf_subsys = {
- .name = "perf_event",
- .subsys_id = perf_subsys_id,
+struct cgroup_subsys perf_event_cgrp_subsys = {
.css_alloc = perf_cgroup_css_alloc,
.css_free = perf_cgroup_css_free,
.exit = perf_cgroup_exit,
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 307d87c0991a..1d0af8a2c646 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -36,6 +36,7 @@
#include "../../mm/internal.h" /* munlock_vma_page */
#include <linux/percpu-rwsem.h>
#include <linux/task_work.h>
+#include <linux/shmem_fs.h>
#include <linux/uprobes.h>
@@ -60,8 +61,6 @@ static struct percpu_rw_semaphore dup_mmap_sem;
/* Have a copy of original instruction */
#define UPROBE_COPY_INSN 0
-/* Can skip singlestep */
-#define UPROBE_SKIP_SSTEP 1
struct uprobe {
struct rb_node rb_node; /* node in the rb tree */
@@ -129,7 +128,7 @@ struct xol_area {
*/
static bool valid_vma(struct vm_area_struct *vma, bool is_register)
{
- vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_SHARED;
+ vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
if (is_register)
flags |= VM_WRITE;
@@ -168,6 +167,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
/* For mmu_notifiers */
const unsigned long mmun_start = addr;
const unsigned long mmun_end = addr + PAGE_SIZE;
+ struct mem_cgroup *memcg;
+
+ err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg);
+ if (err)
+ return err;
/* For try_to_free_swap() and munlock_vma_page() below */
lock_page(page);
@@ -180,6 +184,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
get_page(kpage);
page_add_new_anon_rmap(kpage, vma, addr);
+ mem_cgroup_commit_charge(kpage, memcg, false);
+ lru_cache_add_active_or_unevictable(kpage, vma);
if (!PageAnon(page)) {
dec_mm_counter(mm, MM_FILEPAGES);
@@ -201,6 +207,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
err = 0;
unlock:
+ mem_cgroup_cancel_charge(kpage, memcg);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
unlock_page(page);
return err;
@@ -281,18 +288,13 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
* supported by that architecture then we need to modify is_trap_at_addr and
* uprobe_write_opcode accordingly. This would never be a problem for archs
* that have fixed length instructions.
- */
-
-/*
+ *
* uprobe_write_opcode - write the opcode at a given virtual address.
* @mm: the probed process address space.
* @vaddr: the virtual address to store the opcode.
* @opcode: opcode to be written at @vaddr.
*
- * Called with mm->mmap_sem held (for read and with a reference to
- * mm).
- *
- * For mm @mm, write the opcode at @vaddr.
+ * Called with mm->mmap_sem held for write.
* Return 0 (success) or a negative errno.
*/
int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
@@ -312,23 +314,20 @@ retry:
if (ret <= 0)
goto put_old;
+ ret = anon_vma_prepare(vma);
+ if (ret)
+ goto put_old;
+
ret = -ENOMEM;
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
if (!new_page)
goto put_old;
__SetPageUptodate(new_page);
-
copy_highpage(new_page, old_page);
copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
- ret = anon_vma_prepare(vma);
- if (ret)
- goto put_new;
-
ret = __replace_page(vma, vaddr, old_page, new_page);
-
-put_new:
page_cache_release(new_page);
put_old:
put_page(old_page);
@@ -491,12 +490,9 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
uprobe->offset = offset;
init_rwsem(&uprobe->register_rwsem);
init_rwsem(&uprobe->consumer_rwsem);
- /* For now assume that the instruction need not be single-stepped */
- __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
/* add to uprobes_tree, sorted on inode:offset */
cur_uprobe = insert_uprobe(uprobe);
-
/* a uprobe exists for this inode:offset combination */
if (cur_uprobe) {
kfree(uprobe);
@@ -542,14 +538,15 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
void *insn, int nbytes, loff_t offset)
{
struct page *page;
-
- if (!mapping->a_ops->readpage)
- return -EIO;
/*
- * Ensure that the page that has the original instruction is
- * populated and in page-cache.
+ * Ensure that the page that has the original instruction is populated
+ * and in page-cache. If ->readpage == NULL it must be shmem_mapping(),
+ * see uprobe_register().
*/
- page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
+ if (mapping->a_ops->readpage)
+ page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
+ else
+ page = shmem_read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT);
if (IS_ERR(page))
return PTR_ERR(page);
@@ -850,7 +847,7 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u
{
int err;
- if (!consumer_del(uprobe, uc)) /* WARN? */
+ if (WARN_ON(!consumer_del(uprobe, uc)))
return;
err = register_for_each_vma(uprobe, NULL);
@@ -885,6 +882,9 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
if (!uc->handler && !uc->ret_handler)
return -EINVAL;
+ /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
+ if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping))
+ return -EIO;
/* Racy, just to catch the obvious mistakes */
if (offset > i_size_read(inode))
return -EINVAL;
@@ -928,7 +928,7 @@ int uprobe_apply(struct inode *inode, loff_t offset,
int ret = -ENOENT;
uprobe = find_uprobe(inode, offset);
- if (!uprobe)
+ if (WARN_ON(!uprobe))
return ret;
down_write(&uprobe->register_rwsem);
@@ -953,7 +953,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
struct uprobe *uprobe;
uprobe = find_uprobe(inode, offset);
- if (!uprobe)
+ if (WARN_ON(!uprobe))
return;
down_write(&uprobe->register_rwsem);
@@ -1296,14 +1296,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
if (unlikely(!xol_vaddr))
return 0;
- /* Initialize the slot */
- copy_to_page(area->page, xol_vaddr,
- &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
- /*
- * We probably need flush_icache_user_range() but it needs vma.
- * This should work on supported architectures too.
- */
- flush_dcache_page(area->page);
+ arch_uprobe_copy_ixol(area->page, xol_vaddr,
+ &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
return xol_vaddr;
}
@@ -1346,6 +1340,21 @@ static void xol_free_insn_slot(struct task_struct *tsk)
}
}
+void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
+ void *src, unsigned long len)
+{
+ /* Initialize the slot */
+ copy_to_page(page, vaddr, src, len);
+
+ /*
+ * We probably need flush_icache_user_range() but it needs vma.
+ * This should work on most of architectures by default. If
+ * architecture needs to do something different it can define
+ * its own version of the function.
+ */
+ flush_dcache_page(page);
+}
+
/**
* uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
* @regs: Reflects the saved state of the task after it has hit a breakpoint
@@ -1357,6 +1366,16 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
}
+unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ if (unlikely(utask && utask->active_uprobe))
+ return utask->vaddr;
+
+ return instruction_pointer(regs);
+}
+
/*
* Called with no locks held.
* Called in context of a exiting or a exec-ing thread.
@@ -1628,20 +1647,6 @@ bool uprobe_deny_signal(void)
return true;
}
-/*
- * Avoid singlestepping the original instruction if the original instruction
- * is a NOP or can be emulated.
- */
-static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
-{
- if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) {
- if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
- return true;
- clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
- }
- return false;
-}
-
static void mmf_recalc_uprobes(struct mm_struct *mm)
{
struct vm_area_struct *vma;
@@ -1804,6 +1809,11 @@ static bool handle_trampoline(struct pt_regs *regs)
return true;
}
+bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
+{
+ return false;
+}
+
/*
* Run handler and ask thread to singlestep.
* Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -1858,14 +1868,18 @@ static void handle_swbp(struct pt_regs *regs)
if (!get_utask())
goto out;
+ if (arch_uprobe_ignore(&uprobe->arch, regs))
+ goto out;
+
handler_chain(uprobe, regs);
- if (can_skip_sstep(uprobe, regs))
+
+ if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
goto out;
if (!pre_ssout(uprobe, regs, bp_vaddr))
return;
- /* can_skip_sstep() succeeded, or restart if can't singlestep */
+ /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
out:
put_uprobe(uprobe);
}
@@ -1877,10 +1891,11 @@ out:
static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
{
struct uprobe *uprobe;
+ int err = 0;
uprobe = utask->active_uprobe;
if (utask->state == UTASK_SSTEP_ACK)
- arch_uprobe_post_xol(&uprobe->arch, regs);
+ err = arch_uprobe_post_xol(&uprobe->arch, regs);
else if (utask->state == UTASK_SSTEP_TRAPPED)
arch_uprobe_abort_xol(&uprobe->arch, regs);
else
@@ -1894,6 +1909,11 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
spin_lock_irq(&current->sighand->siglock);
recalc_sigpending(); /* see uprobe_deny_signal() */
spin_unlock_irq(&current->sighand->siglock);
+
+ if (unlikely(err)) {
+ uprobe_warn(current, "execute the probed insn, sending SIGILL.");
+ force_sig_info(SIGILL, SEND_SIG_FORCED, current);
+ }
}
/*
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 0dbeae374225..83d4382f5699 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -37,7 +37,7 @@ static unsigned long ident_map[32] = {
struct exec_domain default_exec_domain = {
.name = "Linux", /* name */
.handler = default_handler, /* lcall7 causes a seg fault. */
- .pers_low = 0, /* PER_LINUX personality. */
+ .pers_low = 0, /* PER_LINUX personality. */
.pers_high = 0, /* PER_LINUX personality. */
.signal_map = ident_map, /* Identity map signals. */
.signal_invmap = ident_map, /* - both ways. */
@@ -83,7 +83,7 @@ lookup_exec_domain(unsigned int personality)
ep = &default_exec_domain;
out:
read_unlock(&exec_domains_lock);
- return (ep);
+ return ep;
}
int
@@ -110,8 +110,9 @@ register_exec_domain(struct exec_domain *ep)
out:
write_unlock(&exec_domains_lock);
- return (err);
+ return err;
}
+EXPORT_SYMBOL(register_exec_domain);
int
unregister_exec_domain(struct exec_domain *ep)
@@ -133,6 +134,7 @@ unregister:
write_unlock(&exec_domains_lock);
return 0;
}
+EXPORT_SYMBOL(unregister_exec_domain);
int __set_personality(unsigned int personality)
{
@@ -144,6 +146,7 @@ int __set_personality(unsigned int personality)
return 0;
}
+EXPORT_SYMBOL(__set_personality);
#ifdef CONFIG_PROC_FS
static int execdomains_proc_show(struct seq_file *m, void *v)
@@ -188,8 +191,3 @@ SYSCALL_DEFINE1(personality, unsigned int, personality)
return old;
}
-
-
-EXPORT_SYMBOL(register_exec_domain);
-EXPORT_SYMBOL(unregister_exec_domain);
-EXPORT_SYMBOL(__set_personality);
diff --git a/kernel/exit.c b/kernel/exit.c
index 1e77fc645317..32c58f7433a3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -59,7 +59,7 @@
#include <asm/pgtable.h>
#include <asm/mmu_context.h>
-static void exit_mm(struct task_struct * tsk);
+static void exit_mm(struct task_struct *tsk);
static void __unhash_process(struct task_struct *p, bool group_dead)
{
@@ -151,7 +151,7 @@ static void __exit_signal(struct task_struct *tsk)
spin_unlock(&sighand->siglock);
__cleanup_sighand(sighand);
- clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
+ clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
if (group_dead) {
flush_sigqueue(&sig->shared_pending);
tty_kref_put(tty);
@@ -168,7 +168,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
}
-void release_task(struct task_struct * p)
+void release_task(struct task_struct *p)
{
struct task_struct *leader;
int zap_leader;
@@ -192,7 +192,8 @@ repeat:
*/
zap_leader = 0;
leader = p->group_leader;
- if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
+ if (leader != p && thread_group_empty(leader)
+ && leader->exit_state == EXIT_ZOMBIE) {
/*
* If we were the last child thread and the leader has
* exited already, and the leader's parent ignores SIGCHLD,
@@ -241,7 +242,8 @@ struct pid *session_of_pgrp(struct pid *pgrp)
*
* "I ask you, have you ever known what it is to be an orphan?"
*/
-static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task)
+static int will_become_orphaned_pgrp(struct pid *pgrp,
+ struct task_struct *ignored_task)
{
struct task_struct *p;
@@ -294,9 +296,9 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
struct task_struct *ignored_task = tsk;
if (!parent)
- /* exit: our father is in a different pgrp than
- * we are and we were the only connection outside.
- */
+ /* exit: our father is in a different pgrp than
+ * we are and we were the only connection outside.
+ */
parent = tsk->real_parent;
else
/* reparent: our child is in a different pgrp than
@@ -313,46 +315,7 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
}
}
-/*
- * Let kernel threads use this to say that they allow a certain signal.
- * Must not be used if kthread was cloned with CLONE_SIGHAND.
- */
-int allow_signal(int sig)
-{
- if (!valid_signal(sig) || sig < 1)
- return -EINVAL;
-
- spin_lock_irq(&current->sighand->siglock);
- /* This is only needed for daemonize()'ed kthreads */
- sigdelset(&current->blocked, sig);
- /*
- * Kernel threads handle their own signals. Let the signal code
- * know it'll be handled, so that they don't get converted to
- * SIGKILL or just silently dropped.
- */
- current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
- return 0;
-}
-
-EXPORT_SYMBOL(allow_signal);
-
-int disallow_signal(int sig)
-{
- if (!valid_signal(sig) || sig < 1)
- return -EINVAL;
-
- spin_lock_irq(&current->sighand->siglock);
- current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
- return 0;
-}
-
-EXPORT_SYMBOL(disallow_signal);
-
-#ifdef CONFIG_MM_OWNER
+#ifdef CONFIG_MEMCG
/*
* A task is exiting. If it owned this mm, find a new owner for the mm.
*/
@@ -395,14 +358,18 @@ retry:
}
/*
- * Search through everything else. We should not get
- * here often
+ * Search through everything else, we should not get here often.
*/
- do_each_thread(g, c) {
- if (c->mm == mm)
- goto assign_new_owner;
- } while_each_thread(g, c);
-
+ for_each_process(g) {
+ if (g->flags & PF_KTHREAD)
+ continue;
+ for_each_thread(g, c) {
+ if (c->mm == mm)
+ goto assign_new_owner;
+ if (c->mm)
+ break;
+ }
+ }
read_unlock(&tasklist_lock);
/*
* We found no owner yet mm_users > 1: this implies that we are
@@ -434,13 +401,13 @@ assign_new_owner:
task_unlock(c);
put_task_struct(c);
}
-#endif /* CONFIG_MM_OWNER */
+#endif /* CONFIG_MEMCG */
/*
* Turn us into a lazy TLB process if we
* aren't already..
*/
-static void exit_mm(struct task_struct * tsk)
+static void exit_mm(struct task_struct *tsk)
{
struct mm_struct *mm = tsk->mm;
struct core_state *core_state;
@@ -460,6 +427,7 @@ static void exit_mm(struct task_struct * tsk)
core_state = mm->core_state;
if (core_state) {
struct core_thread self;
+
up_read(&mm->mmap_sem);
self.task = tsk;
@@ -490,6 +458,7 @@ static void exit_mm(struct task_struct * tsk)
task_unlock(tsk);
mm_update_next_owner(mm);
mmput(mm);
+ clear_thread_flag(TIF_MEMDIE);
}
/*
@@ -570,7 +539,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
if (same_thread_group(p->real_parent, father))
return;
- /* We don't want people slaying init. */
+ /* We don't want people slaying init. */
p->exit_signal = SIGCHLD;
/* If it has exited notify the new parent about this child's death. */
@@ -600,6 +569,7 @@ static void forget_original_parent(struct task_struct *father)
list_for_each_entry_safe(p, n, &father->children, sibling) {
struct task_struct *t = p;
+
do {
t->real_parent = reaper;
if (t->parent == father) {
@@ -633,7 +603,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
/*
* This does two things:
*
- * A. Make init inherit all the child processes
+ * A. Make init inherit all the child processes
* B. Check to see if any process groups have become orphaned
* as a result of our exiting, and if they have any stopped
* jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
@@ -683,9 +653,8 @@ static void check_stack_usage(void)
spin_lock(&low_water_lock);
if (free < lowest_to_date) {
- printk(KERN_WARNING "%s (%d) used greatest stack depth: "
- "%lu bytes left\n",
- current->comm, task_pid_nr(current), free);
+ pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n",
+ current->comm, task_pid_nr(current), free);
lowest_to_date = free;
}
spin_unlock(&low_water_lock);
@@ -726,8 +695,7 @@ void do_exit(long code)
* leave this task alone and wait for reboot.
*/
if (unlikely(tsk->flags & PF_EXITING)) {
- printk(KERN_ALERT
- "Fixing recursive fault but reboot is needed!\n");
+ pr_alert("Fixing recursive fault but reboot is needed!\n");
/*
* We can do this unlocked here. The futex code uses
* this flag just to verify whether the pi state
@@ -751,9 +719,9 @@ void do_exit(long code)
raw_spin_unlock_wait(&tsk->pi_lock);
if (unlikely(in_atomic()))
- printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
- current->comm, task_pid_nr(current),
- preempt_count());
+ pr_info("note: %s[%d] exited with preempt_count %d\n",
+ current->comm, task_pid_nr(current),
+ preempt_count());
acct_update_integrals(tsk);
/* sync mm's RSS info before statistics gathering */
@@ -784,9 +752,10 @@ void do_exit(long code)
exit_shm(tsk);
exit_files(tsk);
exit_fs(tsk);
+ if (group_dead)
+ disassociate_ctty(1);
exit_task_namespaces(tsk);
exit_task_work(tsk);
- check_stack_usage();
exit_thread();
/*
@@ -797,21 +766,17 @@ void do_exit(long code)
*/
perf_event_exit_task(tsk);
- cgroup_exit(tsk, 1);
-
- if (group_dead)
- disassociate_ctty(1);
+ cgroup_exit(tsk);
module_put(task_thread_info(tsk)->exec_domain->module);
- proc_exit_connector(tsk);
-
/*
* FIXME: do that only when needed, using sched_exit tracepoint
*/
flush_ptrace_hw_breakpoint(tsk);
exit_notify(tsk, group_dead);
+ proc_exit_connector(tsk);
#ifdef CONFIG_NUMA
task_lock(tsk);
mpol_put(tsk->mempolicy);
@@ -844,6 +809,7 @@ void do_exit(long code)
validate_creds_for_do_exit(tsk);
+ check_stack_usage();
preempt_disable();
if (tsk->nr_dirtied)
__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
@@ -873,7 +839,6 @@ void do_exit(long code)
for (;;)
cpu_relax(); /* For when BUG is null */
}
-
EXPORT_SYMBOL_GPL(do_exit);
void complete_and_exit(struct completion *comp, long code)
@@ -883,7 +848,6 @@ void complete_and_exit(struct completion *comp, long code)
do_exit(code);
}
-
EXPORT_SYMBOL(complete_and_exit);
SYSCALL_DEFINE1(exit, int, error_code)
@@ -906,6 +870,7 @@ do_group_exit(int exit_code)
exit_code = sig->group_exit_code;
else if (!thread_group_empty(current)) {
struct sighand_struct *const sighand = current->sighand;
+
spin_lock_irq(&sighand->siglock);
if (signal_group_exit(sig))
/* Another thread got here before we took the lock. */
@@ -1038,17 +1003,13 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
return wait_noreap_copyout(wo, p, pid, uid, why, status);
}
+ traced = ptrace_reparented(p);
/*
- * Try to move the task's state to DEAD
- * only one thread is allowed to do this:
+ * Move the task's state to DEAD/TRACE, only one thread can do this.
*/
- state = xchg(&p->exit_state, EXIT_DEAD);
- if (state != EXIT_ZOMBIE) {
- BUG_ON(state != EXIT_DEAD);
+ state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD;
+ if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
return 0;
- }
-
- traced = ptrace_reparented(p);
/*
* It can be ptraced but not reparented, check
* thread_group_leader() to filter out sub-threads.
@@ -1074,9 +1035,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
* as other threads in the parent group can be right
* here reaping other children at the same time.
*
- * We use thread_group_cputime_adjusted() to get times for the thread
- * group, which consolidates times for all threads in the
- * group including the group leader.
+ * We use thread_group_cputime_adjusted() to get times for
+ * the thread group, which consolidates times for all threads
+ * in the group including the group leader.
*/
thread_group_cputime_adjusted(p, &tgutime, &tgstime);
spin_lock_irq(&p->real_parent->sighand->siglock);
@@ -1109,7 +1070,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
/*
* Now we are sure this task is interesting, and no other
- * thread can reap it because we set its state to EXIT_DEAD.
+ * thread can reap it because we its state == DEAD/TRACE.
*/
read_unlock(&tasklist_lock);
@@ -1146,22 +1107,19 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
if (!retval)
retval = pid;
- if (traced) {
+ if (state == EXIT_TRACE) {
write_lock_irq(&tasklist_lock);
/* We dropped tasklist, ptracer could die and untrace */
ptrace_unlink(p);
- /*
- * If this is not a sub-thread, notify the parent.
- * If parent wants a zombie, don't release it now.
- */
- if (thread_group_leader(p) &&
- !do_notify_parent(p, p->exit_signal)) {
- p->exit_state = EXIT_ZOMBIE;
- p = NULL;
- }
+
+ /* If parent wants a zombie, don't release it now */
+ state = EXIT_ZOMBIE;
+ if (do_notify_parent(p, p->exit_signal))
+ state = EXIT_DEAD;
+ p->exit_state = state;
write_unlock_irq(&tasklist_lock);
}
- if (p != NULL)
+ if (state == EXIT_DEAD)
release_task(p);
return retval;
@@ -1338,7 +1296,12 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
static int wait_consider_task(struct wait_opts *wo, int ptrace,
struct task_struct *p)
{
- int ret = eligible_child(wo, p);
+ int ret;
+
+ if (unlikely(p->exit_state == EXIT_DEAD))
+ return 0;
+
+ ret = eligible_child(wo, p);
if (!ret)
return ret;
@@ -1356,33 +1319,44 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
return 0;
}
- /* dead body doesn't have much to contribute */
- if (unlikely(p->exit_state == EXIT_DEAD)) {
+ if (unlikely(p->exit_state == EXIT_TRACE)) {
/*
- * But do not ignore this task until the tracer does
- * wait_task_zombie()->do_notify_parent().
+ * ptrace == 0 means we are the natural parent. In this case
+ * we should clear notask_error, debugger will notify us.
*/
- if (likely(!ptrace) && unlikely(ptrace_reparented(p)))
+ if (likely(!ptrace))
wo->notask_error = 0;
return 0;
}
- /* slay zombie? */
- if (p->exit_state == EXIT_ZOMBIE) {
+ if (likely(!ptrace) && unlikely(p->ptrace)) {
/*
- * A zombie ptracee is only visible to its ptracer.
- * Notification and reaping will be cascaded to the real
- * parent when the ptracer detaches.
+ * If it is traced by its real parent's group, just pretend
+ * the caller is ptrace_do_wait() and reap this child if it
+ * is zombie.
+ *
+ * This also hides group stop state from real parent; otherwise
+ * a single stop can be reported twice as group and ptrace stop.
+ * If a ptracer wants to distinguish these two events for its
+ * own children it should create a separate process which takes
+ * the role of real parent.
*/
- if (likely(!ptrace) && unlikely(p->ptrace)) {
- /* it will become visible, clear notask_error */
- wo->notask_error = 0;
- return 0;
- }
+ if (!ptrace_reparented(p))
+ ptrace = 1;
+ }
+ /* slay zombie? */
+ if (p->exit_state == EXIT_ZOMBIE) {
/* we don't reap group leaders with subthreads */
- if (!delay_group_leader(p))
- return wait_task_zombie(wo, p);
+ if (!delay_group_leader(p)) {
+ /*
+ * A zombie ptracee is only visible to its ptracer.
+ * Notification and reaping will be cascaded to the
+ * real parent when the ptracer detaches.
+ */
+ if (unlikely(ptrace) || likely(!p->ptrace))
+ return wait_task_zombie(wo, p);
+ }
/*
* Allow access to stopped/continued state via zombie by
@@ -1408,19 +1382,6 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
wo->notask_error = 0;
} else {
/*
- * If @p is ptraced by a task in its real parent's group,
- * hide group stop/continued state when looking at @p as
- * the real parent; otherwise, a single stop can be
- * reported twice as group and ptrace stops.
- *
- * If a ptracer wants to distinguish the two events for its
- * own children, it should create a separate process which
- * takes the role of real parent.
- */
- if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p))
- return 0;
-
- /*
* @p is alive and it's gonna stop, continue or exit, so
* there always is something to wait for.
*/
@@ -1458,6 +1419,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
list_for_each_entry(p, &tsk->children, sibling) {
int ret = wait_consider_task(wo, 0, p);
+
if (ret)
return ret;
}
@@ -1471,6 +1433,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
int ret = wait_consider_task(wo, 1, p);
+
if (ret)
return ret;
}
diff --git a/kernel/extable.c b/kernel/extable.c
index 763faf037ec1..d8a6446adbcb 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -36,7 +36,7 @@ extern struct exception_table_entry __start___ex_table[];
extern struct exception_table_entry __stop___ex_table[];
/* Cleared by build time tools if the table is already sorted. */
-u32 __initdata main_extable_sort_needed = 1;
+u32 __initdata __visible main_extable_sort_needed = 1;
/* Sort the kernel's built-in exception table */
void __init sort_main_extable(void)
diff --git a/kernel/fork.c b/kernel/fork.c
index a17621c6cd42..a91e47d86de2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -28,6 +28,8 @@
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
#include <linux/cpu.h>
@@ -71,6 +73,7 @@
#include <linux/signalfd.h>
#include <linux/uprobes.h>
#include <linux/aio.h>
+#include <linux/compiler.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -147,15 +150,15 @@ void __weak arch_release_thread_info(struct thread_info *ti)
static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
int node)
{
- struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED,
- THREAD_SIZE_ORDER);
+ struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
+ THREAD_SIZE_ORDER);
return page ? page_address(page) : NULL;
}
static inline void free_thread_info(struct thread_info *ti)
{
- free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+ free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
}
# else
static struct kmem_cache *thread_info_cache;
@@ -237,6 +240,7 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(atomic_read(&tsk->usage));
WARN_ON(tsk == current);
+ task_numa_free(tsk);
security_task_free(tsk);
exit_creds(tsk);
delayacct_tsk_free(tsk);
@@ -283,7 +287,7 @@ void __init fork_init(unsigned long mempages)
init_task.signal->rlim[RLIMIT_NPROC];
}
-int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst,
+int __weak arch_dup_task_struct(struct task_struct *dst,
struct task_struct *src)
{
*dst = *src;
@@ -311,6 +315,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
goto free_ti;
tsk->stack = ti;
+#ifdef CONFIG_SECCOMP
+ /*
+ * We must handle setting up seccomp filters once we're under
+ * the sighand lock in case orig has changed between now and
+ * then. Until then, filter must be NULL to avoid messing up
+ * the usage counts on the error path calling free_task.
+ */
+ tsk->seccomp.filter = NULL;
+#endif
setup_thread_stack(tsk, orig);
clear_user_return_notifier(tsk);
@@ -361,12 +374,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
*/
down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
- mm->locked_vm = 0;
- mm->mmap = NULL;
- mm->mmap_cache = NULL;
- mm->map_count = 0;
- cpumask_clear(mm_cpumask(mm));
- mm->mm_rb = RB_ROOT;
+ mm->total_vm = oldmm->total_vm;
+ mm->shared_vm = oldmm->shared_vm;
+ mm->exec_vm = oldmm->exec_vm;
+ mm->stack_vm = oldmm->stack_vm;
+
rb_link = &mm->mm_rb.rb_node;
rb_parent = NULL;
pprev = &mm->mmap;
@@ -417,7 +429,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
atomic_dec(&inode->i_writecount);
mutex_lock(&mapping->i_mmap_mutex);
if (tmp->vm_flags & VM_SHARED)
- mapping->i_mmap_writable++;
+ atomic_inc(&mapping->i_mmap_writable);
flush_dcache_mmap_lock(mapping);
/* insert tmp into the share list, just after mpnt */
if (unlikely(tmp->vm_flags & VM_NONLINEAR))
@@ -523,28 +535,57 @@ static void mm_init_aio(struct mm_struct *mm)
#endif
}
+static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
+{
+#ifdef CONFIG_MEMCG
+ mm->owner = p;
+#endif
+}
+
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
{
+ mm->mmap = NULL;
+ mm->mm_rb = RB_ROOT;
+ mm->vmacache_seqnum = 0;
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
init_rwsem(&mm->mmap_sem);
INIT_LIST_HEAD(&mm->mmlist);
- mm->flags = (current->mm) ?
- (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
mm->core_state = NULL;
atomic_long_set(&mm->nr_ptes, 0);
+ mm->map_count = 0;
+ mm->locked_vm = 0;
+ mm->pinned_vm = 0;
memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
spin_lock_init(&mm->page_table_lock);
+ mm_init_cpumask(mm);
mm_init_aio(mm);
mm_init_owner(mm, p);
+ mmu_notifier_mm_init(mm);
clear_tlb_flush_pending(mm);
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+ mm->pmd_huge_pte = NULL;
+#endif
- if (likely(!mm_alloc_pgd(mm))) {
+ if (current->mm) {
+ mm->flags = current->mm->flags & MMF_INIT_MASK;
+ mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
+ } else {
+ mm->flags = default_dump_filter;
mm->def_flags = 0;
- mmu_notifier_mm_init(mm);
- return mm;
}
+ if (mm_alloc_pgd(mm))
+ goto fail_nopgd;
+
+ if (init_new_context(p, mm))
+ goto fail_nocontext;
+
+ return mm;
+
+fail_nocontext:
+ mm_free_pgd(mm);
+fail_nopgd:
free_mm(mm);
return NULL;
}
@@ -578,7 +619,6 @@ struct mm_struct *mm_alloc(void)
return NULL;
memset(mm, 0, sizeof(*mm));
- mm_init_cpumask(mm);
return mm_init(mm, current);
}
@@ -810,17 +850,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
goto fail_nomem;
memcpy(mm, oldmm, sizeof(*mm));
- mm_init_cpumask(mm);
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
- mm->pmd_huge_pte = NULL;
-#endif
if (!mm_init(mm, tsk))
goto fail_nomem;
- if (init_new_context(tsk, mm))
- goto fail_nocontext;
-
dup_mm_exe_file(oldmm, mm);
err = dup_mmap(mm, oldmm);
@@ -842,15 +875,6 @@ free_pt:
fail_nomem:
return NULL;
-
-fail_nocontext:
- /*
- * If init_new_context() failed, we cannot use mmput() to free the mm
- * because it calls destroy_context()
- */
- mm_free_pgd(mm);
- free_mm(mm);
- return NULL;
}
static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
@@ -876,6 +900,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
if (!oldmm)
return 0;
+ /* initialize the new vmacache entries */
+ vmacache_flush(tsk);
+
if (clone_flags & CLONE_VM) {
atomic_inc(&oldmm->mm_users);
mm = oldmm;
@@ -1069,13 +1096,37 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
return 0;
}
-static void copy_flags(unsigned long clone_flags, struct task_struct *p)
+static void copy_seccomp(struct task_struct *p)
{
- unsigned long new_flags = p->flags;
+#ifdef CONFIG_SECCOMP
+ /*
+ * Must be called with sighand->lock held, which is common to
+ * all threads in the group. Holding cred_guard_mutex is not
+ * needed because this new task is not yet running and cannot
+ * be racing exec.
+ */
+ assert_spin_locked(&current->sighand->siglock);
+
+ /* Ref-count the new filter user, and assign it. */
+ get_seccomp_filter(current);
+ p->seccomp = current->seccomp;
- new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
- new_flags |= PF_FORKNOEXEC;
- p->flags = new_flags;
+ /*
+ * Explicitly enable no_new_privs here in case it got set
+ * between the task_struct being duplicated and holding the
+ * sighand lock. The seccomp state and nnp must be in sync.
+ */
+ if (task_no_new_privs(current))
+ task_set_no_new_privs(p);
+
+ /*
+ * If the parent gained a seccomp mode after copying thread
+ * flags and between before we held the sighand lock, we have
+ * to manually enable the seccomp thread flag here.
+ */
+ if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
+ set_tsk_thread_flag(p, TIF_SECCOMP);
+#endif
}
SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
@@ -1092,17 +1143,9 @@ static void rt_mutex_init_task(struct task_struct *p)
p->pi_waiters = RB_ROOT;
p->pi_waiters_leftmost = NULL;
p->pi_blocked_on = NULL;
- p->pi_top_task = NULL;
#endif
}
-#ifdef CONFIG_MM_OWNER
-void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
-{
- mm->owner = p;
-}
-#endif /* CONFIG_MM_OWNER */
-
/*
* Initialize POSIX timer handling for a single task.
*/
@@ -1193,7 +1236,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto fork_out;
ftrace_graph_init_task(p);
- get_seccomp_filter(p);
rt_mutex_init_task(p);
@@ -1227,7 +1269,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto bad_fork_cleanup_count;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
- copy_flags(clone_flags, p);
+ p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
+ p->flags |= PF_FORKNOEXEC;
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
rcu_copy_process(p);
@@ -1258,9 +1301,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
posix_cpu_timers_init(p);
- do_posix_clock_monotonic_gettime(&p->start_time);
- p->real_start_time = p->start_time;
- monotonic_to_bootbased(&p->real_start_time);
+ p->start_time = ktime_get_ns();
+ p->real_start_time = ktime_get_boot_ns();
p->io_context = NULL;
p->audit_context = NULL;
if (clone_flags & CLONE_THREAD)
@@ -1271,9 +1313,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (IS_ERR(p->mempolicy)) {
retval = PTR_ERR(p->mempolicy);
p->mempolicy = NULL;
- goto bad_fork_cleanup_cgroup;
+ goto bad_fork_cleanup_threadgroup_lock;
}
- mpol_fix_fork_child_flag(p);
#endif
#ifdef CONFIG_CPUSETS
p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
@@ -1304,10 +1345,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
#ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
#endif
-#ifdef CONFIG_MEMCG
- p->memcg_batch.do_batch = 0;
- p->memcg_batch.memcg = NULL;
-#endif
#ifdef CONFIG_BCACHE
p->sequential_io = 0;
p->sequential_io_avg = 0;
@@ -1323,8 +1360,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto bad_fork_cleanup_policy;
retval = audit_alloc(p);
if (retval)
- goto bad_fork_cleanup_policy;
+ goto bad_fork_cleanup_perf;
/* copy all the process information */
+ shm_init_task(p);
retval = copy_semundo(clone_flags, p);
if (retval)
goto bad_fork_cleanup_audit;
@@ -1434,6 +1472,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
spin_lock(&current->sighand->siglock);
/*
+ * Copy seccomp details explicitly here, in case they were changed
+ * before holding sighand lock.
+ */
+ copy_seccomp(p);
+
+ /*
* Process group and session signals need to be delivered to just the
* parent before the fork or both the parent and the child after the
* fork. Restart if a signal comes in before we add the new process to
@@ -1484,7 +1528,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
total_forks++;
spin_unlock(&current->sighand->siglock);
+ syscall_tracepoint_update(p);
write_unlock_irq(&tasklist_lock);
+
proc_fork_connector(p);
cgroup_post_fork(p);
if (clone_flags & CLONE_THREAD)
@@ -1520,15 +1566,15 @@ bad_fork_cleanup_semundo:
exit_sem(p);
bad_fork_cleanup_audit:
audit_free(p);
-bad_fork_cleanup_policy:
+bad_fork_cleanup_perf:
perf_event_free_task(p);
+bad_fork_cleanup_policy:
#ifdef CONFIG_NUMA
mpol_put(p->mempolicy);
-bad_fork_cleanup_cgroup:
+bad_fork_cleanup_threadgroup_lock:
#endif
if (clone_flags & CLONE_THREAD)
threadgroup_change_end(current);
- cgroup_exit(p, 0);
delayacct_tsk_free(p);
module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
@@ -1604,10 +1650,12 @@ long do_fork(unsigned long clone_flags,
*/
if (!IS_ERR(p)) {
struct completion vfork;
+ struct pid *pid;
trace_sched_process_fork(current, p);
- nr = task_pid_vnr(p);
+ pid = get_task_pid(p, PIDTYPE_PID);
+ nr = pid_vnr(pid);
if (clone_flags & CLONE_PARENT_SETTID)
put_user(nr, parent_tidptr);
@@ -1622,12 +1670,14 @@ long do_fork(unsigned long clone_flags,
/* forking complete and child started to run, tell ptracer */
if (unlikely(trace))
- ptrace_event(trace, nr);
+ ptrace_event_pid(trace, pid);
if (clone_flags & CLONE_VFORK) {
if (!wait_for_vfork_done(p, &vfork))
- ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
+ ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
}
+
+ put_pid(pid);
} else {
nr = PTR_ERR(p);
}
@@ -1865,6 +1915,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
*/
exit_sem(current);
}
+ if (unshare_flags & CLONE_NEWIPC) {
+ /* Orphan segments in old ns (see sem above). */
+ exit_shm(current);
+ shm_init_task(current);
+ }
if (new_nsproxy)
switch_task_namespaces(current, new_nsproxy);
diff --git a/kernel/futex.c b/kernel/futex.c
index 44a1261cb9ff..815d7af2ffe8 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -70,7 +70,10 @@
#include "locking/rtmutex_common.h"
/*
- * Basic futex operation and ordering guarantees:
+ * READ this before attempting to hack on futexes!
+ *
+ * Basic futex operation and ordering guarantees
+ * =============================================
*
* The waiter reads the futex value in user space and calls
* futex_wait(). This function computes the hash bucket and acquires
@@ -119,7 +122,7 @@
* sys_futex(WAIT, futex, val);
* futex_wait(futex, val);
*
- * waiters++;
+ * waiters++; (a)
* mb(); (A) <-- paired with -.
* |
* lock(hash_bucket(futex)); |
@@ -135,14 +138,14 @@
* unlock(hash_bucket(futex));
* schedule(); if (waiters)
* lock(hash_bucket(futex));
- * wake_waiters(futex);
- * unlock(hash_bucket(futex));
+ * else wake_waiters(futex);
+ * waiters--; (b) unlock(hash_bucket(futex));
*
- * Where (A) orders the waiters increment and the futex value read -- this
- * is guaranteed by the head counter in the hb spinlock; and where (B)
- * orders the write to futex and the waiters read -- this is done by the
- * barriers in get_futex_key_refs(), through either ihold or atomic_inc,
- * depending on the futex type.
+ * Where (A) orders the waiters increment and the futex value read through
+ * atomic operations (see hb_waiters_inc) and where (B) orders the write
+ * to futex and the waiters read -- this is done by the barriers in
+ * get_futex_key_refs(), through either ihold or atomic_inc, depending on the
+ * futex type.
*
* This yields the following case (where X:=waiters, Y:=futex):
*
@@ -155,9 +158,22 @@
* Which guarantees that x==0 && y==0 is impossible; which translates back into
* the guarantee that we cannot both miss the futex variable change and the
* enqueue.
+ *
+ * Note that a new waiter is accounted for in (a) even when it is possible that
+ * the wait call can return error, in which case we backtrack from it in (b).
+ * Refer to the comment in queue_lock().
+ *
+ * Similarly, in order to account for waiters being requeued on another
+ * address we always increment the waiters for the destination bucket before
+ * acquiring the lock. It then decrements them again after releasing it -
+ * the code that actually moves the futex(es) between hash buckets (requeue_futex)
+ * will do the additional required waiter count housekeeping. This is done for
+ * double_lock_hb() and double_unlock_hb(), respectively.
*/
+#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
int __read_mostly futex_cmpxchg_enabled;
+#endif
/*
* Futex flags used to encode options to functions and preserve them across
@@ -234,6 +250,7 @@ static const struct futex_q futex_q_init = {
* waiting on a futex.
*/
struct futex_hash_bucket {
+ atomic_t waiters;
spinlock_t lock;
struct plist_head chain;
} ____cacheline_aligned_in_smp;
@@ -250,25 +267,40 @@ static inline void futex_get_mm(union futex_key *key)
* get_futex_key() implies a full barrier. This is relied upon
* as full barrier (B), see the ordering comment above.
*/
- smp_mb__after_atomic_inc();
+ smp_mb__after_atomic();
}
-static inline bool hb_waiters_pending(struct futex_hash_bucket *hb)
+/*
+ * Reflects a new waiter being added to the waitqueue.
+ */
+static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
{
#ifdef CONFIG_SMP
+ atomic_inc(&hb->waiters);
/*
- * Tasks trying to enter the critical region are most likely
- * potential waiters that will be added to the plist. Ensure
- * that wakers won't miss to-be-slept tasks in the window between
- * the wait call and the actual plist_add.
+ * Full barrier (A), see the ordering comment above.
*/
- if (spin_is_locked(&hb->lock))
- return true;
- smp_rmb(); /* Make sure we check the lock state first */
+ smp_mb__after_atomic();
+#endif
+}
- return !plist_head_empty(&hb->chain);
+/*
+ * Reflects a waiter being removed from the waitqueue by wakeup
+ * paths.
+ */
+static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ atomic_dec(&hb->waiters);
+#endif
+}
+
+static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ return atomic_read(&hb->waiters);
#else
- return true;
+ return 1;
#endif
}
@@ -711,59 +743,142 @@ void exit_pi_state_list(struct task_struct *curr)
raw_spin_unlock_irq(&curr->pi_lock);
}
-static int
-lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
- union futex_key *key, struct futex_pi_state **ps)
+/*
+ * We need to check the following states:
+ *
+ * Waiter | pi_state | pi->owner | uTID | uODIED | ?
+ *
+ * [1] NULL | --- | --- | 0 | 0/1 | Valid
+ * [2] NULL | --- | --- | >0 | 0/1 | Valid
+ *
+ * [3] Found | NULL | -- | Any | 0/1 | Invalid
+ *
+ * [4] Found | Found | NULL | 0 | 1 | Valid
+ * [5] Found | Found | NULL | >0 | 1 | Invalid
+ *
+ * [6] Found | Found | task | 0 | 1 | Valid
+ *
+ * [7] Found | Found | NULL | Any | 0 | Invalid
+ *
+ * [8] Found | Found | task | ==taskTID | 0/1 | Valid
+ * [9] Found | Found | task | 0 | 0 | Invalid
+ * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
+ *
+ * [1] Indicates that the kernel can acquire the futex atomically. We
+ * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
+ *
+ * [2] Valid, if TID does not belong to a kernel thread. If no matching
+ * thread is found then it indicates that the owner TID has died.
+ *
+ * [3] Invalid. The waiter is queued on a non PI futex
+ *
+ * [4] Valid state after exit_robust_list(), which sets the user space
+ * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
+ *
+ * [5] The user space value got manipulated between exit_robust_list()
+ * and exit_pi_state_list()
+ *
+ * [6] Valid state after exit_pi_state_list() which sets the new owner in
+ * the pi_state but cannot access the user space value.
+ *
+ * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
+ *
+ * [8] Owner and user space value match
+ *
+ * [9] There is no transient state which sets the user space TID to 0
+ * except exit_robust_list(), but this is indicated by the
+ * FUTEX_OWNER_DIED bit. See [4]
+ *
+ * [10] There is no transient state which leaves owner and user space
+ * TID out of sync.
+ */
+
+/*
+ * Validate that the existing waiter has a pi_state and sanity check
+ * the pi_state against the user space value. If correct, attach to
+ * it.
+ */
+static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
+ struct futex_pi_state **ps)
{
- struct futex_pi_state *pi_state = NULL;
- struct futex_q *this, *next;
- struct task_struct *p;
pid_t pid = uval & FUTEX_TID_MASK;
- plist_for_each_entry_safe(this, next, &hb->chain, list) {
- if (match_futex(&this->key, key)) {
- /*
- * Another waiter already exists - bump up
- * the refcount and return its pi_state:
- */
- pi_state = this->pi_state;
+ /*
+ * Userspace might have messed up non-PI and PI futexes [3]
+ */
+ if (unlikely(!pi_state))
+ return -EINVAL;
+
+ WARN_ON(!atomic_read(&pi_state->refcount));
+
+ /*
+ * Handle the owner died case:
+ */
+ if (uval & FUTEX_OWNER_DIED) {
+ /*
+ * exit_pi_state_list sets owner to NULL and wakes the
+ * topmost waiter. The task which acquires the
+ * pi_state->rt_mutex will fixup owner.
+ */
+ if (!pi_state->owner) {
/*
- * Userspace might have messed up non-PI and PI futexes
+ * No pi state owner, but the user space TID
+ * is not 0. Inconsistent state. [5]
*/
- if (unlikely(!pi_state))
+ if (pid)
return -EINVAL;
-
- WARN_ON(!atomic_read(&pi_state->refcount));
-
/*
- * When pi_state->owner is NULL then the owner died
- * and another waiter is on the fly. pi_state->owner
- * is fixed up by the task which acquires
- * pi_state->rt_mutex.
- *
- * We do not check for pid == 0 which can happen when
- * the owner died and robust_list_exit() cleared the
- * TID.
+ * Take a ref on the state and return success. [4]
*/
- if (pid && pi_state->owner) {
- /*
- * Bail out if user space manipulated the
- * futex value.
- */
- if (pid != task_pid_vnr(pi_state->owner))
- return -EINVAL;
- }
-
- atomic_inc(&pi_state->refcount);
- *ps = pi_state;
-
- return 0;
+ goto out_state;
}
+
+ /*
+ * If TID is 0, then either the dying owner has not
+ * yet executed exit_pi_state_list() or some waiter
+ * acquired the rtmutex in the pi state, but did not
+ * yet fixup the TID in user space.
+ *
+ * Take a ref on the state and return success. [6]
+ */
+ if (!pid)
+ goto out_state;
+ } else {
+ /*
+ * If the owner died bit is not set, then the pi_state
+ * must have an owner. [7]
+ */
+ if (!pi_state->owner)
+ return -EINVAL;
}
/*
+ * Bail out if user space manipulated the futex value. If pi
+ * state exists then the owner TID must be the same as the
+ * user space TID. [9/10]
+ */
+ if (pid != task_pid_vnr(pi_state->owner))
+ return -EINVAL;
+out_state:
+ atomic_inc(&pi_state->refcount);
+ *ps = pi_state;
+ return 0;
+}
+
+/*
+ * Lookup the task for the TID provided from user space and attach to
+ * it after doing proper sanity checks.
+ */
+static int attach_to_pi_owner(u32 uval, union futex_key *key,
+ struct futex_pi_state **ps)
+{
+ pid_t pid = uval & FUTEX_TID_MASK;
+ struct futex_pi_state *pi_state;
+ struct task_struct *p;
+
+ /*
* We are the first waiter - try to look up the real owner and attach
- * the new pi_state to it, but bail out when TID = 0
+ * the new pi_state to it, but bail out when TID = 0 [1]
*/
if (!pid)
return -ESRCH;
@@ -771,6 +886,11 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
if (!p)
return -ESRCH;
+ if (!p->mm) {
+ put_task_struct(p);
+ return -EPERM;
+ }
+
/*
* We need to look at the task state flags to figure out,
* whether the task is exiting. To protect against the do_exit
@@ -791,10 +911,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
return ret;
}
+ /*
+ * No existing pi state. First waiter. [2]
+ */
pi_state = alloc_pi_state();
/*
- * Initialize the pi_mutex in locked state and make 'p'
+ * Initialize the pi_mutex in locked state and make @p
* the owner of it:
*/
rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
@@ -814,6 +937,36 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
return 0;
}
+static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+ union futex_key *key, struct futex_pi_state **ps)
+{
+ struct futex_q *match = futex_top_waiter(hb, key);
+
+ /*
+ * If there is a waiter on that futex, validate it and
+ * attach to the pi_state when the validation succeeds.
+ */
+ if (match)
+ return attach_to_pi_state(uval, match->pi_state, ps);
+
+ /*
+ * We are the first waiter - try to look up the owner based on
+ * @uval and attach to it.
+ */
+ return attach_to_pi_owner(uval, key, ps);
+}
+
+static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
+{
+ u32 uninitialized_var(curval);
+
+ if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
+ return -EFAULT;
+
+ /*If user space value changed, let the caller retry */
+ return curval != uval ? -EAGAIN : 0;
+}
+
/**
* futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
* @uaddr: the pi futex user address
@@ -837,105 +990,69 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
struct futex_pi_state **ps,
struct task_struct *task, int set_waiters)
{
- int lock_taken, ret, force_take = 0;
- u32 uval, newval, curval, vpid = task_pid_vnr(task);
-
-retry:
- ret = lock_taken = 0;
+ u32 uval, newval, vpid = task_pid_vnr(task);
+ struct futex_q *match;
+ int ret;
/*
- * To avoid races, we attempt to take the lock here again
- * (by doing a 0 -> TID atomic cmpxchg), while holding all
- * the locks. It will most likely not succeed.
+ * Read the user space value first so we can validate a few
+ * things before proceeding further.
*/
- newval = vpid;
- if (set_waiters)
- newval |= FUTEX_WAITERS;
-
- if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
+ if (get_futex_value_locked(&uval, uaddr))
return -EFAULT;
/*
* Detect deadlocks.
*/
- if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
+ if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
return -EDEADLK;
/*
- * Surprise - we got the lock. Just return to userspace:
- */
- if (unlikely(!curval))
- return 1;
-
- uval = curval;
-
- /*
- * Set the FUTEX_WAITERS flag, so the owner will know it has someone
- * to wake at the next unlock.
+ * Lookup existing state first. If it exists, try to attach to
+ * its pi_state.
*/
- newval = curval | FUTEX_WAITERS;
+ match = futex_top_waiter(hb, key);
+ if (match)
+ return attach_to_pi_state(uval, match->pi_state, ps);
/*
- * Should we force take the futex? See below.
+ * No waiter and user TID is 0. We are here because the
+ * waiters or the owner died bit is set or called from
+ * requeue_cmp_pi or for whatever reason something took the
+ * syscall.
*/
- if (unlikely(force_take)) {
+ if (!(uval & FUTEX_TID_MASK)) {
/*
- * Keep the OWNER_DIED and the WAITERS bit and set the
- * new TID value.
+ * We take over the futex. No other waiters and the user space
+ * TID is 0. We preserve the owner died bit.
*/
- newval = (curval & ~FUTEX_TID_MASK) | vpid;
- force_take = 0;
- lock_taken = 1;
- }
+ newval = uval & FUTEX_OWNER_DIED;
+ newval |= vpid;
- if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
- return -EFAULT;
- if (unlikely(curval != uval))
- goto retry;
+ /* The futex requeue_pi code can enforce the waiters bit */
+ if (set_waiters)
+ newval |= FUTEX_WAITERS;
+
+ ret = lock_pi_update_atomic(uaddr, uval, newval);
+ /* If the take over worked, return 1 */
+ return ret < 0 ? ret : 1;
+ }
/*
- * We took the lock due to forced take over.
+ * First waiter. Set the waiters bit before attaching ourself to
+ * the owner. If owner tries to unlock, it will be forced into
+ * the kernel and blocked on hb->lock.
*/
- if (unlikely(lock_taken))
- return 1;
-
+ newval = uval | FUTEX_WAITERS;
+ ret = lock_pi_update_atomic(uaddr, uval, newval);
+ if (ret)
+ return ret;
/*
- * We dont have the lock. Look up the PI state (or create it if
- * we are the first waiter):
+ * If the update of the user space value succeeded, we try to
+ * attach to the owner. If that fails, no harm done, we only
+ * set the FUTEX_WAITERS bit in the user space variable.
*/
- ret = lookup_pi_state(uval, hb, key, ps);
-
- if (unlikely(ret)) {
- switch (ret) {
- case -ESRCH:
- /*
- * We failed to find an owner for this
- * futex. So we have no pi_state to block
- * on. This can happen in two cases:
- *
- * 1) The owner died
- * 2) A stale FUTEX_WAITERS bit
- *
- * Re-read the futex value.
- */
- if (get_futex_value_locked(&curval, uaddr))
- return -EFAULT;
-
- /*
- * If the owner died or we have a stale
- * WAITERS bit the owner TID in the user space
- * futex is 0.
- */
- if (!(curval & FUTEX_TID_MASK)) {
- force_take = 1;
- goto retry;
- }
- default:
- break;
- }
- }
-
- return ret;
+ return attach_to_pi_owner(uval, key, ps);
}
/**
@@ -954,6 +1071,7 @@ static void __unqueue_futex(struct futex_q *q)
hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
plist_del(&q->list, &hb->chain);
+ hb_waiters_dec(hb);
}
/*
@@ -995,6 +1113,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
struct task_struct *new_owner;
struct futex_pi_state *pi_state = this->pi_state;
u32 uninitialized_var(curval), newval;
+ int ret = 0;
if (!pi_state)
return -EINVAL;
@@ -1018,23 +1137,19 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
new_owner = this->task;
/*
- * We pass it to the next owner. (The WAITERS bit is always
- * kept enabled while there is PI state around. We must also
- * preserve the owner died bit.)
+ * We pass it to the next owner. The WAITERS bit is always
+ * kept enabled while there is PI state around. We cleanup the
+ * owner died bit, because we are the owner.
*/
- if (!(uval & FUTEX_OWNER_DIED)) {
- int ret = 0;
-
- newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
+ newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
- ret = -EFAULT;
- else if (curval != uval)
- ret = -EINVAL;
- if (ret) {
- raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
- return ret;
- }
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
+ ret = -EFAULT;
+ else if (curval != uval)
+ ret = -EINVAL;
+ if (ret) {
+ raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ return ret;
}
raw_spin_lock_irq(&pi_state->owner->pi_lock);
@@ -1054,22 +1169,6 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
return 0;
}
-static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
-{
- u32 uninitialized_var(oldval);
-
- /*
- * There is no waiter, so we unlock the futex. The owner died
- * bit has not to be preserved here. We are the owner:
- */
- if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
- return -EFAULT;
- if (oldval != uval)
- return -EAGAIN;
-
- return 0;
-}
-
/*
* Express the locking dependencies for lockdep:
*/
@@ -1257,7 +1356,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
*/
if (likely(&hb1->chain != &hb2->chain)) {
plist_del(&q->list, &hb1->chain);
+ hb_waiters_dec(hb1);
plist_add(&q->list, &hb2->chain);
+ hb_waiters_inc(hb2);
q->lock_ptr = &hb2->lock;
}
get_futex_key_refs(key2);
@@ -1312,7 +1413,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
*
* Return:
* 0 - failed to acquire the lock atomically;
- * 1 - acquired the lock;
+ * >0 - acquired the lock, return value is vpid of the top_waiter
* <0 - error
*/
static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1323,7 +1424,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
{
struct futex_q *top_waiter = NULL;
u32 curval;
- int ret;
+ int ret, vpid;
if (get_futex_value_locked(&curval, pifutex))
return -EFAULT;
@@ -1351,11 +1452,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
* the contended case or if set_waiters is 1. The pi_state is returned
* in ps in contended cases.
*/
+ vpid = task_pid_vnr(top_waiter->task);
ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
set_waiters);
- if (ret == 1)
+ if (ret == 1) {
requeue_pi_wake_futex(top_waiter, key2, hb2);
-
+ return vpid;
+ }
return ret;
}
@@ -1386,10 +1489,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
struct futex_pi_state *pi_state = NULL;
struct futex_hash_bucket *hb1, *hb2;
struct futex_q *this, *next;
- u32 curval2;
if (requeue_pi) {
/*
+ * Requeue PI only works on two distinct uaddrs. This
+ * check is only valid for private futexes. See below.
+ */
+ if (uaddr1 == uaddr2)
+ return -EINVAL;
+
+ /*
* requeue_pi requires a pi_state, try to allocate it now
* without any locks in case it fails.
*/
@@ -1427,10 +1536,20 @@ retry:
if (unlikely(ret != 0))
goto out_put_key1;
+ /*
+ * The check above which compares uaddrs is not sufficient for
+ * shared futexes. We need to compare the keys:
+ */
+ if (requeue_pi && match_futex(&key1, &key2)) {
+ ret = -EINVAL;
+ goto out_put_keys;
+ }
+
hb1 = hash_futex(&key1);
hb2 = hash_futex(&key2);
retry_private:
+ hb_waiters_inc(hb2);
double_lock_hb(hb1, hb2);
if (likely(cmpval != NULL)) {
@@ -1440,6 +1559,7 @@ retry_private:
if (unlikely(ret)) {
double_unlock_hb(hb1, hb2);
+ hb_waiters_dec(hb2);
ret = get_user(curval, uaddr1);
if (ret)
@@ -1472,16 +1592,25 @@ retry_private:
* At this point the top_waiter has either taken uaddr2 or is
* waiting on it. If the former, then the pi_state will not
* exist yet, look it up one more time to ensure we have a
- * reference to it.
+ * reference to it. If the lock was taken, ret contains the
+ * vpid of the top waiter task.
*/
- if (ret == 1) {
+ if (ret > 0) {
WARN_ON(pi_state);
drop_count++;
task_count++;
- ret = get_futex_value_locked(&curval2, uaddr2);
- if (!ret)
- ret = lookup_pi_state(curval2, hb2, &key2,
- &pi_state);
+ /*
+ * If we acquired the lock, then the user
+ * space value of uaddr2 should be vpid. It
+ * cannot be changed by the top waiter as it
+ * is blocked on hb2 lock if it tries to do
+ * so. If something fiddled with it behind our
+ * back the pi state lookup might unearth
+ * it. So we rather use the known value than
+ * rereading and handing potential crap to
+ * lookup_pi_state.
+ */
+ ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
}
switch (ret) {
@@ -1489,6 +1618,7 @@ retry_private:
break;
case -EFAULT:
double_unlock_hb(hb1, hb2);
+ hb_waiters_dec(hb2);
put_futex_key(&key2);
put_futex_key(&key1);
ret = fault_in_user_writeable(uaddr2);
@@ -1496,8 +1626,14 @@ retry_private:
goto retry;
goto out;
case -EAGAIN:
- /* The owner was exiting, try again. */
+ /*
+ * Two reasons for this:
+ * - Owner is exiting and we just wait for the
+ * exit to complete.
+ * - The user space value changed.
+ */
double_unlock_hb(hb1, hb2);
+ hb_waiters_dec(hb2);
put_futex_key(&key2);
put_futex_key(&key1);
cond_resched();
@@ -1554,7 +1690,7 @@ retry_private:
this->pi_state = pi_state;
ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
this->rt_waiter,
- this->task, 1);
+ this->task);
if (ret == 1) {
/* We got the lock. */
requeue_pi_wake_futex(this, &key2, hb2);
@@ -1573,6 +1709,7 @@ retry_private:
out_unlock:
double_unlock_hb(hb1, hb2);
+ hb_waiters_dec(hb2);
/*
* drop_futex_key_refs() must be called outside the spinlocks. During
@@ -1600,6 +1737,17 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
struct futex_hash_bucket *hb;
hb = hash_futex(&q->key);
+
+ /*
+ * Increment the counter before taking the lock so that
+ * a potential waker won't miss a to-be-slept task that is
+ * waiting for the spinlock. This is safe as all queue_lock()
+ * users end up calling queue_me(). Similarly, for housekeeping,
+ * decrement the counter at queue_unlock() when some error has
+ * occurred and we don't end up adding the task to the list.
+ */
+ hb_waiters_inc(hb);
+
q->lock_ptr = &hb->lock;
spin_lock(&hb->lock); /* implies MB (A) */
@@ -1611,6 +1759,7 @@ queue_unlock(struct futex_hash_bucket *hb)
__releases(&hb->lock)
{
spin_unlock(&hb->lock);
+ hb_waiters_dec(hb);
}
/**
@@ -2139,8 +2288,10 @@ retry_private:
goto uaddr_faulted;
case -EAGAIN:
/*
- * Task is exiting and we just wait for the
- * exit to complete.
+ * Two reasons for this:
+ * - Task is exiting and we just wait for the
+ * exit to complete.
+ * - The user space value changed.
*/
queue_unlock(hb);
put_futex_key(&q.key);
@@ -2160,9 +2311,9 @@ retry_private:
/*
* Block on the PI mutex:
*/
- if (!trylock)
- ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
- else {
+ if (!trylock) {
+ ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
+ } else {
ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
/* Fixup the trylock return value: */
ret = ret ? 0 : -EWOULDBLOCK;
@@ -2224,10 +2375,10 @@ uaddr_faulted:
*/
static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
{
- struct futex_hash_bucket *hb;
- struct futex_q *this, *next;
+ u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
union futex_key key = FUTEX_KEY_INIT;
- u32 uval, vpid = task_pid_vnr(current);
+ struct futex_hash_bucket *hb;
+ struct futex_q *match;
int ret;
retry:
@@ -2240,58 +2391,47 @@ retry:
return -EPERM;
ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
- if (unlikely(ret != 0))
- goto out;
+ if (ret)
+ return ret;
hb = hash_futex(&key);
spin_lock(&hb->lock);
/*
- * To avoid races, try to do the TID -> 0 atomic transition
- * again. If it succeeds then we can return without waking
- * anyone else up:
+ * Check waiters first. We do not trust user space values at
+ * all and we at least want to know if user space fiddled
+ * with the futex value instead of blindly unlocking.
*/
- if (!(uval & FUTEX_OWNER_DIED) &&
- cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
- goto pi_faulted;
- /*
- * Rare case: we managed to release the lock atomically,
- * no need to wake anyone else up:
- */
- if (unlikely(uval == vpid))
- goto out_unlock;
-
- /*
- * Ok, other tasks may need to be woken up - check waiters
- * and do the wakeup if necessary:
- */
- plist_for_each_entry_safe(this, next, &hb->chain, list) {
- if (!match_futex (&this->key, &key))
- continue;
- ret = wake_futex_pi(uaddr, uval, this);
+ match = futex_top_waiter(hb, &key);
+ if (match) {
+ ret = wake_futex_pi(uaddr, uval, match);
/*
- * The atomic access to the futex value
- * generated a pagefault, so retry the
- * user-access and the wakeup:
+ * The atomic access to the futex value generated a
+ * pagefault, so retry the user-access and the wakeup:
*/
if (ret == -EFAULT)
goto pi_faulted;
goto out_unlock;
}
+
/*
- * No waiters - kernel unlocks the futex:
+ * We have no kernel internal state, i.e. no waiters in the
+ * kernel. Waiters which are about to queue themselves are stuck
+ * on hb->lock. So we can safely ignore them. We do neither
+ * preserve the WAITERS bit not the OWNER_DIED one. We are the
+ * owner.
*/
- if (!(uval & FUTEX_OWNER_DIED)) {
- ret = unlock_futex_pi(uaddr, uval);
- if (ret == -EFAULT)
- goto pi_faulted;
- }
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
+ goto pi_faulted;
+
+ /*
+ * If uval has changed, let user space handle it.
+ */
+ ret = (curval == uval) ? 0 : -EAGAIN;
out_unlock:
spin_unlock(&hb->lock);
put_futex_key(&key);
-
-out:
return ret;
pi_faulted:
@@ -2342,6 +2482,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* Unqueue the futex_q and determine which it was.
*/
plist_del(&q->list, &hb->chain);
+ hb_waiters_dec(hb);
/* Handle spurious wakeups gracefully */
ret = -EWOULDBLOCK;
@@ -2446,6 +2587,16 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
if (ret)
goto out_key2;
+ /*
+ * The check above which compares uaddrs is not sufficient for
+ * shared futexes. We need to compare the keys:
+ */
+ if (match_futex(&q.key, &key2)) {
+ queue_unlock(hb);
+ ret = -EINVAL;
+ goto out_put_keys;
+ }
+
/* Queue the futex_q, drop the hb lock, wait for wakeup. */
futex_wait_queue_me(hb, &q, to);
@@ -2483,7 +2634,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
*/
WARN_ON(!q.pi_state);
pi_mutex = &q.pi_state->pi_mutex;
- ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
+ ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
debug_rt_mutex_free_waiter(&rt_waiter);
spin_lock(q.lock_ptr);
@@ -2843,9 +2994,28 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
}
-static int __init futex_init(void)
+static void __init futex_detect_cmpxchg(void)
{
+#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
u32 curval;
+
+ /*
+ * This will fail and we want it. Some arch implementations do
+ * runtime detection of the futex_atomic_cmpxchg_inatomic()
+ * functionality. We want to know that before we call in any
+ * of the complex code paths. Also we want to prevent
+ * registration of robust lists in that case. NULL is
+ * guaranteed to fault and we get -EFAULT on functional
+ * implementation, the non-functional ones will return
+ * -ENOSYS.
+ */
+ if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
+ futex_cmpxchg_enabled = 1;
+#endif
+}
+
+static int __init futex_init(void)
+{
unsigned int futex_shift;
unsigned long i;
@@ -2861,20 +3031,11 @@ static int __init futex_init(void)
&futex_shift, NULL,
futex_hashsize, futex_hashsize);
futex_hashsize = 1UL << futex_shift;
- /*
- * This will fail and we want it. Some arch implementations do
- * runtime detection of the futex_atomic_cmpxchg_inatomic()
- * functionality. We want to know that before we call in any
- * of the complex code paths. Also we want to prevent
- * registration of robust lists in that case. NULL is
- * guaranteed to fault and we get -EFAULT on functional
- * implementation, the non-functional ones will return
- * -ENOSYS.
- */
- if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
- futex_cmpxchg_enabled = 1;
+
+ futex_detect_cmpxchg();
for (i = 0; i < futex_hashsize; i++) {
+ atomic_set(&futex_queues[i].waiters, 0);
plist_head_init(&futex_queues[i].chain);
spin_lock_init(&futex_queues[i].lock);
}
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index f9f44fd4d34d..55c8c9349cfe 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -183,7 +183,7 @@ COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
cmd == FUTEX_WAIT_BITSET ||
cmd == FUTEX_WAIT_REQUEUE_PI)) {
- if (get_compat_timespec(&ts, utime))
+ if (compat_get_timespec(&ts, utime))
return -EFAULT;
if (!timespec_valid(&ts))
return -EINVAL;
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index f45b75b713c0..b358a802fd18 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -85,6 +85,12 @@ void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
}
EXPORT_SYMBOL(__gcov_merge_ior);
+void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_time_profile);
+
/**
* gcov_enable_events - enable event reporting through gcov_event()
*
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index 15ff01a76379..edf67c493a8e 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -784,8 +784,7 @@ static __init int gcov_fs_init(void)
err_remove:
pr_err("init failed\n");
- if (root_node.dentry)
- debugfs_remove(root_node.dentry);
+ debugfs_remove(root_node.dentry);
return rc;
}
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index 2c6e4631c814..826ba9fb5e32 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -18,7 +18,12 @@
#include <linux/vmalloc.h>
#include "gcov.h"
+#if __GNUC__ == 4 && __GNUC_MINOR__ >= 9
+#define GCOV_COUNTERS 9
+#else
#define GCOV_COUNTERS 8
+#endif
+
#define GCOV_TAG_FUNCTION_LENGTH 3
static struct gcov_info *gcov_info_head;
diff --git a/kernel/groups.c b/kernel/groups.c
index 90cf1c38c8ea..451698f86cfa 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -157,17 +157,13 @@ int groups_search(const struct group_info *group_info, kgid_t grp)
* set_groups - Change a group subscription in a set of credentials
* @new: The newly prepared set of credentials to alter
* @group_info: The group list to install
- *
- * Validate a group subscription and, if valid, insert it into a set
- * of credentials.
*/
-int set_groups(struct cred *new, struct group_info *group_info)
+void set_groups(struct cred *new, struct group_info *group_info)
{
put_group_info(new->group_info);
groups_sort(group_info);
get_group_info(group_info);
new->group_info = group_info;
- return 0;
}
EXPORT_SYMBOL(set_groups);
@@ -182,18 +178,12 @@ EXPORT_SYMBOL(set_groups);
int set_current_groups(struct group_info *group_info)
{
struct cred *new;
- int ret;
new = prepare_creds();
if (!new)
return -ENOMEM;
- ret = set_groups(new, group_info);
- if (ret < 0) {
- abort_creds(new);
- return ret;
- }
-
+ set_groups(new, group_info);
return commit_creds(new);
}
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 0b9c169d577f..06db12434d72 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -52,8 +52,10 @@ unsigned int __read_mostly sysctl_hung_task_panic =
static int __init hung_task_panic_setup(char *str)
{
- sysctl_hung_task_panic = simple_strtoul(str, NULL, 0);
+ int rc = kstrtouint(str, 0, &sysctl_hung_task_panic);
+ if (rc)
+ return rc;
return 1;
}
__setup("hung_task_panic=", hung_task_panic_setup);
@@ -246,5 +248,4 @@ static int __init hung_task_init(void)
return 0;
}
-
-module_init(hung_task_init);
+subsys_initcall(hung_task_init);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 07cbdfea9ae2..d269cecdfbf0 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -5,6 +5,10 @@ menu "IRQ subsystem"
config MAY_HAVE_SPARSE_IRQ
bool
+# Legacy support, required for itanic
+config GENERIC_IRQ_LEGACY
+ bool
+
# Enable the generic irq autoprobe mechanism
config GENERIC_IRQ_PROBE
bool
@@ -17,6 +21,11 @@ config GENERIC_IRQ_SHOW
config GENERIC_IRQ_SHOW_LEVEL
bool
+# Facility to allocate a hardware interrupt. This is legacy support
+# and should not be used in new code. Use irq domains instead.
+config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
+ bool
+
# Support for delayed migration from interrupt context
config GENERIC_PENDING_IRQ
bool
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index dc04c166c54d..6223fab9a9d2 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -40,10 +40,9 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip)
irq_put_desc_unlock(desc, flags);
/*
* For !CONFIG_SPARSE_IRQ make the irq show up in
- * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is
- * already marked, and this call is harmless.
+ * allocated_irqs.
*/
- irq_reserve_irq(irq);
+ irq_mark_irq(irq);
return 0;
}
EXPORT_SYMBOL(irq_set_chip);
@@ -281,6 +280,19 @@ void unmask_irq(struct irq_desc *desc)
}
}
+void unmask_threaded_irq(struct irq_desc *desc)
+{
+ struct irq_chip *chip = desc->irq_data.chip;
+
+ if (chip->flags & IRQCHIP_EOI_THREADED)
+ chip->irq_eoi(&desc->irq_data);
+
+ if (chip->irq_unmask) {
+ chip->irq_unmask(&desc->irq_data);
+ irq_state_clr_masked(desc);
+ }
+}
+
/*
* handle_nested_irq - Handle a nested irq from a irq thread
* @irq: the interrupt number
@@ -435,6 +447,27 @@ static inline void preflow_handler(struct irq_desc *desc)
static inline void preflow_handler(struct irq_desc *desc) { }
#endif
+static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip)
+{
+ if (!(desc->istate & IRQS_ONESHOT)) {
+ chip->irq_eoi(&desc->irq_data);
+ return;
+ }
+ /*
+ * We need to unmask in the following cases:
+ * - Oneshot irq which did not wake the thread (caused by a
+ * spurious interrupt or a primary handler handling it
+ * completely).
+ */
+ if (!irqd_irq_disabled(&desc->irq_data) &&
+ irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot) {
+ chip->irq_eoi(&desc->irq_data);
+ unmask_irq(desc);
+ } else if (!(chip->flags & IRQCHIP_EOI_THREADED)) {
+ chip->irq_eoi(&desc->irq_data);
+ }
+}
+
/**
* handle_fasteoi_irq - irq handler for transparent controllers
* @irq: the interrupt number
@@ -448,6 +481,8 @@ static inline void preflow_handler(struct irq_desc *desc) { }
void
handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
{
+ struct irq_chip *chip = desc->irq_data.chip;
+
raw_spin_lock(&desc->lock);
if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
@@ -473,19 +508,16 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
preflow_handler(desc);
handle_irq_event(desc);
- if (desc->istate & IRQS_ONESHOT)
- cond_unmask_irq(desc);
+ cond_unmask_eoi_irq(desc, chip);
-out_eoi:
- desc->irq_data.chip->irq_eoi(&desc->irq_data);
-out_unlock:
raw_spin_unlock(&desc->lock);
return;
out:
- if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED))
- goto out_eoi;
- goto out_unlock;
+ if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED))
+ chip->irq_eoi(&desc->irq_data);
+ raw_spin_unlock(&desc->lock);
}
+EXPORT_SYMBOL_GPL(handle_fasteoi_irq);
/**
* handle_edge_irq - edge type IRQ handler
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 452d6f2ba21d..cf80e7b0ddab 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -341,8 +341,8 @@ static struct lock_class_key irq_nested_lock_class;
/*
* irq_map_generic_chip - Map a generic chip for an irq domain
*/
-static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
- irq_hw_number_t hw_irq)
+int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
+ irq_hw_number_t hw_irq)
{
struct irq_data *data = irq_get_irq_data(virq);
struct irq_domain_chip_generic *dgc = d->gc;
@@ -394,6 +394,7 @@ static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set);
return 0;
}
+EXPORT_SYMBOL_GPL(irq_map_generic_chip);
struct irq_domain_ops irq_generic_chip_ops = {
.map = irq_map_generic_chip,
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 131ca176b497..635480270858 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -41,6 +41,7 @@ irqreturn_t no_action(int cpl, void *dev_id)
{
return IRQ_NONE;
}
+EXPORT_SYMBOL_GPL(no_action);
static void warn_no_thread(unsigned int irq, struct irqaction *action)
{
@@ -51,7 +52,7 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
"but no thread function available.", irq, action->name);
}
-static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
+void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
{
/*
* In case the thread crashed and was killed we just pretend that
@@ -157,7 +158,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
break;
}
- irq_wake_thread(desc, action);
+ __irq_wake_thread(desc, action);
/* Fall through to add to randomness */
case IRQ_HANDLED:
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 001fa5bab490..099ea2e0eb88 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -6,6 +6,7 @@
* of this file for your non core code.
*/
#include <linux/irqdesc.h>
+#include <linux/kernel_stat.h>
#ifdef CONFIG_SPARSE_IRQ
# define IRQ_BITMAP_BITS (NR_IRQS + 8196)
@@ -32,7 +33,7 @@ enum {
};
/*
- * Bit masks for desc->state
+ * Bit masks for desc->core_internal_state__do_not_mess_with_it
*
* IRQS_AUTODETECT - autodetection in progress
* IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt
@@ -73,6 +74,13 @@ extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu);
extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu);
extern void mask_irq(struct irq_desc *desc);
extern void unmask_irq(struct irq_desc *desc);
+extern void unmask_threaded_irq(struct irq_desc *desc);
+
+#ifdef CONFIG_SPARSE_IRQ
+static inline void irq_mark_irq(unsigned int irq) { }
+#else
+extern void irq_mark_irq(unsigned int irq);
+#endif
extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
@@ -82,6 +90,7 @@ irqreturn_t handle_irq_event(struct irq_desc *desc);
/* Resending of interrupts :*/
void check_irq_resend(struct irq_desc *desc, unsigned int irq);
bool irq_wait_for_poll(struct irq_desc *desc);
+void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action);
#ifdef CONFIG_PROC_FS
extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
@@ -179,3 +188,9 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
{
return d->state_use_accessors & mask;
}
+
+static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc)
+{
+ __this_cpu_inc(*desc->kstat_irqs);
+ __this_cpu_inc(kstat.irqs_sum);
+}
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 8ab8e9390297..1487a123db5c 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -278,7 +278,12 @@ EXPORT_SYMBOL(irq_to_desc);
static void free_desc(unsigned int irq)
{
- dynamic_irq_cleanup(irq);
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ desc_set_defaults(irq, desc, desc_node(desc), NULL);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
}
static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
@@ -299,6 +304,20 @@ static int irq_expand_nr_irqs(unsigned int nr)
return -ENOMEM;
}
+void irq_mark_irq(unsigned int irq)
+{
+ mutex_lock(&sparse_irq_lock);
+ bitmap_set(allocated_irqs, irq, 1);
+ mutex_unlock(&sparse_irq_lock);
+}
+
+#ifdef CONFIG_GENERIC_IRQ_LEGACY
+void irq_init_desc(unsigned int irq)
+{
+ free_desc(irq);
+}
+#endif
+
#endif /* !CONFIG_SPARSE_IRQ */
/**
@@ -363,6 +382,13 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
if (from > irq)
return -EINVAL;
from = irq;
+ } else {
+ /*
+ * For interrupts which are freely allocated the
+ * architecture can force a lower bound to the @from
+ * argument. x86 uses this to exclude the GSI space.
+ */
+ from = arch_dynirq_lower_bound(from);
}
mutex_lock(&sparse_irq_lock);
@@ -389,30 +415,56 @@ err:
}
EXPORT_SYMBOL_GPL(__irq_alloc_descs);
+#ifdef CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
/**
- * irq_reserve_irqs - mark irqs allocated
- * @from: mark from irq number
- * @cnt: number of irqs to mark
+ * irq_alloc_hwirqs - Allocate an irq descriptor and initialize the hardware
+ * @cnt: number of interrupts to allocate
+ * @node: node on which to allocate
*
- * Returns 0 on success or an appropriate error code
+ * Returns an interrupt number > 0 or 0, if the allocation fails.
*/
-int irq_reserve_irqs(unsigned int from, unsigned int cnt)
+unsigned int irq_alloc_hwirqs(int cnt, int node)
{
- unsigned int start;
- int ret = 0;
+ int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL);
- if (!cnt || (from + cnt) > nr_irqs)
- return -EINVAL;
+ if (irq < 0)
+ return 0;
- mutex_lock(&sparse_irq_lock);
- start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0);
- if (start == from)
- bitmap_set(allocated_irqs, start, cnt);
- else
- ret = -EEXIST;
- mutex_unlock(&sparse_irq_lock);
- return ret;
+ for (i = irq; cnt > 0; i++, cnt--) {
+ if (arch_setup_hwirq(i, node))
+ goto err;
+ irq_clear_status_flags(i, _IRQ_NOREQUEST);
+ }
+ return irq;
+
+err:
+ for (i--; i >= irq; i--) {
+ irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE);
+ arch_teardown_hwirq(i);
+ }
+ irq_free_descs(irq, cnt);
+ return 0;
}
+EXPORT_SYMBOL_GPL(irq_alloc_hwirqs);
+
+/**
+ * irq_free_hwirqs - Free irq descriptor and cleanup the hardware
+ * @from: Free from irq number
+ * @cnt: number of interrupts to free
+ *
+ */
+void irq_free_hwirqs(unsigned int from, int cnt)
+{
+ int i, j;
+
+ for (i = from, j = cnt; j > 0; i++, j--) {
+ irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE);
+ arch_teardown_hwirq(i);
+ }
+ irq_free_descs(from, cnt);
+}
+EXPORT_SYMBOL_GPL(irq_free_hwirqs);
+#endif
/**
* irq_get_next_irq - get next allocated irq number
@@ -475,18 +527,9 @@ int irq_set_percpu_devid(unsigned int irq)
return 0;
}
-/**
- * dynamic_irq_cleanup - cleanup a dynamically allocated irq
- * @irq: irq number to initialize
- */
-void dynamic_irq_cleanup(unsigned int irq)
+void kstat_incr_irq_this_cpu(unsigned int irq)
{
- struct irq_desc *desc = irq_to_desc(irq);
- unsigned long flags;
-
- raw_spin_lock_irqsave(&desc->lock, flags);
- desc_set_defaults(irq, desc, desc_node(desc), NULL);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
}
unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index f14033700c25..6534ff6ce02e 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -27,14 +27,14 @@ static struct irq_domain *irq_default_domain;
* __irq_domain_add() - Allocate a new irq_domain data structure
* @of_node: optional device-tree node of the interrupt controller
* @size: Size of linear map; 0 for radix mapping only
+ * @hwirq_max: Maximum number of interrupts supported by controller
* @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
* direct mapping
* @ops: map/unmap domain callbacks
* @host_data: Controller private data pointer
*
- * Allocates and initialize and irq_domain structure. Caller is expected to
- * register allocated irq_domain with irq_domain_register(). Returns pointer
- * to IRQ domain, or NULL on failure.
+ * Allocates and initialize and irq_domain structure.
+ * Returns pointer to IRQ domain, or NULL on failure.
*/
struct irq_domain *__irq_domain_add(struct device_node *of_node, int size,
irq_hw_number_t hwirq_max, int direct_max,
@@ -231,7 +231,7 @@ void irq_set_default_host(struct irq_domain *domain)
}
EXPORT_SYMBOL_GPL(irq_set_default_host);
-static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
+void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
{
struct irq_data *irq_data = irq_get_irq_data(irq);
irq_hw_number_t hwirq;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index d3bf660cb57f..3dc6a61bf06a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -32,24 +32,10 @@ static int __init setup_forced_irqthreads(char *arg)
early_param("threadirqs", setup_forced_irqthreads);
#endif
-/**
- * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
- * @irq: interrupt number to wait for
- *
- * This function waits for any pending IRQ handlers for this interrupt
- * to complete before returning. If you use this function while
- * holding a resource the IRQ handler may need you will deadlock.
- *
- * This function may be called - with care - from IRQ context.
- */
-void synchronize_irq(unsigned int irq)
+static void __synchronize_hardirq(struct irq_desc *desc)
{
- struct irq_desc *desc = irq_to_desc(irq);
bool inprogress;
- if (!desc)
- return;
-
do {
unsigned long flags;
@@ -67,12 +53,56 @@ void synchronize_irq(unsigned int irq)
/* Oops, that failed? */
} while (inprogress);
+}
- /*
- * We made sure that no hardirq handler is running. Now verify
- * that no threaded handlers are active.
- */
- wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active));
+/**
+ * synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs)
+ * @irq: interrupt number to wait for
+ *
+ * This function waits for any pending hard IRQ handlers for this
+ * interrupt to complete before returning. If you use this
+ * function while holding a resource the IRQ handler may need you
+ * will deadlock. It does not take associated threaded handlers
+ * into account.
+ *
+ * Do not use this for shutdown scenarios where you must be sure
+ * that all parts (hardirq and threaded handler) have completed.
+ *
+ * This function may be called - with care - from IRQ context.
+ */
+void synchronize_hardirq(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (desc)
+ __synchronize_hardirq(desc);
+}
+EXPORT_SYMBOL(synchronize_hardirq);
+
+/**
+ * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
+ * @irq: interrupt number to wait for
+ *
+ * This function waits for any pending IRQ handlers for this interrupt
+ * to complete before returning. If you use this function while
+ * holding a resource the IRQ handler may need you will deadlock.
+ *
+ * This function may be called - with care - from IRQ context.
+ */
+void synchronize_irq(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (desc) {
+ __synchronize_hardirq(desc);
+ /*
+ * We made sure that no hardirq handler is
+ * running. Now verify that no threaded handlers are
+ * active.
+ */
+ wait_event(desc->wait_for_threads,
+ !atomic_read(&desc->threads_active));
+ }
}
EXPORT_SYMBOL(synchronize_irq);
@@ -150,7 +180,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
struct irq_chip *chip = irq_data_get_irq_chip(data);
int ret;
- ret = chip->irq_set_affinity(data, mask, false);
+ ret = chip->irq_set_affinity(data, mask, force);
switch (ret) {
case IRQ_SET_MASK_OK:
cpumask_copy(data->affinity, mask);
@@ -162,7 +192,8 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
return ret;
}
-int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
+int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
+ bool force)
{
struct irq_chip *chip = irq_data_get_irq_chip(data);
struct irq_desc *desc = irq_data_to_desc(data);
@@ -172,7 +203,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
return -EINVAL;
if (irq_can_move_pcntxt(data)) {
- ret = irq_do_set_affinity(data, mask, false);
+ ret = irq_do_set_affinity(data, mask, force);
} else {
irqd_set_move_pending(data);
irq_copy_pending(desc, mask);
@@ -187,13 +218,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
return ret;
}
-/**
- * irq_set_affinity - Set the irq affinity of a given irq
- * @irq: Interrupt to set affinity
- * @mask: cpumask
- *
- */
-int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
+int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force)
{
struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
@@ -203,7 +228,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
return -EINVAL;
raw_spin_lock_irqsave(&desc->lock, flags);
- ret = __irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask);
+ ret = irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force);
raw_spin_unlock_irqrestore(&desc->lock, flags);
return ret;
}
@@ -718,7 +743,7 @@ again:
if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) &&
irqd_irq_masked(&desc->irq_data))
- unmask_irq(desc);
+ unmask_threaded_irq(desc);
out_unlock:
raw_spin_unlock_irq(&desc->lock);
@@ -727,7 +752,7 @@ out_unlock:
#ifdef CONFIG_SMP
/*
- * Check whether we need to chasnge the affinity of the interrupt thread.
+ * Check whether we need to change the affinity of the interrupt thread.
*/
static void
irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
@@ -861,8 +886,8 @@ static int irq_thread(void *data)
irq_thread_check_affinity(desc, action);
action_ret = handler_fn(desc, action);
- if (!noirqdebug)
- note_interrupt(action->irq, desc, action_ret);
+ if (action_ret == IRQ_HANDLED)
+ atomic_inc(&desc->threads_handled);
wake_threads_waitq(desc);
}
@@ -880,6 +905,33 @@ static int irq_thread(void *data)
return 0;
}
+/**
+ * irq_wake_thread - wake the irq thread for the action identified by dev_id
+ * @irq: Interrupt line
+ * @dev_id: Device identity for which the thread should be woken
+ *
+ */
+void irq_wake_thread(unsigned int irq, void *dev_id)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irqaction *action;
+ unsigned long flags;
+
+ if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
+ return;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ for (action = desc->action; action; action = action->next) {
+ if (action->dev_id == dev_id) {
+ if (action->thread)
+ __irq_wake_thread(desc, action);
+ break;
+ }
+ }
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+}
+EXPORT_SYMBOL_GPL(irq_wake_thread);
+
static void irq_setup_forced_threading(struct irqaction *new)
{
if (!force_irqthreads)
@@ -896,6 +948,23 @@ static void irq_setup_forced_threading(struct irqaction *new)
}
}
+static int irq_request_resources(struct irq_desc *desc)
+{
+ struct irq_data *d = &desc->irq_data;
+ struct irq_chip *c = d->chip;
+
+ return c->irq_request_resources ? c->irq_request_resources(d) : 0;
+}
+
+static void irq_release_resources(struct irq_desc *desc)
+{
+ struct irq_data *d = &desc->irq_data;
+ struct irq_chip *c = d->chip;
+
+ if (c->irq_release_resources)
+ c->irq_release_resources(d);
+}
+
/*
* Internal function to register an irqaction - typically used to
* allocate special interrupts that are part of the architecture.
@@ -1091,6 +1160,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
}
if (!shared) {
+ ret = irq_request_resources(desc);
+ if (ret) {
+ pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n",
+ new->name, irq, desc->irq_data.chip->name);
+ goto out_mask;
+ }
+
init_waitqueue_head(&desc->wait_for_threads);
/* Setup the type (level, edge polarity) if configured: */
@@ -1261,8 +1337,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
*action_ptr = action->next;
/* If this was the last handler, shut down the IRQ line: */
- if (!desc->action)
+ if (!desc->action) {
irq_shutdown(desc);
+ irq_release_resources(desc);
+ }
#ifdef CONFIG_SMP
/* make sure affinity_hint is cleaned up */
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 36f6ee181b0c..ac1ba2f11032 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -324,15 +324,15 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
#ifdef CONFIG_SMP
/* create /proc/irq/<irq>/smp_affinity */
- proc_create_data("smp_affinity", 0600, desc->dir,
+ proc_create_data("smp_affinity", 0644, desc->dir,
&irq_affinity_proc_fops, (void *)(long)irq);
/* create /proc/irq/<irq>/affinity_hint */
- proc_create_data("affinity_hint", 0400, desc->dir,
+ proc_create_data("affinity_hint", 0444, desc->dir,
&irq_affinity_hint_proc_fops, (void *)(long)irq);
/* create /proc/irq/<irq>/smp_affinity_list */
- proc_create_data("smp_affinity_list", 0600, desc->dir,
+ proc_create_data("smp_affinity_list", 0644, desc->dir,
&irq_affinity_list_proc_fops, (void *)(long)irq);
proc_create_data("node", 0444, desc->dir,
@@ -372,7 +372,7 @@ void unregister_handler_proc(unsigned int irq, struct irqaction *action)
static void register_default_affinity_proc(void)
{
#ifdef CONFIG_SMP
- proc_create("irq/default_smp_affinity", 0600, NULL,
+ proc_create("irq/default_smp_affinity", 0644, NULL,
&default_affinity_proc_fops);
#endif
}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index a1d8cc63b56e..e2514b0e439e 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -270,6 +270,8 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
return action && (action->flags & IRQF_IRQPOLL);
}
+#define SPURIOUS_DEFERRED 0x80000000
+
void note_interrupt(unsigned int irq, struct irq_desc *desc,
irqreturn_t action_ret)
{
@@ -277,15 +279,111 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
irq_settings_is_polled(desc))
return;
- /* we get here again via the threaded handler */
- if (action_ret == IRQ_WAKE_THREAD)
- return;
-
if (bad_action_ret(action_ret)) {
report_bad_irq(irq, desc, action_ret);
return;
}
+ /*
+ * We cannot call note_interrupt from the threaded handler
+ * because we need to look at the compound of all handlers
+ * (primary and threaded). Aside of that in the threaded
+ * shared case we have no serialization against an incoming
+ * hardware interrupt while we are dealing with a threaded
+ * result.
+ *
+ * So in case a thread is woken, we just note the fact and
+ * defer the analysis to the next hardware interrupt.
+ *
+ * The threaded handlers store whether they sucessfully
+ * handled an interrupt and we check whether that number
+ * changed versus the last invocation.
+ *
+ * We could handle all interrupts with the delayed by one
+ * mechanism, but for the non forced threaded case we'd just
+ * add pointless overhead to the straight hardirq interrupts
+ * for the sake of a few lines less code.
+ */
+ if (action_ret & IRQ_WAKE_THREAD) {
+ /*
+ * There is a thread woken. Check whether one of the
+ * shared primary handlers returned IRQ_HANDLED. If
+ * not we defer the spurious detection to the next
+ * interrupt.
+ */
+ if (action_ret == IRQ_WAKE_THREAD) {
+ int handled;
+ /*
+ * We use bit 31 of thread_handled_last to
+ * denote the deferred spurious detection
+ * active. No locking necessary as
+ * thread_handled_last is only accessed here
+ * and we have the guarantee that hard
+ * interrupts are not reentrant.
+ */
+ if (!(desc->threads_handled_last & SPURIOUS_DEFERRED)) {
+ desc->threads_handled_last |= SPURIOUS_DEFERRED;
+ return;
+ }
+ /*
+ * Check whether one of the threaded handlers
+ * returned IRQ_HANDLED since the last
+ * interrupt happened.
+ *
+ * For simplicity we just set bit 31, as it is
+ * set in threads_handled_last as well. So we
+ * avoid extra masking. And we really do not
+ * care about the high bits of the handled
+ * count. We just care about the count being
+ * different than the one we saw before.
+ */
+ handled = atomic_read(&desc->threads_handled);
+ handled |= SPURIOUS_DEFERRED;
+ if (handled != desc->threads_handled_last) {
+ action_ret = IRQ_HANDLED;
+ /*
+ * Note: We keep the SPURIOUS_DEFERRED
+ * bit set. We are handling the
+ * previous invocation right now.
+ * Keep it for the current one, so the
+ * next hardware interrupt will
+ * account for it.
+ */
+ desc->threads_handled_last = handled;
+ } else {
+ /*
+ * None of the threaded handlers felt
+ * responsible for the last interrupt
+ *
+ * We keep the SPURIOUS_DEFERRED bit
+ * set in threads_handled_last as we
+ * need to account for the current
+ * interrupt as well.
+ */
+ action_ret = IRQ_NONE;
+ }
+ } else {
+ /*
+ * One of the primary handlers returned
+ * IRQ_HANDLED. So we don't care about the
+ * threaded handlers on the same line. Clear
+ * the deferred detection bit.
+ *
+ * In theory we could/should check whether the
+ * deferred bit is set and take the result of
+ * the previous run into account here as
+ * well. But it's really not worth the
+ * trouble. If every other interrupt is
+ * handled we never trigger the spurious
+ * detector. And if this is just the one out
+ * of 100k unhandled ones which is handled
+ * then we merily delay the spurious detection
+ * by one hard interrupt. Not a real problem.
+ */
+ desc->threads_handled_last &= ~SPURIOUS_DEFERRED;
+ }
+ }
+
if (unlikely(action_ret == IRQ_NONE)) {
/*
* If we are seeing only the odd spurious IRQ caused by
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 55fcce6065cf..e6bcbe756663 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -16,11 +16,12 @@
#include <linux/tick.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
+#include <linux/smp.h>
#include <asm/processor.h>
-static DEFINE_PER_CPU(struct llist_head, irq_work_list);
-static DEFINE_PER_CPU(int, irq_work_raised);
+static DEFINE_PER_CPU(struct llist_head, raised_list);
+static DEFINE_PER_CPU(struct llist_head, lazy_list);
/*
* Claim the entry so that no one else will poke at it.
@@ -55,43 +56,66 @@ void __weak arch_irq_work_raise(void)
*/
}
+#ifdef CONFIG_SMP
/*
- * Enqueue the irq_work @entry unless it's already pending
+ * Enqueue the irq_work @work on @cpu unless it's already pending
* somewhere.
*
* Can be re-enqueued while the callback is still in progress.
*/
-void irq_work_queue(struct irq_work *work)
+bool irq_work_queue_on(struct irq_work *work, int cpu)
{
+ /* All work should have been flushed before going offline */
+ WARN_ON_ONCE(cpu_is_offline(cpu));
+
+ /* Arch remote IPI send/receive backend aren't NMI safe */
+ WARN_ON_ONCE(in_nmi());
+
/* Only queue if not already pending */
if (!irq_work_claim(work))
- return;
+ return false;
+
+ if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
+ arch_send_call_function_single_ipi(cpu);
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(irq_work_queue_on);
+#endif
+
+/* Enqueue the irq work @work on the current CPU */
+bool irq_work_queue(struct irq_work *work)
+{
+ /* Only queue if not already pending */
+ if (!irq_work_claim(work))
+ return false;
/* Queue the entry and raise the IPI if needed. */
preempt_disable();
- llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
-
- /*
- * If the work is not "lazy" or the tick is stopped, raise the irq
- * work interrupt (if supported by the arch), otherwise, just wait
- * for the next tick.
- */
- if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) {
- if (!this_cpu_cmpxchg(irq_work_raised, 0, 1))
+ /* If the work is "lazy", handle it from next tick if any */
+ if (work->flags & IRQ_WORK_LAZY) {
+ if (llist_add(&work->llnode, &__get_cpu_var(lazy_list)) &&
+ tick_nohz_tick_stopped())
+ arch_irq_work_raise();
+ } else {
+ if (llist_add(&work->llnode, &__get_cpu_var(raised_list)))
arch_irq_work_raise();
}
preempt_enable();
+
+ return true;
}
EXPORT_SYMBOL_GPL(irq_work_queue);
bool irq_work_needs_cpu(void)
{
- struct llist_head *this_list;
+ struct llist_head *raised, *lazy;
- this_list = &__get_cpu_var(irq_work_list);
- if (llist_empty(this_list))
+ raised = &__get_cpu_var(raised_list);
+ lazy = &__get_cpu_var(lazy_list);
+ if (llist_empty(raised) && llist_empty(lazy))
return false;
/* All work should have been flushed before going offline */
@@ -100,28 +124,18 @@ bool irq_work_needs_cpu(void)
return true;
}
-static void __irq_work_run(void)
+static void irq_work_run_list(struct llist_head *list)
{
unsigned long flags;
struct irq_work *work;
- struct llist_head *this_list;
struct llist_node *llnode;
+ BUG_ON(!irqs_disabled());
- /*
- * Reset the "raised" state right before we check the list because
- * an NMI may enqueue after we find the list empty from the runner.
- */
- __this_cpu_write(irq_work_raised, 0);
- barrier();
-
- this_list = &__get_cpu_var(irq_work_list);
- if (llist_empty(this_list))
+ if (llist_empty(list))
return;
- BUG_ON(!irqs_disabled());
-
- llnode = llist_del_all(this_list);
+ llnode = llist_del_all(list);
while (llnode != NULL) {
work = llist_entry(llnode, struct irq_work, llnode);
@@ -147,13 +161,13 @@ static void __irq_work_run(void)
}
/*
- * Run the irq_work entries on this cpu. Requires to be ran from hardirq
- * context with local IRQs disabled.
+ * hotplug calls this through:
+ * hotplug_cfd() -> flush_smp_call_function_queue()
*/
void irq_work_run(void)
{
- BUG_ON(!in_irq());
- __irq_work_run();
+ irq_work_run_list(&__get_cpu_var(raised_list));
+ irq_work_run_list(&__get_cpu_var(lazy_list));
}
EXPORT_SYMBOL_GPL(irq_work_run);
@@ -169,35 +183,3 @@ void irq_work_sync(struct irq_work *work)
cpu_relax();
}
EXPORT_SYMBOL_GPL(irq_work_sync);
-
-#ifdef CONFIG_HOTPLUG_CPU
-static int irq_work_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
-{
- long cpu = (long)hcpu;
-
- switch (action) {
- case CPU_DYING:
- /* Called from stop_machine */
- if (WARN_ON_ONCE(cpu != smp_processor_id()))
- break;
- __irq_work_run();
- break;
- default:
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block cpu_notify;
-
-static __init int irq_work_init_cpu_notifier(void)
-{
- cpu_notify.notifier_call = irq_work_cpu_notify;
- cpu_notify.priority = 0;
- register_cpu_notifier(&cpu_notify);
- return 0;
-}
-device_initcall(irq_work_init_cpu_notifier);
-
-#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 3127ad52cdb2..ae5167087845 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -23,6 +23,7 @@
#include <linux/mm.h>
#include <linux/ctype.h>
#include <linux/slab.h>
+#include <linux/compiler.h>
#include <asm/sections.h>
@@ -36,8 +37,8 @@
* These will be re-linked against their real values
* during the second link stage.
*/
-extern const unsigned long kallsyms_addresses[] __attribute__((weak));
-extern const u8 kallsyms_names[] __attribute__((weak));
+extern const unsigned long kallsyms_addresses[] __weak;
+extern const u8 kallsyms_names[] __weak;
/*
* Tell the compiler that the count isn't in the small data section if the arch
@@ -46,10 +47,10 @@ extern const u8 kallsyms_names[] __attribute__((weak));
extern const unsigned long kallsyms_num_syms
__attribute__((weak, section(".rodata")));
-extern const u8 kallsyms_token_table[] __attribute__((weak));
-extern const u16 kallsyms_token_index[] __attribute__((weak));
+extern const u8 kallsyms_token_table[] __weak;
+extern const u16 kallsyms_token_index[] __weak;
-extern const unsigned long kallsyms_markers[] __attribute__((weak));
+extern const unsigned long kallsyms_markers[] __weak;
static inline int is_kernel_inittext(unsigned long addr)
{
@@ -363,7 +364,7 @@ static int __sprint_symbol(char *buffer, unsigned long address,
address += symbol_offset;
name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
if (!name)
- return sprintf(buffer, "0x%lx", address);
+ return sprintf(buffer, "0x%lx", address - symbol_offset);
if (name != buffer)
strcpy(buffer, name);
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index e30ac0fe61c3..0aa69ea1d8fd 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -44,11 +44,12 @@ static long kptr_obfuscate(long v, int type)
*/
static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type)
{
- long ret;
+ long t1, t2;
- ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type);
+ t1 = kptr_obfuscate((long)v1, type);
+ t2 = kptr_obfuscate((long)v2, type);
- return (ret < 0) | ((ret > 0) << 1);
+ return (t1 < t2) | ((t1 > t2) << 1);
}
/* The caller must have pinned the task */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 60bafbed06ab..2bee072268d9 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -6,6 +6,8 @@
* Version 2. See the file COPYING for more details.
*/
+#define pr_fmt(fmt) "kexec: " fmt
+
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/file.h>
@@ -32,12 +34,17 @@
#include <linux/vmalloc.h>
#include <linux/swap.h>
#include <linux/syscore_ops.h>
+#include <linux/compiler.h>
+#include <linux/hugetlb.h>
#include <asm/page.h>
#include <asm/uaccess.h>
#include <asm/io.h>
#include <asm/sections.h>
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+
/* Per cpu memory for storing cpu states in case of system crash. */
note_buf_t __percpu *crash_notes;
@@ -50,6 +57,17 @@ size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
/* Flag to indicate we are going to kexec a new kernel */
bool kexec_in_progress = false;
+/*
+ * Declare these symbols weak so that if architecture provides a purgatory,
+ * these will be overridden.
+ */
+char __weak kexec_purgatory[0];
+size_t __weak kexec_purgatory_size = 0;
+
+#ifdef CONFIG_KEXEC_FILE
+static int kexec_calculate_store_digests(struct kimage *image);
+#endif
+
/* Location of the reserved area for the crash kernel */
struct resource crashk_res = {
.name = "Crash kernel",
@@ -123,45 +141,27 @@ static struct page *kimage_alloc_page(struct kimage *image,
gfp_t gfp_mask,
unsigned long dest);
-static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
- unsigned long nr_segments,
- struct kexec_segment __user *segments)
+static int copy_user_segment_list(struct kimage *image,
+ unsigned long nr_segments,
+ struct kexec_segment __user *segments)
{
+ int ret;
size_t segment_bytes;
- struct kimage *image;
- unsigned long i;
- int result;
-
- /* Allocate a controlling structure */
- result = -ENOMEM;
- image = kzalloc(sizeof(*image), GFP_KERNEL);
- if (!image)
- goto out;
-
- image->head = 0;
- image->entry = &image->head;
- image->last_entry = &image->head;
- image->control_page = ~0; /* By default this does not apply */
- image->start = entry;
- image->type = KEXEC_TYPE_DEFAULT;
-
- /* Initialize the list of control pages */
- INIT_LIST_HEAD(&image->control_pages);
-
- /* Initialize the list of destination pages */
- INIT_LIST_HEAD(&image->dest_pages);
-
- /* Initialize the list of unusable pages */
- INIT_LIST_HEAD(&image->unuseable_pages);
/* Read in the segments */
image->nr_segments = nr_segments;
segment_bytes = nr_segments * sizeof(*segments);
- result = copy_from_user(image->segment, segments, segment_bytes);
- if (result) {
- result = -EFAULT;
- goto out;
- }
+ ret = copy_from_user(image->segment, segments, segment_bytes);
+ if (ret)
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static int sanity_check_segment_list(struct kimage *image)
+{
+ int result, i;
+ unsigned long nr_segments = image->nr_segments;
/*
* Verify we have good destination addresses. The caller is
@@ -183,9 +183,9 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
mstart = image->segment[i].mem;
mend = mstart + image->segment[i].memsz;
if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
- goto out;
+ return result;
if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
- goto out;
+ return result;
}
/* Verify our destination addresses do not overlap.
@@ -206,7 +206,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
pend = pstart + image->segment[j].memsz;
/* Do the segments overlap ? */
if ((mend > pstart) && (mstart < pend))
- goto out;
+ return result;
}
}
@@ -218,131 +218,406 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
result = -EINVAL;
for (i = 0; i < nr_segments; i++) {
if (image->segment[i].bufsz > image->segment[i].memsz)
- goto out;
+ return result;
}
- result = 0;
-out:
- if (result == 0)
- *rimage = image;
- else
- kfree(image);
+ /*
+ * Verify we have good destination addresses. Normally
+ * the caller is responsible for making certain we don't
+ * attempt to load the new image into invalid or reserved
+ * areas of RAM. But crash kernels are preloaded into a
+ * reserved area of ram. We must ensure the addresses
+ * are in the reserved area otherwise preloading the
+ * kernel could corrupt things.
+ */
- return result;
+ if (image->type == KEXEC_TYPE_CRASH) {
+ result = -EADDRNOTAVAIL;
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz - 1;
+ /* Ensure we are within the crash kernel limits */
+ if ((mstart < crashk_res.start) ||
+ (mend > crashk_res.end))
+ return result;
+ }
+ }
+
+ return 0;
+}
+
+static struct kimage *do_kimage_alloc_init(void)
+{
+ struct kimage *image;
+ /* Allocate a controlling structure */
+ image = kzalloc(sizeof(*image), GFP_KERNEL);
+ if (!image)
+ return NULL;
+
+ image->head = 0;
+ image->entry = &image->head;
+ image->last_entry = &image->head;
+ image->control_page = ~0; /* By default this does not apply */
+ image->type = KEXEC_TYPE_DEFAULT;
+
+ /* Initialize the list of control pages */
+ INIT_LIST_HEAD(&image->control_pages);
+
+ /* Initialize the list of destination pages */
+ INIT_LIST_HEAD(&image->dest_pages);
+
+ /* Initialize the list of unusable pages */
+ INIT_LIST_HEAD(&image->unusable_pages);
+
+ return image;
}
static void kimage_free_page_list(struct list_head *list);
-static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
- unsigned long nr_segments,
- struct kexec_segment __user *segments)
+static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
+ unsigned long nr_segments,
+ struct kexec_segment __user *segments,
+ unsigned long flags)
{
- int result;
+ int ret;
struct kimage *image;
+ bool kexec_on_panic = flags & KEXEC_ON_CRASH;
+
+ if (kexec_on_panic) {
+ /* Verify we have a valid entry point */
+ if ((entry < crashk_res.start) || (entry > crashk_res.end))
+ return -EADDRNOTAVAIL;
+ }
/* Allocate and initialize a controlling structure */
- image = NULL;
- result = do_kimage_alloc(&image, entry, nr_segments, segments);
- if (result)
- goto out;
+ image = do_kimage_alloc_init();
+ if (!image)
+ return -ENOMEM;
+
+ image->start = entry;
+
+ ret = copy_user_segment_list(image, nr_segments, segments);
+ if (ret)
+ goto out_free_image;
+
+ ret = sanity_check_segment_list(image);
+ if (ret)
+ goto out_free_image;
+
+ /* Enable the special crash kernel control page allocation policy. */
+ if (kexec_on_panic) {
+ image->control_page = crashk_res.start;
+ image->type = KEXEC_TYPE_CRASH;
+ }
/*
* Find a location for the control code buffer, and add it
* the vector of segments so that it's pages will also be
* counted as destination pages.
*/
- result = -ENOMEM;
+ ret = -ENOMEM;
image->control_code_page = kimage_alloc_control_pages(image,
get_order(KEXEC_CONTROL_PAGE_SIZE));
if (!image->control_code_page) {
- printk(KERN_ERR "Could not allocate control_code_buffer\n");
- goto out_free;
+ pr_err("Could not allocate control_code_buffer\n");
+ goto out_free_image;
}
- image->swap_page = kimage_alloc_control_pages(image, 0);
- if (!image->swap_page) {
- printk(KERN_ERR "Could not allocate swap buffer\n");
- goto out_free;
+ if (!kexec_on_panic) {
+ image->swap_page = kimage_alloc_control_pages(image, 0);
+ if (!image->swap_page) {
+ pr_err("Could not allocate swap buffer\n");
+ goto out_free_control_pages;
+ }
}
*rimage = image;
return 0;
-
-out_free:
+out_free_control_pages:
kimage_free_page_list(&image->control_pages);
+out_free_image:
kfree(image);
-out:
- return result;
+ return ret;
}
-static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
- unsigned long nr_segments,
- struct kexec_segment __user *segments)
+#ifdef CONFIG_KEXEC_FILE
+static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
{
- int result;
- struct kimage *image;
- unsigned long i;
+ struct fd f = fdget(fd);
+ int ret;
+ struct kstat stat;
+ loff_t pos;
+ ssize_t bytes = 0;
- image = NULL;
- /* Verify we have a valid entry point */
- if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
- result = -EADDRNOTAVAIL;
+ if (!f.file)
+ return -EBADF;
+
+ ret = vfs_getattr(&f.file->f_path, &stat);
+ if (ret)
+ goto out;
+
+ if (stat.size > INT_MAX) {
+ ret = -EFBIG;
goto out;
}
- /* Allocate and initialize a controlling structure */
- result = do_kimage_alloc(&image, entry, nr_segments, segments);
- if (result)
+ /* Don't hand 0 to vmalloc, it whines. */
+ if (stat.size == 0) {
+ ret = -EINVAL;
goto out;
+ }
- /* Enable the special crash kernel control page
- * allocation policy.
- */
- image->control_page = crashk_res.start;
- image->type = KEXEC_TYPE_CRASH;
+ *buf = vmalloc(stat.size);
+ if (!*buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
- /*
- * Verify we have good destination addresses. Normally
- * the caller is responsible for making certain we don't
- * attempt to load the new image into invalid or reserved
- * areas of RAM. But crash kernels are preloaded into a
- * reserved area of ram. We must ensure the addresses
- * are in the reserved area otherwise preloading the
- * kernel could corrupt things.
- */
- result = -EADDRNOTAVAIL;
- for (i = 0; i < nr_segments; i++) {
- unsigned long mstart, mend;
+ pos = 0;
+ while (pos < stat.size) {
+ bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
+ stat.size - pos);
+ if (bytes < 0) {
+ vfree(*buf);
+ ret = bytes;
+ goto out;
+ }
- mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz - 1;
- /* Ensure we are within the crash kernel limits */
- if ((mstart < crashk_res.start) || (mend > crashk_res.end))
- goto out_free;
+ if (bytes == 0)
+ break;
+ pos += bytes;
}
+ if (pos != stat.size) {
+ ret = -EBADF;
+ vfree(*buf);
+ goto out;
+ }
+
+ *buf_len = pos;
+out:
+ fdput(f);
+ return ret;
+}
+
+/* Architectures can provide this probe function */
+int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ return -ENOEXEC;
+}
+
+void * __weak arch_kexec_kernel_image_load(struct kimage *image)
+{
+ return ERR_PTR(-ENOEXEC);
+}
+
+void __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
+{
+}
+
+int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ return -EKEYREJECTED;
+}
+
+/* Apply relocations of type RELA */
+int __weak
+arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+ unsigned int relsec)
+{
+ pr_err("RELA relocation unsupported.\n");
+ return -ENOEXEC;
+}
+
+/* Apply relocations of type REL */
+int __weak
+arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+ unsigned int relsec)
+{
+ pr_err("REL relocation unsupported.\n");
+ return -ENOEXEC;
+}
+
+/*
+ * Free up memory used by kernel, initrd, and comand line. This is temporary
+ * memory allocation which is not needed any more after these buffers have
+ * been loaded into separate segments and have been copied elsewhere.
+ */
+static void kimage_file_post_load_cleanup(struct kimage *image)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+
+ vfree(image->kernel_buf);
+ image->kernel_buf = NULL;
+
+ vfree(image->initrd_buf);
+ image->initrd_buf = NULL;
+
+ kfree(image->cmdline_buf);
+ image->cmdline_buf = NULL;
+
+ vfree(pi->purgatory_buf);
+ pi->purgatory_buf = NULL;
+
+ vfree(pi->sechdrs);
+ pi->sechdrs = NULL;
+
+ /* See if architecture has anything to cleanup post load */
+ arch_kimage_file_post_load_cleanup(image);
+
/*
- * Find a location for the control code buffer, and add
- * the vector of segments so that it's pages will also be
- * counted as destination pages.
+ * Above call should have called into bootloader to free up
+ * any data stored in kimage->image_loader_data. It should
+ * be ok now to free it up.
*/
- result = -ENOMEM;
+ kfree(image->image_loader_data);
+ image->image_loader_data = NULL;
+}
+
+/*
+ * In file mode list of segments is prepared by kernel. Copy relevant
+ * data from user space, do error checking, prepare segment list
+ */
+static int
+kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
+ const char __user *cmdline_ptr,
+ unsigned long cmdline_len, unsigned flags)
+{
+ int ret = 0;
+ void *ldata;
+
+ ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
+ &image->kernel_buf_len);
+ if (ret)
+ return ret;
+
+ /* Call arch image probe handlers */
+ ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
+ image->kernel_buf_len);
+
+ if (ret)
+ goto out;
+
+#ifdef CONFIG_KEXEC_VERIFY_SIG
+ ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
+ image->kernel_buf_len);
+ if (ret) {
+ pr_debug("kernel signature verification failed.\n");
+ goto out;
+ }
+ pr_debug("kernel signature verification successful.\n");
+#endif
+ /* It is possible that there no initramfs is being loaded */
+ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
+ ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
+ &image->initrd_buf_len);
+ if (ret)
+ goto out;
+ }
+
+ if (cmdline_len) {
+ image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
+ if (!image->cmdline_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
+ cmdline_len);
+ if (ret) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ image->cmdline_buf_len = cmdline_len;
+
+ /* command line should be a string with last byte null */
+ if (image->cmdline_buf[cmdline_len - 1] != '\0') {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ /* Call arch image load handlers */
+ ldata = arch_kexec_kernel_image_load(image);
+
+ if (IS_ERR(ldata)) {
+ ret = PTR_ERR(ldata);
+ goto out;
+ }
+
+ image->image_loader_data = ldata;
+out:
+ /* In case of error, free up all allocated memory in this function */
+ if (ret)
+ kimage_file_post_load_cleanup(image);
+ return ret;
+}
+
+static int
+kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
+ int initrd_fd, const char __user *cmdline_ptr,
+ unsigned long cmdline_len, unsigned long flags)
+{
+ int ret;
+ struct kimage *image;
+ bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
+
+ image = do_kimage_alloc_init();
+ if (!image)
+ return -ENOMEM;
+
+ image->file_mode = 1;
+
+ if (kexec_on_panic) {
+ /* Enable special crash kernel control page alloc policy. */
+ image->control_page = crashk_res.start;
+ image->type = KEXEC_TYPE_CRASH;
+ }
+
+ ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
+ cmdline_ptr, cmdline_len, flags);
+ if (ret)
+ goto out_free_image;
+
+ ret = sanity_check_segment_list(image);
+ if (ret)
+ goto out_free_post_load_bufs;
+
+ ret = -ENOMEM;
image->control_code_page = kimage_alloc_control_pages(image,
get_order(KEXEC_CONTROL_PAGE_SIZE));
if (!image->control_code_page) {
- printk(KERN_ERR "Could not allocate control_code_buffer\n");
- goto out_free;
+ pr_err("Could not allocate control_code_buffer\n");
+ goto out_free_post_load_bufs;
+ }
+
+ if (!kexec_on_panic) {
+ image->swap_page = kimage_alloc_control_pages(image, 0);
+ if (!image->swap_page) {
+ pr_err(KERN_ERR "Could not allocate swap buffer\n");
+ goto out_free_control_pages;
+ }
}
*rimage = image;
return 0;
-
-out_free:
+out_free_control_pages:
+ kimage_free_page_list(&image->control_pages);
+out_free_post_load_bufs:
+ kimage_file_post_load_cleanup(image);
+out_free_image:
kfree(image);
-out:
- return result;
+ return ret;
}
+#else /* CONFIG_KEXEC_FILE */
+static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
+#endif /* CONFIG_KEXEC_FILE */
static int kimage_is_destination_range(struct kimage *image,
unsigned long start,
@@ -607,7 +882,7 @@ static void kimage_free_extra_pages(struct kimage *image)
kimage_free_page_list(&image->dest_pages);
/* Walk through and free any unusable pages I have cached */
- kimage_free_page_list(&image->unuseable_pages);
+ kimage_free_page_list(&image->unusable_pages);
}
static void kimage_terminate(struct kimage *image)
@@ -620,8 +895,8 @@ static void kimage_terminate(struct kimage *image)
#define for_each_kimage_entry(image, ptr, entry) \
for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
- ptr = (entry & IND_INDIRECTION)? \
- phys_to_virt((entry & PAGE_MASK)): ptr +1)
+ ptr = (entry & IND_INDIRECTION) ? \
+ phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
static void kimage_free_entry(kimage_entry_t entry)
{
@@ -649,8 +924,7 @@ static void kimage_free(struct kimage *image)
* done with it.
*/
ind = entry;
- }
- else if (entry & IND_SOURCE)
+ } else if (entry & IND_SOURCE)
kimage_free_entry(entry);
}
/* Free the final indirection page */
@@ -662,6 +936,14 @@ static void kimage_free(struct kimage *image)
/* Free the kexec control pages... */
kimage_free_page_list(&image->control_pages);
+
+ /*
+ * Free up any temporary buffers allocated. This might hit if
+ * error occurred much later after buffer allocation.
+ */
+ if (image->file_mode)
+ kimage_file_post_load_cleanup(image);
+
kfree(image);
}
@@ -731,7 +1013,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
/* If the page cannot be used file it away */
if (page_to_pfn(page) >
(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
- list_add(&page->lru, &image->unuseable_pages);
+ list_add(&page->lru, &image->unusable_pages);
continue;
}
addr = page_to_pfn(page) << PAGE_SHIFT;
@@ -773,8 +1055,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
addr = old_addr;
page = old_page;
break;
- }
- else {
+ } else {
/* Place the page on the destination list I
* will use it later.
*/
@@ -791,10 +1072,14 @@ static int kimage_load_normal_segment(struct kimage *image,
unsigned long maddr;
size_t ubytes, mbytes;
int result;
- unsigned char __user *buf;
+ unsigned char __user *buf = NULL;
+ unsigned char *kbuf = NULL;
result = 0;
- buf = segment->buf;
+ if (image->file_mode)
+ kbuf = segment->kbuf;
+ else
+ buf = segment->buf;
ubytes = segment->bufsz;
mbytes = segment->memsz;
maddr = segment->mem;
@@ -826,7 +1111,11 @@ static int kimage_load_normal_segment(struct kimage *image,
PAGE_SIZE - (maddr & ~PAGE_MASK));
uchunk = min(ubytes, mchunk);
- result = copy_from_user(ptr, buf, uchunk);
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
kunmap(page);
if (result) {
result = -EFAULT;
@@ -834,7 +1123,10 @@ static int kimage_load_normal_segment(struct kimage *image,
}
ubytes -= uchunk;
maddr += mchunk;
- buf += mchunk;
+ if (image->file_mode)
+ kbuf += mchunk;
+ else
+ buf += mchunk;
mbytes -= mchunk;
}
out:
@@ -851,10 +1143,14 @@ static int kimage_load_crash_segment(struct kimage *image,
unsigned long maddr;
size_t ubytes, mbytes;
int result;
- unsigned char __user *buf;
+ unsigned char __user *buf = NULL;
+ unsigned char *kbuf = NULL;
result = 0;
- buf = segment->buf;
+ if (image->file_mode)
+ kbuf = segment->kbuf;
+ else
+ buf = segment->buf;
ubytes = segment->bufsz;
mbytes = segment->memsz;
maddr = segment->mem;
@@ -877,7 +1173,12 @@ static int kimage_load_crash_segment(struct kimage *image,
/* Zero the trailing part of the page */
memset(ptr + uchunk, 0, mchunk - uchunk);
}
- result = copy_from_user(ptr, buf, uchunk);
+
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
kexec_flush_icache_page(page);
kunmap(page);
if (result) {
@@ -886,7 +1187,10 @@ static int kimage_load_crash_segment(struct kimage *image,
}
ubytes -= uchunk;
maddr += mchunk;
- buf += mchunk;
+ if (image->file_mode)
+ kbuf += mchunk;
+ else
+ buf += mchunk;
mbytes -= mchunk;
}
out:
@@ -986,16 +1290,16 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
/* Loading another kernel to reboot into */
if ((flags & KEXEC_ON_CRASH) == 0)
- result = kimage_normal_alloc(&image, entry,
- nr_segments, segments);
+ result = kimage_alloc_init(&image, entry, nr_segments,
+ segments, flags);
/* Loading another kernel to switch to if this one crashes */
else if (flags & KEXEC_ON_CRASH) {
/* Free any current crash dump kernel before
* we corrupt it.
*/
kimage_free(xchg(&kexec_crash_image, NULL));
- result = kimage_crash_alloc(&image, entry,
- nr_segments, segments);
+ result = kimage_alloc_init(&image, entry, nr_segments,
+ segments, flags);
crash_map_reserved_pages();
}
if (result)
@@ -1039,10 +1343,10 @@ void __weak crash_unmap_reserved_pages(void)
{}
#ifdef CONFIG_COMPAT
-asmlinkage long compat_sys_kexec_load(unsigned long entry,
- unsigned long nr_segments,
- struct compat_kexec_segment __user *segments,
- unsigned long flags)
+COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
+ compat_ulong_t, nr_segments,
+ struct compat_kexec_segment __user *, segments,
+ compat_ulong_t, flags)
{
struct compat_kexec_segment in;
struct kexec_segment out, __user *ksegments;
@@ -1058,7 +1362,7 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry,
return -EINVAL;
ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
- for (i=0; i < nr_segments; i++) {
+ for (i = 0; i < nr_segments; i++) {
result = copy_from_user(&in, &segments[i], sizeof(in));
if (result)
return -EFAULT;
@@ -1077,6 +1381,85 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry,
}
#endif
+#ifdef CONFIG_KEXEC_FILE
+SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
+ unsigned long, cmdline_len, const char __user *, cmdline_ptr,
+ unsigned long, flags)
+{
+ int ret = 0, i;
+ struct kimage **dest_image, *image;
+
+ /* We only trust the superuser with rebooting the system. */
+ if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+ return -EPERM;
+
+ /* Make sure we have a legal set of flags */
+ if (flags != (flags & KEXEC_FILE_FLAGS))
+ return -EINVAL;
+
+ image = NULL;
+
+ if (!mutex_trylock(&kexec_mutex))
+ return -EBUSY;
+
+ dest_image = &kexec_image;
+ if (flags & KEXEC_FILE_ON_CRASH)
+ dest_image = &kexec_crash_image;
+
+ if (flags & KEXEC_FILE_UNLOAD)
+ goto exchange;
+
+ /*
+ * In case of crash, new kernel gets loaded in reserved region. It is
+ * same memory where old crash kernel might be loaded. Free any
+ * current crash dump kernel before we corrupt it.
+ */
+ if (flags & KEXEC_FILE_ON_CRASH)
+ kimage_free(xchg(&kexec_crash_image, NULL));
+
+ ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
+ cmdline_len, flags);
+ if (ret)
+ goto out;
+
+ ret = machine_kexec_prepare(image);
+ if (ret)
+ goto out;
+
+ ret = kexec_calculate_store_digests(image);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < image->nr_segments; i++) {
+ struct kexec_segment *ksegment;
+
+ ksegment = &image->segment[i];
+ pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
+ i, ksegment->buf, ksegment->bufsz, ksegment->mem,
+ ksegment->memsz);
+
+ ret = kimage_load_segment(image, &image->segment[i]);
+ if (ret)
+ goto out;
+ }
+
+ kimage_terminate(image);
+
+ /*
+ * Free up any temporary buffers allocated which are not needed
+ * after image has been loaded
+ */
+ kimage_file_post_load_cleanup(image);
+exchange:
+ image = xchg(dest_image, image);
+out:
+ mutex_unlock(&kexec_mutex);
+ kimage_free(image);
+ return ret;
+}
+
+#endif /* CONFIG_KEXEC_FILE */
+
void crash_kexec(struct pt_regs *regs)
{
/* Take the kexec_mutex here to prevent sys_kexec_load
@@ -1213,14 +1596,14 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
* squirrelled away. ELF notes happen to provide
* all of that, so there is no need to invent something new.
*/
- buf = (u32*)per_cpu_ptr(crash_notes, cpu);
+ buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
if (!buf)
return;
memset(&prstatus, 0, sizeof(prstatus));
prstatus.pr_pid = current->pid;
elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
- &prstatus, sizeof(prstatus));
+ &prstatus, sizeof(prstatus));
final_note(buf);
}
@@ -1229,13 +1612,12 @@ static int __init crash_notes_memory_init(void)
/* Allocate memory for saving cpu registers. */
crash_notes = alloc_percpu(note_buf_t);
if (!crash_notes) {
- printk("Kexec: Memory allocation for saving cpu register"
- " states failed\n");
+ pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
return -ENOMEM;
}
return 0;
}
-module_init(crash_notes_memory_init)
+subsys_initcall(crash_notes_memory_init);
/*
@@ -1252,10 +1634,10 @@ module_init(crash_notes_memory_init)
*
* The function returns 0 on success and -EINVAL on failure.
*/
-static int __init parse_crashkernel_mem(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
+static int __init parse_crashkernel_mem(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
{
char *cur = cmdline, *tmp;
@@ -1266,12 +1648,12 @@ static int __init parse_crashkernel_mem(char *cmdline,
/* get the start of the range */
start = memparse(cur, &tmp);
if (cur == tmp) {
- pr_warning("crashkernel: Memory value expected\n");
+ pr_warn("crashkernel: Memory value expected\n");
return -EINVAL;
}
cur = tmp;
if (*cur != '-') {
- pr_warning("crashkernel: '-' expected\n");
+ pr_warn("crashkernel: '-' expected\n");
return -EINVAL;
}
cur++;
@@ -1280,31 +1662,30 @@ static int __init parse_crashkernel_mem(char *cmdline,
if (*cur != ':') {
end = memparse(cur, &tmp);
if (cur == tmp) {
- pr_warning("crashkernel: Memory "
- "value expected\n");
+ pr_warn("crashkernel: Memory value expected\n");
return -EINVAL;
}
cur = tmp;
if (end <= start) {
- pr_warning("crashkernel: end <= start\n");
+ pr_warn("crashkernel: end <= start\n");
return -EINVAL;
}
}
if (*cur != ':') {
- pr_warning("crashkernel: ':' expected\n");
+ pr_warn("crashkernel: ':' expected\n");
return -EINVAL;
}
cur++;
size = memparse(cur, &tmp);
if (cur == tmp) {
- pr_warning("Memory value expected\n");
+ pr_warn("Memory value expected\n");
return -EINVAL;
}
cur = tmp;
if (size >= system_ram) {
- pr_warning("crashkernel: invalid size\n");
+ pr_warn("crashkernel: invalid size\n");
return -EINVAL;
}
@@ -1322,8 +1703,7 @@ static int __init parse_crashkernel_mem(char *cmdline,
cur++;
*crash_base = memparse(cur, &tmp);
if (cur == tmp) {
- pr_warning("Memory value expected "
- "after '@'\n");
+ pr_warn("Memory value expected after '@'\n");
return -EINVAL;
}
}
@@ -1335,26 +1715,26 @@ static int __init parse_crashkernel_mem(char *cmdline,
/*
* That function parses "simple" (old) crashkernel command lines like
*
- * crashkernel=size[@offset]
+ * crashkernel=size[@offset]
*
* It returns 0 on success and -EINVAL on failure.
*/
-static int __init parse_crashkernel_simple(char *cmdline,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
+static int __init parse_crashkernel_simple(char *cmdline,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
{
char *cur = cmdline;
*crash_size = memparse(cmdline, &cur);
if (cmdline == cur) {
- pr_warning("crashkernel: memory value expected\n");
+ pr_warn("crashkernel: memory value expected\n");
return -EINVAL;
}
if (*cur == '@')
*crash_base = memparse(cur+1, &cur);
else if (*cur != ' ' && *cur != '\0') {
- pr_warning("crashkernel: unrecognized char\n");
+ pr_warn("crashkernel: unrecognized char\n");
return -EINVAL;
}
@@ -1551,10 +1931,10 @@ void vmcoreinfo_append_str(const char *fmt, ...)
* provide an empty default implementation here -- architecture
* code may override this
*/
-void __attribute__ ((weak)) arch_crash_save_vmcoreinfo(void)
+void __weak arch_crash_save_vmcoreinfo(void)
{}
-unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void)
+unsigned long __weak paddr_vmcoreinfo_note(void)
{
return __pa((unsigned long)(char *)&vmcoreinfo_note);
}
@@ -1621,7 +2001,11 @@ static int __init crash_save_vmcoreinfo_init(void)
#ifdef CONFIG_MEMORY_FAILURE
VMCOREINFO_NUMBER(PG_hwpoison);
#endif
+ VMCOREINFO_NUMBER(PG_head_mask);
VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
+#ifdef CONFIG_HUGETLBFS
+ VMCOREINFO_SYMBOL(free_huge_page);
+#endif
arch_crash_save_vmcoreinfo();
update_vmcoreinfo_note();
@@ -1629,7 +2013,686 @@ static int __init crash_save_vmcoreinfo_init(void)
return 0;
}
-module_init(crash_save_vmcoreinfo_init)
+subsys_initcall(crash_save_vmcoreinfo_init);
+
+#ifdef CONFIG_KEXEC_FILE
+static int __kexec_add_segment(struct kimage *image, char *buf,
+ unsigned long bufsz, unsigned long mem,
+ unsigned long memsz)
+{
+ struct kexec_segment *ksegment;
+
+ ksegment = &image->segment[image->nr_segments];
+ ksegment->kbuf = buf;
+ ksegment->bufsz = bufsz;
+ ksegment->mem = mem;
+ ksegment->memsz = memsz;
+ image->nr_segments++;
+
+ return 0;
+}
+
+static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
+ struct kexec_buf *kbuf)
+{
+ struct kimage *image = kbuf->image;
+ unsigned long temp_start, temp_end;
+
+ temp_end = min(end, kbuf->buf_max);
+ temp_start = temp_end - kbuf->memsz;
+
+ do {
+ /* align down start */
+ temp_start = temp_start & (~(kbuf->buf_align - 1));
+
+ if (temp_start < start || temp_start < kbuf->buf_min)
+ return 0;
+
+ temp_end = temp_start + kbuf->memsz - 1;
+
+ /*
+ * Make sure this does not conflict with any of existing
+ * segments
+ */
+ if (kimage_is_destination_range(image, temp_start, temp_end)) {
+ temp_start = temp_start - PAGE_SIZE;
+ continue;
+ }
+
+ /* We found a suitable memory range */
+ break;
+ } while (1);
+
+ /* If we are here, we found a suitable memory range */
+ __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start,
+ kbuf->memsz);
+
+ /* Success, stop navigating through remaining System RAM ranges */
+ return 1;
+}
+
+static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
+ struct kexec_buf *kbuf)
+{
+ struct kimage *image = kbuf->image;
+ unsigned long temp_start, temp_end;
+
+ temp_start = max(start, kbuf->buf_min);
+
+ do {
+ temp_start = ALIGN(temp_start, kbuf->buf_align);
+ temp_end = temp_start + kbuf->memsz - 1;
+
+ if (temp_end > end || temp_end > kbuf->buf_max)
+ return 0;
+ /*
+ * Make sure this does not conflict with any of existing
+ * segments
+ */
+ if (kimage_is_destination_range(image, temp_start, temp_end)) {
+ temp_start = temp_start + PAGE_SIZE;
+ continue;
+ }
+
+ /* We found a suitable memory range */
+ break;
+ } while (1);
+
+ /* If we are here, we found a suitable memory range */
+ __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start,
+ kbuf->memsz);
+
+ /* Success, stop navigating through remaining System RAM ranges */
+ return 1;
+}
+
+static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
+{
+ struct kexec_buf *kbuf = (struct kexec_buf *)arg;
+ unsigned long sz = end - start + 1;
+
+ /* Returning 0 will take to next memory range */
+ if (sz < kbuf->memsz)
+ return 0;
+
+ if (end < kbuf->buf_min || start > kbuf->buf_max)
+ return 0;
+
+ /*
+ * Allocate memory top down with-in ram range. Otherwise bottom up
+ * allocation.
+ */
+ if (kbuf->top_down)
+ return locate_mem_hole_top_down(start, end, kbuf);
+ return locate_mem_hole_bottom_up(start, end, kbuf);
+}
+
+/*
+ * Helper function for placing a buffer in a kexec segment. This assumes
+ * that kexec_mutex is held.
+ */
+int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
+ unsigned long memsz, unsigned long buf_align,
+ unsigned long buf_min, unsigned long buf_max,
+ bool top_down, unsigned long *load_addr)
+{
+
+ struct kexec_segment *ksegment;
+ struct kexec_buf buf, *kbuf;
+ int ret;
+
+ /* Currently adding segment this way is allowed only in file mode */
+ if (!image->file_mode)
+ return -EINVAL;
+
+ if (image->nr_segments >= KEXEC_SEGMENT_MAX)
+ return -EINVAL;
+
+ /*
+ * Make sure we are not trying to add buffer after allocating
+ * control pages. All segments need to be placed first before
+ * any control pages are allocated. As control page allocation
+ * logic goes through list of segments to make sure there are
+ * no destination overlaps.
+ */
+ if (!list_empty(&image->control_pages)) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ memset(&buf, 0, sizeof(struct kexec_buf));
+ kbuf = &buf;
+ kbuf->image = image;
+ kbuf->buffer = buffer;
+ kbuf->bufsz = bufsz;
+
+ kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
+ kbuf->buf_align = max(buf_align, PAGE_SIZE);
+ kbuf->buf_min = buf_min;
+ kbuf->buf_max = buf_max;
+ kbuf->top_down = top_down;
+
+ /* Walk the RAM ranges and allocate a suitable range for the buffer */
+ if (image->type == KEXEC_TYPE_CRASH)
+ ret = walk_iomem_res("Crash kernel",
+ IORESOURCE_MEM | IORESOURCE_BUSY,
+ crashk_res.start, crashk_res.end, kbuf,
+ locate_mem_hole_callback);
+ else
+ ret = walk_system_ram_res(0, -1, kbuf,
+ locate_mem_hole_callback);
+ if (ret != 1) {
+ /* A suitable memory range could not be found for buffer */
+ return -EADDRNOTAVAIL;
+ }
+
+ /* Found a suitable memory range */
+ ksegment = &image->segment[image->nr_segments - 1];
+ *load_addr = ksegment->mem;
+ return 0;
+}
+
+/* Calculate and store the digest of segments */
+static int kexec_calculate_store_digests(struct kimage *image)
+{
+ struct crypto_shash *tfm;
+ struct shash_desc *desc;
+ int ret = 0, i, j, zero_buf_sz, sha_region_sz;
+ size_t desc_size, nullsz;
+ char *digest;
+ void *zero_buf;
+ struct kexec_sha_region *sha_regions;
+ struct purgatory_info *pi = &image->purgatory_info;
+
+ zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
+ zero_buf_sz = PAGE_SIZE;
+
+ tfm = crypto_alloc_shash("sha256", 0, 0);
+ if (IS_ERR(tfm)) {
+ ret = PTR_ERR(tfm);
+ goto out;
+ }
+
+ desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
+ desc = kzalloc(desc_size, GFP_KERNEL);
+ if (!desc) {
+ ret = -ENOMEM;
+ goto out_free_tfm;
+ }
+
+ sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
+ sha_regions = vzalloc(sha_region_sz);
+ if (!sha_regions)
+ goto out_free_desc;
+
+ desc->tfm = tfm;
+ desc->flags = 0;
+
+ ret = crypto_shash_init(desc);
+ if (ret < 0)
+ goto out_free_sha_regions;
+
+ digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
+ if (!digest) {
+ ret = -ENOMEM;
+ goto out_free_sha_regions;
+ }
+
+ for (j = i = 0; i < image->nr_segments; i++) {
+ struct kexec_segment *ksegment;
+
+ ksegment = &image->segment[i];
+ /*
+ * Skip purgatory as it will be modified once we put digest
+ * info in purgatory.
+ */
+ if (ksegment->kbuf == pi->purgatory_buf)
+ continue;
+
+ ret = crypto_shash_update(desc, ksegment->kbuf,
+ ksegment->bufsz);
+ if (ret)
+ break;
+
+ /*
+ * Assume rest of the buffer is filled with zero and
+ * update digest accordingly.
+ */
+ nullsz = ksegment->memsz - ksegment->bufsz;
+ while (nullsz) {
+ unsigned long bytes = nullsz;
+
+ if (bytes > zero_buf_sz)
+ bytes = zero_buf_sz;
+ ret = crypto_shash_update(desc, zero_buf, bytes);
+ if (ret)
+ break;
+ nullsz -= bytes;
+ }
+
+ if (ret)
+ break;
+
+ sha_regions[j].start = ksegment->mem;
+ sha_regions[j].len = ksegment->memsz;
+ j++;
+ }
+
+ if (!ret) {
+ ret = crypto_shash_final(desc, digest);
+ if (ret)
+ goto out_free_digest;
+ ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
+ sha_regions, sha_region_sz, 0);
+ if (ret)
+ goto out_free_digest;
+
+ ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
+ digest, SHA256_DIGEST_SIZE, 0);
+ if (ret)
+ goto out_free_digest;
+ }
+
+out_free_digest:
+ kfree(digest);
+out_free_sha_regions:
+ vfree(sha_regions);
+out_free_desc:
+ kfree(desc);
+out_free_tfm:
+ kfree(tfm);
+out:
+ return ret;
+}
+
+/* Actually load purgatory. Lot of code taken from kexec-tools */
+static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
+ unsigned long max, int top_down)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
+ unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
+ unsigned char *buf_addr, *src;
+ int i, ret = 0, entry_sidx = -1;
+ const Elf_Shdr *sechdrs_c;
+ Elf_Shdr *sechdrs = NULL;
+ void *purgatory_buf = NULL;
+
+ /*
+ * sechdrs_c points to section headers in purgatory and are read
+ * only. No modifications allowed.
+ */
+ sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
+
+ /*
+ * We can not modify sechdrs_c[] and its fields. It is read only.
+ * Copy it over to a local copy where one can store some temporary
+ * data and free it at the end. We need to modify ->sh_addr and
+ * ->sh_offset fields to keep track of permanent and temporary
+ * locations of sections.
+ */
+ sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+ if (!sechdrs)
+ return -ENOMEM;
+
+ memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+
+ /*
+ * We seem to have multiple copies of sections. First copy is which
+ * is embedded in kernel in read only section. Some of these sections
+ * will be copied to a temporary buffer and relocated. And these
+ * sections will finally be copied to their final destination at
+ * segment load time.
+ *
+ * Use ->sh_offset to reflect section address in memory. It will
+ * point to original read only copy if section is not allocatable.
+ * Otherwise it will point to temporary copy which will be relocated.
+ *
+ * Use ->sh_addr to contain final address of the section where it
+ * will go during execution time.
+ */
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (sechdrs[i].sh_type == SHT_NOBITS)
+ continue;
+
+ sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
+ sechdrs[i].sh_offset;
+ }
+
+ /*
+ * Identify entry point section and make entry relative to section
+ * start.
+ */
+ entry = pi->ehdr->e_entry;
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
+ continue;
+
+ /* Make entry section relative */
+ if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
+ ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
+ pi->ehdr->e_entry)) {
+ entry_sidx = i;
+ entry -= sechdrs[i].sh_addr;
+ break;
+ }
+ }
+
+ /* Determine how much memory is needed to load relocatable object. */
+ buf_align = 1;
+ bss_align = 1;
+ buf_sz = 0;
+ bss_sz = 0;
+
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ align = sechdrs[i].sh_addralign;
+ if (sechdrs[i].sh_type != SHT_NOBITS) {
+ if (buf_align < align)
+ buf_align = align;
+ buf_sz = ALIGN(buf_sz, align);
+ buf_sz += sechdrs[i].sh_size;
+ } else {
+ /* bss section */
+ if (bss_align < align)
+ bss_align = align;
+ bss_sz = ALIGN(bss_sz, align);
+ bss_sz += sechdrs[i].sh_size;
+ }
+ }
+
+ /* Determine the bss padding required to align bss properly */
+ bss_pad = 0;
+ if (buf_sz & (bss_align - 1))
+ bss_pad = bss_align - (buf_sz & (bss_align - 1));
+
+ memsz = buf_sz + bss_pad + bss_sz;
+
+ /* Allocate buffer for purgatory */
+ purgatory_buf = vzalloc(buf_sz);
+ if (!purgatory_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (buf_align < bss_align)
+ buf_align = bss_align;
+
+ /* Add buffer to segment list */
+ ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
+ buf_align, min, max, top_down,
+ &pi->purgatory_load_addr);
+ if (ret)
+ goto out;
+
+ /* Load SHF_ALLOC sections */
+ buf_addr = purgatory_buf;
+ load_addr = curr_load_addr = pi->purgatory_load_addr;
+ bss_addr = load_addr + buf_sz + bss_pad;
+
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ align = sechdrs[i].sh_addralign;
+ if (sechdrs[i].sh_type != SHT_NOBITS) {
+ curr_load_addr = ALIGN(curr_load_addr, align);
+ offset = curr_load_addr - load_addr;
+ /* We already modifed ->sh_offset to keep src addr */
+ src = (char *) sechdrs[i].sh_offset;
+ memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
+
+ /* Store load address and source address of section */
+ sechdrs[i].sh_addr = curr_load_addr;
+
+ /*
+ * This section got copied to temporary buffer. Update
+ * ->sh_offset accordingly.
+ */
+ sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
+
+ /* Advance to the next address */
+ curr_load_addr += sechdrs[i].sh_size;
+ } else {
+ bss_addr = ALIGN(bss_addr, align);
+ sechdrs[i].sh_addr = bss_addr;
+ bss_addr += sechdrs[i].sh_size;
+ }
+ }
+
+ /* Update entry point based on load address of text section */
+ if (entry_sidx >= 0)
+ entry += sechdrs[entry_sidx].sh_addr;
+
+ /* Make kernel jump to purgatory after shutdown */
+ image->start = entry;
+
+ /* Used later to get/set symbol values */
+ pi->sechdrs = sechdrs;
+
+ /*
+ * Used later to identify which section is purgatory and skip it
+ * from checksumming.
+ */
+ pi->purgatory_buf = purgatory_buf;
+ return ret;
+out:
+ vfree(sechdrs);
+ vfree(purgatory_buf);
+ return ret;
+}
+
+static int kexec_apply_relocations(struct kimage *image)
+{
+ int i, ret;
+ struct purgatory_info *pi = &image->purgatory_info;
+ Elf_Shdr *sechdrs = pi->sechdrs;
+
+ /* Apply relocations */
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ Elf_Shdr *section, *symtab;
+
+ if (sechdrs[i].sh_type != SHT_RELA &&
+ sechdrs[i].sh_type != SHT_REL)
+ continue;
+
+ /*
+ * For section of type SHT_RELA/SHT_REL,
+ * ->sh_link contains section header index of associated
+ * symbol table. And ->sh_info contains section header
+ * index of section to which relocations apply.
+ */
+ if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
+ sechdrs[i].sh_link >= pi->ehdr->e_shnum)
+ return -ENOEXEC;
+
+ section = &sechdrs[sechdrs[i].sh_info];
+ symtab = &sechdrs[sechdrs[i].sh_link];
+
+ if (!(section->sh_flags & SHF_ALLOC))
+ continue;
+
+ /*
+ * symtab->sh_link contain section header index of associated
+ * string table.
+ */
+ if (symtab->sh_link >= pi->ehdr->e_shnum)
+ /* Invalid section number? */
+ continue;
+
+ /*
+ * Respective archicture needs to provide support for applying
+ * relocations of type SHT_RELA/SHT_REL.
+ */
+ if (sechdrs[i].sh_type == SHT_RELA)
+ ret = arch_kexec_apply_relocations_add(pi->ehdr,
+ sechdrs, i);
+ else if (sechdrs[i].sh_type == SHT_REL)
+ ret = arch_kexec_apply_relocations(pi->ehdr,
+ sechdrs, i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/* Load relocatable purgatory object and relocate it appropriately */
+int kexec_load_purgatory(struct kimage *image, unsigned long min,
+ unsigned long max, int top_down,
+ unsigned long *load_addr)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ int ret;
+
+ if (kexec_purgatory_size <= 0)
+ return -EINVAL;
+
+ if (kexec_purgatory_size < sizeof(Elf_Ehdr))
+ return -ENOEXEC;
+
+ pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
+
+ if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
+ || pi->ehdr->e_type != ET_REL
+ || !elf_check_arch(pi->ehdr)
+ || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
+ return -ENOEXEC;
+
+ if (pi->ehdr->e_shoff >= kexec_purgatory_size
+ || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
+ kexec_purgatory_size - pi->ehdr->e_shoff))
+ return -ENOEXEC;
+
+ ret = __kexec_load_purgatory(image, min, max, top_down);
+ if (ret)
+ return ret;
+
+ ret = kexec_apply_relocations(image);
+ if (ret)
+ goto out;
+
+ *load_addr = pi->purgatory_load_addr;
+ return 0;
+out:
+ vfree(pi->sechdrs);
+ vfree(pi->purgatory_buf);
+ return ret;
+}
+
+static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
+ const char *name)
+{
+ Elf_Sym *syms;
+ Elf_Shdr *sechdrs;
+ Elf_Ehdr *ehdr;
+ int i, k;
+ const char *strtab;
+
+ if (!pi->sechdrs || !pi->ehdr)
+ return NULL;
+
+ sechdrs = pi->sechdrs;
+ ehdr = pi->ehdr;
+
+ for (i = 0; i < ehdr->e_shnum; i++) {
+ if (sechdrs[i].sh_type != SHT_SYMTAB)
+ continue;
+
+ if (sechdrs[i].sh_link >= ehdr->e_shnum)
+ /* Invalid strtab section number */
+ continue;
+ strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
+ syms = (Elf_Sym *)sechdrs[i].sh_offset;
+
+ /* Go through symbols for a match */
+ for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
+ if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
+ continue;
+
+ if (strcmp(strtab + syms[k].st_name, name) != 0)
+ continue;
+
+ if (syms[k].st_shndx == SHN_UNDEF ||
+ syms[k].st_shndx >= ehdr->e_shnum) {
+ pr_debug("Symbol: %s has bad section index %d.\n",
+ name, syms[k].st_shndx);
+ return NULL;
+ }
+
+ /* Found the symbol we are looking for */
+ return &syms[k];
+ }
+ }
+
+ return NULL;
+}
+
+void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ Elf_Sym *sym;
+ Elf_Shdr *sechdr;
+
+ sym = kexec_purgatory_find_symbol(pi, name);
+ if (!sym)
+ return ERR_PTR(-EINVAL);
+
+ sechdr = &pi->sechdrs[sym->st_shndx];
+
+ /*
+ * Returns the address where symbol will finally be loaded after
+ * kexec_load_segment()
+ */
+ return (void *)(sechdr->sh_addr + sym->st_value);
+}
+
+/*
+ * Get or set value of a symbol. If "get_value" is true, symbol value is
+ * returned in buf otherwise symbol value is set based on value in buf.
+ */
+int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
+ void *buf, unsigned int size, bool get_value)
+{
+ Elf_Sym *sym;
+ Elf_Shdr *sechdrs;
+ struct purgatory_info *pi = &image->purgatory_info;
+ char *sym_buf;
+
+ sym = kexec_purgatory_find_symbol(pi, name);
+ if (!sym)
+ return -EINVAL;
+
+ if (sym->st_size != size) {
+ pr_err("symbol %s size mismatch: expected %lu actual %u\n",
+ name, (unsigned long)sym->st_size, size);
+ return -EINVAL;
+ }
+
+ sechdrs = pi->sechdrs;
+
+ if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+ pr_err("symbol %s is in a bss section. Cannot %s\n", name,
+ get_value ? "get" : "set");
+ return -EINVAL;
+ }
+
+ sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
+ sym->st_value;
+
+ if (get_value)
+ memcpy((void *)buf, sym_buf, size);
+ else
+ memcpy((void *)sym_buf, buf, size);
+
+ return 0;
+}
+#endif /* CONFIG_KEXEC_FILE */
/*
* Move into place and start executing a preloaded standalone
@@ -1682,7 +2745,15 @@ int kernel_kexec(void)
kexec_in_progress = true;
kernel_restart_prepare(NULL);
migrate_to_reboot_cpu();
- printk(KERN_EMERG "Starting new kernel\n");
+
+ /*
+ * migrate_to_reboot_cpu() disables CPU hotplug assuming that
+ * no further code needs to use CPU hotplug (which is true in
+ * the reboot case). However, the kexec path depends on using
+ * CPU hotplug again; so re-enable it here.
+ */
+ cpu_hotplug_enable();
+ pr_emerg("Starting new kernel\n");
machine_shutdown();
}
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 6b375af4958d..8637e041a247 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -285,10 +285,7 @@ static int wait_for_helper(void *data)
pid_t pid;
/* If SIGCLD is ignored sys_wait4 won't populate the status. */
- spin_lock_irq(&current->sighand->siglock);
- current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_DFL;
- spin_unlock_irq(&current->sighand->siglock);
-
+ kernel_sigaction(SIGCHLD, SIG_DFL);
pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
if (pid < 0) {
sub_info->retval = pid;
@@ -498,7 +495,7 @@ int __usermodehelper_disable(enum umh_disable_depth depth)
static void helper_lock(void)
{
atomic_inc(&running_helpers);
- smp_mb__after_atomic_inc();
+ smp_mb__after_atomic();
}
static void helper_unlock(void)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ceeadfcabb76..3995f546d0f3 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -86,21 +86,8 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
return &(kretprobe_table_locks[hash].lock);
}
-/*
- * Normally, functions that we'd want to prohibit kprobes in, are marked
- * __kprobes. But, there are cases where such functions already belong to
- * a different section (__sched for preempt_schedule)
- *
- * For such cases, we now have a blacklist
- */
-static struct kprobe_blackpoint kprobe_blacklist[] = {
- {"preempt_schedule",},
- {"native_get_debugreg",},
- {"irq_entries_start",},
- {"common_interrupt",},
- {"mcount",}, /* mcount can be called from everywhere */
- {NULL} /* Terminator */
-};
+/* Blacklist -- list of struct kprobe_blacklist_entry */
+static LIST_HEAD(kprobe_blacklist);
#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
/*
@@ -151,13 +138,13 @@ struct kprobe_insn_cache kprobe_insn_slots = {
.insn_size = MAX_INSN_SIZE,
.nr_garbage = 0,
};
-static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c);
+static int collect_garbage_slots(struct kprobe_insn_cache *c);
/**
* __get_insn_slot() - Find a slot on an executable page for an instruction.
* We allocate an executable page if there's no room on existing ones.
*/
-kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
+kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
{
struct kprobe_insn_page *kip;
kprobe_opcode_t *slot = NULL;
@@ -214,7 +201,7 @@ out:
}
/* Return 1 if all garbages are collected, otherwise 0. */
-static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
+static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
{
kip->slot_used[idx] = SLOT_CLEAN;
kip->nused--;
@@ -235,7 +222,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
return 0;
}
-static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
+static int collect_garbage_slots(struct kprobe_insn_cache *c)
{
struct kprobe_insn_page *kip, *next;
@@ -257,8 +244,8 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
return 0;
}
-void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
- kprobe_opcode_t *slot, int dirty)
+void __free_insn_slot(struct kprobe_insn_cache *c,
+ kprobe_opcode_t *slot, int dirty)
{
struct kprobe_insn_page *kip;
@@ -314,7 +301,7 @@ static inline void reset_kprobe_instance(void)
* OR
* - with preemption disabled - from arch/xxx/kernel/kprobes.c
*/
-struct kprobe __kprobes *get_kprobe(void *addr)
+struct kprobe *get_kprobe(void *addr)
{
struct hlist_head *head;
struct kprobe *p;
@@ -327,8 +314,9 @@ struct kprobe __kprobes *get_kprobe(void *addr)
return NULL;
}
+NOKPROBE_SYMBOL(get_kprobe);
-static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
+static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
/* Return true if the kprobe is an aggregator */
static inline int kprobe_aggrprobe(struct kprobe *p)
@@ -360,7 +348,7 @@ static bool kprobes_allow_optimization;
* Call all pre_handler on the list, but ignores its return value.
* This must be called from arch-dep optimized caller.
*/
-void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
+void opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
struct kprobe *kp;
@@ -372,9 +360,10 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
reset_kprobe_instance();
}
}
+NOKPROBE_SYMBOL(opt_pre_handler);
/* Free optimized instructions and optimized_kprobe */
-static __kprobes void free_aggr_kprobe(struct kprobe *p)
+static void free_aggr_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -412,7 +401,7 @@ static inline int kprobe_disarmed(struct kprobe *p)
}
/* Return true(!0) if the probe is queued on (un)optimizing lists */
-static int __kprobes kprobe_queued(struct kprobe *p)
+static int kprobe_queued(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -428,7 +417,7 @@ static int __kprobes kprobe_queued(struct kprobe *p)
* Return an optimized kprobe whose optimizing code replaces
* instructions including addr (exclude breakpoint).
*/
-static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
+static struct kprobe *get_optimized_kprobe(unsigned long addr)
{
int i;
struct kprobe *p = NULL;
@@ -460,7 +449,7 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
* Optimize (replace a breakpoint with a jump) kprobes listed on
* optimizing_list.
*/
-static __kprobes void do_optimize_kprobes(void)
+static void do_optimize_kprobes(void)
{
/* Optimization never be done when disarmed */
if (kprobes_all_disarmed || !kprobes_allow_optimization ||
@@ -488,7 +477,7 @@ static __kprobes void do_optimize_kprobes(void)
* Unoptimize (replace a jump with a breakpoint and remove the breakpoint
* if need) kprobes listed on unoptimizing_list.
*/
-static __kprobes void do_unoptimize_kprobes(void)
+static void do_unoptimize_kprobes(void)
{
struct optimized_kprobe *op, *tmp;
@@ -520,7 +509,7 @@ static __kprobes void do_unoptimize_kprobes(void)
}
/* Reclaim all kprobes on the free_list */
-static __kprobes void do_free_cleaned_kprobes(void)
+static void do_free_cleaned_kprobes(void)
{
struct optimized_kprobe *op, *tmp;
@@ -532,13 +521,13 @@ static __kprobes void do_free_cleaned_kprobes(void)
}
/* Start optimizer after OPTIMIZE_DELAY passed */
-static __kprobes void kick_kprobe_optimizer(void)
+static void kick_kprobe_optimizer(void)
{
schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
}
/* Kprobe jump optimizer */
-static __kprobes void kprobe_optimizer(struct work_struct *work)
+static void kprobe_optimizer(struct work_struct *work)
{
mutex_lock(&kprobe_mutex);
/* Lock modules while optimizing kprobes */
@@ -574,7 +563,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
}
/* Wait for completing optimization and unoptimization */
-static __kprobes void wait_for_kprobe_optimizer(void)
+static void wait_for_kprobe_optimizer(void)
{
mutex_lock(&kprobe_mutex);
@@ -593,7 +582,7 @@ static __kprobes void wait_for_kprobe_optimizer(void)
}
/* Optimize kprobe if p is ready to be optimized */
-static __kprobes void optimize_kprobe(struct kprobe *p)
+static void optimize_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -627,7 +616,7 @@ static __kprobes void optimize_kprobe(struct kprobe *p)
}
/* Short cut to direct unoptimizing */
-static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op)
+static void force_unoptimize_kprobe(struct optimized_kprobe *op)
{
get_online_cpus();
arch_unoptimize_kprobe(op);
@@ -637,7 +626,7 @@ static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op)
}
/* Unoptimize a kprobe if p is optimized */
-static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force)
+static void unoptimize_kprobe(struct kprobe *p, bool force)
{
struct optimized_kprobe *op;
@@ -697,7 +686,7 @@ static void reuse_unused_kprobe(struct kprobe *ap)
}
/* Remove optimized instructions */
-static void __kprobes kill_optimized_kprobe(struct kprobe *p)
+static void kill_optimized_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -723,7 +712,7 @@ static void __kprobes kill_optimized_kprobe(struct kprobe *p)
}
/* Try to prepare optimized instructions */
-static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
+static void prepare_optimized_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -732,7 +721,7 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
}
/* Allocate new optimized_kprobe and try to prepare optimized instructions */
-static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
+static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -747,13 +736,13 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
return &op->kp;
}
-static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
+static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
/*
* Prepare an optimized_kprobe and optimize it
* NOTE: p must be a normal registered kprobe
*/
-static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
+static void try_to_optimize_kprobe(struct kprobe *p)
{
struct kprobe *ap;
struct optimized_kprobe *op;
@@ -787,7 +776,7 @@ out:
}
#ifdef CONFIG_SYSCTL
-static void __kprobes optimize_all_kprobes(void)
+static void optimize_all_kprobes(void)
{
struct hlist_head *head;
struct kprobe *p;
@@ -810,7 +799,7 @@ out:
mutex_unlock(&kprobe_mutex);
}
-static void __kprobes unoptimize_all_kprobes(void)
+static void unoptimize_all_kprobes(void)
{
struct hlist_head *head;
struct kprobe *p;
@@ -861,7 +850,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
#endif /* CONFIG_SYSCTL */
/* Put a breakpoint for a probe. Must be called with text_mutex locked */
-static void __kprobes __arm_kprobe(struct kprobe *p)
+static void __arm_kprobe(struct kprobe *p)
{
struct kprobe *_p;
@@ -876,7 +865,7 @@ static void __kprobes __arm_kprobe(struct kprobe *p)
}
/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
-static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt)
+static void __disarm_kprobe(struct kprobe *p, bool reopt)
{
struct kprobe *_p;
@@ -911,13 +900,13 @@ static void reuse_unused_kprobe(struct kprobe *ap)
BUG_ON(kprobe_unused(ap));
}
-static __kprobes void free_aggr_kprobe(struct kprobe *p)
+static void free_aggr_kprobe(struct kprobe *p)
{
arch_remove_kprobe(p);
kfree(p);
}
-static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
+static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
{
return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
}
@@ -931,7 +920,7 @@ static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
static int kprobe_ftrace_enabled;
/* Must ensure p->addr is really on ftrace */
-static int __kprobes prepare_kprobe(struct kprobe *p)
+static int prepare_kprobe(struct kprobe *p)
{
if (!kprobe_ftrace(p))
return arch_prepare_kprobe(p);
@@ -940,7 +929,7 @@ static int __kprobes prepare_kprobe(struct kprobe *p)
}
/* Caller must lock kprobe_mutex */
-static void __kprobes arm_kprobe_ftrace(struct kprobe *p)
+static void arm_kprobe_ftrace(struct kprobe *p)
{
int ret;
@@ -955,7 +944,7 @@ static void __kprobes arm_kprobe_ftrace(struct kprobe *p)
}
/* Caller must lock kprobe_mutex */
-static void __kprobes disarm_kprobe_ftrace(struct kprobe *p)
+static void disarm_kprobe_ftrace(struct kprobe *p)
{
int ret;
@@ -975,7 +964,7 @@ static void __kprobes disarm_kprobe_ftrace(struct kprobe *p)
#endif
/* Arm a kprobe with text_mutex */
-static void __kprobes arm_kprobe(struct kprobe *kp)
+static void arm_kprobe(struct kprobe *kp)
{
if (unlikely(kprobe_ftrace(kp))) {
arm_kprobe_ftrace(kp);
@@ -992,7 +981,7 @@ static void __kprobes arm_kprobe(struct kprobe *kp)
}
/* Disarm a kprobe with text_mutex */
-static void __kprobes disarm_kprobe(struct kprobe *kp, bool reopt)
+static void disarm_kprobe(struct kprobe *kp, bool reopt)
{
if (unlikely(kprobe_ftrace(kp))) {
disarm_kprobe_ftrace(kp);
@@ -1008,7 +997,7 @@ static void __kprobes disarm_kprobe(struct kprobe *kp, bool reopt)
* Aggregate handlers for multiple kprobes support - these handlers
* take care of invoking the individual kprobe handlers on p->list
*/
-static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
+static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
struct kprobe *kp;
@@ -1022,9 +1011,10 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
}
return 0;
}
+NOKPROBE_SYMBOL(aggr_pre_handler);
-static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
- unsigned long flags)
+static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
+ unsigned long flags)
{
struct kprobe *kp;
@@ -1036,9 +1026,10 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
}
}
}
+NOKPROBE_SYMBOL(aggr_post_handler);
-static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
- int trapnr)
+static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
+ int trapnr)
{
struct kprobe *cur = __this_cpu_read(kprobe_instance);
@@ -1052,8 +1043,9 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
}
return 0;
}
+NOKPROBE_SYMBOL(aggr_fault_handler);
-static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
+static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
{
struct kprobe *cur = __this_cpu_read(kprobe_instance);
int ret = 0;
@@ -1065,9 +1057,10 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
reset_kprobe_instance();
return ret;
}
+NOKPROBE_SYMBOL(aggr_break_handler);
/* Walks the list and increments nmissed count for multiprobe case */
-void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
+void kprobes_inc_nmissed_count(struct kprobe *p)
{
struct kprobe *kp;
if (!kprobe_aggrprobe(p)) {
@@ -1078,9 +1071,10 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
}
return;
}
+NOKPROBE_SYMBOL(kprobes_inc_nmissed_count);
-void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
- struct hlist_head *head)
+void recycle_rp_inst(struct kretprobe_instance *ri,
+ struct hlist_head *head)
{
struct kretprobe *rp = ri->rp;
@@ -1095,8 +1089,9 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
/* Unregistering */
hlist_add_head(&ri->hlist, head);
}
+NOKPROBE_SYMBOL(recycle_rp_inst);
-void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
+void kretprobe_hash_lock(struct task_struct *tsk,
struct hlist_head **head, unsigned long *flags)
__acquires(hlist_lock)
{
@@ -1107,17 +1102,19 @@ __acquires(hlist_lock)
hlist_lock = kretprobe_table_lock_ptr(hash);
raw_spin_lock_irqsave(hlist_lock, *flags);
}
+NOKPROBE_SYMBOL(kretprobe_hash_lock);
-static void __kprobes kretprobe_table_lock(unsigned long hash,
- unsigned long *flags)
+static void kretprobe_table_lock(unsigned long hash,
+ unsigned long *flags)
__acquires(hlist_lock)
{
raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
raw_spin_lock_irqsave(hlist_lock, *flags);
}
+NOKPROBE_SYMBOL(kretprobe_table_lock);
-void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
- unsigned long *flags)
+void kretprobe_hash_unlock(struct task_struct *tsk,
+ unsigned long *flags)
__releases(hlist_lock)
{
unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
@@ -1126,14 +1123,16 @@ __releases(hlist_lock)
hlist_lock = kretprobe_table_lock_ptr(hash);
raw_spin_unlock_irqrestore(hlist_lock, *flags);
}
+NOKPROBE_SYMBOL(kretprobe_hash_unlock);
-static void __kprobes kretprobe_table_unlock(unsigned long hash,
- unsigned long *flags)
+static void kretprobe_table_unlock(unsigned long hash,
+ unsigned long *flags)
__releases(hlist_lock)
{
raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
raw_spin_unlock_irqrestore(hlist_lock, *flags);
}
+NOKPROBE_SYMBOL(kretprobe_table_unlock);
/*
* This function is called from finish_task_switch when task tk becomes dead,
@@ -1141,7 +1140,7 @@ __releases(hlist_lock)
* with this task. These left over instances represent probed functions
* that have been called but will never return.
*/
-void __kprobes kprobe_flush_task(struct task_struct *tk)
+void kprobe_flush_task(struct task_struct *tk)
{
struct kretprobe_instance *ri;
struct hlist_head *head, empty_rp;
@@ -1166,6 +1165,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
kfree(ri);
}
}
+NOKPROBE_SYMBOL(kprobe_flush_task);
static inline void free_rp_inst(struct kretprobe *rp)
{
@@ -1178,7 +1178,7 @@ static inline void free_rp_inst(struct kretprobe *rp)
}
}
-static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
+static void cleanup_rp_inst(struct kretprobe *rp)
{
unsigned long flags, hash;
struct kretprobe_instance *ri;
@@ -1197,12 +1197,13 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
}
free_rp_inst(rp);
}
+NOKPROBE_SYMBOL(cleanup_rp_inst);
/*
* Add the new probe to ap->list. Fail if this is the
* second jprobe at the address - two jprobes can't coexist
*/
-static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
+static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
{
BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
@@ -1226,7 +1227,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
* Fill in the required fields of the "manager kprobe". Replace the
* earlier kprobe in the hlist with the manager kprobe
*/
-static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
+static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
{
/* Copy p's insn slot to ap */
copy_kprobe(p, ap);
@@ -1252,8 +1253,7 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
* This is the second or subsequent kprobe at the address - handle
* the intricacies
*/
-static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
- struct kprobe *p)
+static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
{
int ret = 0;
struct kprobe *ap = orig_p;
@@ -1324,25 +1324,29 @@ out:
return ret;
}
-static int __kprobes in_kprobes_functions(unsigned long addr)
+bool __weak arch_within_kprobe_blacklist(unsigned long addr)
{
- struct kprobe_blackpoint *kb;
+ /* The __kprobes marked functions and entry code must not be probed */
+ return addr >= (unsigned long)__kprobes_text_start &&
+ addr < (unsigned long)__kprobes_text_end;
+}
- if (addr >= (unsigned long)__kprobes_text_start &&
- addr < (unsigned long)__kprobes_text_end)
- return -EINVAL;
+static bool within_kprobe_blacklist(unsigned long addr)
+{
+ struct kprobe_blacklist_entry *ent;
+
+ if (arch_within_kprobe_blacklist(addr))
+ return true;
/*
* If there exists a kprobe_blacklist, verify and
* fail any probe registration in the prohibited area
*/
- for (kb = kprobe_blacklist; kb->name != NULL; kb++) {
- if (kb->start_addr) {
- if (addr >= kb->start_addr &&
- addr < (kb->start_addr + kb->range))
- return -EINVAL;
- }
+ list_for_each_entry(ent, &kprobe_blacklist, list) {
+ if (addr >= ent->start_addr && addr < ent->end_addr)
+ return true;
}
- return 0;
+
+ return false;
}
/*
@@ -1351,7 +1355,7 @@ static int __kprobes in_kprobes_functions(unsigned long addr)
* This returns encoded errors if it fails to look up symbol or invalid
* combination of parameters.
*/
-static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
+static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
{
kprobe_opcode_t *addr = p->addr;
@@ -1374,7 +1378,7 @@ invalid:
}
/* Check passed kprobe is valid and return kprobe in kprobe_table. */
-static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
+static struct kprobe *__get_valid_kprobe(struct kprobe *p)
{
struct kprobe *ap, *list_p;
@@ -1406,8 +1410,8 @@ static inline int check_kprobe_rereg(struct kprobe *p)
return ret;
}
-static __kprobes int check_kprobe_address_safe(struct kprobe *p,
- struct module **probed_mod)
+static int check_kprobe_address_safe(struct kprobe *p,
+ struct module **probed_mod)
{
int ret = 0;
unsigned long ftrace_addr;
@@ -1433,7 +1437,7 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p,
/* Ensure it is not in reserved area nor out of text */
if (!kernel_text_address((unsigned long) p->addr) ||
- in_kprobes_functions((unsigned long) p->addr) ||
+ within_kprobe_blacklist((unsigned long) p->addr) ||
jump_label_text_reserved(p->addr, p->addr)) {
ret = -EINVAL;
goto out;
@@ -1469,7 +1473,7 @@ out:
return ret;
}
-int __kprobes register_kprobe(struct kprobe *p)
+int register_kprobe(struct kprobe *p)
{
int ret;
struct kprobe *old_p;
@@ -1531,7 +1535,7 @@ out:
EXPORT_SYMBOL_GPL(register_kprobe);
/* Check if all probes on the aggrprobe are disabled */
-static int __kprobes aggr_kprobe_disabled(struct kprobe *ap)
+static int aggr_kprobe_disabled(struct kprobe *ap)
{
struct kprobe *kp;
@@ -1547,7 +1551,7 @@ static int __kprobes aggr_kprobe_disabled(struct kprobe *ap)
}
/* Disable one kprobe: Make sure called under kprobe_mutex is locked */
-static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
+static struct kprobe *__disable_kprobe(struct kprobe *p)
{
struct kprobe *orig_p;
@@ -1574,7 +1578,7 @@ static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
/*
* Unregister a kprobe without a scheduler synchronization.
*/
-static int __kprobes __unregister_kprobe_top(struct kprobe *p)
+static int __unregister_kprobe_top(struct kprobe *p)
{
struct kprobe *ap, *list_p;
@@ -1631,7 +1635,7 @@ disarmed:
return 0;
}
-static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
+static void __unregister_kprobe_bottom(struct kprobe *p)
{
struct kprobe *ap;
@@ -1647,7 +1651,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
/* Otherwise, do nothing. */
}
-int __kprobes register_kprobes(struct kprobe **kps, int num)
+int register_kprobes(struct kprobe **kps, int num)
{
int i, ret = 0;
@@ -1665,13 +1669,13 @@ int __kprobes register_kprobes(struct kprobe **kps, int num)
}
EXPORT_SYMBOL_GPL(register_kprobes);
-void __kprobes unregister_kprobe(struct kprobe *p)
+void unregister_kprobe(struct kprobe *p)
{
unregister_kprobes(&p, 1);
}
EXPORT_SYMBOL_GPL(unregister_kprobe);
-void __kprobes unregister_kprobes(struct kprobe **kps, int num)
+void unregister_kprobes(struct kprobe **kps, int num)
{
int i;
@@ -1700,7 +1704,7 @@ unsigned long __weak arch_deref_entry_point(void *entry)
return (unsigned long)entry;
}
-int __kprobes register_jprobes(struct jprobe **jps, int num)
+int register_jprobes(struct jprobe **jps, int num)
{
struct jprobe *jp;
int ret = 0, i;
@@ -1731,19 +1735,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
}
EXPORT_SYMBOL_GPL(register_jprobes);
-int __kprobes register_jprobe(struct jprobe *jp)
+int register_jprobe(struct jprobe *jp)
{
return register_jprobes(&jp, 1);
}
EXPORT_SYMBOL_GPL(register_jprobe);
-void __kprobes unregister_jprobe(struct jprobe *jp)
+void unregister_jprobe(struct jprobe *jp)
{
unregister_jprobes(&jp, 1);
}
EXPORT_SYMBOL_GPL(unregister_jprobe);
-void __kprobes unregister_jprobes(struct jprobe **jps, int num)
+void unregister_jprobes(struct jprobe **jps, int num)
{
int i;
@@ -1768,14 +1772,24 @@ EXPORT_SYMBOL_GPL(unregister_jprobes);
* This kprobe pre_handler is registered with every kretprobe. When probe
* hits it will set up the return probe.
*/
-static int __kprobes pre_handler_kretprobe(struct kprobe *p,
- struct pt_regs *regs)
+static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
struct kretprobe *rp = container_of(p, struct kretprobe, kp);
unsigned long hash, flags = 0;
struct kretprobe_instance *ri;
- /*TODO: consider to only swap the RA after the last pre_handler fired */
+ /*
+ * To avoid deadlocks, prohibit return probing in NMI contexts,
+ * just skip the probe and increase the (inexact) 'nmissed'
+ * statistical counter, so that the user is informed that
+ * something happened:
+ */
+ if (unlikely(in_nmi())) {
+ rp->nmissed++;
+ return 0;
+ }
+
+ /* TODO: consider to only swap the RA after the last pre_handler fired */
hash = hash_ptr(current, KPROBE_HASH_BITS);
raw_spin_lock_irqsave(&rp->lock, flags);
if (!hlist_empty(&rp->free_instances)) {
@@ -1807,8 +1821,9 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
}
return 0;
}
+NOKPROBE_SYMBOL(pre_handler_kretprobe);
-int __kprobes register_kretprobe(struct kretprobe *rp)
+int register_kretprobe(struct kretprobe *rp)
{
int ret = 0;
struct kretprobe_instance *inst;
@@ -1861,7 +1876,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
}
EXPORT_SYMBOL_GPL(register_kretprobe);
-int __kprobes register_kretprobes(struct kretprobe **rps, int num)
+int register_kretprobes(struct kretprobe **rps, int num)
{
int ret = 0, i;
@@ -1879,13 +1894,13 @@ int __kprobes register_kretprobes(struct kretprobe **rps, int num)
}
EXPORT_SYMBOL_GPL(register_kretprobes);
-void __kprobes unregister_kretprobe(struct kretprobe *rp)
+void unregister_kretprobe(struct kretprobe *rp)
{
unregister_kretprobes(&rp, 1);
}
EXPORT_SYMBOL_GPL(unregister_kretprobe);
-void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
+void unregister_kretprobes(struct kretprobe **rps, int num)
{
int i;
@@ -1908,38 +1923,38 @@ void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
EXPORT_SYMBOL_GPL(unregister_kretprobes);
#else /* CONFIG_KRETPROBES */
-int __kprobes register_kretprobe(struct kretprobe *rp)
+int register_kretprobe(struct kretprobe *rp)
{
return -ENOSYS;
}
EXPORT_SYMBOL_GPL(register_kretprobe);
-int __kprobes register_kretprobes(struct kretprobe **rps, int num)
+int register_kretprobes(struct kretprobe **rps, int num)
{
return -ENOSYS;
}
EXPORT_SYMBOL_GPL(register_kretprobes);
-void __kprobes unregister_kretprobe(struct kretprobe *rp)
+void unregister_kretprobe(struct kretprobe *rp)
{
}
EXPORT_SYMBOL_GPL(unregister_kretprobe);
-void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
+void unregister_kretprobes(struct kretprobe **rps, int num)
{
}
EXPORT_SYMBOL_GPL(unregister_kretprobes);
-static int __kprobes pre_handler_kretprobe(struct kprobe *p,
- struct pt_regs *regs)
+static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
return 0;
}
+NOKPROBE_SYMBOL(pre_handler_kretprobe);
#endif /* CONFIG_KRETPROBES */
/* Set the kprobe gone and remove its instruction buffer. */
-static void __kprobes kill_kprobe(struct kprobe *p)
+static void kill_kprobe(struct kprobe *p)
{
struct kprobe *kp;
@@ -1963,7 +1978,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
}
/* Disable one kprobe */
-int __kprobes disable_kprobe(struct kprobe *kp)
+int disable_kprobe(struct kprobe *kp)
{
int ret = 0;
@@ -1979,7 +1994,7 @@ int __kprobes disable_kprobe(struct kprobe *kp)
EXPORT_SYMBOL_GPL(disable_kprobe);
/* Enable one kprobe */
-int __kprobes enable_kprobe(struct kprobe *kp)
+int enable_kprobe(struct kprobe *kp)
{
int ret = 0;
struct kprobe *p;
@@ -2012,16 +2027,53 @@ out:
}
EXPORT_SYMBOL_GPL(enable_kprobe);
-void __kprobes dump_kprobe(struct kprobe *kp)
+void dump_kprobe(struct kprobe *kp)
{
printk(KERN_WARNING "Dumping kprobe:\n");
printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n",
kp->symbol_name, kp->addr, kp->offset);
}
+NOKPROBE_SYMBOL(dump_kprobe);
+
+/*
+ * Lookup and populate the kprobe_blacklist.
+ *
+ * Unlike the kretprobe blacklist, we'll need to determine
+ * the range of addresses that belong to the said functions,
+ * since a kprobe need not necessarily be at the beginning
+ * of a function.
+ */
+static int __init populate_kprobe_blacklist(unsigned long *start,
+ unsigned long *end)
+{
+ unsigned long *iter;
+ struct kprobe_blacklist_entry *ent;
+ unsigned long entry, offset = 0, size = 0;
+
+ for (iter = start; iter < end; iter++) {
+ entry = arch_deref_entry_point((void *)*iter);
+
+ if (!kernel_text_address(entry) ||
+ !kallsyms_lookup_size_offset(entry, &size, &offset)) {
+ pr_err("Failed to find blacklist at %p\n",
+ (void *)entry);
+ continue;
+ }
+
+ ent = kmalloc(sizeof(*ent), GFP_KERNEL);
+ if (!ent)
+ return -ENOMEM;
+ ent->start_addr = entry;
+ ent->end_addr = entry + size;
+ INIT_LIST_HEAD(&ent->list);
+ list_add_tail(&ent->list, &kprobe_blacklist);
+ }
+ return 0;
+}
/* Module notifier call back, checking kprobes on the module */
-static int __kprobes kprobes_module_callback(struct notifier_block *nb,
- unsigned long val, void *data)
+static int kprobes_module_callback(struct notifier_block *nb,
+ unsigned long val, void *data)
{
struct module *mod = data;
struct hlist_head *head;
@@ -2062,14 +2114,13 @@ static struct notifier_block kprobe_module_nb = {
.priority = 0
};
+/* Markers of _kprobe_blacklist section */
+extern unsigned long __start_kprobe_blacklist[];
+extern unsigned long __stop_kprobe_blacklist[];
+
static int __init init_kprobes(void)
{
int i, err = 0;
- unsigned long offset = 0, size = 0;
- char *modname, namebuf[KSYM_NAME_LEN];
- const char *symbol_name;
- void *addr;
- struct kprobe_blackpoint *kb;
/* FIXME allocate the probe table, currently defined statically */
/* initialize all list heads */
@@ -2079,26 +2130,11 @@ static int __init init_kprobes(void)
raw_spin_lock_init(&(kretprobe_table_locks[i].lock));
}
- /*
- * Lookup and populate the kprobe_blacklist.
- *
- * Unlike the kretprobe blacklist, we'll need to determine
- * the range of addresses that belong to the said functions,
- * since a kprobe need not necessarily be at the beginning
- * of a function.
- */
- for (kb = kprobe_blacklist; kb->name != NULL; kb++) {
- kprobe_lookup_name(kb->name, addr);
- if (!addr)
- continue;
-
- kb->start_addr = (unsigned long)addr;
- symbol_name = kallsyms_lookup(kb->start_addr,
- &size, &offset, &modname, namebuf);
- if (!symbol_name)
- kb->range = 0;
- else
- kb->range = size;
+ err = populate_kprobe_blacklist(__start_kprobe_blacklist,
+ __stop_kprobe_blacklist);
+ if (err) {
+ pr_err("kprobes: failed to populate blacklist: %d\n", err);
+ pr_err("Please take care of using kprobes.\n");
}
if (kretprobe_blacklist_size) {
@@ -2138,7 +2174,7 @@ static int __init init_kprobes(void)
}
#ifdef CONFIG_DEBUG_FS
-static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
+static void report_probe(struct seq_file *pi, struct kprobe *p,
const char *sym, int offset, char *modname, struct kprobe *pp)
{
char *kprobe_type;
@@ -2167,12 +2203,12 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
(kprobe_ftrace(pp) ? "[FTRACE]" : ""));
}
-static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
+static void *kprobe_seq_start(struct seq_file *f, loff_t *pos)
{
return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL;
}
-static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos)
+static void *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos)
{
(*pos)++;
if (*pos >= KPROBE_TABLE_SIZE)
@@ -2180,12 +2216,12 @@ static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos)
return pos;
}
-static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v)
+static void kprobe_seq_stop(struct seq_file *f, void *v)
{
/* Nothing to do */
}
-static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
+static int show_kprobe_addr(struct seq_file *pi, void *v)
{
struct hlist_head *head;
struct kprobe *p, *kp;
@@ -2216,7 +2252,7 @@ static const struct seq_operations kprobes_seq_ops = {
.show = show_kprobe_addr
};
-static int __kprobes kprobes_open(struct inode *inode, struct file *filp)
+static int kprobes_open(struct inode *inode, struct file *filp)
{
return seq_open(filp, &kprobes_seq_ops);
}
@@ -2228,7 +2264,47 @@ static const struct file_operations debugfs_kprobes_operations = {
.release = seq_release,
};
-static void __kprobes arm_all_kprobes(void)
+/* kprobes/blacklist -- shows which functions can not be probed */
+static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos)
+{
+ return seq_list_start(&kprobe_blacklist, *pos);
+}
+
+static void *kprobe_blacklist_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &kprobe_blacklist, pos);
+}
+
+static int kprobe_blacklist_seq_show(struct seq_file *m, void *v)
+{
+ struct kprobe_blacklist_entry *ent =
+ list_entry(v, struct kprobe_blacklist_entry, list);
+
+ seq_printf(m, "0x%p-0x%p\t%ps\n", (void *)ent->start_addr,
+ (void *)ent->end_addr, (void *)ent->start_addr);
+ return 0;
+}
+
+static const struct seq_operations kprobe_blacklist_seq_ops = {
+ .start = kprobe_blacklist_seq_start,
+ .next = kprobe_blacklist_seq_next,
+ .stop = kprobe_seq_stop, /* Reuse void function */
+ .show = kprobe_blacklist_seq_show,
+};
+
+static int kprobe_blacklist_open(struct inode *inode, struct file *filp)
+{
+ return seq_open(filp, &kprobe_blacklist_seq_ops);
+}
+
+static const struct file_operations debugfs_kprobe_blacklist_ops = {
+ .open = kprobe_blacklist_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static void arm_all_kprobes(void)
{
struct hlist_head *head;
struct kprobe *p;
@@ -2256,7 +2332,7 @@ already_enabled:
return;
}
-static void __kprobes disarm_all_kprobes(void)
+static void disarm_all_kprobes(void)
{
struct hlist_head *head;
struct kprobe *p;
@@ -2340,7 +2416,7 @@ static const struct file_operations fops_kp = {
.llseek = default_llseek,
};
-static int __kprobes debugfs_kprobe_init(void)
+static int __init debugfs_kprobe_init(void)
{
struct dentry *dir, *file;
unsigned int value = 1;
@@ -2351,19 +2427,24 @@ static int __kprobes debugfs_kprobe_init(void)
file = debugfs_create_file("list", 0444, dir, NULL,
&debugfs_kprobes_operations);
- if (!file) {
- debugfs_remove(dir);
- return -ENOMEM;
- }
+ if (!file)
+ goto error;
file = debugfs_create_file("enabled", 0600, dir,
&value, &fops_kp);
- if (!file) {
- debugfs_remove(dir);
- return -ENOMEM;
- }
+ if (!file)
+ goto error;
+
+ file = debugfs_create_file("blacklist", 0444, dir, NULL,
+ &debugfs_kprobe_blacklist_ops);
+ if (!file)
+ goto error;
return 0;
+
+error:
+ debugfs_remove(dir);
+ return -ENOMEM;
}
late_initcall(debugfs_kprobe_init);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index d945a949760f..6683ccef9fff 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -18,6 +18,9 @@
#include <linux/stat.h>
#include <linux/sched.h>
#include <linux/capability.h>
+#include <linux/compiler.h>
+
+#include <linux/rcupdate.h> /* rcu_expedited */
#define KERNEL_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
@@ -34,6 +37,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj,
}
KERNEL_ATTR_RO(uevent_seqnum);
+#ifdef CONFIG_UEVENT_HELPER
/* uevent helper program, used during early boot */
static ssize_t uevent_helper_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
@@ -53,7 +57,7 @@ static ssize_t uevent_helper_store(struct kobject *kobj,
return count;
}
KERNEL_ATTR_RW(uevent_helper);
-
+#endif
#ifdef CONFIG_PROFILING
static ssize_t profiling_show(struct kobject *kobj,
@@ -160,8 +164,8 @@ KERNEL_ATTR_RW(rcu_expedited);
/*
* Make /sys/kernel/notes give the raw contents of our kernel .notes section.
*/
-extern const void __start_notes __attribute__((weak));
-extern const void __stop_notes __attribute__((weak));
+extern const void __start_notes __weak;
+extern const void __stop_notes __weak;
#define notes_size (&__stop_notes - &__start_notes)
static ssize_t notes_read(struct file *filp, struct kobject *kobj,
@@ -186,7 +190,9 @@ EXPORT_SYMBOL_GPL(kernel_kobj);
static struct attribute * kernel_attrs[] = {
&fscaps_attr.attr,
&uevent_seqnum_attr.attr,
+#ifdef CONFIG_UEVENT_HELPER
&uevent_helper_attr.attr,
+#endif
#ifdef CONFIG_PROFILING
&profiling_attr.attr,
#endif
diff --git a/kernel/kthread.c b/kernel/kthread.c
index b5ae3ee860a9..ef483220e855 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -217,7 +217,7 @@ int tsk_fork_get_node(struct task_struct *tsk)
if (tsk == kthreadd_task)
return tsk->pref_node_fork;
#endif
- return numa_node_id();
+ return NUMA_NO_NODE;
}
static void create_kthread(struct kthread_create_info *create)
@@ -262,7 +262,7 @@ static void create_kthread(struct kthread_create_info *create)
* kthread_stop() has been called). The return value should be zero
* or a negative error number; it will be passed to kthread_stop().
*
- * Returns a task_struct or ERR_PTR(-ENOMEM).
+ * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).
*/
struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
void *data, int node,
@@ -298,7 +298,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
* that thread.
*/
if (xchg(&create->done, NULL))
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(-EINTR);
/*
* kthreadd (or new kernel thread) will call complete()
* shortly.
@@ -369,7 +369,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
{
struct task_struct *p;
- p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
+ p = kthread_create_on_node(threadfn, data, cpu_to_mem(cpu), namefmt,
cpu);
if (IS_ERR(p))
return p;
@@ -591,7 +591,7 @@ static void insert_kthread_work(struct kthread_worker *worker,
list_add_tail(&work->node, pos);
work->worker = worker;
- if (likely(worker->task))
+ if (!worker->current_work && likely(worker->task))
wake_up_process(worker->task);
}
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index a462b317f9a0..a02812743a7e 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -88,7 +88,8 @@ static void clear_global_latency_tracing(void)
}
static void __sched
-account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat)
+account_global_scheduler_latency(struct task_struct *tsk,
+ struct latency_record *lat)
{
int firstnonnull = MAXLR + 1;
int i;
@@ -255,7 +256,7 @@ static int lstats_show(struct seq_file *m, void *v)
break;
seq_printf(m, " %ps", (void *)bt);
}
- seq_printf(m, "\n");
+ seq_puts(m, "\n");
}
}
return 0;
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index baab8e5e7f66..8541bfdfd232 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,5 +1,5 @@
-obj-y += mutex.o semaphore.o rwsem.o lglock.o
+obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_lockdep.o = -pg
@@ -14,6 +14,7 @@ ifeq ($(CONFIG_PROC_FS),y)
obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
endif
obj-$(CONFIG_SMP) += spinlock.o
+obj-$(CONFIG_SMP) += lglock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
@@ -23,3 +24,5 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
+obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o
+obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index eb8a54783fa0..88d0d4420ad2 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -384,7 +384,9 @@ static void print_lockdep_off(const char *bug_msg)
{
printk(KERN_DEBUG "%s\n", bug_msg);
printk(KERN_DEBUG "turning off the locking correctness validator.\n");
+#ifdef CONFIG_LOCK_STAT
printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n");
+#endif
}
static int save_trace(struct stack_trace *trace)
@@ -1936,12 +1938,12 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
for (;;) {
int distance = curr->lockdep_depth - depth + 1;
- hlock = curr->held_locks + depth-1;
+ hlock = curr->held_locks + depth - 1;
/*
* Only non-recursive-read entries get new dependencies
* added:
*/
- if (hlock->read != 2) {
+ if (hlock->read != 2 && hlock->check) {
if (!check_prev_add(curr, hlock, next,
distance, trylock_loop))
return 0;
@@ -2098,7 +2100,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
* (If lookup_chain_cache() returns with 1 it acquires
* graph_lock for us)
*/
- if (!hlock->trylock && (hlock->check == 2) &&
+ if (!hlock->trylock && hlock->check &&
lookup_chain_cache(curr, hlock, chain_key)) {
/*
* Check whether last held lock:
@@ -2517,7 +2519,7 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
BUG_ON(usage_bit >= LOCK_USAGE_STATES);
- if (hlock_class(hlock)->key == __lockdep_no_validate__.subkeys)
+ if (!hlock->check)
continue;
if (!mark_lock(curr, hlock, usage_bit))
@@ -2557,7 +2559,7 @@ static void __trace_hardirqs_on_caller(unsigned long ip)
debug_atomic_inc(hardirqs_on_events);
}
-void trace_hardirqs_on_caller(unsigned long ip)
+__visible void trace_hardirqs_on_caller(unsigned long ip)
{
time_hardirqs_on(CALLER_ADDR0, ip);
@@ -2610,7 +2612,7 @@ EXPORT_SYMBOL(trace_hardirqs_on);
/*
* Hardirqs were disabled:
*/
-void trace_hardirqs_off_caller(unsigned long ip)
+__visible void trace_hardirqs_off_caller(unsigned long ip)
{
struct task_struct *curr = current;
@@ -3055,9 +3057,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
int class_idx;
u64 chain_key;
- if (!prove_locking)
- check = 1;
-
if (unlikely(!debug_locks))
return 0;
@@ -3069,8 +3068,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return 0;
- if (lock->key == &__lockdep_no_validate__)
- check = 1;
+ if (!prove_locking || lock->key == &__lockdep_no_validate__)
+ check = 0;
if (subclass < NR_LOCKDEP_CACHING_CLASSES)
class = lock->class_cache[subclass];
@@ -3138,7 +3137,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
hlock->holdtime_stamp = lockstat_clock();
#endif
- if (check == 2 && !mark_irqflags(curr, hlock))
+ if (check && !mark_irqflags(curr, hlock))
return 0;
/* mark it as used: */
@@ -4191,7 +4190,7 @@ void debug_show_held_locks(struct task_struct *task)
}
EXPORT_SYMBOL_GPL(debug_show_held_locks);
-void lockdep_sys_exit(void)
+asmlinkage __visible void lockdep_sys_exit(void)
{
struct task_struct *curr = current;
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index 4f560cfedc8f..51c4b24b6328 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -54,9 +54,9 @@ enum {
* table (if it's not there yet), and we check it for lock order
* conflicts and deadlocks.
*/
-#define MAX_LOCKDEP_ENTRIES 16384UL
+#define MAX_LOCKDEP_ENTRIES 32768UL
-#define MAX_LOCKDEP_CHAINS_BITS 15
+#define MAX_LOCKDEP_CHAINS_BITS 16
#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
@@ -65,7 +65,7 @@ enum {
* Stack-trace: tightly packed array of stack backtrace
* addresses. Protected by the hash_lock.
*/
-#define MAX_STACK_TRACE_ENTRIES 262144UL
+#define MAX_STACK_TRACE_ENTRIES 524288UL
extern struct list_head all_lock_classes;
extern struct lock_chain lock_chains[];
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
new file mode 100644
index 000000000000..0955b885d0dc
--- /dev/null
+++ b/kernel/locking/locktorture.c
@@ -0,0 +1,454 @@
+/*
+ * Module-based torture test facility for locking
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2014
+ *
+ * Author: Paul E. McKenney <paulmck@us.ibm.com>
+ * Based on kernel/rcu/torture.c.
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/freezer.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/trace_clock.h>
+#include <asm/byteorder.h>
+#include <linux/torture.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
+
+torture_param(int, nwriters_stress, -1,
+ "Number of write-locking stress-test threads");
+torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
+torture_param(int, onoff_interval, 0,
+ "Time between CPU hotplugs (s), 0=disable");
+torture_param(int, shuffle_interval, 3,
+ "Number of jiffies between shuffles, 0=disable");
+torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable.");
+torture_param(int, stat_interval, 60,
+ "Number of seconds between stats printk()s");
+torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
+torture_param(bool, verbose, true,
+ "Enable verbose debugging printk()s");
+
+static char *torture_type = "spin_lock";
+module_param(torture_type, charp, 0444);
+MODULE_PARM_DESC(torture_type,
+ "Type of lock to torture (spin_lock, spin_lock_irq, ...)");
+
+static atomic_t n_lock_torture_errors;
+
+static struct task_struct *stats_task;
+static struct task_struct **writer_tasks;
+
+static int nrealwriters_stress;
+static bool lock_is_write_held;
+
+struct lock_writer_stress_stats {
+ long n_write_lock_fail;
+ long n_write_lock_acquired;
+};
+static struct lock_writer_stress_stats *lwsa;
+
+#if defined(MODULE)
+#define LOCKTORTURE_RUNNABLE_INIT 1
+#else
+#define LOCKTORTURE_RUNNABLE_INIT 0
+#endif
+int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT;
+module_param(locktorture_runnable, int, 0444);
+MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at module init");
+
+/* Forward reference. */
+static void lock_torture_cleanup(void);
+
+/*
+ * Operations vector for selecting different types of tests.
+ */
+struct lock_torture_ops {
+ void (*init)(void);
+ int (*writelock)(void);
+ void (*write_delay)(struct torture_random_state *trsp);
+ void (*writeunlock)(void);
+ unsigned long flags;
+ const char *name;
+};
+
+static struct lock_torture_ops *cur_ops;
+
+/*
+ * Definitions for lock torture testing.
+ */
+
+static int torture_lock_busted_write_lock(void)
+{
+ return 0; /* BUGGY, do not use in real life!!! */
+}
+
+static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
+{
+ const unsigned long longdelay_us = 100;
+
+ /* We want a long delay occasionally to force massive contention. */
+ if (!(torture_random(trsp) %
+ (nrealwriters_stress * 2000 * longdelay_us)))
+ mdelay(longdelay_us);
+#ifdef CONFIG_PREEMPT
+ if (!(torture_random(trsp) % (nrealwriters_stress * 20000)))
+ preempt_schedule(); /* Allow test to be preempted. */
+#endif
+}
+
+static void torture_lock_busted_write_unlock(void)
+{
+ /* BUGGY, do not use in real life!!! */
+}
+
+static struct lock_torture_ops lock_busted_ops = {
+ .writelock = torture_lock_busted_write_lock,
+ .write_delay = torture_lock_busted_write_delay,
+ .writeunlock = torture_lock_busted_write_unlock,
+ .name = "lock_busted"
+};
+
+static DEFINE_SPINLOCK(torture_spinlock);
+
+static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock)
+{
+ spin_lock(&torture_spinlock);
+ return 0;
+}
+
+static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
+{
+ const unsigned long shortdelay_us = 2;
+ const unsigned long longdelay_us = 100;
+
+ /* We want a short delay mostly to emulate likely code, and
+ * we want a long delay occasionally to force massive contention.
+ */
+ if (!(torture_random(trsp) %
+ (nrealwriters_stress * 2000 * longdelay_us)))
+ mdelay(longdelay_us);
+ if (!(torture_random(trsp) %
+ (nrealwriters_stress * 2 * shortdelay_us)))
+ udelay(shortdelay_us);
+#ifdef CONFIG_PREEMPT
+ if (!(torture_random(trsp) % (nrealwriters_stress * 20000)))
+ preempt_schedule(); /* Allow test to be preempted. */
+#endif
+}
+
+static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock)
+{
+ spin_unlock(&torture_spinlock);
+}
+
+static struct lock_torture_ops spin_lock_ops = {
+ .writelock = torture_spin_lock_write_lock,
+ .write_delay = torture_spin_lock_write_delay,
+ .writeunlock = torture_spin_lock_write_unlock,
+ .name = "spin_lock"
+};
+
+static int torture_spin_lock_write_lock_irq(void)
+__acquires(torture_spinlock_irq)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&torture_spinlock, flags);
+ cur_ops->flags = flags;
+ return 0;
+}
+
+static void torture_lock_spin_write_unlock_irq(void)
+__releases(torture_spinlock)
+{
+ spin_unlock_irqrestore(&torture_spinlock, cur_ops->flags);
+}
+
+static struct lock_torture_ops spin_lock_irq_ops = {
+ .writelock = torture_spin_lock_write_lock_irq,
+ .write_delay = torture_spin_lock_write_delay,
+ .writeunlock = torture_lock_spin_write_unlock_irq,
+ .name = "spin_lock_irq"
+};
+
+/*
+ * Lock torture writer kthread. Repeatedly acquires and releases
+ * the lock, checking for duplicate acquisitions.
+ */
+static int lock_torture_writer(void *arg)
+{
+ struct lock_writer_stress_stats *lwsp = arg;
+ static DEFINE_TORTURE_RANDOM(rand);
+
+ VERBOSE_TOROUT_STRING("lock_torture_writer task started");
+ set_user_nice(current, MAX_NICE);
+
+ do {
+ if ((torture_random(&rand) & 0xfffff) == 0)
+ schedule_timeout_uninterruptible(1);
+ cur_ops->writelock();
+ if (WARN_ON_ONCE(lock_is_write_held))
+ lwsp->n_write_lock_fail++;
+ lock_is_write_held = 1;
+ lwsp->n_write_lock_acquired++;
+ cur_ops->write_delay(&rand);
+ lock_is_write_held = 0;
+ cur_ops->writeunlock();
+ stutter_wait("lock_torture_writer");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("lock_torture_writer");
+ return 0;
+}
+
+/*
+ * Create an lock-torture-statistics message in the specified buffer.
+ */
+static void lock_torture_printk(char *page)
+{
+ bool fail = 0;
+ int i;
+ long max = 0;
+ long min = lwsa[0].n_write_lock_acquired;
+ long long sum = 0;
+
+ for (i = 0; i < nrealwriters_stress; i++) {
+ if (lwsa[i].n_write_lock_fail)
+ fail = true;
+ sum += lwsa[i].n_write_lock_acquired;
+ if (max < lwsa[i].n_write_lock_fail)
+ max = lwsa[i].n_write_lock_fail;
+ if (min > lwsa[i].n_write_lock_fail)
+ min = lwsa[i].n_write_lock_fail;
+ }
+ page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG);
+ page += sprintf(page,
+ "Writes: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n",
+ sum, max, min, max / 2 > min ? "???" : "",
+ fail, fail ? "!!!" : "");
+ if (fail)
+ atomic_inc(&n_lock_torture_errors);
+}
+
+/*
+ * Print torture statistics. Caller must ensure that there is only one
+ * call to this function at a given time!!! This is normally accomplished
+ * by relying on the module system to only have one copy of the module
+ * loaded, and then by giving the lock_torture_stats kthread full control
+ * (or the init/cleanup functions when lock_torture_stats thread is not
+ * running).
+ */
+static void lock_torture_stats_print(void)
+{
+ int size = nrealwriters_stress * 200 + 8192;
+ char *buf;
+
+ buf = kmalloc(size, GFP_KERNEL);
+ if (!buf) {
+ pr_err("lock_torture_stats_print: Out of memory, need: %d",
+ size);
+ return;
+ }
+ lock_torture_printk(buf);
+ pr_alert("%s", buf);
+ kfree(buf);
+}
+
+/*
+ * Periodically prints torture statistics, if periodic statistics printing
+ * was specified via the stat_interval module parameter.
+ *
+ * No need to worry about fullstop here, since this one doesn't reference
+ * volatile state or register callbacks.
+ */
+static int lock_torture_stats(void *arg)
+{
+ VERBOSE_TOROUT_STRING("lock_torture_stats task started");
+ do {
+ schedule_timeout_interruptible(stat_interval * HZ);
+ lock_torture_stats_print();
+ torture_shutdown_absorb("lock_torture_stats");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("lock_torture_stats");
+ return 0;
+}
+
+static inline void
+lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
+ const char *tag)
+{
+ pr_alert("%s" TORTURE_FLAG
+ "--- %s: nwriters_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
+ torture_type, tag, nrealwriters_stress, stat_interval, verbose,
+ shuffle_interval, stutter, shutdown_secs,
+ onoff_interval, onoff_holdoff);
+}
+
+static void lock_torture_cleanup(void)
+{
+ int i;
+
+ if (torture_cleanup())
+ return;
+
+ if (writer_tasks) {
+ for (i = 0; i < nrealwriters_stress; i++)
+ torture_stop_kthread(lock_torture_writer,
+ writer_tasks[i]);
+ kfree(writer_tasks);
+ writer_tasks = NULL;
+ }
+
+ torture_stop_kthread(lock_torture_stats, stats_task);
+ lock_torture_stats_print(); /* -After- the stats thread is stopped! */
+
+ if (atomic_read(&n_lock_torture_errors))
+ lock_torture_print_module_parms(cur_ops,
+ "End of test: FAILURE");
+ else if (torture_onoff_failures())
+ lock_torture_print_module_parms(cur_ops,
+ "End of test: LOCK_HOTPLUG");
+ else
+ lock_torture_print_module_parms(cur_ops,
+ "End of test: SUCCESS");
+}
+
+static int __init lock_torture_init(void)
+{
+ int i;
+ int firsterr = 0;
+ static struct lock_torture_ops *torture_ops[] = {
+ &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops,
+ };
+
+ if (!torture_init_begin(torture_type, verbose, &locktorture_runnable))
+ return -EBUSY;
+
+ /* Process args and tell the world that the torturer is on the job. */
+ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
+ cur_ops = torture_ops[i];
+ if (strcmp(torture_type, cur_ops->name) == 0)
+ break;
+ }
+ if (i == ARRAY_SIZE(torture_ops)) {
+ pr_alert("lock-torture: invalid torture type: \"%s\"\n",
+ torture_type);
+ pr_alert("lock-torture types:");
+ for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
+ pr_alert(" %s", torture_ops[i]->name);
+ pr_alert("\n");
+ torture_init_end();
+ return -EINVAL;
+ }
+ if (cur_ops->init)
+ cur_ops->init(); /* no "goto unwind" prior to this point!!! */
+
+ if (nwriters_stress >= 0)
+ nrealwriters_stress = nwriters_stress;
+ else
+ nrealwriters_stress = 2 * num_online_cpus();
+ lock_torture_print_module_parms(cur_ops, "Start of test");
+
+ /* Initialize the statistics so that each run gets its own numbers. */
+
+ lock_is_write_held = 0;
+ lwsa = kmalloc(sizeof(*lwsa) * nrealwriters_stress, GFP_KERNEL);
+ if (lwsa == NULL) {
+ VERBOSE_TOROUT_STRING("lwsa: Out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (i = 0; i < nrealwriters_stress; i++) {
+ lwsa[i].n_write_lock_fail = 0;
+ lwsa[i].n_write_lock_acquired = 0;
+ }
+
+ /* Start up the kthreads. */
+
+ if (onoff_interval > 0) {
+ firsterr = torture_onoff_init(onoff_holdoff * HZ,
+ onoff_interval * HZ);
+ if (firsterr)
+ goto unwind;
+ }
+ if (shuffle_interval > 0) {
+ firsterr = torture_shuffle_init(shuffle_interval);
+ if (firsterr)
+ goto unwind;
+ }
+ if (shutdown_secs > 0) {
+ firsterr = torture_shutdown_init(shutdown_secs,
+ lock_torture_cleanup);
+ if (firsterr)
+ goto unwind;
+ }
+ if (stutter > 0) {
+ firsterr = torture_stutter_init(stutter);
+ if (firsterr)
+ goto unwind;
+ }
+
+ writer_tasks = kzalloc(nrealwriters_stress * sizeof(writer_tasks[0]),
+ GFP_KERNEL);
+ if (writer_tasks == NULL) {
+ VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (i = 0; i < nrealwriters_stress; i++) {
+ firsterr = torture_create_kthread(lock_torture_writer, &lwsa[i],
+ writer_tasks[i]);
+ if (firsterr)
+ goto unwind;
+ }
+ if (stat_interval > 0) {
+ firsterr = torture_create_kthread(lock_torture_stats, NULL,
+ stats_task);
+ if (firsterr)
+ goto unwind;
+ }
+ torture_init_end();
+ return 0;
+
+unwind:
+ torture_init_end();
+ lock_torture_cleanup();
+ return firsterr;
+}
+
+module_init(lock_torture_init);
+module_exit(lock_torture_cleanup);
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
new file mode 100644
index 000000000000..9887a905a762
--- /dev/null
+++ b/kernel/locking/mcs_spinlock.c
@@ -0,0 +1,208 @@
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include "mcs_spinlock.h"
+
+#ifdef CONFIG_SMP
+
+/*
+ * An MCS like lock especially tailored for optimistic spinning for sleeping
+ * lock implementations (mutex, rwsem, etc).
+ *
+ * Using a single mcs node per CPU is safe because sleeping locks should not be
+ * called from interrupt context and we have preemption disabled while
+ * spinning.
+ */
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
+
+/*
+ * We use the value 0 to represent "no CPU", thus the encoded value
+ * will be the CPU number incremented by 1.
+ */
+static inline int encode_cpu(int cpu_nr)
+{
+ return cpu_nr + 1;
+}
+
+static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
+{
+ int cpu_nr = encoded_cpu_val - 1;
+
+ return per_cpu_ptr(&osq_node, cpu_nr);
+}
+
+/*
+ * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
+ * Can return NULL in case we were the last queued and we updated @lock instead.
+ */
+static inline struct optimistic_spin_node *
+osq_wait_next(struct optimistic_spin_queue *lock,
+ struct optimistic_spin_node *node,
+ struct optimistic_spin_node *prev)
+{
+ struct optimistic_spin_node *next = NULL;
+ int curr = encode_cpu(smp_processor_id());
+ int old;
+
+ /*
+ * If there is a prev node in queue, then the 'old' value will be
+ * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
+ * we're currently last in queue, then the queue will then become empty.
+ */
+ old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
+
+ for (;;) {
+ if (atomic_read(&lock->tail) == curr &&
+ atomic_cmpxchg(&lock->tail, curr, old) == curr) {
+ /*
+ * We were the last queued, we moved @lock back. @prev
+ * will now observe @lock and will complete its
+ * unlock()/unqueue().
+ */
+ break;
+ }
+
+ /*
+ * We must xchg() the @node->next value, because if we were to
+ * leave it in, a concurrent unlock()/unqueue() from
+ * @node->next might complete Step-A and think its @prev is
+ * still valid.
+ *
+ * If the concurrent unlock()/unqueue() wins the race, we'll
+ * wait for either @lock to point to us, through its Step-B, or
+ * wait for a new @node->next from its Step-C.
+ */
+ if (node->next) {
+ next = xchg(&node->next, NULL);
+ if (next)
+ break;
+ }
+
+ cpu_relax_lowlatency();
+ }
+
+ return next;
+}
+
+bool osq_lock(struct optimistic_spin_queue *lock)
+{
+ struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
+ struct optimistic_spin_node *prev, *next;
+ int curr = encode_cpu(smp_processor_id());
+ int old;
+
+ node->locked = 0;
+ node->next = NULL;
+ node->cpu = curr;
+
+ old = atomic_xchg(&lock->tail, curr);
+ if (old == OSQ_UNLOCKED_VAL)
+ return true;
+
+ prev = decode_cpu(old);
+ node->prev = prev;
+ ACCESS_ONCE(prev->next) = node;
+
+ /*
+ * Normally @prev is untouchable after the above store; because at that
+ * moment unlock can proceed and wipe the node element from stack.
+ *
+ * However, since our nodes are static per-cpu storage, we're
+ * guaranteed their existence -- this allows us to apply
+ * cmpxchg in an attempt to undo our queueing.
+ */
+
+ while (!smp_load_acquire(&node->locked)) {
+ /*
+ * If we need to reschedule bail... so we can block.
+ */
+ if (need_resched())
+ goto unqueue;
+
+ cpu_relax_lowlatency();
+ }
+ return true;
+
+unqueue:
+ /*
+ * Step - A -- stabilize @prev
+ *
+ * Undo our @prev->next assignment; this will make @prev's
+ * unlock()/unqueue() wait for a next pointer since @lock points to us
+ * (or later).
+ */
+
+ for (;;) {
+ if (prev->next == node &&
+ cmpxchg(&prev->next, node, NULL) == node)
+ break;
+
+ /*
+ * We can only fail the cmpxchg() racing against an unlock(),
+ * in which case we should observe @node->locked becomming
+ * true.
+ */
+ if (smp_load_acquire(&node->locked))
+ return true;
+
+ cpu_relax_lowlatency();
+
+ /*
+ * Or we race against a concurrent unqueue()'s step-B, in which
+ * case its step-C will write us a new @node->prev pointer.
+ */
+ prev = ACCESS_ONCE(node->prev);
+ }
+
+ /*
+ * Step - B -- stabilize @next
+ *
+ * Similar to unlock(), wait for @node->next or move @lock from @node
+ * back to @prev.
+ */
+
+ next = osq_wait_next(lock, node, prev);
+ if (!next)
+ return false;
+
+ /*
+ * Step - C -- unlink
+ *
+ * @prev is stable because its still waiting for a new @prev->next
+ * pointer, @next is stable because our @node->next pointer is NULL and
+ * it will wait in Step-A.
+ */
+
+ ACCESS_ONCE(next->prev) = prev;
+ ACCESS_ONCE(prev->next) = next;
+
+ return false;
+}
+
+void osq_unlock(struct optimistic_spin_queue *lock)
+{
+ struct optimistic_spin_node *node, *next;
+ int curr = encode_cpu(smp_processor_id());
+
+ /*
+ * Fast path for the uncontended case.
+ */
+ if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr))
+ return;
+
+ /*
+ * Second most likely case.
+ */
+ node = this_cpu_ptr(&osq_node);
+ next = xchg(&node->next, NULL);
+ if (next) {
+ ACCESS_ONCE(next->locked) = 1;
+ return;
+ }
+
+ next = osq_wait_next(lock, node, NULL);
+ if (next)
+ ACCESS_ONCE(next->locked) = 1;
+}
+
+#endif
+
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
new file mode 100644
index 000000000000..23e89c5930e9
--- /dev/null
+++ b/kernel/locking/mcs_spinlock.h
@@ -0,0 +1,130 @@
+/*
+ * MCS lock defines
+ *
+ * This file contains the main data structure and API definitions of MCS lock.
+ *
+ * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock
+ * with the desirable properties of being fair, and with each cpu trying
+ * to acquire the lock spinning on a local variable.
+ * It avoids expensive cache bouncings that common test-and-set spin-lock
+ * implementations incur.
+ */
+#ifndef __LINUX_MCS_SPINLOCK_H
+#define __LINUX_MCS_SPINLOCK_H
+
+#include <asm/mcs_spinlock.h>
+
+struct mcs_spinlock {
+ struct mcs_spinlock *next;
+ int locked; /* 1 if lock acquired */
+};
+
+#ifndef arch_mcs_spin_lock_contended
+/*
+ * Using smp_load_acquire() provides a memory barrier that ensures
+ * subsequent operations happen after the lock is acquired.
+ */
+#define arch_mcs_spin_lock_contended(l) \
+do { \
+ while (!(smp_load_acquire(l))) \
+ cpu_relax_lowlatency(); \
+} while (0)
+#endif
+
+#ifndef arch_mcs_spin_unlock_contended
+/*
+ * smp_store_release() provides a memory barrier to ensure all
+ * operations in the critical section has been completed before
+ * unlocking.
+ */
+#define arch_mcs_spin_unlock_contended(l) \
+ smp_store_release((l), 1)
+#endif
+
+/*
+ * Note: the smp_load_acquire/smp_store_release pair is not
+ * sufficient to form a full memory barrier across
+ * cpus for many architectures (except x86) for mcs_unlock and mcs_lock.
+ * For applications that need a full barrier across multiple cpus
+ * with mcs_unlock and mcs_lock pair, smp_mb__after_unlock_lock() should be
+ * used after mcs_lock.
+ */
+
+/*
+ * In order to acquire the lock, the caller should declare a local node and
+ * pass a reference of the node to this function in addition to the lock.
+ * If the lock has already been acquired, then this will proceed to spin
+ * on this node->locked until the previous lock holder sets the node->locked
+ * in mcs_spin_unlock().
+ *
+ * We don't inline mcs_spin_lock() so that perf can correctly account for the
+ * time spent in this lock function.
+ */
+static inline
+void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{
+ struct mcs_spinlock *prev;
+
+ /* Init node */
+ node->locked = 0;
+ node->next = NULL;
+
+ prev = xchg(lock, node);
+ if (likely(prev == NULL)) {
+ /*
+ * Lock acquired, don't need to set node->locked to 1. Threads
+ * only spin on its own node->locked value for lock acquisition.
+ * However, since this thread can immediately acquire the lock
+ * and does not proceed to spin on its own node->locked, this
+ * value won't be used. If a debug mode is needed to
+ * audit lock status, then set node->locked value here.
+ */
+ return;
+ }
+ ACCESS_ONCE(prev->next) = node;
+
+ /* Wait until the lock holder passes the lock down. */
+ arch_mcs_spin_lock_contended(&node->locked);
+}
+
+/*
+ * Releases the lock. The caller should pass in the corresponding node that
+ * was used to acquire the lock.
+ */
+static inline
+void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{
+ struct mcs_spinlock *next = ACCESS_ONCE(node->next);
+
+ if (likely(!next)) {
+ /*
+ * Release the lock by setting it to NULL
+ */
+ if (likely(cmpxchg(lock, node, NULL) == node))
+ return;
+ /* Wait until the next pointer is set */
+ while (!(next = ACCESS_ONCE(node->next)))
+ cpu_relax_lowlatency();
+ }
+
+ /* Pass lock to next waiter. */
+ arch_mcs_spin_unlock_contended(&next->locked);
+}
+
+/*
+ * Cancellable version of the MCS lock above.
+ *
+ * Intended for adaptive spinning of sleeping locks:
+ * mutex_lock()/rwsem_down_{read,write}() etc.
+ */
+
+struct optimistic_spin_node {
+ struct optimistic_spin_node *next, *prev;
+ int locked; /* 1 if lock acquired */
+ int cpu; /* encoded CPU # value */
+};
+
+extern bool osq_lock(struct optimistic_spin_queue *lock);
+extern void osq_unlock(struct optimistic_spin_queue *lock);
+
+#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index faf6f5b53e77..5cf6731b98e9 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -71,18 +71,23 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
void debug_mutex_unlock(struct mutex *lock)
{
- if (unlikely(!debug_locks))
- return;
+ if (likely(debug_locks)) {
+ DEBUG_LOCKS_WARN_ON(lock->magic != lock);
- DEBUG_LOCKS_WARN_ON(lock->magic != lock);
+ if (!lock->owner)
+ DEBUG_LOCKS_WARN_ON(!lock->owner);
+ else
+ DEBUG_LOCKS_WARN_ON(lock->owner != current);
- if (!lock->owner)
- DEBUG_LOCKS_WARN_ON(!lock->owner);
- else
- DEBUG_LOCKS_WARN_ON(lock->owner != current);
+ DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
+ mutex_clear_owner(lock);
+ }
- DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
- mutex_clear_owner(lock);
+ /*
+ * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug
+ * mutexes so that we can do it here after we've verified state.
+ */
+ atomic_set(&lock->count, 1);
}
void debug_mutex_init(struct mutex *lock, const char *name,
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 4dd6e4c219de..ae712b25e492 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -25,6 +25,7 @@
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/debug_locks.h>
+#include "mcs_spinlock.h"
/*
* In the DEBUG case we are using the "NULL fastpath" for mutexes,
@@ -33,17 +34,18 @@
#ifdef CONFIG_DEBUG_MUTEXES
# include "mutex-debug.h"
# include <asm-generic/mutex-null.h>
+/*
+ * Must be 0 for the debug case so we do not do the unlock outside of the
+ * wait_lock region. debug_mutex_unlock() will do the actual unlock in this
+ * case.
+ */
+# undef __mutex_slowpath_needs_to_unlock
+# define __mutex_slowpath_needs_to_unlock() 0
#else
# include "mutex.h"
# include <asm/mutex.h>
#endif
-/*
- * A negative mutex count indicates that waiters are sleeping waiting for the
- * mutex.
- */
-#define MUTEX_SHOW_NO_WAITER(mutex) (atomic_read(&(mutex)->count) >= 0)
-
void
__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
{
@@ -52,7 +54,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
INIT_LIST_HEAD(&lock->wait_list);
mutex_clear_owner(lock);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
- lock->spin_mlock = NULL;
+ osq_lock_init(&lock->osq);
#endif
debug_mutex_init(lock, name, key);
@@ -67,8 +69,7 @@ EXPORT_SYMBOL(__mutex_init);
* We also put the fastpath first in the kernel image, to make sure the
* branch is predicted by the CPU as default-untaken.
*/
-static __used noinline void __sched
-__mutex_lock_slowpath(atomic_t *lock_count);
+__visible void __sched __mutex_lock_slowpath(atomic_t *lock_count);
/**
* mutex_lock - acquire the mutex
@@ -111,54 +112,7 @@ EXPORT_SYMBOL(mutex_lock);
* more or less simultaneously, the spinners need to acquire a MCS lock
* first before spinning on the owner field.
*
- * We don't inline mspin_lock() so that perf can correctly account for the
- * time spent in this lock function.
*/
-struct mspin_node {
- struct mspin_node *next ;
- int locked; /* 1 if lock acquired */
-};
-#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock))
-
-static noinline
-void mspin_lock(struct mspin_node **lock, struct mspin_node *node)
-{
- struct mspin_node *prev;
-
- /* Init node */
- node->locked = 0;
- node->next = NULL;
-
- prev = xchg(lock, node);
- if (likely(prev == NULL)) {
- /* Lock acquired */
- node->locked = 1;
- return;
- }
- ACCESS_ONCE(prev->next) = node;
- smp_wmb();
- /* Wait until the lock holder passes the lock down */
- while (!ACCESS_ONCE(node->locked))
- arch_mutex_cpu_relax();
-}
-
-static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node)
-{
- struct mspin_node *next = ACCESS_ONCE(node->next);
-
- if (likely(!next)) {
- /*
- * Release the lock by setting it to NULL
- */
- if (cmpxchg(lock, node, NULL) == node)
- return;
- /* Wait until the next pointer is set */
- while (!(next = ACCESS_ONCE(node->next)))
- arch_mutex_cpu_relax();
- }
- ACCESS_ONCE(next->locked) = 1;
- smp_wmb();
-}
/*
* Mutex spinning code migrated from kernel/sched/core.c
@@ -192,7 +146,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
if (need_resched())
break;
- arch_mutex_cpu_relax();
+ cpu_relax_lowlatency();
}
rcu_read_unlock();
@@ -212,6 +166,9 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
struct task_struct *owner;
int retval = 1;
+ if (need_resched())
+ return 0;
+
rcu_read_lock();
owner = ACCESS_ONCE(lock->owner);
if (owner)
@@ -225,7 +182,8 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
}
#endif
-static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
+__visible __used noinline
+void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
/**
* mutex_unlock - release the mutex
@@ -424,12 +382,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
/*
* Optimistic spinning.
*
- * We try to spin for acquisition when we find that there are no
- * pending waiters and the lock owner is currently running on a
- * (different) CPU.
- *
- * The rationale is that if the lock owner is running, it is likely to
- * release the lock soon.
+ * We try to spin for acquisition when we find that the lock owner
+ * is currently running on a (different) CPU and while we don't
+ * need to reschedule. The rationale is that if the lock owner is
+ * running, it is likely to release the lock soon.
*
* Since this needs the lock owner, and this mutex implementation
* doesn't track the owner atomically in the lock field, we need to
@@ -446,9 +402,11 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
if (!mutex_can_spin_on_owner(lock))
goto slowpath;
+ if (!osq_lock(&lock->osq))
+ goto slowpath;
+
for (;;) {
struct task_struct *owner;
- struct mspin_node node;
if (use_ww_ctx && ww_ctx->acquired > 0) {
struct ww_mutex *ww;
@@ -463,21 +421,19 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* performed the optimistic spinning cannot be done.
*/
if (ACCESS_ONCE(ww->ctx))
- goto slowpath;
+ break;
}
/*
* If there's an owner, wait for it to either
* release the lock or go to sleep.
*/
- mspin_lock(MLOCK(lock), &node);
owner = ACCESS_ONCE(lock->owner);
- if (owner && !mutex_spin_on_owner(lock, owner)) {
- mspin_unlock(MLOCK(lock), &node);
- goto slowpath;
- }
+ if (owner && !mutex_spin_on_owner(lock, owner))
+ break;
- if ((atomic_read(&lock->count) == 1) &&
+ /* Try to acquire the mutex if it is unlocked. */
+ if (!mutex_is_locked(lock) &&
(atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
lock_acquired(&lock->dep_map, ip);
if (use_ww_ctx) {
@@ -488,11 +444,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
}
mutex_set_owner(lock);
- mspin_unlock(MLOCK(lock), &node);
+ osq_unlock(&lock->osq);
preempt_enable();
return 0;
}
- mspin_unlock(MLOCK(lock), &node);
/*
* When there's no owner, we might have preempted between the
@@ -501,7 +456,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* the owner complete.
*/
if (!owner && (need_resched() || rt_task(task)))
- goto slowpath;
+ break;
/*
* The cpu_relax() call is a compiler barrier which forces
@@ -509,14 +464,25 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* memory barriers as we'll eventually observe the right
* values at the cost of a few extra spins.
*/
- arch_mutex_cpu_relax();
+ cpu_relax_lowlatency();
}
+ osq_unlock(&lock->osq);
slowpath:
+ /*
+ * If we fell out of the spin path because of need_resched(),
+ * reschedule now, before we try-lock the mutex. This avoids getting
+ * scheduled out right after we obtained the mutex.
+ */
+ if (need_resched())
+ schedule_preempt_disabled();
#endif
spin_lock_mutex(&lock->wait_lock, flags);
- /* once more, can we acquire the lock? */
- if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, 0) == 1))
+ /*
+ * Once more, try to acquire the lock. Only try-lock the mutex if
+ * it is unlocked to reduce unnecessary xchg() operations.
+ */
+ if (!mutex_is_locked(lock) && (atomic_xchg(&lock->count, 0) == 1))
goto skip_wait;
debug_mutex_lock_common(lock, &waiter);
@@ -536,9 +502,10 @@ slowpath:
* it's unlocked. Later on, if we sleep, this is the
* operation that gives us the lock. We xchg it to -1, so
* that when we release the lock, we properly wake up the
- * other waiters:
+ * other waiters. We only attempt the xchg if the count is
+ * non-negative in order to avoid unnecessary xchg operations:
*/
- if (MUTEX_SHOW_NO_WAITER(lock) &&
+ if (atomic_read(&lock->count) >= 0 &&
(atomic_xchg(&lock->count, -1) == 1))
break;
@@ -717,10 +684,6 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
struct mutex *lock = container_of(lock_count, struct mutex, count);
unsigned long flags;
- spin_lock_mutex(&lock->wait_lock, flags);
- mutex_release(&lock->dep_map, nested, _RET_IP_);
- debug_mutex_unlock(lock);
-
/*
* some architectures leave the lock unlocked in the fastpath failure
* case, others need to leave it locked. In the later case we have to
@@ -729,6 +692,10 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
if (__mutex_slowpath_needs_to_unlock())
atomic_set(&lock->count, 1);
+ spin_lock_mutex(&lock->wait_lock, flags);
+ mutex_release(&lock->dep_map, nested, _RET_IP_);
+ debug_mutex_unlock(lock);
+
if (!list_empty(&lock->wait_list)) {
/* get the first entry from the wait-list: */
struct mutex_waiter *waiter =
@@ -746,7 +713,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
/*
* Release the lock, slowpath:
*/
-static __used noinline void
+__visible void
__mutex_unlock_slowpath(atomic_t *lock_count)
{
__mutex_unlock_common_slowpath(lock_count, 1);
@@ -803,7 +770,7 @@ int __sched mutex_lock_killable(struct mutex *lock)
}
EXPORT_SYMBOL(mutex_lock_killable);
-static __used noinline void __sched
+__visible void __sched
__mutex_lock_slowpath(atomic_t *lock_count)
{
struct mutex *lock = container_of(lock_count, struct mutex, count);
@@ -853,6 +820,10 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
unsigned long flags;
int prev;
+ /* No need to trylock if the mutex is locked. */
+ if (mutex_is_locked(lock))
+ return 0;
+
spin_lock_mutex(&lock->wait_lock, flags);
prev = atomic_xchg(&lock->count, -1);
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
new file mode 100644
index 000000000000..f956ede7f90d
--- /dev/null
+++ b/kernel/locking/qrwlock.c
@@ -0,0 +1,132 @@
+/*
+ * Queue read/write lock
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Authors: Waiman Long <waiman.long@hp.com>
+ */
+#include <linux/smp.h>
+#include <linux/bug.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <asm/qrwlock.h>
+
+/**
+ * rspin_until_writer_unlock - inc reader count & spin until writer is gone
+ * @lock : Pointer to queue rwlock structure
+ * @writer: Current queue rwlock writer status byte
+ *
+ * In interrupt context or at the head of the queue, the reader will just
+ * increment the reader count & wait until the writer releases the lock.
+ */
+static __always_inline void
+rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
+{
+ while ((cnts & _QW_WMASK) == _QW_LOCKED) {
+ cpu_relax_lowlatency();
+ cnts = smp_load_acquire((u32 *)&lock->cnts);
+ }
+}
+
+/**
+ * queue_read_lock_slowpath - acquire read lock of a queue rwlock
+ * @lock: Pointer to queue rwlock structure
+ */
+void queue_read_lock_slowpath(struct qrwlock *lock)
+{
+ u32 cnts;
+
+ /*
+ * Readers come here when they cannot get the lock without waiting
+ */
+ if (unlikely(in_interrupt())) {
+ /*
+ * Readers in interrupt context will spin until the lock is
+ * available without waiting in the queue.
+ */
+ cnts = smp_load_acquire((u32 *)&lock->cnts);
+ rspin_until_writer_unlock(lock, cnts);
+ return;
+ }
+ atomic_sub(_QR_BIAS, &lock->cnts);
+
+ /*
+ * Put the reader into the wait queue
+ */
+ arch_spin_lock(&lock->lock);
+
+ /*
+ * At the head of the wait queue now, wait until the writer state
+ * goes to 0 and then try to increment the reader count and get
+ * the lock. It is possible that an incoming writer may steal the
+ * lock in the interim, so it is necessary to check the writer byte
+ * to make sure that the write lock isn't taken.
+ */
+ while (atomic_read(&lock->cnts) & _QW_WMASK)
+ cpu_relax_lowlatency();
+
+ cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS;
+ rspin_until_writer_unlock(lock, cnts);
+
+ /*
+ * Signal the next one in queue to become queue head
+ */
+ arch_spin_unlock(&lock->lock);
+}
+EXPORT_SYMBOL(queue_read_lock_slowpath);
+
+/**
+ * queue_write_lock_slowpath - acquire write lock of a queue rwlock
+ * @lock : Pointer to queue rwlock structure
+ */
+void queue_write_lock_slowpath(struct qrwlock *lock)
+{
+ u32 cnts;
+
+ /* Put the writer into the wait queue */
+ arch_spin_lock(&lock->lock);
+
+ /* Try to acquire the lock directly if no reader is present */
+ if (!atomic_read(&lock->cnts) &&
+ (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0))
+ goto unlock;
+
+ /*
+ * Set the waiting flag to notify readers that a writer is pending,
+ * or wait for a previous writer to go away.
+ */
+ for (;;) {
+ cnts = atomic_read(&lock->cnts);
+ if (!(cnts & _QW_WMASK) &&
+ (atomic_cmpxchg(&lock->cnts, cnts,
+ cnts | _QW_WAITING) == cnts))
+ break;
+
+ cpu_relax_lowlatency();
+ }
+
+ /* When no more readers, set the locked flag */
+ for (;;) {
+ cnts = atomic_read(&lock->cnts);
+ if ((cnts == _QW_WAITING) &&
+ (atomic_cmpxchg(&lock->cnts, _QW_WAITING,
+ _QW_LOCKED) == _QW_WAITING))
+ break;
+
+ cpu_relax_lowlatency();
+ }
+unlock:
+ arch_spin_unlock(&lock->lock);
+}
+EXPORT_SYMBOL(queue_write_lock_slowpath);
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 49b2ed3dced8..62b6cee8ea7f 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -66,12 +66,13 @@ void rt_mutex_debug_task_free(struct task_struct *task)
* the deadlock. We print when we return. act_waiter can be NULL in
* case of a remove waiter operation.
*/
-void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
+void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
+ struct rt_mutex_waiter *act_waiter,
struct rt_mutex *lock)
{
struct task_struct *task;
- if (!debug_locks || detect || !act_waiter)
+ if (!debug_locks || chwalk == RT_MUTEX_FULL_CHAINWALK || !act_waiter)
return;
task = rt_mutex_owner(act_waiter->lock);
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index 14193d596d78..d0519c3432b6 100644
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -20,14 +20,20 @@ extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
struct task_struct *powner);
extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
-extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter,
+extern void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
+ struct rt_mutex_waiter *waiter,
struct rt_mutex *lock);
extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
# define debug_rt_mutex_reset_waiter(w) \
do { (w)->deadlock_lock = NULL; } while (0)
-static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
- int detect)
+static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
+ enum rtmutex_chainwalk walk)
{
return (waiter != NULL);
}
+
+static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
+{
+ debug_rt_mutex_print_deadlock(w);
+}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 2e960a2bab81..a0ea2a141b3b 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -83,6 +83,47 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
owner = *p;
} while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
}
+
+/*
+ * Safe fastpath aware unlock:
+ * 1) Clear the waiters bit
+ * 2) Drop lock->wait_lock
+ * 3) Try to unlock the lock with cmpxchg
+ */
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+ __releases(lock->wait_lock)
+{
+ struct task_struct *owner = rt_mutex_owner(lock);
+
+ clear_rt_mutex_waiters(lock);
+ raw_spin_unlock(&lock->wait_lock);
+ /*
+ * If a new waiter comes in between the unlock and the cmpxchg
+ * we have two situations:
+ *
+ * unlock(wait_lock);
+ * lock(wait_lock);
+ * cmpxchg(p, owner, 0) == owner
+ * mark_rt_mutex_waiters(lock);
+ * acquire(lock);
+ * or:
+ *
+ * unlock(wait_lock);
+ * lock(wait_lock);
+ * mark_rt_mutex_waiters(lock);
+ *
+ * cmpxchg(p, owner, 0) != owner
+ * enqueue_waiter();
+ * unlock(wait_lock);
+ * lock(wait_lock);
+ * wake waiter();
+ * unlock(wait_lock);
+ * lock(wait_lock);
+ * acquire(lock);
+ */
+ return rt_mutex_cmpxchg(lock, owner, NULL);
+}
+
#else
# define rt_mutex_cmpxchg(l,c,n) (0)
static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
@@ -90,6 +131,17 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
lock->owner = (struct task_struct *)
((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
}
+
+/*
+ * Simple slow path only version: lock->owner is protected by lock->wait_lock.
+ */
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+ __releases(lock->wait_lock)
+{
+ lock->owner = NULL;
+ raw_spin_unlock(&lock->wait_lock);
+ return true;
+}
#endif
static inline int
@@ -213,6 +265,18 @@ struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
}
/*
+ * Called by sched_setscheduler() to check whether the priority change
+ * is overruled by a possible priority boosting.
+ */
+int rt_mutex_check_prio(struct task_struct *task, int newprio)
+{
+ if (!task_has_pi_waiters(task))
+ return 0;
+
+ return task_top_pi_waiter(task)->task->prio <= newprio;
+}
+
+/*
* Adjust the priority of a task, after its pi_waiters got modified.
*
* This can be both boosting and unboosting. task->pi_lock must be held.
@@ -244,41 +308,120 @@ static void rt_mutex_adjust_prio(struct task_struct *task)
}
/*
+ * Deadlock detection is conditional:
+ *
+ * If CONFIG_DEBUG_RT_MUTEXES=n, deadlock detection is only conducted
+ * if the detect argument is == RT_MUTEX_FULL_CHAINWALK.
+ *
+ * If CONFIG_DEBUG_RT_MUTEXES=y, deadlock detection is always
+ * conducted independent of the detect argument.
+ *
+ * If the waiter argument is NULL this indicates the deboost path and
+ * deadlock detection is disabled independent of the detect argument
+ * and the config settings.
+ */
+static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
+ enum rtmutex_chainwalk chwalk)
+{
+ /*
+ * This is just a wrapper function for the following call,
+ * because debug_rt_mutex_detect_deadlock() smells like a magic
+ * debug feature and I wanted to keep the cond function in the
+ * main source file along with the comments instead of having
+ * two of the same in the headers.
+ */
+ return debug_rt_mutex_detect_deadlock(waiter, chwalk);
+}
+
+/*
* Max number of times we'll walk the boosting chain:
*/
int max_lock_depth = 1024;
+static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
+{
+ return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
+}
+
/*
* Adjust the priority chain. Also used for deadlock detection.
* Decreases task's usage by one - may thus free the task.
*
- * @task: the task owning the mutex (owner) for which a chain walk is probably
- * needed
+ * @task: the task owning the mutex (owner) for which a chain walk is
+ * probably needed
* @deadlock_detect: do we have to carry out deadlock detection?
- * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck
- * things for a task that has just got its priority adjusted, and
- * is waiting on a mutex)
+ * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck
+ * things for a task that has just got its priority adjusted, and
+ * is waiting on a mutex)
+ * @next_lock: the mutex on which the owner of @orig_lock was blocked before
+ * we dropped its pi_lock. Is never dereferenced, only used for
+ * comparison to detect lock chain changes.
* @orig_waiter: rt_mutex_waiter struct for the task that has just donated
- * its priority to the mutex owner (can be NULL in the case
- * depicted above or if the top waiter is gone away and we are
- * actually deboosting the owner)
- * @top_task: the current top waiter
+ * its priority to the mutex owner (can be NULL in the case
+ * depicted above or if the top waiter is gone away and we are
+ * actually deboosting the owner)
+ * @top_task: the current top waiter
*
* Returns 0 or -EDEADLK.
+ *
+ * Chain walk basics and protection scope
+ *
+ * [R] refcount on task
+ * [P] task->pi_lock held
+ * [L] rtmutex->wait_lock held
+ *
+ * Step Description Protected by
+ * function arguments:
+ * @task [R]
+ * @orig_lock if != NULL @top_task is blocked on it
+ * @next_lock Unprotected. Cannot be
+ * dereferenced. Only used for
+ * comparison.
+ * @orig_waiter if != NULL @top_task is blocked on it
+ * @top_task current, or in case of proxy
+ * locking protected by calling
+ * code
+ * again:
+ * loop_sanity_check();
+ * retry:
+ * [1] lock(task->pi_lock); [R] acquire [P]
+ * [2] waiter = task->pi_blocked_on; [P]
+ * [3] check_exit_conditions_1(); [P]
+ * [4] lock = waiter->lock; [P]
+ * [5] if (!try_lock(lock->wait_lock)) { [P] try to acquire [L]
+ * unlock(task->pi_lock); release [P]
+ * goto retry;
+ * }
+ * [6] check_exit_conditions_2(); [P] + [L]
+ * [7] requeue_lock_waiter(lock, waiter); [P] + [L]
+ * [8] unlock(task->pi_lock); release [P]
+ * put_task_struct(task); release [R]
+ * [9] check_exit_conditions_3(); [L]
+ * [10] task = owner(lock); [L]
+ * get_task_struct(task); [L] acquire [R]
+ * lock(task->pi_lock); [L] acquire [P]
+ * [11] requeue_pi_waiter(tsk, waiters(lock));[P] + [L]
+ * [12] check_exit_conditions_4(); [P] + [L]
+ * [13] unlock(task->pi_lock); release [P]
+ * unlock(lock->wait_lock); release [L]
+ * goto again;
*/
static int rt_mutex_adjust_prio_chain(struct task_struct *task,
- int deadlock_detect,
+ enum rtmutex_chainwalk chwalk,
struct rt_mutex *orig_lock,
+ struct rt_mutex *next_lock,
struct rt_mutex_waiter *orig_waiter,
struct task_struct *top_task)
{
- struct rt_mutex *lock;
struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
- int detect_deadlock, ret = 0, depth = 0;
+ struct rt_mutex_waiter *prerequeue_top_waiter;
+ int ret = 0, depth = 0;
+ struct rt_mutex *lock;
+ bool detect_deadlock;
unsigned long flags;
+ bool requeue = true;
- detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter,
- deadlock_detect);
+ detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
/*
* The (de)boosting is a step by step approach with a lot of
@@ -287,6 +430,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* carefully whether things change under us.
*/
again:
+ /*
+ * We limit the lock chain length for each invocation.
+ */
if (++depth > max_lock_depth) {
static int prev_max;
@@ -302,15 +448,30 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
}
put_task_struct(task);
- return deadlock_detect ? -EDEADLK : 0;
+ return -EDEADLK;
}
+
+ /*
+ * We are fully preemptible here and only hold the refcount on
+ * @task. So everything can have changed under us since the
+ * caller or our own code below (goto retry/again) dropped all
+ * locks.
+ */
retry:
/*
- * Task can not go away as we did a get_task() before !
+ * [1] Task cannot go away as we did a get_task() before !
*/
raw_spin_lock_irqsave(&task->pi_lock, flags);
+ /*
+ * [2] Get the waiter on which @task is blocked on.
+ */
waiter = task->pi_blocked_on;
+
+ /*
+ * [3] check_exit_conditions_1() protected by task->pi_lock.
+ */
+
/*
* Check whether the end of the boosting chain has been
* reached or the state of the chain has changed while we
@@ -327,82 +488,243 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
goto out_unlock_pi;
/*
+ * We dropped all locks after taking a refcount on @task, so
+ * the task might have moved on in the lock chain or even left
+ * the chain completely and blocks now on an unrelated lock or
+ * on @orig_lock.
+ *
+ * We stored the lock on which @task was blocked in @next_lock,
+ * so we can detect the chain change.
+ */
+ if (next_lock != waiter->lock)
+ goto out_unlock_pi;
+
+ /*
* Drop out, when the task has no waiters. Note,
* top_waiter can be NULL, when we are in the deboosting
* mode!
*/
- if (top_waiter && (!task_has_pi_waiters(task) ||
- top_waiter != task_top_pi_waiter(task)))
- goto out_unlock_pi;
+ if (top_waiter) {
+ if (!task_has_pi_waiters(task))
+ goto out_unlock_pi;
+ /*
+ * If deadlock detection is off, we stop here if we
+ * are not the top pi waiter of the task. If deadlock
+ * detection is enabled we continue, but stop the
+ * requeueing in the chain walk.
+ */
+ if (top_waiter != task_top_pi_waiter(task)) {
+ if (!detect_deadlock)
+ goto out_unlock_pi;
+ else
+ requeue = false;
+ }
+ }
/*
- * When deadlock detection is off then we check, if further
- * priority adjustment is necessary.
+ * If the waiter priority is the same as the task priority
+ * then there is no further priority adjustment necessary. If
+ * deadlock detection is off, we stop the chain walk. If its
+ * enabled we continue, but stop the requeueing in the chain
+ * walk.
*/
- if (!detect_deadlock && waiter->prio == task->prio)
- goto out_unlock_pi;
+ if (waiter->prio == task->prio) {
+ if (!detect_deadlock)
+ goto out_unlock_pi;
+ else
+ requeue = false;
+ }
+ /*
+ * [4] Get the next lock
+ */
lock = waiter->lock;
+ /*
+ * [5] We need to trylock here as we are holding task->pi_lock,
+ * which is the reverse lock order versus the other rtmutex
+ * operations.
+ */
if (!raw_spin_trylock(&lock->wait_lock)) {
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
cpu_relax();
goto retry;
}
- /* Deadlock detection */
+ /*
+ * [6] check_exit_conditions_2() protected by task->pi_lock and
+ * lock->wait_lock.
+ *
+ * Deadlock detection. If the lock is the same as the original
+ * lock which caused us to walk the lock chain or if the
+ * current lock is owned by the task which initiated the chain
+ * walk, we detected a deadlock.
+ */
if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
- debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
+ debug_rt_mutex_deadlock(chwalk, orig_waiter, lock);
raw_spin_unlock(&lock->wait_lock);
- ret = deadlock_detect ? -EDEADLK : 0;
+ ret = -EDEADLK;
goto out_unlock_pi;
}
- top_waiter = rt_mutex_top_waiter(lock);
+ /*
+ * If we just follow the lock chain for deadlock detection, no
+ * need to do all the requeue operations. To avoid a truckload
+ * of conditionals around the various places below, just do the
+ * minimum chain walk checks.
+ */
+ if (!requeue) {
+ /*
+ * No requeue[7] here. Just release @task [8]
+ */
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ put_task_struct(task);
- /* Requeue the waiter */
+ /*
+ * [9] check_exit_conditions_3 protected by lock->wait_lock.
+ * If there is no owner of the lock, end of chain.
+ */
+ if (!rt_mutex_owner(lock)) {
+ raw_spin_unlock(&lock->wait_lock);
+ return 0;
+ }
+
+ /* [10] Grab the next task, i.e. owner of @lock */
+ task = rt_mutex_owner(lock);
+ get_task_struct(task);
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+
+ /*
+ * No requeue [11] here. We just do deadlock detection.
+ *
+ * [12] Store whether owner is blocked
+ * itself. Decision is made after dropping the locks
+ */
+ next_lock = task_blocked_on_lock(task);
+ /*
+ * Get the top waiter for the next iteration
+ */
+ top_waiter = rt_mutex_top_waiter(lock);
+
+ /* [13] Drop locks */
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&lock->wait_lock);
+
+ /* If owner is not blocked, end of chain. */
+ if (!next_lock)
+ goto out_put_task;
+ goto again;
+ }
+
+ /*
+ * Store the current top waiter before doing the requeue
+ * operation on @lock. We need it for the boost/deboost
+ * decision below.
+ */
+ prerequeue_top_waiter = rt_mutex_top_waiter(lock);
+
+ /* [7] Requeue the waiter in the lock waiter list. */
rt_mutex_dequeue(lock, waiter);
waiter->prio = task->prio;
rt_mutex_enqueue(lock, waiter);
- /* Release the task */
+ /* [8] Release the task */
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ put_task_struct(task);
+
+ /*
+ * [9] check_exit_conditions_3 protected by lock->wait_lock.
+ *
+ * We must abort the chain walk if there is no lock owner even
+ * in the dead lock detection case, as we have nothing to
+ * follow here. This is the end of the chain we are walking.
+ */
if (!rt_mutex_owner(lock)) {
/*
- * If the requeue above changed the top waiter, then we need
- * to wake the new top waiter up to try to get the lock.
+ * If the requeue [7] above changed the top waiter,
+ * then we need to wake the new top waiter up to try
+ * to get the lock.
*/
-
- if (top_waiter != rt_mutex_top_waiter(lock))
+ if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
wake_up_process(rt_mutex_top_waiter(lock)->task);
raw_spin_unlock(&lock->wait_lock);
- goto out_put_task;
+ return 0;
}
- put_task_struct(task);
- /* Grab the next task */
+ /* [10] Grab the next task, i.e. the owner of @lock */
task = rt_mutex_owner(lock);
get_task_struct(task);
raw_spin_lock_irqsave(&task->pi_lock, flags);
+ /* [11] requeue the pi waiters if necessary */
if (waiter == rt_mutex_top_waiter(lock)) {
- /* Boost the owner */
- rt_mutex_dequeue_pi(task, top_waiter);
+ /*
+ * The waiter became the new top (highest priority)
+ * waiter on the lock. Replace the previous top waiter
+ * in the owner tasks pi waiters list with this waiter
+ * and adjust the priority of the owner.
+ */
+ rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
rt_mutex_enqueue_pi(task, waiter);
__rt_mutex_adjust_prio(task);
- } else if (top_waiter == waiter) {
- /* Deboost the owner */
+ } else if (prerequeue_top_waiter == waiter) {
+ /*
+ * The waiter was the top waiter on the lock, but is
+ * no longer the top prority waiter. Replace waiter in
+ * the owner tasks pi waiters list with the new top
+ * (highest priority) waiter and adjust the priority
+ * of the owner.
+ * The new top waiter is stored in @waiter so that
+ * @waiter == @top_waiter evaluates to true below and
+ * we continue to deboost the rest of the chain.
+ */
rt_mutex_dequeue_pi(task, waiter);
waiter = rt_mutex_top_waiter(lock);
rt_mutex_enqueue_pi(task, waiter);
__rt_mutex_adjust_prio(task);
+ } else {
+ /*
+ * Nothing changed. No need to do any priority
+ * adjustment.
+ */
}
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
-
+ /*
+ * [12] check_exit_conditions_4() protected by task->pi_lock
+ * and lock->wait_lock. The actual decisions are made after we
+ * dropped the locks.
+ *
+ * Check whether the task which owns the current lock is pi
+ * blocked itself. If yes we store a pointer to the lock for
+ * the lock chain change detection above. After we dropped
+ * task->pi_lock next_lock cannot be dereferenced anymore.
+ */
+ next_lock = task_blocked_on_lock(task);
+ /*
+ * Store the top waiter of @lock for the end of chain walk
+ * decision below.
+ */
top_waiter = rt_mutex_top_waiter(lock);
+
+ /* [13] Drop the locks */
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
raw_spin_unlock(&lock->wait_lock);
+ /*
+ * Make the actual exit decisions [12], based on the stored
+ * values.
+ *
+ * We reached the end of the lock chain. Stop right here. No
+ * point to go back just to figure that out.
+ */
+ if (!next_lock)
+ goto out_put_task;
+
+ /*
+ * If the current waiter is not the top waiter on the lock,
+ * then we can stop the chain walk here if we are not in full
+ * deadlock detection mode.
+ */
if (!detect_deadlock && waiter != top_waiter)
goto out_put_task;
@@ -421,76 +743,119 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
*
* Must be called with lock->wait_lock held.
*
- * @lock: the lock to be acquired.
- * @task: the task which wants to acquire the lock
- * @waiter: the waiter that is queued to the lock's wait list. (could be NULL)
+ * @lock: The lock to be acquired.
+ * @task: The task which wants to acquire the lock
+ * @waiter: The waiter that is queued to the lock's wait list if the
+ * callsite called task_blocked_on_lock(), otherwise NULL
*/
static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
- struct rt_mutex_waiter *waiter)
+ struct rt_mutex_waiter *waiter)
{
+ unsigned long flags;
+
/*
- * We have to be careful here if the atomic speedups are
- * enabled, such that, when
- * - no other waiter is on the lock
- * - the lock has been released since we did the cmpxchg
- * the lock can be released or taken while we are doing the
- * checks and marking the lock with RT_MUTEX_HAS_WAITERS.
+ * Before testing whether we can acquire @lock, we set the
+ * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
+ * other tasks which try to modify @lock into the slow path
+ * and they serialize on @lock->wait_lock.
+ *
+ * The RT_MUTEX_HAS_WAITERS bit can have a transitional state
+ * as explained at the top of this file if and only if:
*
- * The atomic acquire/release aware variant of
- * mark_rt_mutex_waiters uses a cmpxchg loop. After setting
- * the WAITERS bit, the atomic release / acquire can not
- * happen anymore and lock->wait_lock protects us from the
- * non-atomic case.
+ * - There is a lock owner. The caller must fixup the
+ * transient state if it does a trylock or leaves the lock
+ * function due to a signal or timeout.
*
- * Note, that this might set lock->owner =
- * RT_MUTEX_HAS_WAITERS in the case the lock is not contended
- * any more. This is fixed up when we take the ownership.
- * This is the transitional state explained at the top of this file.
+ * - @task acquires the lock and there are no other
+ * waiters. This is undone in rt_mutex_set_owner(@task) at
+ * the end of this function.
*/
mark_rt_mutex_waiters(lock);
+ /*
+ * If @lock has an owner, give up.
+ */
if (rt_mutex_owner(lock))
return 0;
/*
- * It will get the lock because of one of these conditions:
- * 1) there is no waiter
- * 2) higher priority than waiters
- * 3) it is top waiter
+ * If @waiter != NULL, @task has already enqueued the waiter
+ * into @lock waiter list. If @waiter == NULL then this is a
+ * trylock attempt.
*/
- if (rt_mutex_has_waiters(lock)) {
- if (task->prio >= rt_mutex_top_waiter(lock)->prio) {
- if (!waiter || waiter != rt_mutex_top_waiter(lock))
- return 0;
- }
- }
-
- if (waiter || rt_mutex_has_waiters(lock)) {
- unsigned long flags;
- struct rt_mutex_waiter *top;
-
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ if (waiter) {
+ /*
+ * If waiter is not the highest priority waiter of
+ * @lock, give up.
+ */
+ if (waiter != rt_mutex_top_waiter(lock))
+ return 0;
- /* remove the queued waiter. */
- if (waiter) {
- rt_mutex_dequeue(lock, waiter);
- task->pi_blocked_on = NULL;
- }
+ /*
+ * We can acquire the lock. Remove the waiter from the
+ * lock waiters list.
+ */
+ rt_mutex_dequeue(lock, waiter);
+ } else {
/*
- * We have to enqueue the top waiter(if it exists) into
- * task->pi_waiters list.
+ * If the lock has waiters already we check whether @task is
+ * eligible to take over the lock.
+ *
+ * If there are no other waiters, @task can acquire
+ * the lock. @task->pi_blocked_on is NULL, so it does
+ * not need to be dequeued.
*/
if (rt_mutex_has_waiters(lock)) {
- top = rt_mutex_top_waiter(lock);
- rt_mutex_enqueue_pi(task, top);
+ /*
+ * If @task->prio is greater than or equal to
+ * the top waiter priority (kernel view),
+ * @task lost.
+ */
+ if (task->prio >= rt_mutex_top_waiter(lock)->prio)
+ return 0;
+
+ /*
+ * The current top waiter stays enqueued. We
+ * don't have to change anything in the lock
+ * waiters order.
+ */
+ } else {
+ /*
+ * No waiters. Take the lock without the
+ * pi_lock dance.@task->pi_blocked_on is NULL
+ * and we have no waiters to enqueue in @task
+ * pi waiters list.
+ */
+ goto takeit;
}
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
}
+ /*
+ * Clear @task->pi_blocked_on. Requires protection by
+ * @task->pi_lock. Redundant operation for the @waiter == NULL
+ * case, but conditionals are more expensive than a redundant
+ * store.
+ */
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+ task->pi_blocked_on = NULL;
+ /*
+ * Finish the lock acquisition. @task is the new owner. If
+ * other waiters exist we have to insert the highest priority
+ * waiter into @task->pi_waiters list.
+ */
+ if (rt_mutex_has_waiters(lock))
+ rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+
+takeit:
/* We got the lock. */
debug_rt_mutex_lock(lock);
+ /*
+ * This either preserves the RT_MUTEX_HAS_WAITERS bit if there
+ * are still waiters or clears it.
+ */
rt_mutex_set_owner(lock, task);
rt_mutex_deadlock_account_lock(lock, task);
@@ -508,12 +873,25 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task,
- int detect_deadlock)
+ enum rtmutex_chainwalk chwalk)
{
struct task_struct *owner = rt_mutex_owner(lock);
struct rt_mutex_waiter *top_waiter = waiter;
- unsigned long flags;
+ struct rt_mutex *next_lock;
int chain_walk = 0, res;
+ unsigned long flags;
+
+ /*
+ * Early deadlock detection. We really don't want the task to
+ * enqueue on itself just to untangle the mess later. It's not
+ * only an optimization. We drop the locks, so another waiter
+ * can come in before the chain walk detects the deadlock. So
+ * the other will detect the deadlock and return -EDEADLOCK,
+ * which is wrong, as the other waiter is not in a deadlock
+ * situation.
+ */
+ if (owner == task)
+ return -EDEADLK;
raw_spin_lock_irqsave(&task->pi_lock, flags);
__rt_mutex_adjust_prio(task);
@@ -533,20 +911,28 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
if (!owner)
return 0;
+ raw_spin_lock_irqsave(&owner->pi_lock, flags);
if (waiter == rt_mutex_top_waiter(lock)) {
- raw_spin_lock_irqsave(&owner->pi_lock, flags);
rt_mutex_dequeue_pi(owner, top_waiter);
rt_mutex_enqueue_pi(owner, waiter);
__rt_mutex_adjust_prio(owner);
if (owner->pi_blocked_on)
chain_walk = 1;
- raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
- }
- else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
+ } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
chain_walk = 1;
+ }
+
+ /* Store the lock on which owner is blocked or NULL */
+ next_lock = task_blocked_on_lock(owner);
- if (!chain_walk)
+ raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+ /*
+ * Even if full deadlock detection is on, if the owner is not
+ * blocked itself, we can avoid finding this out in the chain
+ * walk.
+ */
+ if (!chain_walk || !next_lock)
return 0;
/*
@@ -558,8 +944,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
raw_spin_unlock(&lock->wait_lock);
- res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
- task);
+ res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
+ next_lock, waiter, task);
raw_spin_lock(&lock->wait_lock);
@@ -569,7 +955,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
/*
* Wake up the next waiter on the lock.
*
- * Remove the top waiter from the current tasks waiter list and wake it up.
+ * Remove the top waiter from the current tasks pi waiter list and
+ * wake it up.
*
* Called with lock->wait_lock held.
*/
@@ -590,10 +977,23 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
*/
rt_mutex_dequeue_pi(current, waiter);
- rt_mutex_set_owner(lock, NULL);
+ /*
+ * As we are waking up the top waiter, and the waiter stays
+ * queued on the lock until it gets the lock, this lock
+ * obviously has waiters. Just set the bit here and this has
+ * the added benefit of forcing all new tasks into the
+ * slow path making sure no task of lower priority than
+ * the top waiter can steal this lock.
+ */
+ lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+ /*
+ * It's safe to dereference waiter as it cannot go away as
+ * long as we hold lock->wait_lock. The waiter task needs to
+ * acquire it in order to dequeue the waiter.
+ */
wake_up_process(waiter->task);
}
@@ -606,40 +1006,42 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
static void remove_waiter(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter)
{
- int first = (waiter == rt_mutex_top_waiter(lock));
+ bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
struct task_struct *owner = rt_mutex_owner(lock);
+ struct rt_mutex *next_lock;
unsigned long flags;
- int chain_walk = 0;
raw_spin_lock_irqsave(&current->pi_lock, flags);
rt_mutex_dequeue(lock, waiter);
current->pi_blocked_on = NULL;
raw_spin_unlock_irqrestore(&current->pi_lock, flags);
- if (!owner)
+ /*
+ * Only update priority if the waiter was the highest priority
+ * waiter of the lock and there is an owner to update.
+ */
+ if (!owner || !is_top_waiter)
return;
- if (first) {
-
- raw_spin_lock_irqsave(&owner->pi_lock, flags);
+ raw_spin_lock_irqsave(&owner->pi_lock, flags);
- rt_mutex_dequeue_pi(owner, waiter);
+ rt_mutex_dequeue_pi(owner, waiter);
- if (rt_mutex_has_waiters(lock)) {
- struct rt_mutex_waiter *next;
+ if (rt_mutex_has_waiters(lock))
+ rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
- next = rt_mutex_top_waiter(lock);
- rt_mutex_enqueue_pi(owner, next);
- }
- __rt_mutex_adjust_prio(owner);
+ __rt_mutex_adjust_prio(owner);
- if (owner->pi_blocked_on)
- chain_walk = 1;
+ /* Store the lock on which owner is blocked or NULL */
+ next_lock = task_blocked_on_lock(owner);
- raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
- }
+ raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
- if (!chain_walk)
+ /*
+ * Don't walk the chain, if the owner task is not blocked
+ * itself.
+ */
+ if (!next_lock)
return;
/* gets dropped in rt_mutex_adjust_prio_chain()! */
@@ -647,7 +1049,8 @@ static void remove_waiter(struct rt_mutex *lock,
raw_spin_unlock(&lock->wait_lock);
- rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);
+ rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
+ next_lock, NULL, current);
raw_spin_lock(&lock->wait_lock);
}
@@ -660,6 +1063,7 @@ static void remove_waiter(struct rt_mutex *lock,
void rt_mutex_adjust_pi(struct task_struct *task)
{
struct rt_mutex_waiter *waiter;
+ struct rt_mutex *next_lock;
unsigned long flags;
raw_spin_lock_irqsave(&task->pi_lock, flags);
@@ -670,12 +1074,14 @@ void rt_mutex_adjust_pi(struct task_struct *task)
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
return;
}
-
+ next_lock = waiter->lock;
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
/* gets dropped in rt_mutex_adjust_prio_chain()! */
get_task_struct(task);
- rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
+
+ rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
+ next_lock, NULL, task);
}
/**
@@ -727,13 +1133,33 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
return ret;
}
+static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
+ struct rt_mutex_waiter *w)
+{
+ /*
+ * If the result is not -EDEADLOCK or the caller requested
+ * deadlock detection, nothing to do here.
+ */
+ if (res != -EDEADLOCK || detect_deadlock)
+ return;
+
+ /*
+ * Yell lowdly and stop the task right here.
+ */
+ rt_mutex_print_deadlock(w);
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ }
+}
+
/*
* Slow path lock function:
*/
static int __sched
rt_mutex_slowlock(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
- int detect_deadlock)
+ enum rtmutex_chainwalk chwalk)
{
struct rt_mutex_waiter waiter;
int ret = 0;
@@ -759,15 +1185,17 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
timeout->task = NULL;
}
- ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock);
+ ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
if (likely(!ret))
ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
set_current_state(TASK_RUNNING);
- if (unlikely(ret))
+ if (unlikely(ret)) {
remove_waiter(lock, &waiter);
+ rt_mutex_handle_deadlock(ret, chwalk, &waiter);
+ }
/*
* try_to_take_rt_mutex() sets the waiter bit
@@ -789,22 +1217,31 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
/*
* Slow path try-lock function:
*/
-static inline int
-rt_mutex_slowtrylock(struct rt_mutex *lock)
+static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
{
- int ret = 0;
+ int ret;
+
+ /*
+ * If the lock already has an owner we fail to get the lock.
+ * This can be done without taking the @lock->wait_lock as
+ * it is only being read, and this is a trylock anyway.
+ */
+ if (rt_mutex_owner(lock))
+ return 0;
+ /*
+ * The mutex has currently no owner. Lock the wait lock and
+ * try to acquire the lock.
+ */
raw_spin_lock(&lock->wait_lock);
- if (likely(rt_mutex_owner(lock) != current)) {
+ ret = try_to_take_rt_mutex(lock, current, NULL);
- ret = try_to_take_rt_mutex(lock, current, NULL);
- /*
- * try_to_take_rt_mutex() sets the lock waiters
- * bit unconditionally. Clean this up.
- */
- fixup_rt_mutex_waiters(lock);
- }
+ /*
+ * try_to_take_rt_mutex() sets the lock waiters bit
+ * unconditionally. Clean this up.
+ */
+ fixup_rt_mutex_waiters(lock);
raw_spin_unlock(&lock->wait_lock);
@@ -823,12 +1260,49 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
rt_mutex_deadlock_account_unlock(current);
- if (!rt_mutex_has_waiters(lock)) {
- lock->owner = NULL;
- raw_spin_unlock(&lock->wait_lock);
- return;
+ /*
+ * We must be careful here if the fast path is enabled. If we
+ * have no waiters queued we cannot set owner to NULL here
+ * because of:
+ *
+ * foo->lock->owner = NULL;
+ * rtmutex_lock(foo->lock); <- fast path
+ * free = atomic_dec_and_test(foo->refcnt);
+ * rtmutex_unlock(foo->lock); <- fast path
+ * if (free)
+ * kfree(foo);
+ * raw_spin_unlock(foo->lock->wait_lock);
+ *
+ * So for the fastpath enabled kernel:
+ *
+ * Nothing can set the waiters bit as long as we hold
+ * lock->wait_lock. So we do the following sequence:
+ *
+ * owner = rt_mutex_owner(lock);
+ * clear_rt_mutex_waiters(lock);
+ * raw_spin_unlock(&lock->wait_lock);
+ * if (cmpxchg(&lock->owner, owner, 0) == owner)
+ * return;
+ * goto retry;
+ *
+ * The fastpath disabled variant is simple as all access to
+ * lock->owner is serialized by lock->wait_lock:
+ *
+ * lock->owner = NULL;
+ * raw_spin_unlock(&lock->wait_lock);
+ */
+ while (!rt_mutex_has_waiters(lock)) {
+ /* Drops lock->wait_lock ! */
+ if (unlock_rt_mutex_safe(lock) == true)
+ return;
+ /* Relock the rtmutex and try again */
+ raw_spin_lock(&lock->wait_lock);
}
+ /*
+ * The wakeup next waiter path does not suffer from the above
+ * race. See the comments there.
+ */
wakeup_next_waiter(lock);
raw_spin_unlock(&lock->wait_lock);
@@ -845,30 +1319,31 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
*/
static inline int
rt_mutex_fastlock(struct rt_mutex *lock, int state,
- int detect_deadlock,
int (*slowfn)(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
- int detect_deadlock))
+ enum rtmutex_chainwalk chwalk))
{
- if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+ if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
rt_mutex_deadlock_account_lock(lock, current);
return 0;
} else
- return slowfn(lock, state, NULL, detect_deadlock);
+ return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
}
static inline int
rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout, int detect_deadlock,
+ struct hrtimer_sleeper *timeout,
+ enum rtmutex_chainwalk chwalk,
int (*slowfn)(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
- int detect_deadlock))
+ enum rtmutex_chainwalk chwalk))
{
- if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+ if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
+ likely(rt_mutex_cmpxchg(lock, NULL, current))) {
rt_mutex_deadlock_account_lock(lock, current);
return 0;
} else
- return slowfn(lock, state, timeout, detect_deadlock);
+ return slowfn(lock, state, timeout, chwalk);
}
static inline int
@@ -901,54 +1376,61 @@ void __sched rt_mutex_lock(struct rt_mutex *lock)
{
might_sleep();
- rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock);
+ rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
}
EXPORT_SYMBOL_GPL(rt_mutex_lock);
/**
* rt_mutex_lock_interruptible - lock a rt_mutex interruptible
*
- * @lock: the rt_mutex to be locked
- * @detect_deadlock: deadlock detection on/off
+ * @lock: the rt_mutex to be locked
*
* Returns:
- * 0 on success
- * -EINTR when interrupted by a signal
- * -EDEADLK when the lock would deadlock (when deadlock detection is on)
+ * 0 on success
+ * -EINTR when interrupted by a signal
*/
-int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
- int detect_deadlock)
+int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
{
might_sleep();
- return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE,
- detect_deadlock, rt_mutex_slowlock);
+ return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
}
EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+/*
+ * Futex variant with full deadlock detection.
+ */
+int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
+ struct hrtimer_sleeper *timeout)
+{
+ might_sleep();
+
+ return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
+ RT_MUTEX_FULL_CHAINWALK,
+ rt_mutex_slowlock);
+}
+
/**
* rt_mutex_timed_lock - lock a rt_mutex interruptible
* the timeout structure is provided
* by the caller
*
- * @lock: the rt_mutex to be locked
+ * @lock: the rt_mutex to be locked
* @timeout: timeout structure or NULL (no timeout)
- * @detect_deadlock: deadlock detection on/off
*
* Returns:
- * 0 on success
- * -EINTR when interrupted by a signal
+ * 0 on success
+ * -EINTR when interrupted by a signal
* -ETIMEDOUT when the timeout expired
- * -EDEADLK when the lock would deadlock (when deadlock detection is on)
*/
int
-rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout,
- int detect_deadlock)
+rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
{
might_sleep();
return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
- detect_deadlock, rt_mutex_slowlock);
+ RT_MUTEX_MIN_CHAINWALK,
+ rt_mutex_slowlock);
}
EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
@@ -1054,7 +1536,6 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
* @lock: the rt_mutex to take
* @waiter: the pre-initialized rt_mutex_waiter
* @task: the task to prepare
- * @detect_deadlock: perform deadlock detection (1) or not (0)
*
* Returns:
* 0 - task blocked on lock
@@ -1065,7 +1546,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
*/
int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
- struct task_struct *task, int detect_deadlock)
+ struct task_struct *task)
{
int ret;
@@ -1076,7 +1557,9 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
return 1;
}
- ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
+ /* We enforce deadlock detection for futexes */
+ ret = task_blocks_on_rt_mutex(lock, waiter, task,
+ RT_MUTEX_FULL_CHAINWALK);
if (ret && !rt_mutex_owner(lock)) {
/*
@@ -1122,22 +1605,20 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
* rt_mutex_finish_proxy_lock() - Complete lock acquisition
* @lock: the rt_mutex we were woken on
* @to: the timeout, null if none. hrtimer should already have
- * been started.
+ * been started.
* @waiter: the pre-initialized rt_mutex_waiter
- * @detect_deadlock: perform deadlock detection (1) or not (0)
*
* Complete the lock acquisition started our behalf by another thread.
*
* Returns:
* 0 - success
- * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK
+ * <0 - error, one of -EINTR, -ETIMEDOUT
*
* Special API call for PI-futex requeue support
*/
int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
struct hrtimer_sleeper *to,
- struct rt_mutex_waiter *waiter,
- int detect_deadlock)
+ struct rt_mutex_waiter *waiter)
{
int ret;
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
index a1a1dd06421d..c4060584c407 100644
--- a/kernel/locking/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -22,5 +22,15 @@
#define debug_rt_mutex_init(m, n) do { } while (0)
#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0)
#define debug_rt_mutex_print_deadlock(w) do { } while (0)
-#define debug_rt_mutex_detect_deadlock(w,d) (d)
#define debug_rt_mutex_reset_waiter(w) do { } while (0)
+
+static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
+{
+ WARN(1, "rtmutex deadlock detected\n");
+}
+
+static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *w,
+ enum rtmutex_chainwalk walk)
+{
+ return walk == RT_MUTEX_FULL_CHAINWALK;
+}
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 7431a9c86f35..855212501407 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -102,6 +102,21 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
}
/*
+ * Constants for rt mutex functions which have a selectable deadlock
+ * detection.
+ *
+ * RT_MUTEX_MIN_CHAINWALK: Stops the lock chain walk when there are
+ * no further PI adjustments to be made.
+ *
+ * RT_MUTEX_FULL_CHAINWALK: Invoke deadlock detection with a full
+ * walk of the lock chain.
+ */
+enum rtmutex_chainwalk {
+ RT_MUTEX_MIN_CHAINWALK,
+ RT_MUTEX_FULL_CHAINWALK,
+};
+
+/*
* PI-futex support (proxy locking functions, etc.):
*/
extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
@@ -111,12 +126,11 @@ extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
struct task_struct *proxy_owner);
extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
- struct task_struct *task,
- int detect_deadlock);
+ struct task_struct *task);
extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
struct hrtimer_sleeper *to,
- struct rt_mutex_waiter *waiter,
- int detect_deadlock);
+ struct rt_mutex_waiter *waiter);
+extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
#ifdef CONFIG_DEBUG_RT_MUTEXES
# include "rtmutex-debug.h"
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 9be8a9144978..2c93571162cb 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -26,7 +26,7 @@ int rwsem_is_locked(struct rw_semaphore *sem)
unsigned long flags;
if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) {
- ret = (sem->activity != 0);
+ ret = (sem->count != 0);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
}
return ret;
@@ -46,7 +46,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
debug_check_no_locks_freed((void *)sem, sizeof(*sem));
lockdep_init_map(&sem->dep_map, name, key, 0);
#endif
- sem->activity = 0;
+ sem->count = 0;
raw_spin_lock_init(&sem->wait_lock);
INIT_LIST_HEAD(&sem->wait_list);
}
@@ -95,7 +95,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
waiter = list_entry(next, struct rwsem_waiter, list);
} while (waiter->type != RWSEM_WAITING_FOR_WRITE);
- sem->activity += woken;
+ sem->count += woken;
out:
return sem;
@@ -126,9 +126,9 @@ void __sched __down_read(struct rw_semaphore *sem)
raw_spin_lock_irqsave(&sem->wait_lock, flags);
- if (sem->activity >= 0 && list_empty(&sem->wait_list)) {
+ if (sem->count >= 0 && list_empty(&sem->wait_list)) {
/* granted */
- sem->activity++;
+ sem->count++;
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
goto out;
}
@@ -170,9 +170,9 @@ int __down_read_trylock(struct rw_semaphore *sem)
raw_spin_lock_irqsave(&sem->wait_lock, flags);
- if (sem->activity >= 0 && list_empty(&sem->wait_list)) {
+ if (sem->count >= 0 && list_empty(&sem->wait_list)) {
/* granted */
- sem->activity++;
+ sem->count++;
ret = 1;
}
@@ -206,7 +206,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
* itself into sleep and waiting for system woke it or someone
* else in the head of the wait list up.
*/
- if (sem->activity == 0)
+ if (sem->count == 0)
break;
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -214,7 +214,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
raw_spin_lock_irqsave(&sem->wait_lock, flags);
}
/* got the lock */
- sem->activity = -1;
+ sem->count = -1;
list_del(&waiter.list);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -235,9 +235,9 @@ int __down_write_trylock(struct rw_semaphore *sem)
raw_spin_lock_irqsave(&sem->wait_lock, flags);
- if (sem->activity == 0) {
+ if (sem->count == 0) {
/* got the lock */
- sem->activity = -1;
+ sem->count = -1;
ret = 1;
}
@@ -255,7 +255,7 @@ void __up_read(struct rw_semaphore *sem)
raw_spin_lock_irqsave(&sem->wait_lock, flags);
- if (--sem->activity == 0 && !list_empty(&sem->wait_list))
+ if (--sem->count == 0 && !list_empty(&sem->wait_list))
sem = __rwsem_wake_one_writer(sem);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -270,7 +270,7 @@ void __up_write(struct rw_semaphore *sem)
raw_spin_lock_irqsave(&sem->wait_lock, flags);
- sem->activity = 0;
+ sem->count = 0;
if (!list_empty(&sem->wait_list))
sem = __rwsem_do_wake(sem, 1);
@@ -287,7 +287,7 @@ void __downgrade_write(struct rw_semaphore *sem)
raw_spin_lock_irqsave(&sem->wait_lock, flags);
- sem->activity = 1;
+ sem->count = 1;
if (!list_empty(&sem->wait_list))
sem = __rwsem_do_wake(sem, 0);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 19c5fa95e0b4..d6203faf2eb1 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -5,11 +5,66 @@
*
* Writer lock-stealing by Alex Shi <alex.shi@intel.com>
* and Michel Lespinasse <walken@google.com>
+ *
+ * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
+ * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
*/
#include <linux/rwsem.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/export.h>
+#include <linux/sched/rt.h>
+
+#include "mcs_spinlock.h"
+
+/*
+ * Guide to the rw_semaphore's count field for common values.
+ * (32-bit case illustrated, similar for 64-bit)
+ *
+ * 0x0000000X (1) X readers active or attempting lock, no writer waiting
+ * X = #active_readers + #readers attempting to lock
+ * (X*ACTIVE_BIAS)
+ *
+ * 0x00000000 rwsem is unlocked, and no one is waiting for the lock or
+ * attempting to read lock or write lock.
+ *
+ * 0xffff000X (1) X readers active or attempting lock, with waiters for lock
+ * X = #active readers + # readers attempting lock
+ * (X*ACTIVE_BIAS + WAITING_BIAS)
+ * (2) 1 writer attempting lock, no waiters for lock
+ * X-1 = #active readers + #readers attempting lock
+ * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
+ * (3) 1 writer active, no waiters for lock
+ * X-1 = #active readers + #readers attempting lock
+ * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
+ *
+ * 0xffff0001 (1) 1 reader active or attempting lock, waiters for lock
+ * (WAITING_BIAS + ACTIVE_BIAS)
+ * (2) 1 writer active or attempting lock, no waiters for lock
+ * (ACTIVE_WRITE_BIAS)
+ *
+ * 0xffff0000 (1) There are writers or readers queued but none active
+ * or in the process of attempting lock.
+ * (WAITING_BIAS)
+ * Note: writer can attempt to steal lock for this count by adding
+ * ACTIVE_WRITE_BIAS in cmpxchg and checking the old count
+ *
+ * 0xfffe0001 (1) 1 writer active, or attempting lock. Waiters on queue.
+ * (ACTIVE_WRITE_BIAS + WAITING_BIAS)
+ *
+ * Note: Readers attempt to lock by adding ACTIVE_BIAS in down_read and checking
+ * the count becomes more than 0 for successful lock acquisition,
+ * i.e. the case where there are only readers or nobody has lock.
+ * (1st and 2nd case above).
+ *
+ * Writers attempt to lock by adding ACTIVE_WRITE_BIAS in down_write and
+ * checking the count becomes ACTIVE_WRITE_BIAS for successful lock
+ * acquisition (i.e. nobody else has lock or attempts lock). If
+ * unsuccessful, in rwsem_down_write_failed, we'll check to see if there
+ * are only waiters but none active (5th case above), and attempt to
+ * steal the lock.
+ *
+ */
/*
* Initialize an rwsem:
@@ -27,6 +82,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
sem->count = RWSEM_UNLOCKED_VALUE;
raw_spin_lock_init(&sem->wait_lock);
INIT_LIST_HEAD(&sem->wait_list);
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+ sem->owner = NULL;
+ osq_lock_init(&sem->osq);
+#endif
}
EXPORT_SYMBOL(__init_rwsem);
@@ -141,8 +200,9 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
}
/*
- * wait for the read lock to be granted
+ * Wait for the read lock to be granted
*/
+__visible
struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
{
long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
@@ -187,63 +247,221 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
return sem;
}
+static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
+{
+ if (!(count & RWSEM_ACTIVE_MASK)) {
+ /* try acquiring the write lock */
+ if (sem->count == RWSEM_WAITING_BIAS &&
+ cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
+ RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
+ if (!list_is_singular(&sem->wait_list))
+ rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+ return true;
+ }
+ }
+ return false;
+}
+
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
/*
- * wait until we successfully acquire the write lock
+ * Try to acquire write lock before the writer has been put on wait queue.
*/
+static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
+{
+ long old, count = ACCESS_ONCE(sem->count);
+
+ while (true) {
+ if (!(count == 0 || count == RWSEM_WAITING_BIAS))
+ return false;
+
+ old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
+ if (old == count)
+ return true;
+
+ count = old;
+ }
+}
+
+static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
+{
+ struct task_struct *owner;
+ bool on_cpu = false;
+
+ if (need_resched())
+ return false;
+
+ rcu_read_lock();
+ owner = ACCESS_ONCE(sem->owner);
+ if (owner)
+ on_cpu = owner->on_cpu;
+ rcu_read_unlock();
+
+ /*
+ * If sem->owner is not set, yet we have just recently entered the
+ * slowpath, then there is a possibility reader(s) may have the lock.
+ * To be safe, avoid spinning in these situations.
+ */
+ return on_cpu;
+}
+
+static inline bool owner_running(struct rw_semaphore *sem,
+ struct task_struct *owner)
+{
+ if (sem->owner != owner)
+ return false;
+
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_ checking
+ * sem->owner still matches owner, if that fails, owner might
+ * point to free()d memory, if it still matches, the rcu_read_lock()
+ * ensures the memory stays valid.
+ */
+ barrier();
+
+ return owner->on_cpu;
+}
+
+static noinline
+bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
+{
+ rcu_read_lock();
+ while (owner_running(sem, owner)) {
+ if (need_resched())
+ break;
+
+ cpu_relax_lowlatency();
+ }
+ rcu_read_unlock();
+
+ /*
+ * We break out the loop above on need_resched() or when the
+ * owner changed, which is a sign for heavy contention. Return
+ * success only when sem->owner is NULL.
+ */
+ return sem->owner == NULL;
+}
+
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+{
+ struct task_struct *owner;
+ bool taken = false;
+
+ preempt_disable();
+
+ /* sem->wait_lock should not be held when doing optimistic spinning */
+ if (!rwsem_can_spin_on_owner(sem))
+ goto done;
+
+ if (!osq_lock(&sem->osq))
+ goto done;
+
+ while (true) {
+ owner = ACCESS_ONCE(sem->owner);
+ if (owner && !rwsem_spin_on_owner(sem, owner))
+ break;
+
+ /* wait_lock will be acquired if write_lock is obtained */
+ if (rwsem_try_write_lock_unqueued(sem)) {
+ taken = true;
+ break;
+ }
+
+ /*
+ * When there's no owner, we might have preempted between the
+ * owner acquiring the lock and setting the owner field. If
+ * we're an RT task that will live-lock because we won't let
+ * the owner complete.
+ */
+ if (!owner && (need_resched() || rt_task(current)))
+ break;
+
+ /*
+ * The cpu_relax() call is a compiler barrier which forces
+ * everything in this loop to be re-loaded. We don't need
+ * memory barriers as we'll eventually observe the right
+ * values at the cost of a few extra spins.
+ */
+ cpu_relax_lowlatency();
+ }
+ osq_unlock(&sem->osq);
+done:
+ preempt_enable();
+ return taken;
+}
+
+#else
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+{
+ return false;
+}
+#endif
+
+/*
+ * Wait until we successfully acquire the write lock
+ */
+__visible
struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
{
- long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
+ long count;
+ bool waiting = true; /* any queued threads before us */
struct rwsem_waiter waiter;
- struct task_struct *tsk = current;
- /* set up my own style of waitqueue */
- waiter.task = tsk;
+ /* undo write bias from down_write operation, stop active locking */
+ count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
+
+ /* do optimistic spinning and steal lock if possible */
+ if (rwsem_optimistic_spin(sem))
+ return sem;
+
+ /*
+ * Optimistic spinning failed, proceed to the slowpath
+ * and block until we can acquire the sem.
+ */
+ waiter.task = current;
waiter.type = RWSEM_WAITING_FOR_WRITE;
raw_spin_lock_irq(&sem->wait_lock);
+
+ /* account for this before adding a new element to the list */
if (list_empty(&sem->wait_list))
- adjustment += RWSEM_WAITING_BIAS;
+ waiting = false;
+
list_add_tail(&waiter.list, &sem->wait_list);
/* we're now waiting on the lock, but no longer actively locking */
- count = rwsem_atomic_update(adjustment, sem);
+ if (waiting) {
+ count = ACCESS_ONCE(sem->count);
+
+ /*
+ * If there were already threads queued before us and there are
+ * no active writers, the lock must be read owned; so we try to
+ * wake any read locks that were queued ahead of us.
+ */
+ if (count > RWSEM_WAITING_BIAS)
+ sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
- /* If there were already threads queued before us and there are no
- * active writers, the lock must be read owned; so we try to wake
- * any read locks that were queued ahead of us. */
- if (count > RWSEM_WAITING_BIAS &&
- adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
- sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
+ } else
+ count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
/* wait until we successfully acquire the lock */
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+ set_current_state(TASK_UNINTERRUPTIBLE);
while (true) {
- if (!(count & RWSEM_ACTIVE_MASK)) {
- /* Try acquiring the write lock. */
- count = RWSEM_ACTIVE_WRITE_BIAS;
- if (!list_is_singular(&sem->wait_list))
- count += RWSEM_WAITING_BIAS;
-
- if (sem->count == RWSEM_WAITING_BIAS &&
- cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
- RWSEM_WAITING_BIAS)
- break;
- }
-
+ if (rwsem_try_write_lock(count, sem))
+ break;
raw_spin_unlock_irq(&sem->wait_lock);
/* Block until there are no active lockers. */
do {
schedule();
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+ set_current_state(TASK_UNINTERRUPTIBLE);
} while ((count = sem->count) & RWSEM_ACTIVE_MASK);
raw_spin_lock_irq(&sem->wait_lock);
}
+ __set_current_state(TASK_RUNNING);
list_del(&waiter.list);
raw_spin_unlock_irq(&sem->wait_lock);
- tsk->state = TASK_RUNNING;
return sem;
}
@@ -252,6 +470,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
* handle waking up a waiter on the semaphore
* - up_read/up_write has decremented the active part of count if we come here
*/
+__visible
struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
{
unsigned long flags;
@@ -272,6 +491,7 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
* - caller incremented waiting part of count and discovered it still negative
* - just wake up any readers at the front of the queue
*/
+__visible
struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
{
unsigned long flags;
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index cfff1435bdfb..e2d3bc7f03b4 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -12,6 +12,27 @@
#include <linux/atomic.h>
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+ sem->owner = current;
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+ sem->owner = NULL;
+}
+
+#else
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+}
+#endif
+
/*
* lock for reading
*/
@@ -48,6 +69,7 @@ void __sched down_write(struct rw_semaphore *sem)
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+ rwsem_set_owner(sem);
}
EXPORT_SYMBOL(down_write);
@@ -59,8 +81,11 @@ int down_write_trylock(struct rw_semaphore *sem)
{
int ret = __down_write_trylock(sem);
- if (ret == 1)
+ if (ret == 1) {
rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
+ rwsem_set_owner(sem);
+ }
+
return ret;
}
@@ -85,6 +110,7 @@ void up_write(struct rw_semaphore *sem)
{
rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ rwsem_clear_owner(sem);
__up_write(sem);
}
@@ -99,6 +125,7 @@ void downgrade_write(struct rw_semaphore *sem)
* lockdep: a downgraded write will live on as a write
* dependency.
*/
+ rwsem_clear_owner(sem);
__downgrade_write(sem);
}
@@ -122,6 +149,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+ rwsem_set_owner(sem);
}
EXPORT_SYMBOL(_down_write_nest_lock);
@@ -141,6 +169,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+ rwsem_set_owner(sem);
}
EXPORT_SYMBOL(down_write_nested);
diff --git a/kernel/module.c b/kernel/module.c
index d24fcf29cb64..8a0dc91eddbc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -60,7 +60,6 @@
#include <linux/jump_label.h>
#include <linux/pfn.h>
#include <linux/bsearch.h>
-#include <linux/fips.h>
#include <uapi/linux/module.h>
#include "module-internal.h"
@@ -136,7 +135,7 @@ static int param_set_bool_enable_only(const char *val,
}
static const struct kernel_param_ops param_ops_bool_enable_only = {
- .flags = KERNEL_PARAM_FL_NOARG,
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
.set = param_set_bool_enable_only,
.get = param_get_bool,
};
@@ -640,7 +639,7 @@ static int module_unload_init(struct module *mod)
INIT_LIST_HEAD(&mod->target_list);
/* Hold reference count during initialization. */
- __this_cpu_write(mod->refptr->incs, 1);
+ raw_cpu_write(mod->refptr->incs, 1);
return 0;
}
@@ -815,9 +814,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
return -EFAULT;
name[MODULE_NAME_LEN-1] = '\0';
- if (!(flags & O_NONBLOCK))
- pr_warn("waiting module removal not supported: please upgrade\n");
-
if (mutex_lock_interruptible(&module_mutex) != 0)
return -EINTR;
@@ -1013,9 +1009,11 @@ static size_t module_flags_taint(struct module *mod, char *buf)
buf[l++] = 'F';
if (mod->taints & (1 << TAINT_CRAP))
buf[l++] = 'C';
+ if (mod->taints & (1 << TAINT_UNSIGNED_MODULE))
+ buf[l++] = 'E';
/*
* TAINT_FORCED_RMMOD: could be added.
- * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
+ * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
* apply to modules.
*/
return l;
@@ -1948,6 +1946,10 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
switch (sym[i].st_shndx) {
case SHN_COMMON:
+ /* Ignore common symbols */
+ if (!strncmp(name, "__gnu_lto", 9))
+ break;
+
/* We compiled with -fno-common. These are not
supposed to happen. */
pr_debug("Common symbol: %s\n", name);
@@ -2445,9 +2447,6 @@ static int module_sig_check(struct load_info *info)
}
/* Not having a signature is only an error if we're strict. */
- if (err < 0 && fips_enabled)
- panic("Module verification failed with error %d in FIPS mode\n",
- err);
if (err == -ENOKEY && !sig_enforce)
err = 0;
@@ -3017,21 +3016,6 @@ static int do_init_module(struct module *mod)
*/
current->flags &= ~PF_USED_ASYNC;
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_COMING, mod);
-
- /* Set RO and NX regions for core */
- set_section_ro_nx(mod->module_core,
- mod->core_text_size,
- mod->core_ro_size,
- mod->core_size);
-
- /* Set RO and NX regions for init */
- set_section_ro_nx(mod->module_init,
- mod->init_text_size,
- mod->init_ro_size,
- mod->init_size);
-
do_mod_ctors(mod);
/* Start the module */
if (mod->init != NULL)
@@ -3162,9 +3146,26 @@ static int complete_formation(struct module *mod, struct load_info *info)
/* This relies on module_mutex for list integrity. */
module_bug_finalize(info->hdr, info->sechdrs, mod);
+ /* Set RO and NX regions for core */
+ set_section_ro_nx(mod->module_core,
+ mod->core_text_size,
+ mod->core_ro_size,
+ mod->core_size);
+
+ /* Set RO and NX regions for init */
+ set_section_ro_nx(mod->module_init,
+ mod->init_text_size,
+ mod->init_ro_size,
+ mod->init_size);
+
/* Mark state as coming so strong_try_module_get() ignores us,
* but kallsyms etc. can see us. */
mod->state = MODULE_STATE_COMING;
+ mutex_unlock(&module_mutex);
+
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_COMING, mod);
+ return 0;
out:
mutex_unlock(&module_mutex);
@@ -3187,6 +3188,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
{
struct module *mod;
long err;
+ char *after_dashes;
err = module_sig_check(info);
if (err)
@@ -3214,7 +3216,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
pr_notice_once("%s: module verification failed: signature "
"and/or required key missing - tainting "
"kernel\n", mod->name);
- add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK);
+ add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
}
#endif
@@ -3265,16 +3267,24 @@ static int load_module(struct load_info *info, const char __user *uargs,
dynamic_debug_setup(info->debug, info->num_debug);
+ /* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
+ ftrace_module_init(mod);
+
/* Finally it's fully formed, ready to start executing. */
err = complete_formation(mod, info);
if (err)
goto ddebug_cleanup;
/* Module is ready to execute: parsing args may do that. */
- err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
- -32768, 32767, unknown_module_param_cb);
- if (err < 0)
+ after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
+ -32768, 32767, unknown_module_param_cb);
+ if (IS_ERR(after_dashes)) {
+ err = PTR_ERR(after_dashes);
goto bug_cleanup;
+ } else if (after_dashes) {
+ pr_warn("%s: parameters '%s' after `--' ignored\n",
+ mod->name, after_dashes);
+ }
/* Link in to syfs. */
err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
@@ -3294,6 +3304,11 @@ static int load_module(struct load_info *info, const char __user *uargs,
mutex_lock(&module_mutex);
module_bug_cleanup(mod);
mutex_unlock(&module_mutex);
+
+ /* we can't deallocate the module until we clear memory protection */
+ unset_module_init_ro_nx(mod);
+ unset_module_core_ro_nx(mod);
+
ddebug_cleanup:
dynamic_debug_remove(info->debug);
synchronize_sched();
@@ -3371,6 +3386,8 @@ static inline int within(unsigned long addr, void *start, unsigned long size)
*/
static inline int is_arm_mapping_symbol(const char *str)
{
+ if (str[0] == '.' && str[1] == 'L')
+ return true;
return str[0] == '$' && strchr("atd", str[1])
&& (str[2] == '\0' || str[2] == '.');
}
@@ -3434,8 +3451,7 @@ const char *module_address_lookup(unsigned long addr,
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if (within_module_init(addr, mod) ||
- within_module_core(addr, mod)) {
+ if (within_module(addr, mod)) {
if (modname)
*modname = mod->name;
ret = get_ksymbol(mod, addr, size, offset);
@@ -3459,8 +3475,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if (within_module_init(addr, mod) ||
- within_module_core(addr, mod)) {
+ if (within_module(addr, mod)) {
const char *sym;
sym = get_ksymbol(mod, addr, NULL, NULL);
@@ -3485,8 +3500,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if (within_module_init(addr, mod) ||
- within_module_core(addr, mod)) {
+ if (within_module(addr, mod)) {
const char *sym;
sym = get_ksymbol(mod, addr, size, offset);
@@ -3750,8 +3764,7 @@ struct module *__module_address(unsigned long addr)
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if (within_module_core(addr, mod)
- || within_module_init(addr, mod))
+ if (within_module(addr, mod))
return mod;
}
return NULL;
@@ -3809,12 +3822,12 @@ void print_modules(void)
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- printk(" %s%s", mod->name, module_flags(mod, buf));
+ pr_cont(" %s%s", mod->name, module_flags(mod, buf));
}
preempt_enable();
if (last_unloaded_module[0])
- printk(" [last unloaded: %s]", last_unloaded_module);
- printk("\n");
+ pr_cont(" [last unloaded: %s]", last_unloaded_module);
+ pr_cont("\n");
}
#ifdef CONFIG_MODVERSIONS
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 2d5cc4ccff7f..4803da6eab62 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -71,9 +71,9 @@ static int notifier_chain_unregister(struct notifier_block **nl,
* @returns: notifier_call_chain returns the value returned by the
* last notifier function called.
*/
-static int __kprobes notifier_call_chain(struct notifier_block **nl,
- unsigned long val, void *v,
- int nr_to_call, int *nr_calls)
+static int notifier_call_chain(struct notifier_block **nl,
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
{
int ret = NOTIFY_DONE;
struct notifier_block *nb, *next_nb;
@@ -102,6 +102,7 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
}
return ret;
}
+NOKPROBE_SYMBOL(notifier_call_chain);
/*
* Atomic notifier chain routines. Registration and unregistration
@@ -172,9 +173,9 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
* Otherwise the return value is the return value
* of the last notifier function called.
*/
-int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
- unsigned long val, void *v,
- int nr_to_call, int *nr_calls)
+int __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
{
int ret;
@@ -184,13 +185,15 @@ int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
return ret;
}
EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
+NOKPROBE_SYMBOL(__atomic_notifier_call_chain);
-int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
- unsigned long val, void *v)
+int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+ unsigned long val, void *v)
{
return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
}
EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
+NOKPROBE_SYMBOL(atomic_notifier_call_chain);
/*
* Blocking notifier chain routines. All access to the chain is
@@ -309,7 +312,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
* racy then it does not matter what the result of the test
* is, we re-check the list after having taken the lock anyway:
*/
- if (rcu_dereference_raw(nh->head)) {
+ if (rcu_access_pointer(nh->head)) {
down_read(&nh->rwsem);
ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
nr_calls);
@@ -527,7 +530,7 @@ EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
static ATOMIC_NOTIFIER_HEAD(die_chain);
-int notrace __kprobes notify_die(enum die_val val, const char *str,
+int notrace notify_die(enum die_val val, const char *str,
struct pt_regs *regs, long err, int trap, int sig)
{
struct die_args args = {
@@ -540,6 +543,7 @@ int notrace __kprobes notify_die(enum die_val val, const char *str,
};
return atomic_notifier_call_chain(&die_chain, val, &args);
}
+NOKPROBE_SYMBOL(notify_die);
int register_die_notifier(struct notifier_block *nb)
{
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 8e7811086b82..ef42d0ab3115 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -204,20 +204,13 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
might_sleep();
+ task_lock(p);
ns = p->nsproxy;
+ p->nsproxy = new;
+ task_unlock(p);
- rcu_assign_pointer(p->nsproxy, new);
-
- if (ns && atomic_dec_and_test(&ns->count)) {
- /*
- * wait for others to get what they want from this nsproxy.
- *
- * cannot release this nsproxy via the call_rcu() since
- * put_mnt_ns() will want to sleep
- */
- synchronize_rcu();
+ if (ns && atomic_dec_and_test(&ns->count))
free_nsproxy(ns);
- }
}
void exit_task_namespaces(struct task_struct *p)
diff --git a/kernel/panic.c b/kernel/panic.c
index 6d6300375090..d09dc5c32c67 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -32,6 +32,7 @@ static unsigned long tainted_mask;
static int pause_on_oops;
static int pause_on_oops_flag;
static DEFINE_SPINLOCK(pause_on_oops_lock);
+static bool crash_kexec_post_notifiers;
int panic_timeout = CONFIG_PANIC_TIMEOUT;
EXPORT_SYMBOL_GPL(panic_timeout);
@@ -100,7 +101,7 @@ void panic(const char *fmt, ...)
va_start(args, fmt);
vsnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
- printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
+ pr_emerg("Kernel panic - not syncing: %s\n", buf);
#ifdef CONFIG_DEBUG_BUGVERBOSE
/*
* Avoid nested stack-dumping if a panic occurs during oops processing
@@ -112,9 +113,11 @@ void panic(const char *fmt, ...)
/*
* If we have crashed and we have a crash kernel loaded let it handle
* everything else.
- * Do we want to call this before we try to display a message?
+ * If we want to run this after calling panic_notifiers, pass
+ * the "crash_kexec_post_notifiers" option to the kernel.
*/
- crash_kexec(NULL);
+ if (!crash_kexec_post_notifiers)
+ crash_kexec(NULL);
/*
* Note smp_send_stop is the usual smp shutdown function, which
@@ -131,6 +134,15 @@ void panic(const char *fmt, ...)
kmsg_dump(KMSG_DUMP_PANIC);
+ /*
+ * If you doubt kdump always works fine in any situation,
+ * "crash_kexec_post_notifiers" offers you a chance to run
+ * panic_notifiers and dumping kmsg before kdump.
+ * Note: since some panic_notifiers can make crashed kernel
+ * more unstable, it can increase risks of the kdump failure too.
+ */
+ crash_kexec(NULL);
+
bust_spinlocks(0);
if (!panic_blink)
@@ -141,7 +153,7 @@ void panic(const char *fmt, ...)
* Delay timeout seconds before rebooting the machine.
* We can't use the "normal" timers since we just panicked.
*/
- printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
+ pr_emerg("Rebooting in %d seconds..", panic_timeout);
for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
touch_nmi_watchdog();
@@ -165,7 +177,7 @@ void panic(const char *fmt, ...)
extern int stop_a_enabled;
/* Make sure the user can actually press Stop-A (L1-A) */
stop_a_enabled = 1;
- printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n");
+ pr_emerg("Press Stop-A (L1-A) to return to the boot prom\n");
}
#endif
#if defined(CONFIG_S390)
@@ -176,6 +188,7 @@ void panic(const char *fmt, ...)
disabled_wait(caller);
}
#endif
+ pr_emerg("---[ end Kernel panic - not syncing: %s\n", buf);
local_irq_enable();
for (i = 0; ; i += PANIC_TIMER_STEP) {
touch_softlockup_watchdog();
@@ -199,7 +212,7 @@ struct tnt {
static const struct tnt tnts[] = {
{ TAINT_PROPRIETARY_MODULE, 'P', 'G' },
{ TAINT_FORCED_MODULE, 'F', ' ' },
- { TAINT_UNSAFE_SMP, 'S', ' ' },
+ { TAINT_CPU_OUT_OF_SPEC, 'S', ' ' },
{ TAINT_FORCED_RMMOD, 'R', ' ' },
{ TAINT_MACHINE_CHECK, 'M', ' ' },
{ TAINT_BAD_PAGE, 'B', ' ' },
@@ -210,6 +223,8 @@ static const struct tnt tnts[] = {
{ TAINT_CRAP, 'C', ' ' },
{ TAINT_FIRMWARE_WORKAROUND, 'I', ' ' },
{ TAINT_OOT_MODULE, 'O', ' ' },
+ { TAINT_UNSIGNED_MODULE, 'E', ' ' },
+ { TAINT_SOFTLOCKUP, 'L', ' ' },
};
/**
@@ -228,6 +243,7 @@ static const struct tnt tnts[] = {
* 'C' - modules from drivers/staging are loaded.
* 'I' - Working around severe firmware bug.
* 'O' - Out-of-tree module has been loaded.
+ * 'E' - Unsigned module has been loaded.
*
* The string is overwritten by the next call to print_tainted().
*/
@@ -274,8 +290,7 @@ unsigned long get_taint(void)
void add_taint(unsigned flag, enum lockdep_ok lockdep_ok)
{
if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off())
- printk(KERN_WARNING
- "Disabling lock debugging due to kernel taint\n");
+ pr_warn("Disabling lock debugging due to kernel taint\n");
set_bit(flag, &tainted_mask);
}
@@ -380,8 +395,7 @@ late_initcall(init_oops_id);
void print_oops_end_marker(void)
{
init_oops_id();
- printk(KERN_WARNING "---[ end trace %016llx ]---\n",
- (unsigned long long)oops_id);
+ pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id);
}
/*
@@ -459,7 +473,7 @@ EXPORT_SYMBOL(warn_slowpath_null);
* Called when gcc's -fstack-protector feature is used, and
* gcc detects corruption of the on-stack canary value
*/
-void __stack_chk_fail(void)
+__visible void __stack_chk_fail(void)
{
panic("stack-protector: Kernel stack is corrupted in: %p\n",
__builtin_return_address(0));
@@ -471,6 +485,13 @@ EXPORT_SYMBOL(__stack_chk_fail);
core_param(panic, panic_timeout, int, 0644);
core_param(pause_on_oops, pause_on_oops, int, 0644);
+static int __init setup_crash_kexec_post_notifiers(char *s)
+{
+ crash_kexec_post_notifiers = true;
+ return 0;
+}
+early_param("crash_kexec_post_notifiers", setup_crash_kexec_post_notifiers);
+
static int __init oops_setup(char *s)
{
if (!s)
diff --git a/kernel/params.c b/kernel/params.c
index b00142e7f3ba..041b5899d5e2 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -83,6 +83,15 @@ bool parameq(const char *a, const char *b)
return parameqn(a, b, strlen(a)+1);
}
+static void param_check_unsafe(const struct kernel_param *kp)
+{
+ if (kp->flags & KERNEL_PARAM_FL_UNSAFE) {
+ pr_warn("Setting dangerous option %s - tainting kernel\n",
+ kp->name);
+ add_taint(TAINT_USER, LOCKDEP_STILL_OK);
+ }
+}
+
static int parse_one(char *param,
char *val,
const char *doing,
@@ -104,11 +113,12 @@ static int parse_one(char *param,
return 0;
/* No one handled NULL, so do it here. */
if (!val &&
- !(params[i].ops->flags & KERNEL_PARAM_FL_NOARG))
+ !(params[i].ops->flags & KERNEL_PARAM_OPS_FL_NOARG))
return -EINVAL;
pr_debug("handling %s with %p\n", param,
params[i].ops->set);
mutex_lock(&param_lock);
+ param_check_unsafe(&params[i]);
err = params[i].ops->set(val, &params[i]);
mutex_unlock(&param_lock);
return err;
@@ -177,13 +187,13 @@ static char *next_arg(char *args, char **param, char **val)
}
/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
-int parse_args(const char *doing,
- char *args,
- const struct kernel_param *params,
- unsigned num,
- s16 min_level,
- s16 max_level,
- int (*unknown)(char *param, char *val, const char *doing))
+char *parse_args(const char *doing,
+ char *args,
+ const struct kernel_param *params,
+ unsigned num,
+ s16 min_level,
+ s16 max_level,
+ int (*unknown)(char *param, char *val, const char *doing))
{
char *param, *val;
@@ -198,6 +208,9 @@ int parse_args(const char *doing,
int irq_was_disabled;
args = next_arg(args, &param, &val);
+ /* Stop at -- */
+ if (!val && strcmp(param, "--") == 0)
+ return args;
irq_was_disabled = irqs_disabled();
ret = parse_one(param, val, doing, params, num,
min_level, max_level, unknown);
@@ -208,22 +221,22 @@ int parse_args(const char *doing,
switch (ret) {
case -ENOENT:
pr_err("%s: Unknown parameter `%s'\n", doing, param);
- return ret;
+ return ERR_PTR(ret);
case -ENOSPC:
pr_err("%s: `%s' too large for parameter `%s'\n",
doing, val ?: "", param);
- return ret;
+ return ERR_PTR(ret);
case 0:
break;
default:
pr_err("%s: `%s' invalid for parameter `%s'\n",
doing, val ?: "", param);
- return ret;
+ return ERR_PTR(ret);
}
}
/* All parsed OK. */
- return 0;
+ return NULL;
}
/* Lazy bastard, eh? */
@@ -253,6 +266,7 @@ STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
+STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
int param_set_charp(const char *val, const struct kernel_param *kp)
{
@@ -314,7 +328,7 @@ int param_get_bool(char *buffer, const struct kernel_param *kp)
EXPORT_SYMBOL(param_get_bool);
struct kernel_param_ops param_ops_bool = {
- .flags = KERNEL_PARAM_FL_NOARG,
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
.set = param_set_bool,
.get = param_get_bool,
};
@@ -365,7 +379,7 @@ int param_set_bint(const char *val, const struct kernel_param *kp)
EXPORT_SYMBOL(param_set_bint);
struct kernel_param_ops param_ops_bint = {
- .flags = KERNEL_PARAM_FL_NOARG,
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
.set = param_set_bint,
.get = param_get_int,
};
@@ -548,6 +562,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
return -EPERM;
mutex_lock(&param_lock);
+ param_check_unsafe(attribute->param);
err = attribute->param->ops->set(buf, attribute->param);
mutex_unlock(&param_lock);
if (!err)
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 06c62de9c711..db95d8eb761b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -318,7 +318,9 @@ static void *pidns_get(struct task_struct *task)
struct pid_namespace *ns;
rcu_read_lock();
- ns = get_pid_ns(task_active_pid_ns(task));
+ ns = task_active_pid_ns(task);
+ if (ns)
+ get_pid_ns(ns);
rcu_read_unlock();
return ns;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 2fac9cc79b3d..e4e4121fa327 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -253,12 +253,8 @@ config APM_EMULATION
anything, try disabling/enabling this option (or disabling/enabling
APM in your BIOS).
-config ARCH_HAS_OPP
- bool
-
config PM_OPP
- bool "Operating Performance Point (OPP) Layer library"
- depends on ARCH_HAS_OPP
+ bool
---help---
SOCs have a standard set of tuples consisting of frequency and
voltage pairs that the device will support per voltage domain. This
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 37170d4dd9a6..a9dfa79b6bab 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -28,14 +28,16 @@
#include <linux/syscore_ops.h>
#include <linux/ctype.h>
#include <linux/genhd.h>
+#include <trace/events/power.h>
#include "power.h"
static int nocompress;
static int noresume;
+static int nohibernate;
static int resume_wait;
-static int resume_delay;
+static unsigned int resume_delay;
static char resume_file[256] = CONFIG_PM_STD_PARTITION;
dev_t swsusp_resume_device;
sector_t swsusp_resume_block;
@@ -61,6 +63,11 @@ bool freezer_test_done;
static const struct platform_hibernation_ops *hibernation_ops;
+bool hibernation_available(void)
+{
+ return (nohibernate == 0);
+}
+
/**
* hibernation_set_ops - Set the global hibernate operations.
* @ops: Hibernation operations to use in subsequent hibernation transitions.
@@ -228,19 +235,23 @@ static void platform_recover(int platform_mode)
void swsusp_show_speed(struct timeval *start, struct timeval *stop,
unsigned nr_pages, char *msg)
{
- s64 elapsed_centisecs64;
- int centisecs;
- int k;
- int kps;
+ u64 elapsed_centisecs64;
+ unsigned int centisecs;
+ unsigned int k;
+ unsigned int kps;
elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
+ /*
+ * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time,
+ * it is obvious enough for what went wrong.
+ */
do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
centisecs = elapsed_centisecs64;
if (centisecs == 0)
centisecs = 1; /* avoid div-by-zero */
k = nr_pages * (PAGE_SIZE / 1024);
kps = (k * 100) / centisecs;
- printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n",
+ printk(KERN_INFO "PM: %s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n",
msg, k,
centisecs / 100, centisecs % 100,
kps / 1000, (kps % 1000) / 10);
@@ -288,7 +299,9 @@ static int create_image(int platform_mode)
in_suspend = 1;
save_processor_state();
+ trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true);
error = swsusp_arch_suspend();
+ trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
if (error)
printk(KERN_ERR "PM: Error %d creating hibernation image\n",
error);
@@ -358,7 +371,6 @@ int hibernation_snapshot(int platform_mode)
}
suspend_console();
- ftrace_stop();
pm_restrict_gfp_mask();
error = dpm_suspend(PMSG_FREEZE);
@@ -384,7 +396,6 @@ int hibernation_snapshot(int platform_mode)
if (error || !in_suspend)
pm_restore_gfp_mask();
- ftrace_start();
resume_console();
dpm_complete(msg);
@@ -487,7 +498,6 @@ int hibernation_restore(int platform_mode)
pm_prepare_console();
suspend_console();
- ftrace_stop();
pm_restrict_gfp_mask();
error = dpm_suspend_start(PMSG_QUIESCE);
if (!error) {
@@ -495,7 +505,6 @@ int hibernation_restore(int platform_mode)
dpm_resume_end(PMSG_RECOVER);
}
pm_restore_gfp_mask();
- ftrace_start();
resume_console();
pm_restore_console();
return error;
@@ -522,7 +531,6 @@ int hibernation_platform_enter(void)
entering_platform_hibernation = true;
suspend_console();
- ftrace_stop();
error = dpm_suspend_start(PMSG_HIBERNATE);
if (error) {
if (hibernation_ops->recover)
@@ -566,7 +574,6 @@ int hibernation_platform_enter(void)
Resume_devices:
entering_platform_hibernation = false;
dpm_resume_end(PMSG_RESTORE);
- ftrace_start();
resume_console();
Close:
@@ -595,7 +602,8 @@ static void power_down(void)
case HIBERNATION_PLATFORM:
hibernation_platform_enter();
case HIBERNATION_SHUTDOWN:
- kernel_power_off();
+ if (pm_power_off)
+ kernel_power_off();
break;
#ifdef CONFIG_SUSPEND
case HIBERNATION_SUSPEND:
@@ -623,7 +631,8 @@ static void power_down(void)
* corruption after resume.
*/
printk(KERN_CRIT "PM: Please power down manually\n");
- while(1);
+ while (1)
+ cpu_relax();
}
/**
@@ -633,6 +642,11 @@ int hibernate(void)
{
int error;
+ if (!hibernation_available()) {
+ pr_debug("PM: Hibernation not available.\n");
+ return -EPERM;
+ }
+
lock_system_sleep();
/* The snapshot device should not be opened while we're running */
if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
@@ -725,7 +739,7 @@ static int software_resume(void)
/*
* If the user said "noresume".. bail out early.
*/
- if (noresume)
+ if (noresume || !hibernation_available())
return 0;
/*
@@ -891,6 +905,9 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
int i;
char *start = buf;
+ if (!hibernation_available())
+ return sprintf(buf, "[disabled]\n");
+
for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
if (!hibernation_modes[i])
continue;
@@ -925,6 +942,9 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
char *p;
int mode = HIBERNATION_INVALID;
+ if (!hibernation_available())
+ return -EPERM;
+
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
@@ -973,16 +993,20 @@ static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr,
static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
- unsigned int maj, min;
dev_t res;
- int ret = -EINVAL;
+ int len = n;
+ char *name;
- if (sscanf(buf, "%u:%u", &maj, &min) != 2)
- goto out;
+ if (len && buf[len-1] == '\n')
+ len--;
+ name = kstrndup(buf, len, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
- res = MKDEV(maj,min);
- if (maj != MAJOR(res) || min != MINOR(res))
- goto out;
+ res = name_to_dev_t(name);
+ kfree(name);
+ if (!res)
+ return -EINVAL;
lock_system_sleep();
swsusp_resume_device = res;
@@ -990,9 +1014,7 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
printk(KERN_INFO "PM: Starting manual resume from disk\n");
noresume = 0;
software_resume();
- ret = n;
- out:
- return ret;
+ return n;
}
power_attr(resume);
@@ -1090,6 +1112,10 @@ static int __init hibernate_setup(char *str)
noresume = 1;
else if (!strncmp(str, "nocompress", 10))
nocompress = 1;
+ else if (!strncmp(str, "no", 2)) {
+ noresume = 1;
+ nohibernate = 1;
+ }
return 1;
}
@@ -1107,13 +1133,30 @@ static int __init resumewait_setup(char *str)
static int __init resumedelay_setup(char *str)
{
- resume_delay = simple_strtoul(str, NULL, 0);
+ int rc = kstrtouint(str, 0, &resume_delay);
+
+ if (rc)
+ return rc;
+ return 1;
+}
+
+static int __init nohibernate_setup(char *str)
+{
+ noresume = 1;
+ nohibernate = 1;
return 1;
}
+static int __init kaslr_nohibernate_setup(char *str)
+{
+ return nohibernate_setup(str);
+}
+
__setup("noresume", noresume_setup);
__setup("resume_offset=", resume_offset_setup);
__setup("resume=", resume_setup);
__setup("hibernate=", hibernate_setup);
__setup("resumewait", resumewait_setup);
__setup("resumedelay=", resumedelay_setup);
+__setup("nohibernate", nohibernate_setup);
+__setup("kaslr", kaslr_nohibernate_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 1d1bf630e6e9..9a59d042ea84 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -279,42 +279,39 @@ static inline void pm_print_times_init(void) {}
struct kobject *power_kobj;
/**
- * state - control system power state.
+ * state - control system sleep states.
*
- * show() returns what states are supported, which is hard-coded to
- * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and
- * 'disk' (Suspend-to-Disk).
+ * show() returns available sleep state labels, which may be "mem", "standby",
+ * "freeze" and "disk" (hibernation). See Documentation/power/states.txt for a
+ * description of what they mean.
*
- * store() accepts one of those strings, translates it into the
- * proper enumerated value, and initiates a suspend transition.
+ * store() accepts one of those strings, translates it into the proper
+ * enumerated value, and initiates a suspend transition.
*/
static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
char *s = buf;
#ifdef CONFIG_SUSPEND
- int i;
+ suspend_state_t i;
- for (i = 0; i < PM_SUSPEND_MAX; i++) {
- if (pm_states[i] && valid_state(i))
+ for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
+ if (pm_states[i])
s += sprintf(s,"%s ", pm_states[i]);
- }
+
#endif
-#ifdef CONFIG_HIBERNATION
- s += sprintf(s, "%s\n", "disk");
-#else
+ if (hibernation_available())
+ s += sprintf(s, "disk ");
if (s != buf)
/* convert the last space to a newline */
*(s-1) = '\n';
-#endif
return (s - buf);
}
static suspend_state_t decode_state(const char *buf, size_t n)
{
#ifdef CONFIG_SUSPEND
- suspend_state_t state = PM_SUSPEND_MIN;
- const char * const *s;
+ suspend_state_t state;
#endif
char *p;
int len;
@@ -327,9 +324,12 @@ static suspend_state_t decode_state(const char *buf, size_t n)
return PM_SUSPEND_MAX;
#ifdef CONFIG_SUSPEND
- for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++)
- if (*s && len == strlen(*s) && !strncmp(buf, *s, len))
+ for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) {
+ const char *label = pm_states[state];
+
+ if (label && len == strlen(label) && !strncmp(buf, label, len))
return state;
+ }
#endif
return PM_SUSPEND_ON;
@@ -447,8 +447,8 @@ static ssize_t autosleep_show(struct kobject *kobj,
#ifdef CONFIG_SUSPEND
if (state < PM_SUSPEND_MAX)
- return sprintf(buf, "%s\n", valid_state(state) ?
- pm_states[state] : "error");
+ return sprintf(buf, "%s\n", pm_states[state] ?
+ pm_states[state] : "error");
#endif
#ifdef CONFIG_HIBERNATION
return sprintf(buf, "disk\n");
@@ -616,7 +616,6 @@ static struct attribute_group attr_group = {
.attrs = g,
};
-#ifdef CONFIG_PM_RUNTIME
struct workqueue_struct *pm_wq;
EXPORT_SYMBOL_GPL(pm_wq);
@@ -626,9 +625,6 @@ static int __init pm_start_workqueue(void)
return pm_wq ? 0 : -ENOMEM;
}
-#else
-static inline int pm_start_workqueue(void) { return 0; }
-#endif
static int __init pm_init(void)
{
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 7d4b7ffb3c1d..2df883a9d3cb 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -2,6 +2,7 @@
#include <linux/suspend_ioctls.h>
#include <linux/utsname.h>
#include <linux/freezer.h>
+#include <linux/compiler.h>
struct swsusp_info {
struct new_utsname uts;
@@ -11,7 +12,7 @@ struct swsusp_info {
unsigned long image_pages;
unsigned long pages;
unsigned long size;
-} __attribute__((aligned(PAGE_SIZE)));
+} __aligned(PAGE_SIZE);
#ifdef CONFIG_HIBERNATION
/* kernel/power/snapshot.c */
@@ -49,6 +50,8 @@ static inline char *check_image_kernel(struct swsusp_info *info)
*/
#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT)
+asmlinkage int swsusp_save(void);
+
/* kernel/power/hibernate.c */
extern bool freezer_test_done;
@@ -176,16 +179,15 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *,
#ifdef CONFIG_SUSPEND
/* kernel/power/suspend.c */
-extern const char *const pm_states[];
+extern const char *pm_labels[];
+extern const char *pm_states[];
-extern bool valid_state(suspend_state_t state);
extern int suspend_devices_and_enter(suspend_state_t state);
#else /* !CONFIG_SUSPEND */
static inline int suspend_devices_and_enter(suspend_state_t state)
{
return -ENOSYS;
}
-static inline bool valid_state(suspend_state_t state) { return false; }
#endif /* !CONFIG_SUSPEND */
#ifdef CONFIG_PM_TEST_SUSPEND
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 06ec8869dbf1..4ee194eb524b 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -17,6 +17,7 @@
#include <linux/delay.h>
#include <linux/workqueue.h>
#include <linux/kmod.h>
+#include <trace/events/power.h>
/*
* Timeout for stopping processes
@@ -175,6 +176,7 @@ void thaw_processes(void)
struct task_struct *g, *p;
struct task_struct *curr = current;
+ trace_suspend_resume(TPS("thaw_processes"), 0, true);
if (pm_freezing)
atomic_dec(&system_freezing_cnt);
pm_freezing = false;
@@ -184,6 +186,7 @@ void thaw_processes(void)
printk("Restarting tasks ... ");
+ __usermodehelper_set_disable_depth(UMH_FREEZING);
thaw_workqueues();
read_lock(&tasklist_lock);
@@ -201,6 +204,7 @@ void thaw_processes(void)
schedule();
printk("done.\n");
+ trace_suspend_resume(TPS("thaw_processes"), 0, false);
}
void thaw_kernel_threads(void)
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 8dff9b48075a..884b77058864 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -66,6 +66,7 @@ static struct pm_qos_constraints cpu_dma_constraints = {
.list = PLIST_HEAD_INIT(cpu_dma_constraints.list),
.target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
.default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
+ .no_constraint_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
.type = PM_QOS_MIN,
.notifiers = &cpu_dma_lat_notifier,
};
@@ -79,6 +80,7 @@ static struct pm_qos_constraints network_lat_constraints = {
.list = PLIST_HEAD_INIT(network_lat_constraints.list),
.target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
.default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
+ .no_constraint_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
.type = PM_QOS_MIN,
.notifiers = &network_lat_notifier,
};
@@ -93,6 +95,7 @@ static struct pm_qos_constraints network_tput_constraints = {
.list = PLIST_HEAD_INIT(network_tput_constraints.list),
.target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
.default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
+ .no_constraint_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
.type = PM_QOS_MAX,
.notifiers = &network_throughput_notifier,
};
@@ -128,7 +131,7 @@ static const struct file_operations pm_qos_power_fops = {
static inline int pm_qos_get_value(struct pm_qos_constraints *c)
{
if (plist_head_empty(&c->list))
- return c->default_value;
+ return c->no_constraint_value;
switch (c->type) {
case PM_QOS_MIN:
@@ -170,6 +173,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
{
unsigned long flags;
int prev_value, curr_value, new_value;
+ int ret;
spin_lock_irqsave(&pm_qos_lock, flags);
prev_value = pm_qos_get_value(c);
@@ -205,13 +209,15 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
trace_pm_qos_update_target(action, prev_value, curr_value);
if (prev_value != curr_value) {
- blocking_notifier_call_chain(c->notifiers,
- (unsigned long)curr_value,
- NULL);
- return 1;
+ ret = 1;
+ if (c->notifiers)
+ blocking_notifier_call_chain(c->notifiers,
+ (unsigned long)curr_value,
+ NULL);
} else {
- return 0;
+ ret = 0;
}
+ return ret;
}
/**
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index d9f61a145802..f1604d8cf489 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -27,6 +27,7 @@
#include <linux/highmem.h>
#include <linux/list.h>
#include <linux/slab.h>
+#include <linux/compiler.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -155,7 +156,7 @@ static inline void free_image_page(void *addr, int clear_nosave_free)
struct linked_page {
struct linked_page *next;
char data[LINKED_PAGE_DATA_SIZE];
-} __attribute__((packed));
+} __packed;
static inline void
free_list_of_pages(struct linked_page *list, int clear_page_nosave)
@@ -247,33 +248,61 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
* information is stored (in the form of a block of bitmap)
* It also contains the pfns that correspond to the start and end of
* the represented memory area.
+ *
+ * The memory bitmap is organized as a radix tree to guarantee fast random
+ * access to the bits. There is one radix tree for each zone (as returned
+ * from create_mem_extents).
+ *
+ * One radix tree is represented by one struct mem_zone_bm_rtree. There are
+ * two linked lists for the nodes of the tree, one for the inner nodes and
+ * one for the leave nodes. The linked leave nodes are used for fast linear
+ * access of the memory bitmap.
+ *
+ * The struct rtree_node represents one node of the radix tree.
*/
#define BM_END_OF_MAP (~0UL)
#define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE)
+#define BM_BLOCK_SHIFT (PAGE_SHIFT + 3)
+#define BM_BLOCK_MASK ((1UL << BM_BLOCK_SHIFT) - 1)
-struct bm_block {
- struct list_head hook; /* hook into a list of bitmap blocks */
- unsigned long start_pfn; /* pfn represented by the first bit */
- unsigned long end_pfn; /* pfn represented by the last bit plus 1 */
- unsigned long *data; /* bitmap representing pages */
+/*
+ * struct rtree_node is a wrapper struct to link the nodes
+ * of the rtree together for easy linear iteration over
+ * bits and easy freeing
+ */
+struct rtree_node {
+ struct list_head list;
+ unsigned long *data;
};
-static inline unsigned long bm_block_bits(struct bm_block *bb)
-{
- return bb->end_pfn - bb->start_pfn;
-}
+/*
+ * struct mem_zone_bm_rtree represents a bitmap used for one
+ * populated memory zone.
+ */
+struct mem_zone_bm_rtree {
+ struct list_head list; /* Link Zones together */
+ struct list_head nodes; /* Radix Tree inner nodes */
+ struct list_head leaves; /* Radix Tree leaves */
+ unsigned long start_pfn; /* Zone start page frame */
+ unsigned long end_pfn; /* Zone end page frame + 1 */
+ struct rtree_node *rtree; /* Radix Tree Root */
+ int levels; /* Number of Radix Tree Levels */
+ unsigned int blocks; /* Number of Bitmap Blocks */
+};
/* strcut bm_position is used for browsing memory bitmaps */
struct bm_position {
- struct bm_block *block;
- int bit;
+ struct mem_zone_bm_rtree *zone;
+ struct rtree_node *node;
+ unsigned long node_pfn;
+ int node_bit;
};
struct memory_bitmap {
- struct list_head blocks; /* list of bitmap blocks */
+ struct list_head zones;
struct linked_page *p_list; /* list of pages used to store zone
* bitmap objects and bitmap block
* objects
@@ -283,38 +312,178 @@ struct memory_bitmap {
/* Functions that operate on memory bitmaps */
-static void memory_bm_position_reset(struct memory_bitmap *bm)
+#define BM_ENTRIES_PER_LEVEL (PAGE_SIZE / sizeof(unsigned long))
+#if BITS_PER_LONG == 32
+#define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 2)
+#else
+#define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 3)
+#endif
+#define BM_RTREE_LEVEL_MASK ((1UL << BM_RTREE_LEVEL_SHIFT) - 1)
+
+/*
+ * alloc_rtree_node - Allocate a new node and add it to the radix tree.
+ *
+ * This function is used to allocate inner nodes as well as the
+ * leave nodes of the radix tree. It also adds the node to the
+ * corresponding linked list passed in by the *list parameter.
+ */
+static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed,
+ struct chain_allocator *ca,
+ struct list_head *list)
{
- bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook);
- bm->cur.bit = 0;
-}
+ struct rtree_node *node;
-static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
+ node = chain_alloc(ca, sizeof(struct rtree_node));
+ if (!node)
+ return NULL;
-/**
- * create_bm_block_list - create a list of block bitmap objects
- * @pages - number of pages to track
- * @list - list to put the allocated blocks into
- * @ca - chain allocator to be used for allocating memory
+ node->data = get_image_page(gfp_mask, safe_needed);
+ if (!node->data)
+ return NULL;
+
+ list_add_tail(&node->list, list);
+
+ return node;
+}
+
+/*
+ * add_rtree_block - Add a new leave node to the radix tree
+ *
+ * The leave nodes need to be allocated in order to keep the leaves
+ * linked list in order. This is guaranteed by the zone->blocks
+ * counter.
*/
-static int create_bm_block_list(unsigned long pages,
- struct list_head *list,
- struct chain_allocator *ca)
+static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask,
+ int safe_needed, struct chain_allocator *ca)
{
- unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK);
+ struct rtree_node *node, *block, **dst;
+ unsigned int levels_needed, block_nr;
+ int i;
+
+ block_nr = zone->blocks;
+ levels_needed = 0;
- while (nr_blocks-- > 0) {
- struct bm_block *bb;
+ /* How many levels do we need for this block nr? */
+ while (block_nr) {
+ levels_needed += 1;
+ block_nr >>= BM_RTREE_LEVEL_SHIFT;
+ }
- bb = chain_alloc(ca, sizeof(struct bm_block));
- if (!bb)
+ /* Make sure the rtree has enough levels */
+ for (i = zone->levels; i < levels_needed; i++) {
+ node = alloc_rtree_node(gfp_mask, safe_needed, ca,
+ &zone->nodes);
+ if (!node)
return -ENOMEM;
- list_add(&bb->hook, list);
+
+ node->data[0] = (unsigned long)zone->rtree;
+ zone->rtree = node;
+ zone->levels += 1;
}
+ /* Allocate new block */
+ block = alloc_rtree_node(gfp_mask, safe_needed, ca, &zone->leaves);
+ if (!block)
+ return -ENOMEM;
+
+ /* Now walk the rtree to insert the block */
+ node = zone->rtree;
+ dst = &zone->rtree;
+ block_nr = zone->blocks;
+ for (i = zone->levels; i > 0; i--) {
+ int index;
+
+ if (!node) {
+ node = alloc_rtree_node(gfp_mask, safe_needed, ca,
+ &zone->nodes);
+ if (!node)
+ return -ENOMEM;
+ *dst = node;
+ }
+
+ index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT);
+ index &= BM_RTREE_LEVEL_MASK;
+ dst = (struct rtree_node **)&((*dst)->data[index]);
+ node = *dst;
+ }
+
+ zone->blocks += 1;
+ *dst = block;
+
return 0;
}
+static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
+ int clear_nosave_free);
+
+/*
+ * create_zone_bm_rtree - create a radix tree for one zone
+ *
+ * Allocated the mem_zone_bm_rtree structure and initializes it.
+ * This function also allocated and builds the radix tree for the
+ * zone.
+ */
+static struct mem_zone_bm_rtree *
+create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed,
+ struct chain_allocator *ca,
+ unsigned long start, unsigned long end)
+{
+ struct mem_zone_bm_rtree *zone;
+ unsigned int i, nr_blocks;
+ unsigned long pages;
+
+ pages = end - start;
+ zone = chain_alloc(ca, sizeof(struct mem_zone_bm_rtree));
+ if (!zone)
+ return NULL;
+
+ INIT_LIST_HEAD(&zone->nodes);
+ INIT_LIST_HEAD(&zone->leaves);
+ zone->start_pfn = start;
+ zone->end_pfn = end;
+ nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK);
+
+ for (i = 0; i < nr_blocks; i++) {
+ if (add_rtree_block(zone, gfp_mask, safe_needed, ca)) {
+ free_zone_bm_rtree(zone, PG_UNSAFE_CLEAR);
+ return NULL;
+ }
+ }
+
+ return zone;
+}
+
+/*
+ * free_zone_bm_rtree - Free the memory of the radix tree
+ *
+ * Free all node pages of the radix tree. The mem_zone_bm_rtree
+ * structure itself is not freed here nor are the rtree_node
+ * structs.
+ */
+static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
+ int clear_nosave_free)
+{
+ struct rtree_node *node;
+
+ list_for_each_entry(node, &zone->nodes, list)
+ free_image_page(node->data, clear_nosave_free);
+
+ list_for_each_entry(node, &zone->leaves, list)
+ free_image_page(node->data, clear_nosave_free);
+}
+
+static void memory_bm_position_reset(struct memory_bitmap *bm)
+{
+ bm->cur.zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree,
+ list);
+ bm->cur.node = list_entry(bm->cur.zone->leaves.next,
+ struct rtree_node, list);
+ bm->cur.node_pfn = 0;
+ bm->cur.node_bit = 0;
+}
+
+static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
+
struct mem_extent {
struct list_head hook;
unsigned long start;
@@ -406,40 +575,22 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
int error;
chain_init(&ca, gfp_mask, safe_needed);
- INIT_LIST_HEAD(&bm->blocks);
+ INIT_LIST_HEAD(&bm->zones);
error = create_mem_extents(&mem_extents, gfp_mask);
if (error)
return error;
list_for_each_entry(ext, &mem_extents, hook) {
- struct bm_block *bb;
- unsigned long pfn = ext->start;
- unsigned long pages = ext->end - ext->start;
-
- bb = list_entry(bm->blocks.prev, struct bm_block, hook);
+ struct mem_zone_bm_rtree *zone;
- error = create_bm_block_list(pages, bm->blocks.prev, &ca);
- if (error)
+ zone = create_zone_bm_rtree(gfp_mask, safe_needed, &ca,
+ ext->start, ext->end);
+ if (!zone) {
+ error = -ENOMEM;
goto Error;
-
- list_for_each_entry_continue(bb, &bm->blocks, hook) {
- bb->data = get_image_page(gfp_mask, safe_needed);
- if (!bb->data) {
- error = -ENOMEM;
- goto Error;
- }
-
- bb->start_pfn = pfn;
- if (pages >= BM_BITS_PER_BLOCK) {
- pfn += BM_BITS_PER_BLOCK;
- pages -= BM_BITS_PER_BLOCK;
- } else {
- /* This is executed only once in the loop */
- pfn += pages;
- }
- bb->end_pfn = pfn;
}
+ list_add_tail(&zone->list, &bm->zones);
}
bm->p_list = ca.chain;
@@ -459,51 +610,83 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
*/
static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
{
- struct bm_block *bb;
+ struct mem_zone_bm_rtree *zone;
- list_for_each_entry(bb, &bm->blocks, hook)
- if (bb->data)
- free_image_page(bb->data, clear_nosave_free);
+ list_for_each_entry(zone, &bm->zones, list)
+ free_zone_bm_rtree(zone, clear_nosave_free);
free_list_of_pages(bm->p_list, clear_nosave_free);
- INIT_LIST_HEAD(&bm->blocks);
+ INIT_LIST_HEAD(&bm->zones);
}
/**
- * memory_bm_find_bit - find the bit in the bitmap @bm that corresponds
- * to given pfn. The cur_zone_bm member of @bm and the cur_block member
- * of @bm->cur_zone_bm are updated.
+ * memory_bm_find_bit - Find the bit for pfn in the memory
+ * bitmap
+ *
+ * Find the bit in the bitmap @bm that corresponds to given pfn.
+ * The cur.zone, cur.block and cur.node_pfn member of @bm are
+ * updated.
+ * It walks the radix tree to find the page which contains the bit for
+ * pfn and returns the bit position in **addr and *bit_nr.
*/
static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
- void **addr, unsigned int *bit_nr)
+ void **addr, unsigned int *bit_nr)
{
- struct bm_block *bb;
+ struct mem_zone_bm_rtree *curr, *zone;
+ struct rtree_node *node;
+ int i, block_nr;
+ zone = bm->cur.zone;
+
+ if (pfn >= zone->start_pfn && pfn < zone->end_pfn)
+ goto zone_found;
+
+ zone = NULL;
+
+ /* Find the right zone */
+ list_for_each_entry(curr, &bm->zones, list) {
+ if (pfn >= curr->start_pfn && pfn < curr->end_pfn) {
+ zone = curr;
+ break;
+ }
+ }
+
+ if (!zone)
+ return -EFAULT;
+
+zone_found:
/*
- * Check if the pfn corresponds to the current bitmap block and find
- * the block where it fits if this is not the case.
+ * We have a zone. Now walk the radix tree to find the leave
+ * node for our pfn.
*/
- bb = bm->cur.block;
- if (pfn < bb->start_pfn)
- list_for_each_entry_continue_reverse(bb, &bm->blocks, hook)
- if (pfn >= bb->start_pfn)
- break;
- if (pfn >= bb->end_pfn)
- list_for_each_entry_continue(bb, &bm->blocks, hook)
- if (pfn >= bb->start_pfn && pfn < bb->end_pfn)
- break;
+ node = bm->cur.node;
+ if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
+ goto node_found;
- if (&bb->hook == &bm->blocks)
- return -EFAULT;
+ node = zone->rtree;
+ block_nr = (pfn - zone->start_pfn) >> BM_BLOCK_SHIFT;
+
+ for (i = zone->levels; i > 0; i--) {
+ int index;
+
+ index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT);
+ index &= BM_RTREE_LEVEL_MASK;
+ BUG_ON(node->data[index] == 0);
+ node = (struct rtree_node *)node->data[index];
+ }
+
+node_found:
+ /* Update last position */
+ bm->cur.zone = zone;
+ bm->cur.node = node;
+ bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK;
+
+ /* Set return values */
+ *addr = node->data;
+ *bit_nr = (pfn - zone->start_pfn) & BM_BLOCK_MASK;
- /* The block has been found */
- bm->cur.block = bb;
- pfn -= bb->start_pfn;
- bm->cur.bit = pfn + 1;
- *bit_nr = pfn;
- *addr = bb->data;
return 0;
}
@@ -527,6 +710,7 @@ static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn)
error = memory_bm_find_bit(bm, pfn, &addr, &bit);
if (!error)
set_bit(bit, addr);
+
return error;
}
@@ -560,38 +744,70 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
return !memory_bm_find_bit(bm, pfn, &addr, &bit);
}
-/**
- * memory_bm_next_pfn - find the pfn that corresponds to the next set bit
- * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is
- * returned.
+/*
+ * rtree_next_node - Jumps to the next leave node
+ *
+ * Sets the position to the beginning of the next node in the
+ * memory bitmap. This is either the next node in the current
+ * zone's radix tree or the first node in the radix tree of the
+ * next zone.
*
- * It is required to run memory_bm_position_reset() before the first call to
- * this function.
+ * Returns true if there is a next node, false otherwise.
*/
+static bool rtree_next_node(struct memory_bitmap *bm)
+{
+ bm->cur.node = list_entry(bm->cur.node->list.next,
+ struct rtree_node, list);
+ if (&bm->cur.node->list != &bm->cur.zone->leaves) {
+ bm->cur.node_pfn += BM_BITS_PER_BLOCK;
+ bm->cur.node_bit = 0;
+ touch_softlockup_watchdog();
+ return true;
+ }
+ /* No more nodes, goto next zone */
+ bm->cur.zone = list_entry(bm->cur.zone->list.next,
+ struct mem_zone_bm_rtree, list);
+ if (&bm->cur.zone->list != &bm->zones) {
+ bm->cur.node = list_entry(bm->cur.zone->leaves.next,
+ struct rtree_node, list);
+ bm->cur.node_pfn = 0;
+ bm->cur.node_bit = 0;
+ return true;
+ }
+
+ /* No more zones */
+ return false;
+}
+
+/**
+ * memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm
+ *
+ * Starting from the last returned position this function searches
+ * for the next set bit in the memory bitmap and returns its
+ * number. If no more bit is set BM_END_OF_MAP is returned.
+ *
+ * It is required to run memory_bm_position_reset() before the
+ * first call to this function.
+ */
static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
{
- struct bm_block *bb;
+ unsigned long bits, pfn, pages;
int bit;
- bb = bm->cur.block;
do {
- bit = bm->cur.bit;
- bit = find_next_bit(bb->data, bm_block_bits(bb), bit);
- if (bit < bm_block_bits(bb))
- goto Return_pfn;
-
- bb = list_entry(bb->hook.next, struct bm_block, hook);
- bm->cur.block = bb;
- bm->cur.bit = 0;
- } while (&bb->hook != &bm->blocks);
+ pages = bm->cur.zone->end_pfn - bm->cur.zone->start_pfn;
+ bits = min(pages - bm->cur.node_pfn, BM_BITS_PER_BLOCK);
+ bit = find_next_bit(bm->cur.node->data, bits,
+ bm->cur.node_bit);
+ if (bit < bits) {
+ pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit;
+ bm->cur.node_bit = bit + 1;
+ return pfn;
+ }
+ } while (rtree_next_node(bm));
- memory_bm_position_reset(bm);
return BM_END_OF_MAP;
-
- Return_pfn:
- bm->cur.bit = bit + 1;
- return bb->start_pfn + bit;
}
/**
@@ -730,6 +946,25 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
}
}
+static bool is_nosave_page(unsigned long pfn)
+{
+ struct nosave_region *region;
+
+ list_for_each_entry(region, &nosave_regions, list) {
+ if (pfn >= region->start_pfn && pfn < region->end_pfn) {
+ pr_err("PM: %#010llx in e820 nosave region: "
+ "[mem %#010llx-%#010llx]\n",
+ (unsigned long long) pfn << PAGE_SHIFT,
+ (unsigned long long) region->start_pfn << PAGE_SHIFT,
+ ((unsigned long long) region->end_pfn << PAGE_SHIFT)
+ - 1);
+ return true;
+ }
+ }
+
+ return false;
+}
+
/**
* create_basic_memory_bitmaps - create bitmaps needed for marking page
* frames that should not be saved and free page frames. The pointers
@@ -815,12 +1050,17 @@ void free_basic_memory_bitmaps(void)
unsigned int snapshot_additional_pages(struct zone *zone)
{
- unsigned int res;
+ unsigned int rtree, nodes;
+
+ rtree = nodes = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
+ rtree += DIV_ROUND_UP(rtree * sizeof(struct rtree_node),
+ LINKED_PAGE_DATA_SIZE);
+ while (nodes > 1) {
+ nodes = DIV_ROUND_UP(nodes, BM_ENTRIES_PER_LEVEL);
+ rtree += nodes;
+ }
- res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
- res += DIV_ROUND_UP(res * sizeof(struct bm_block),
- LINKED_PAGE_DATA_SIZE);
- return 2 * res;
+ return 2 * rtree;
}
#ifdef CONFIG_HIGHMEM
@@ -1268,7 +1508,7 @@ static void free_unnecessary_pages(void)
* [number of saveable pages] - [number of pages that can be freed in theory]
*
* where the second term is the sum of (1) reclaimable slab pages, (2) active
- * and (3) inactive anonymouns pages, (4) active and (5) inactive file pages,
+ * and (3) inactive anonymous pages, (4) active and (5) inactive file pages,
* minus mapped file pages.
*/
static unsigned long minimum_image_size(unsigned long saveable)
@@ -1585,7 +1825,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
return -ENOMEM;
}
-asmlinkage int swsusp_save(void)
+asmlinkage __visible int swsusp_save(void)
{
unsigned int nr_pages, nr_highmem;
@@ -1774,7 +2014,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
do {
pfn = memory_bm_next_pfn(bm);
if (likely(pfn != BM_END_OF_MAP)) {
- if (likely(pfn_valid(pfn)))
+ if (likely(pfn_valid(pfn)) && !is_nosave_page(pfn))
swsusp_set_page_free(pfn_to_page(pfn));
else
return -EFAULT;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 62ee437b5c7e..18c62195660f 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -14,6 +14,7 @@
#include <linux/init.h>
#include <linux/console.h>
#include <linux/cpu.h>
+#include <linux/cpuidle.h>
#include <linux/syscalls.h>
#include <linux/gfp.h>
#include <linux/io.h>
@@ -26,25 +27,25 @@
#include <linux/syscore_ops.h>
#include <linux/ftrace.h>
#include <trace/events/power.h>
+#include <linux/compiler.h>
#include "power.h"
-const char *const pm_states[PM_SUSPEND_MAX] = {
- [PM_SUSPEND_FREEZE] = "freeze",
- [PM_SUSPEND_STANDBY] = "standby",
- [PM_SUSPEND_MEM] = "mem",
-};
+const char *pm_labels[] = { "mem", "standby", "freeze", NULL };
+const char *pm_states[PM_SUSPEND_MAX];
static const struct platform_suspend_ops *suspend_ops;
+static const struct platform_freeze_ops *freeze_ops;
+static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
+static bool suspend_freeze_wake;
-static bool need_suspend_ops(suspend_state_t state)
+void freeze_set_ops(const struct platform_freeze_ops *ops)
{
- return !!(state > PM_SUSPEND_FREEZE);
+ lock_system_sleep();
+ freeze_ops = ops;
+ unlock_system_sleep();
}
-static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
-static bool suspend_freeze_wake;
-
static void freeze_begin(void)
{
suspend_freeze_wake = false;
@@ -52,7 +53,11 @@ static void freeze_begin(void)
static void freeze_enter(void)
{
+ cpuidle_use_deepest_state(true);
+ cpuidle_resume();
wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
+ cpuidle_pause();
+ cpuidle_use_deepest_state(false);
}
void freeze_wake(void)
@@ -62,42 +67,59 @@ void freeze_wake(void)
}
EXPORT_SYMBOL_GPL(freeze_wake);
+static bool valid_state(suspend_state_t state)
+{
+ /*
+ * PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states need low level
+ * support and need to be valid to the low level
+ * implementation, no valid callback implies that none are valid.
+ */
+ return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
+}
+
+/*
+ * If this is set, the "mem" label always corresponds to the deepest sleep state
+ * available, the "standby" label corresponds to the second deepest sleep state
+ * available (if any), and the "freeze" label corresponds to the remaining
+ * available sleep state (if there is one).
+ */
+static bool relative_states;
+
+static int __init sleep_states_setup(char *str)
+{
+ relative_states = !strncmp(str, "1", 1);
+ pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2];
+ return 1;
+}
+
+__setup("relative_sleep_states=", sleep_states_setup);
+
/**
* suspend_set_ops - Set the global suspend method table.
* @ops: Suspend operations to use.
*/
void suspend_set_ops(const struct platform_suspend_ops *ops)
{
+ suspend_state_t i;
+ int j = 0;
+
lock_system_sleep();
+
suspend_ops = ops;
+ for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--)
+ if (valid_state(i)) {
+ pm_states[i] = pm_labels[j++];
+ } else if (!relative_states) {
+ pm_states[i] = NULL;
+ j++;
+ }
+
+ pm_states[PM_SUSPEND_FREEZE] = pm_labels[j];
+
unlock_system_sleep();
}
EXPORT_SYMBOL_GPL(suspend_set_ops);
-bool valid_state(suspend_state_t state)
-{
- if (state == PM_SUSPEND_FREEZE) {
-#ifdef CONFIG_PM_DEBUG
- if (pm_test_level != TEST_NONE &&
- pm_test_level != TEST_FREEZER &&
- pm_test_level != TEST_DEVICES &&
- pm_test_level != TEST_PLATFORM) {
- printk(KERN_WARNING "Unsupported pm_test mode for "
- "freeze state, please choose "
- "none/freezer/devices/platform.\n");
- return false;
- }
-#endif
- return true;
- }
- /*
- * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel
- * support and need to be valid to the lowlevel
- * implementation, no valid callback implies that none are valid.
- */
- return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
-}
-
/**
* suspend_valid_only_mem - Generic memory-only valid callback.
*
@@ -111,6 +133,65 @@ int suspend_valid_only_mem(suspend_state_t state)
}
EXPORT_SYMBOL_GPL(suspend_valid_only_mem);
+static bool sleep_state_supported(suspend_state_t state)
+{
+ return state == PM_SUSPEND_FREEZE || (suspend_ops && suspend_ops->enter);
+}
+
+static int platform_suspend_prepare(suspend_state_t state)
+{
+ return state != PM_SUSPEND_FREEZE && suspend_ops->prepare ?
+ suspend_ops->prepare() : 0;
+}
+
+static int platform_suspend_prepare_late(suspend_state_t state)
+{
+ return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ?
+ suspend_ops->prepare_late() : 0;
+}
+
+static void platform_suspend_wake(suspend_state_t state)
+{
+ if (state != PM_SUSPEND_FREEZE && suspend_ops->wake)
+ suspend_ops->wake();
+}
+
+static void platform_suspend_finish(suspend_state_t state)
+{
+ if (state != PM_SUSPEND_FREEZE && suspend_ops->finish)
+ suspend_ops->finish();
+}
+
+static int platform_suspend_begin(suspend_state_t state)
+{
+ if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin)
+ return freeze_ops->begin();
+ else if (suspend_ops->begin)
+ return suspend_ops->begin(state);
+ else
+ return 0;
+}
+
+static void platform_suspend_end(suspend_state_t state)
+{
+ if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
+ freeze_ops->end();
+ else if (suspend_ops->end)
+ suspend_ops->end();
+}
+
+static void platform_suspend_recover(suspend_state_t state)
+{
+ if (state != PM_SUSPEND_FREEZE && suspend_ops->recover)
+ suspend_ops->recover();
+}
+
+static bool platform_suspend_again(suspend_state_t state)
+{
+ return state != PM_SUSPEND_FREEZE && suspend_ops->suspend_again ?
+ suspend_ops->suspend_again() : false;
+}
+
static int suspend_test(int level)
{
#ifdef CONFIG_PM_DEBUG
@@ -134,7 +215,7 @@ static int suspend_prepare(suspend_state_t state)
{
int error;
- if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter))
+ if (!sleep_state_supported(state))
return -EPERM;
pm_prepare_console();
@@ -143,7 +224,9 @@ static int suspend_prepare(suspend_state_t state)
if (error)
goto Finish;
+ trace_suspend_resume(TPS("freeze_processes"), 0, true);
error = suspend_freeze_processes();
+ trace_suspend_resume(TPS("freeze_processes"), 0, false);
if (!error)
return 0;
@@ -156,13 +239,13 @@ static int suspend_prepare(suspend_state_t state)
}
/* default implementation */
-void __attribute__ ((weak)) arch_suspend_disable_irqs(void)
+void __weak arch_suspend_disable_irqs(void)
{
local_irq_disable();
}
/* default implementation */
-void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
+void __weak arch_suspend_enable_irqs(void)
{
local_irq_enable();
}
@@ -178,23 +261,18 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
{
int error;
- if (need_suspend_ops(state) && suspend_ops->prepare) {
- error = suspend_ops->prepare();
- if (error)
- goto Platform_finish;
- }
+ error = platform_suspend_prepare(state);
+ if (error)
+ goto Platform_finish;
error = dpm_suspend_end(PMSG_SUSPEND);
if (error) {
printk(KERN_ERR "PM: Some devices failed to power down\n");
goto Platform_finish;
}
-
- if (need_suspend_ops(state) && suspend_ops->prepare_late) {
- error = suspend_ops->prepare_late();
- if (error)
- goto Platform_wake;
- }
+ error = platform_suspend_prepare_late(state);
+ if (error)
+ goto Platform_wake;
if (suspend_test(TEST_PLATFORM))
goto Platform_wake;
@@ -206,11 +284,12 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
* all the devices are suspended.
*/
if (state == PM_SUSPEND_FREEZE) {
+ trace_suspend_resume(TPS("machine_suspend"), state, true);
freeze_enter();
+ trace_suspend_resume(TPS("machine_suspend"), state, false);
goto Platform_wake;
}
- ftrace_stop();
error = disable_nonboot_cpus();
if (error || suspend_test(TEST_CPUS))
goto Enable_cpus;
@@ -222,7 +301,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
if (!error) {
*wakeup = pm_wakeup_pending();
if (!(suspend_test(TEST_CORE) || *wakeup)) {
+ trace_suspend_resume(TPS("machine_suspend"),
+ state, true);
error = suspend_ops->enter(state);
+ trace_suspend_resume(TPS("machine_suspend"),
+ state, false);
events_check_enabled = false;
}
syscore_resume();
@@ -233,18 +316,13 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
Enable_cpus:
enable_nonboot_cpus();
- ftrace_start();
Platform_wake:
- if (need_suspend_ops(state) && suspend_ops->wake)
- suspend_ops->wake();
-
+ platform_suspend_wake(state);
dpm_resume_start(PMSG_RESUME);
Platform_finish:
- if (need_suspend_ops(state) && suspend_ops->finish)
- suspend_ops->finish();
-
+ platform_suspend_finish(state);
return error;
}
@@ -257,15 +335,13 @@ int suspend_devices_and_enter(suspend_state_t state)
int error;
bool wakeup = false;
- if (need_suspend_ops(state) && !suspend_ops)
+ if (!sleep_state_supported(state))
return -ENOSYS;
- trace_machine_suspend(state);
- if (need_suspend_ops(state) && suspend_ops->begin) {
- error = suspend_ops->begin(state);
- if (error)
- goto Close;
- }
+ error = platform_suspend_begin(state);
+ if (error)
+ goto Close;
+
suspend_console();
suspend_test_start();
error = dpm_suspend_start(PMSG_SUSPEND);
@@ -279,23 +355,20 @@ int suspend_devices_and_enter(suspend_state_t state)
do {
error = suspend_enter(state, &wakeup);
- } while (!error && !wakeup && need_suspend_ops(state)
- && suspend_ops->suspend_again && suspend_ops->suspend_again());
+ } while (!error && !wakeup && platform_suspend_again(state));
Resume_devices:
suspend_test_start();
dpm_resume_end(PMSG_RESUME);
suspend_test_finish("resume devices");
resume_console();
+
Close:
- if (need_suspend_ops(state) && suspend_ops->end)
- suspend_ops->end();
- trace_machine_suspend(PWR_EVENT_EXIT);
+ platform_suspend_end(state);
return error;
Recover_platform:
- if (need_suspend_ops(state) && suspend_ops->recover)
- suspend_ops->recover();
+ platform_suspend_recover(state);
goto Resume_devices;
}
@@ -324,18 +397,29 @@ static int enter_state(suspend_state_t state)
{
int error;
- if (!valid_state(state))
- return -ENODEV;
-
+ trace_suspend_resume(TPS("suspend_enter"), state, true);
+ if (state == PM_SUSPEND_FREEZE) {
+#ifdef CONFIG_PM_DEBUG
+ if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) {
+ pr_warning("PM: Unsupported test mode for freeze state,"
+ "please choose none/freezer/devices/platform.\n");
+ return -EAGAIN;
+ }
+#endif
+ } else if (!valid_state(state)) {
+ return -EINVAL;
+ }
if (!mutex_trylock(&pm_mutex))
return -EBUSY;
if (state == PM_SUSPEND_FREEZE)
freeze_begin();
+ trace_suspend_resume(TPS("sync_filesystems"), 0, true);
printk(KERN_INFO "PM: Syncing filesystems ... ");
sys_sync();
printk("done.\n");
+ trace_suspend_resume(TPS("sync_filesystems"), 0, false);
pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
error = suspend_prepare(state);
@@ -345,6 +429,7 @@ static int enter_state(suspend_state_t state)
if (suspend_test(TEST_FREEZER))
goto Finish;
+ trace_suspend_resume(TPS("suspend_enter"), state, false);
pr_debug("PM: Entering %s sleep\n", pm_states[state]);
pm_restrict_gfp_mask();
error = suspend_devices_and_enter(state);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 9b2a1d58558d..bd91bc177c93 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -129,25 +129,23 @@ static int __init has_wakealarm(struct device *dev, const void *data)
* at startup time. They're normally disabled, for faster boot and because
* we can't know which states really work on this particular system.
*/
-static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
+static const char *test_state_label __initdata;
static char warn_bad_state[] __initdata =
KERN_WARNING "PM: can't test '%s' suspend state\n";
static int __init setup_test_suspend(char *value)
{
- unsigned i;
+ int i;
/* "=mem" ==> "mem" */
value++;
- for (i = 0; i < PM_SUSPEND_MAX; i++) {
- if (!pm_states[i])
- continue;
- if (strcmp(pm_states[i], value) != 0)
- continue;
- test_state = (__force suspend_state_t) i;
- return 0;
- }
+ for (i = 0; pm_labels[i]; i++)
+ if (!strcmp(pm_labels[i], value)) {
+ test_state_label = pm_labels[i];
+ return 0;
+ }
+
printk(warn_bad_state, value);
return 0;
}
@@ -160,13 +158,21 @@ static int __init test_suspend(void)
struct rtc_device *rtc = NULL;
struct device *dev;
+ suspend_state_t test_state;
/* PM is initialized by now; is that state testable? */
- if (test_state == PM_SUSPEND_ON)
- goto done;
- if (!valid_state(test_state)) {
- printk(warn_bad_state, pm_states[test_state]);
- goto done;
+ if (!test_state_label)
+ return 0;
+
+ for (test_state = PM_SUSPEND_MIN; test_state < PM_SUSPEND_MAX; test_state++) {
+ const char *state_label = pm_states[test_state];
+
+ if (state_label && !strcmp(test_state_label, state_label))
+ break;
+ }
+ if (test_state == PM_SUSPEND_MAX) {
+ printk(warn_bad_state, test_state_label);
+ return 0;
}
/* RTCs have initialized by now too ... can we use one? */
@@ -175,13 +181,12 @@ static int __init test_suspend(void)
rtc = rtc_class_open(dev_name(dev));
if (!rtc) {
printk(warn_no_rtc);
- goto done;
+ return 0;
}
/* go for it */
test_wakealarm(rtc, test_state);
rtc_class_close(rtc);
-done:
return 0;
}
late_initcall(test_suspend);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 7c33ed200410..aaa3261dea5d 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -101,7 +101,7 @@ struct swsusp_header {
unsigned int flags; /* Flags to pass to the "boot" kernel */
char orig_sig[10];
char sig[10];
-} __attribute__((packed));
+} __packed;
static struct swsusp_header *swsusp_header;
@@ -567,7 +567,7 @@ static int lzo_compress_threadfn(void *data)
/**
* save_image_lzo - Save the suspend image data compressed with LZO.
- * @handle: Swap mam handle to use for saving the image.
+ * @handle: Swap map handle to use for saving the image.
* @snapshot: Image to read data from.
* @nr_to_write: Number of pages to save.
*/
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 98d357584cd6..526e8911460a 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -49,6 +49,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)
struct snapshot_data *data;
int error;
+ if (!hibernation_available())
+ return -EPERM;
+
lock_system_sleep();
if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index 8f50de394d22..019069c84ff6 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -18,6 +18,8 @@
#include <linux/rbtree.h>
#include <linux/slab.h>
+#include "power.h"
+
static DEFINE_MUTEX(wakelocks_lock);
struct wakelock {
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 4dae9cbe9259..1ce770687ea8 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -45,6 +45,7 @@
#include <linux/poll.h>
#include <linux/irq_work.h>
#include <linux/utsname.h>
+#include <linux/ctype.h>
#include <asm/uaccess.h>
@@ -54,20 +55,16 @@
#include "console_cmdline.h"
#include "braille.h"
-/* printk's without a loglevel use this.. */
-#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
-
-/* We show everything that is MORE important than this.. */
-#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
-#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */
-
int console_printk[4] = {
- DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */
- DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */
- MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */
- DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
+ CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */
+ MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */
+ CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */
+ CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */
};
+/* Deferred messaged from sched code are marked by this special level */
+#define SCHED_MESSAGE_LOGLEVEL -2
+
/*
* Low level drivers may need that to know if they can schedule in
* their unblank() callback or not. So let's export it.
@@ -91,12 +88,35 @@ static struct lockdep_map console_lock_dep_map = {
#endif
/*
+ * Helper macros to handle lockdep when locking/unlocking console_sem. We use
+ * macros instead of functions so that _RET_IP_ contains useful information.
+ */
+#define down_console_sem() do { \
+ down(&console_sem);\
+ mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);\
+} while (0)
+
+static int __down_trylock_console_sem(unsigned long ip)
+{
+ if (down_trylock(&console_sem))
+ return 1;
+ mutex_acquire(&console_lock_dep_map, 0, 1, ip);
+ return 0;
+}
+#define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_)
+
+#define up_console_sem() do { \
+ mutex_release(&console_lock_dep_map, 1, _RET_IP_);\
+ up(&console_sem);\
+} while (0)
+
+/*
* This is used for debugging the mess that is the VT code by
* keeping track if we have the console semaphore held. It's
* definitely not the perfect debug tool (we don't know if _WE_
- * hold it are racing, but it helps tracking those weird code
- * path in the console code where we end up in places I want
- * locked without the console sempahore held
+ * hold it and are racing, but it helps tracking those weird code
+ * paths in the console code where we end up in places I want
+ * locked without the console sempahore held).
*/
static int console_locked, console_suspended;
@@ -127,8 +147,8 @@ static int console_may_schedule;
* the overall length of the record.
*
* The heads to the first and last entry in the buffer, as well as the
- * sequence numbers of these both entries are maintained when messages
- * are stored..
+ * sequence numbers of these entries are maintained when messages are
+ * stored.
*
* If the heads indicate available messages, the length in the header
* tells the start next message. A length == 0 for the next message
@@ -206,8 +226,9 @@ struct printk_log {
};
/*
- * The logbuf_lock protects kmsg buffer, indices, counters. It is also
- * used in interesting ways to provide interlocking in console_unlock();
+ * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken
+ * within the scheduler's rq lock. It must be released before calling
+ * console_unlock() or anything else that might wake up a process.
*/
static DEFINE_RAW_SPINLOCK(logbuf_lock);
@@ -237,7 +258,7 @@ static u64 clear_seq;
static u32 clear_idx;
#define PREFIX_MAX 32
-#define LOG_LINE_MAX 1024 - PREFIX_MAX
+#define LOG_LINE_MAX (1024 - PREFIX_MAX)
/* record buffer */
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
@@ -246,12 +267,22 @@ static u32 clear_idx;
#define LOG_ALIGN __alignof__(struct printk_log)
#endif
#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
+#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT)
static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
static char *log_buf = __log_buf;
static u32 log_buf_len = __LOG_BUF_LEN;
-/* cpu currently holding logbuf_lock */
-static volatile unsigned int logbuf_cpu = UINT_MAX;
+/* Return log buffer address */
+char *log_buf_addr_get(void)
+{
+ return log_buf;
+}
+
+/* Return log buffer size */
+u32 log_buf_len_get(void)
+{
+ return log_buf_len;
+}
/* human readable text of the record */
static char *log_text(const struct printk_log *msg)
@@ -297,37 +328,109 @@ static u32 log_next(u32 idx)
return idx + msg->len;
}
-/* insert record into the buffer, discard old ones, update heads */
-static void log_store(int facility, int level,
- enum log_flags flags, u64 ts_nsec,
- const char *dict, u16 dict_len,
- const char *text, u16 text_len)
+/*
+ * Check whether there is enough free space for the given message.
+ *
+ * The same values of first_idx and next_idx mean that the buffer
+ * is either empty or full.
+ *
+ * If the buffer is empty, we must respect the position of the indexes.
+ * They cannot be reset to the beginning of the buffer.
+ */
+static int logbuf_has_space(u32 msg_size, bool empty)
{
- struct printk_log *msg;
- u32 size, pad_len;
+ u32 free;
- /* number of '\0' padding bytes to next message */
- size = sizeof(struct printk_log) + text_len + dict_len;
- pad_len = (-size) & (LOG_ALIGN - 1);
- size += pad_len;
+ if (log_next_idx > log_first_idx || empty)
+ free = max(log_buf_len - log_next_idx, log_first_idx);
+ else
+ free = log_first_idx - log_next_idx;
+ /*
+ * We need space also for an empty header that signalizes wrapping
+ * of the buffer.
+ */
+ return free >= msg_size + sizeof(struct printk_log);
+}
+
+static int log_make_free_space(u32 msg_size)
+{
while (log_first_seq < log_next_seq) {
- u32 free;
+ if (logbuf_has_space(msg_size, false))
+ return 0;
+ /* drop old messages until we have enough contiguous space */
+ log_first_idx = log_next(log_first_idx);
+ log_first_seq++;
+ }
- if (log_next_idx > log_first_idx)
- free = max(log_buf_len - log_next_idx, log_first_idx);
- else
- free = log_first_idx - log_next_idx;
+ /* sequence numbers are equal, so the log buffer is empty */
+ if (logbuf_has_space(msg_size, true))
+ return 0;
- if (free > size + sizeof(struct printk_log))
- break;
+ return -ENOMEM;
+}
- /* drop old messages until we have enough contiuous space */
- log_first_idx = log_next(log_first_idx);
- log_first_seq++;
+/* compute the message size including the padding bytes */
+static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len)
+{
+ u32 size;
+
+ size = sizeof(struct printk_log) + text_len + dict_len;
+ *pad_len = (-size) & (LOG_ALIGN - 1);
+ size += *pad_len;
+
+ return size;
+}
+
+/*
+ * Define how much of the log buffer we could take at maximum. The value
+ * must be greater than two. Note that only half of the buffer is available
+ * when the index points to the middle.
+ */
+#define MAX_LOG_TAKE_PART 4
+static const char trunc_msg[] = "<truncated>";
+
+static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len,
+ u16 *dict_len, u32 *pad_len)
+{
+ /*
+ * The message should not take the whole buffer. Otherwise, it might
+ * get removed too soon.
+ */
+ u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART;
+ if (*text_len > max_text_len)
+ *text_len = max_text_len;
+ /* enable the warning message */
+ *trunc_msg_len = strlen(trunc_msg);
+ /* disable the "dict" completely */
+ *dict_len = 0;
+ /* compute the size again, count also the warning message */
+ return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len);
+}
+
+/* insert record into the buffer, discard old ones, update heads */
+static int log_store(int facility, int level,
+ enum log_flags flags, u64 ts_nsec,
+ const char *dict, u16 dict_len,
+ const char *text, u16 text_len)
+{
+ struct printk_log *msg;
+ u32 size, pad_len;
+ u16 trunc_msg_len = 0;
+
+ /* number of '\0' padding bytes to next message */
+ size = msg_used_size(text_len, dict_len, &pad_len);
+
+ if (log_make_free_space(size)) {
+ /* truncate the message if it is too long for empty buffer */
+ size = truncate_msg(&text_len, &trunc_msg_len,
+ &dict_len, &pad_len);
+ /* survive when the log buffer is too small for trunc_msg */
+ if (log_make_free_space(size))
+ return 0;
}
- if (log_next_idx + size + sizeof(struct printk_log) >= log_buf_len) {
+ if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) {
/*
* This message + an additional empty header does not fit
* at the end of the buffer. Add an empty header with len == 0
@@ -341,6 +444,10 @@ static void log_store(int facility, int level,
msg = (struct printk_log *)(log_buf + log_next_idx);
memcpy(log_text(msg), text, text_len);
msg->text_len = text_len;
+ if (trunc_msg_len) {
+ memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len);
+ msg->text_len += trunc_msg_len;
+ }
memcpy(log_dict(msg), dict, dict_len);
msg->dict_len = dict_len;
msg->facility = facility;
@@ -351,18 +458,16 @@ static void log_store(int facility, int level,
else
msg->ts_nsec = local_clock();
memset(log_dict(msg) + dict_len, 0, pad_len);
- msg->len = sizeof(struct printk_log) + text_len + dict_len + pad_len;
+ msg->len = size;
/* insert message */
log_next_idx += msg->len;
log_next_seq++;
+
+ return msg->text_len;
}
-#ifdef CONFIG_SECURITY_DMESG_RESTRICT
-int dmesg_restrict = 1;
-#else
-int dmesg_restrict;
-#endif
+int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT);
static int syslog_action_restricted(int type)
{
@@ -733,34 +838,74 @@ void log_buf_kexec_setup(void)
/* requested log_buf_len from kernel cmdline */
static unsigned long __initdata new_log_buf_len;
-/* save requested log_buf_len since it's too early to process it */
-static int __init log_buf_len_setup(char *str)
+/* we practice scaling the ring buffer by powers of 2 */
+static void __init log_buf_len_update(unsigned size)
{
- unsigned size = memparse(str, &str);
-
if (size)
size = roundup_pow_of_two(size);
if (size > log_buf_len)
new_log_buf_len = size;
+}
+
+/* save requested log_buf_len since it's too early to process it */
+static int __init log_buf_len_setup(char *str)
+{
+ unsigned size = memparse(str, &str);
+
+ log_buf_len_update(size);
return 0;
}
early_param("log_buf_len", log_buf_len_setup);
+static void __init log_buf_add_cpu(void)
+{
+ unsigned int cpu_extra;
+
+ /*
+ * archs should set up cpu_possible_bits properly with
+ * set_cpu_possible() after setup_arch() but just in
+ * case lets ensure this is valid.
+ */
+ if (num_possible_cpus() == 1)
+ return;
+
+ cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN;
+
+ /* by default this will only continue through for large > 64 CPUs */
+ if (cpu_extra <= __LOG_BUF_LEN / 2)
+ return;
+
+ pr_info("log_buf_len individual max cpu contribution: %d bytes\n",
+ __LOG_CPU_MAX_BUF_LEN);
+ pr_info("log_buf_len total cpu_extra contributions: %d bytes\n",
+ cpu_extra);
+ pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN);
+
+ log_buf_len_update(cpu_extra + __LOG_BUF_LEN);
+}
+
void __init setup_log_buf(int early)
{
unsigned long flags;
char *new_log_buf;
int free;
+ if (log_buf != __log_buf)
+ return;
+
+ if (!early && !new_log_buf_len)
+ log_buf_add_cpu();
+
if (!new_log_buf_len)
return;
if (early) {
new_log_buf =
- memblock_virt_alloc(new_log_buf_len, PAGE_SIZE);
+ memblock_virt_alloc(new_log_buf_len, LOG_ALIGN);
} else {
- new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0);
+ new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len,
+ LOG_ALIGN);
}
if (unlikely(!new_log_buf)) {
@@ -777,7 +922,7 @@ void __init setup_log_buf(int early)
memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
raw_spin_unlock_irqrestore(&logbuf_lock, flags);
- pr_info("log_buf_len: %d\n", log_buf_len);
+ pr_info("log_buf_len: %d bytes\n", log_buf_len);
pr_info("early log buf free: %d(%d%%)\n",
free, (free * 100) / __LOG_BUF_LEN);
}
@@ -786,7 +931,7 @@ static bool __read_mostly ignore_loglevel;
static int __init ignore_loglevel_setup(char *str)
{
- ignore_loglevel = 1;
+ ignore_loglevel = true;
pr_info("debug: ignoring loglevel setting.\n");
return 0;
@@ -852,11 +997,7 @@ static inline void boot_delay_msec(int level)
}
#endif
-#if defined(CONFIG_PRINTK_TIME)
-static bool printk_time = 1;
-#else
-static bool printk_time;
-#endif
+static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME);
module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
static size_t print_time(u64 ts, char *buf)
@@ -1215,7 +1356,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
* for pending data, not the size; return the count of
* records, not the length.
*/
- error = log_next_idx - syslog_idx;
+ error = log_next_seq - syslog_seq;
} else {
u64 seq = syslog_seq;
u32 idx = syslog_idx;
@@ -1303,7 +1444,10 @@ static void zap_locks(void)
sema_init(&console_sem, 1);
}
-/* Check if we have any console registered that can be called early in boot. */
+/*
+ * Check if we have any console that is capable of printing while cpu is
+ * booting or shutting down. Requires console_sem.
+ */
static int have_callable_console(void)
{
struct console *con;
@@ -1318,10 +1462,9 @@ static int have_callable_console(void)
/*
* Can we actually use the console at this time on this cpu?
*
- * Console drivers may assume that per-cpu resources have
- * been allocated. So unless they're explicitly marked as
- * being able to cope (CON_ANYTIME) don't call them until
- * this CPU is officially up.
+ * Console drivers may assume that per-cpu resources have been allocated. So
+ * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
+ * call them until this CPU is officially up.
*/
static inline int can_use_console(unsigned int cpu)
{
@@ -1333,36 +1476,24 @@ static inline int can_use_console(unsigned int cpu)
* messages from a 'printk'. Return true (and with the
* console_lock held, and 'console_locked' set) if it
* is successful, false otherwise.
- *
- * This gets called with the 'logbuf_lock' spinlock held and
- * interrupts disabled. It should return with 'lockbuf_lock'
- * released but interrupts still disabled.
*/
-static int console_trylock_for_printk(unsigned int cpu)
- __releases(&logbuf_lock)
+static int console_trylock_for_printk(void)
{
- int retval = 0, wake = 0;
-
- if (console_trylock()) {
- retval = 1;
+ unsigned int cpu = smp_processor_id();
- /*
- * If we can't use the console, we need to release
- * the console semaphore by hand to avoid flushing
- * the buffer. We need to hold the console semaphore
- * in order to do this test safely.
- */
- if (!can_use_console(cpu)) {
- console_locked = 0;
- wake = 1;
- retval = 0;
- }
+ if (!console_trylock())
+ return 0;
+ /*
+ * If we can't use the console, we need to release the console
+ * semaphore by hand to avoid flushing the buffer. We need to hold the
+ * console semaphore in order to do this test safely.
+ */
+ if (!can_use_console(cpu)) {
+ console_locked = 0;
+ up_console_sem();
+ return 0;
}
- logbuf_cpu = UINT_MAX;
- raw_spin_unlock(&logbuf_lock);
- if (wake)
- up(&console_sem);
- return retval;
+ return 1;
}
int printk_delay_msec __read_mostly;
@@ -1392,7 +1523,7 @@ static struct cont {
struct task_struct *owner; /* task of first print*/
u64 ts_nsec; /* time of first print */
u8 level; /* log level of first message */
- u8 facility; /* log level of first message */
+ u8 facility; /* log facility of first message */
enum log_flags flags; /* prefix, newline flags */
bool flushed:1; /* buffer sealed and committed */
} cont;
@@ -1490,11 +1621,19 @@ asmlinkage int vprintk_emit(int facility, int level,
static int recursion_bug;
static char textbuf[LOG_LINE_MAX];
char *text = textbuf;
- size_t text_len;
+ size_t text_len = 0;
enum log_flags lflags = 0;
unsigned long flags;
int this_cpu;
int printed_len = 0;
+ bool in_sched = false;
+ /* cpu currently holding logbuf_lock in this function */
+ static volatile unsigned int logbuf_cpu = UINT_MAX;
+
+ if (level == SCHED_MESSAGE_LOGLEVEL) {
+ level = -1;
+ in_sched = true;
+ }
boot_delay_msec(level);
printk_delay();
@@ -1516,7 +1655,8 @@ asmlinkage int vprintk_emit(int facility, int level,
*/
if (!oops_in_progress && !lockdep_recursing(current)) {
recursion_bug = 1;
- goto out_restore_irqs;
+ local_irq_restore(flags);
+ return 0;
}
zap_locks();
}
@@ -1525,22 +1665,27 @@ asmlinkage int vprintk_emit(int facility, int level,
raw_spin_lock(&logbuf_lock);
logbuf_cpu = this_cpu;
- if (recursion_bug) {
+ if (unlikely(recursion_bug)) {
static const char recursion_msg[] =
"BUG: recent printk recursion!";
recursion_bug = 0;
- printed_len += strlen(recursion_msg);
/* emit KERN_CRIT message */
- log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
- NULL, 0, recursion_msg, printed_len);
+ printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
+ NULL, 0, recursion_msg,
+ strlen(recursion_msg));
}
/*
* The printf needs to come first; we need the syslog
* prefix which might be passed-in as a parameter.
*/
- text_len = vscnprintf(text, sizeof(textbuf), fmt, args);
+ if (in_sched)
+ text_len = scnprintf(text, sizeof(textbuf),
+ KERN_WARNING "[sched_delayed] ");
+
+ text_len += vscnprintf(text + text_len,
+ sizeof(textbuf) - text_len, fmt, args);
/* mark and strip a trailing newline */
if (text_len && text[text_len-1] == '\n') {
@@ -1560,9 +1705,12 @@ asmlinkage int vprintk_emit(int facility, int level,
level = kern_level - '0';
case 'd': /* KERN_DEFAULT */
lflags |= LOG_PREFIX;
- case 'c': /* KERN_CONT */
- break;
}
+ /*
+ * No need to check length here because vscnprintf
+ * put '\0' at the end of the string. Only valid and
+ * newly printed level is detected.
+ */
text_len -= end_of_header - text;
text = (char *)end_of_header;
}
@@ -1583,9 +1731,12 @@ asmlinkage int vprintk_emit(int facility, int level,
cont_flush(LOG_NEWLINE);
/* buffer line if possible, otherwise store it right away */
- if (!cont_add(facility, level, text, text_len))
- log_store(facility, level, lflags | LOG_CONT, 0,
- dict, dictlen, text, text_len);
+ if (cont_add(facility, level, text, text_len))
+ printed_len += text_len;
+ else
+ printed_len += log_store(facility, level,
+ lflags | LOG_CONT, 0,
+ dict, dictlen, text, text_len);
} else {
bool stored = false;
@@ -1604,27 +1755,39 @@ asmlinkage int vprintk_emit(int facility, int level,
cont_flush(LOG_NEWLINE);
}
- if (!stored)
- log_store(facility, level, lflags, 0,
- dict, dictlen, text, text_len);
+ if (stored)
+ printed_len += text_len;
+ else
+ printed_len += log_store(facility, level, lflags, 0,
+ dict, dictlen, text, text_len);
}
- printed_len += text_len;
-
- /*
- * Try to acquire and then immediately release the console semaphore.
- * The release will print out buffers and wake up /dev/kmsg and syslog()
- * users.
- *
- * The console_trylock_for_printk() function will release 'logbuf_lock'
- * regardless of whether it actually gets the console semaphore or not.
- */
- if (console_trylock_for_printk(this_cpu))
- console_unlock();
+ logbuf_cpu = UINT_MAX;
+ raw_spin_unlock(&logbuf_lock);
lockdep_on();
-out_restore_irqs:
local_irq_restore(flags);
+ /* If called from the scheduler, we can not call up(). */
+ if (!in_sched) {
+ lockdep_off();
+ /*
+ * Disable preemption to avoid being preempted while holding
+ * console_sem which would prevent anyone from printing to
+ * console
+ */
+ preempt_disable();
+
+ /*
+ * Try to acquire and then immediately release the console
+ * semaphore. The release will print out buffers and wake up
+ * /dev/kmsg and syslog() users.
+ */
+ if (console_trylock_for_printk())
+ console_unlock();
+ preempt_enable();
+ lockdep_on();
+ }
+
return printed_len;
}
EXPORT_SYMBOL(vprintk_emit);
@@ -1671,7 +1834,7 @@ EXPORT_SYMBOL(printk_emit);
*
* See the vsnprintf() documentation for format string extensions over C99.
*/
-asmlinkage int printk(const char *fmt, ...)
+asmlinkage __visible int printk(const char *fmt, ...)
{
va_list args;
int r;
@@ -1696,7 +1859,7 @@ EXPORT_SYMBOL(printk);
#define LOG_LINE_MAX 0
#define PREFIX_MAX 0
-#define LOG_LINE_MAX 0
+
static u64 syslog_seq;
static u32 syslog_idx;
static u64 console_seq;
@@ -1734,7 +1897,7 @@ void early_vprintk(const char *fmt, va_list ap)
}
}
-asmlinkage void early_printk(const char *fmt, ...)
+asmlinkage __visible void early_printk(const char *fmt, ...)
{
va_list ap;
@@ -1775,11 +1938,12 @@ static int __add_preferred_console(char *name, int idx, char *options,
return 0;
}
/*
- * Set up a list of consoles. Called from init/main.c
+ * Set up a console. Called via do_early_param() in init/main.c
+ * for each "console=" parameter in the boot command line.
*/
static int __init console_setup(char *str)
{
- char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */
+ char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for "ttyS" */
char *s, *options, *brl_options = NULL;
int idx;
@@ -1796,7 +1960,8 @@ static int __init console_setup(char *str)
strncpy(buf, str, sizeof(buf) - 1);
}
buf[sizeof(buf) - 1] = 0;
- if ((options = strchr(str, ',')) != NULL)
+ options = strchr(str, ',');
+ if (options)
*(options++) = 0;
#ifdef __sparc__
if (!strcmp(str, "ttya"))
@@ -1805,7 +1970,7 @@ static int __init console_setup(char *str)
strcpy(buf, "ttyS1");
#endif
for (s = buf; *s; s++)
- if ((*s >= '0' && *s <= '9') || *s == ',')
+ if (isdigit(*s) || *s == ',')
break;
idx = simple_strtoul(s, NULL, 10);
*s = 0;
@@ -1844,7 +2009,6 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
i++, c++)
if (strcmp(c->name, name) == 0 && c->index == idx) {
strlcpy(c->name, name_new, sizeof(c->name));
- c->name[sizeof(c->name) - 1] = 0;
c->options = options;
c->index = idx_new;
return i;
@@ -1853,12 +2017,12 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
return -1;
}
-bool console_suspend_enabled = 1;
+bool console_suspend_enabled = true;
EXPORT_SYMBOL(console_suspend_enabled);
static int __init console_suspend_disable(char *str)
{
- console_suspend_enabled = 0;
+ console_suspend_enabled = false;
return 1;
}
__setup("no_console_suspend", console_suspend_disable);
@@ -1879,14 +2043,14 @@ void suspend_console(void)
printk("Suspending console(s) (use no_console_suspend to debug)\n");
console_lock();
console_suspended = 1;
- up(&console_sem);
+ up_console_sem();
}
void resume_console(void)
{
if (!console_suspend_enabled)
return;
- down(&console_sem);
+ down_console_sem();
console_suspended = 0;
console_unlock();
}
@@ -1928,34 +2092,32 @@ void console_lock(void)
{
might_sleep();
- down(&console_sem);
+ down_console_sem();
if (console_suspended)
return;
console_locked = 1;
console_may_schedule = 1;
- mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);
}
EXPORT_SYMBOL(console_lock);
/**
* console_trylock - try to lock the console system for exclusive use.
*
- * Tried to acquire a lock which guarantees that the caller has
- * exclusive access to the console system and the console_drivers list.
+ * Try to acquire a lock which guarantees that the caller has exclusive
+ * access to the console system and the console_drivers list.
*
* returns 1 on success, and 0 on failure to acquire the lock.
*/
int console_trylock(void)
{
- if (down_trylock(&console_sem))
+ if (down_trylock_console_sem())
return 0;
if (console_suspended) {
- up(&console_sem);
+ up_console_sem();
return 0;
}
console_locked = 1;
console_may_schedule = 0;
- mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_);
return 1;
}
EXPORT_SYMBOL(console_trylock);
@@ -2017,7 +2179,7 @@ void console_unlock(void)
bool retry;
if (console_suspended) {
- up(&console_sem);
+ up_console_sem();
return;
}
@@ -2038,10 +2200,15 @@ again:
}
if (console_seq < log_first_seq) {
+ len = sprintf(text, "** %u printk messages dropped ** ",
+ (unsigned)(log_first_seq - console_seq));
+
/* messages are gone, move to first one */
console_seq = log_first_seq;
console_idx = log_first_idx;
console_prev = 0;
+ } else {
+ len = 0;
}
skip:
if (console_seq == log_next_seq)
@@ -2066,8 +2233,8 @@ skip:
}
level = msg->level;
- len = msg_print_text(msg, console_prev, false,
- text, sizeof(text));
+ len += msg_print_text(msg, console_prev, false,
+ text + len, sizeof(text) - len);
console_idx = log_next(console_idx);
console_seq++;
console_prev = msg->flags;
@@ -2079,7 +2246,6 @@ skip:
local_irq_restore(flags);
}
console_locked = 0;
- mutex_release(&console_lock_dep_map, 1, _RET_IP_);
/* Release the exclusive_console once it is used */
if (unlikely(exclusive_console))
@@ -2087,7 +2253,7 @@ skip:
raw_spin_unlock(&logbuf_lock);
- up(&console_sem);
+ up_console_sem();
/*
* Someone could have filled up the buffer again, so re-check if there's
@@ -2132,7 +2298,7 @@ void console_unblank(void)
* oops_in_progress is set to 1..
*/
if (oops_in_progress) {
- if (down_trylock(&console_sem) != 0)
+ if (down_trylock_console_sem() != 0)
return;
} else
console_lock();
@@ -2408,6 +2574,7 @@ int unregister_console(struct console *console)
if (console_drivers != NULL && console->flags & CON_CONSDEV)
console_drivers->flags |= CON_CONSDEV;
+ console->flags &= ~CON_ENABLED;
console_unlock();
console_sysfs_notify();
return res;
@@ -2432,21 +2599,19 @@ late_initcall(printk_late_init);
/*
* Delayed printk version, for scheduler-internal messages:
*/
-#define PRINTK_BUF_SIZE 512
-
#define PRINTK_PENDING_WAKEUP 0x01
-#define PRINTK_PENDING_SCHED 0x02
+#define PRINTK_PENDING_OUTPUT 0x02
static DEFINE_PER_CPU(int, printk_pending);
-static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
static void wake_up_klogd_work_func(struct irq_work *irq_work)
{
int pending = __this_cpu_xchg(printk_pending, 0);
- if (pending & PRINTK_PENDING_SCHED) {
- char *buf = __get_cpu_var(printk_sched_buf);
- pr_warn("[sched_delayed] %s", buf);
+ if (pending & PRINTK_PENDING_OUTPUT) {
+ /* If trylock fails, someone else is doing the printing */
+ if (console_trylock())
+ console_unlock();
}
if (pending & PRINTK_PENDING_WAKEUP)
@@ -2468,23 +2633,19 @@ void wake_up_klogd(void)
preempt_enable();
}
-int printk_sched(const char *fmt, ...)
+int printk_deferred(const char *fmt, ...)
{
- unsigned long flags;
va_list args;
- char *buf;
int r;
- local_irq_save(flags);
- buf = __get_cpu_var(printk_sched_buf);
-
+ preempt_disable();
va_start(args, fmt);
- r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args);
+ r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args);
va_end(args);
- __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED);
+ __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
- local_irq_restore(flags);
+ preempt_enable();
return r;
}
@@ -2515,14 +2676,13 @@ EXPORT_SYMBOL(__printk_ratelimit);
bool printk_timed_ratelimit(unsigned long *caller_jiffies,
unsigned int interval_msecs)
{
- if (*caller_jiffies == 0
- || !time_in_range(jiffies, *caller_jiffies,
- *caller_jiffies
- + msecs_to_jiffies(interval_msecs))) {
- *caller_jiffies = jiffies;
- return true;
- }
- return false;
+ unsigned long elapsed = jiffies - *caller_jiffies;
+
+ if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs))
+ return false;
+
+ *caller_jiffies = jiffies;
+ return true;
}
EXPORT_SYMBOL(printk_timed_ratelimit);
diff --git a/kernel/profile.c b/kernel/profile.c
index ebdd9c1a86b4..54bf5ba26420 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -52,9 +52,9 @@ static DEFINE_MUTEX(profile_flip_mutex);
int profile_setup(char *str)
{
- static char schedstr[] = "schedule";
- static char sleepstr[] = "sleep";
- static char kvmstr[] = "kvm";
+ static const char schedstr[] = "schedule";
+ static const char sleepstr[] = "sleep";
+ static const char kvmstr[] = "kvm";
int par;
if (!strncmp(str, sleepstr, strlen(sleepstr))) {
@@ -64,12 +64,10 @@ int profile_setup(char *str)
str += strlen(sleepstr) + 1;
if (get_option(&str, &par))
prof_shift = par;
- printk(KERN_INFO
- "kernel sleep profiling enabled (shift: %ld)\n",
+ pr_info("kernel sleep profiling enabled (shift: %ld)\n",
prof_shift);
#else
- printk(KERN_WARNING
- "kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
+ pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
#endif /* CONFIG_SCHEDSTATS */
} else if (!strncmp(str, schedstr, strlen(schedstr))) {
prof_on = SCHED_PROFILING;
@@ -77,8 +75,7 @@ int profile_setup(char *str)
str += strlen(schedstr) + 1;
if (get_option(&str, &par))
prof_shift = par;
- printk(KERN_INFO
- "kernel schedule profiling enabled (shift: %ld)\n",
+ pr_info("kernel schedule profiling enabled (shift: %ld)\n",
prof_shift);
} else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
prof_on = KVM_PROFILING;
@@ -86,13 +83,12 @@ int profile_setup(char *str)
str += strlen(kvmstr) + 1;
if (get_option(&str, &par))
prof_shift = par;
- printk(KERN_INFO
- "kernel KVM profiling enabled (shift: %ld)\n",
+ pr_info("kernel KVM profiling enabled (shift: %ld)\n",
prof_shift);
} else if (get_option(&str, &par)) {
prof_shift = par;
prof_on = CPU_PROFILING;
- printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n",
+ pr_info("kernel profiling enabled (shift: %ld)\n",
prof_shift);
}
return 1;
@@ -591,18 +587,28 @@ out_cleanup:
int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
{
struct proc_dir_entry *entry;
+ int err = 0;
if (!prof_on)
return 0;
- if (create_hash_tables())
- return -ENOMEM;
+
+ cpu_notifier_register_begin();
+
+ if (create_hash_tables()) {
+ err = -ENOMEM;
+ goto out;
+ }
+
entry = proc_create("profile", S_IWUSR | S_IRUGO,
NULL, &proc_profile_operations);
if (!entry)
- return 0;
+ goto out;
proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
- hotcpu_notifier(profile_cpu_callback, 0);
- return 0;
+ __hotcpu_notifier(profile_cpu_callback, 0);
+
+out:
+ cpu_notifier_register_done();
+ return err;
}
-module_init(create_proc_profile);
+subsys_initcall(create_proc_profile);
#endif /* CONFIG_PROC_FS */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1f4bcb3cc21c..54e75226c2c4 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -28,12 +28,6 @@
#include <linux/compat.h>
-static int ptrace_trapping_sleep_fn(void *flags)
-{
- schedule();
- return 0;
-}
-
/*
* ptrace a task: make the debugger its new parent and
* move it to the ptrace list.
@@ -371,7 +365,7 @@ unlock_creds:
out:
if (!retval) {
wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT,
- ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE);
+ TASK_UNINTERRUPTIBLE);
proc_ptrace_connector(task, PTRACE_ATTACH);
}
@@ -1180,8 +1174,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
return ret;
}
-asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
- compat_long_t addr, compat_long_t data)
+COMPAT_SYSCALL_DEFINE4(ptrace, compat_long_t, request, compat_long_t, pid,
+ compat_long_t, addr, compat_long_t, data)
{
struct task_struct *child;
long ret;
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 01e9ec37a3e3..807ccfbf69b3 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,5 +1,5 @@
obj-y += update.o srcu.o
-obj-$(CONFIG_RCU_TORTURE_TEST) += torture.o
+obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_TREE_RCU) += tree.o
obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o
obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 79c3877e9c5b..ff1a6de62f17 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -12,8 +12,8 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright IBM Corporation, 2011
*
@@ -23,6 +23,7 @@
#ifndef __LINUX_RCU_H
#define __LINUX_RCU_H
+#include <trace/events/rcu.h>
#ifdef CONFIG_RCU_TRACE
#define RCU_TRACE(stmt) stmt
#else /* #ifdef CONFIG_RCU_TRACE */
@@ -98,6 +99,10 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
void kfree(const void *);
+/*
+ * Reclaim the specified callback, either by invoking it (non-lazy case)
+ * or freeing it directly (lazy case). Return true if lazy, false otherwise.
+ */
static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
{
unsigned long offset = (unsigned long)head->func;
@@ -107,17 +112,15 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
kfree((void *)head - offset);
rcu_lock_release(&rcu_callback_map);
- return 1;
+ return true;
} else {
RCU_TRACE(trace_rcu_invoke_callback(rn, head));
head->func(head);
rcu_lock_release(&rcu_callback_map);
- return 0;
+ return false;
}
}
-extern int rcu_expedited;
-
#ifdef CONFIG_RCU_STALL_COMMON
extern int rcu_cpu_stall_suppress;
diff --git a/kernel/rcu/torture.c b/kernel/rcu/rcutorture.c
index 732f8ae3086a..948a7693748e 100644
--- a/kernel/rcu/torture.c
+++ b/kernel/rcu/rcutorture.c
@@ -12,13 +12,13 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright (C) IBM Corporation, 2005, 2006
*
* Authors: Paul E. McKenney <paulmck@us.ibm.com>
- * Josh Triplett <josh@freedesktop.org>
+ * Josh Triplett <josh@joshtriplett.org>
*
* See also: Documentation/RCU/torture.txt
*/
@@ -48,110 +48,60 @@
#include <linux/slab.h>
#include <linux/trace_clock.h>
#include <asm/byteorder.h>
+#include <linux/torture.h>
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
+
+
+torture_param(int, fqs_duration, 0,
+ "Duration of fqs bursts (us), 0 to disable");
+torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
+torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)");
+torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives");
+torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
+torture_param(bool, gp_normal, false,
+ "Use normal (non-expedited) GP wait primitives");
+torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
+torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
+torture_param(int, n_barrier_cbs, 0,
+ "# of callbacks/kthreads for barrier testing");
+torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads");
+torture_param(int, nreaders, -1, "Number of RCU reader threads");
+torture_param(int, object_debug, 0,
+ "Enable debug-object double call_rcu() testing");
+torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
+torture_param(int, onoff_interval, 0,
+ "Time between CPU hotplugs (s), 0=disable");
+torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles");
+torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
+torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
+torture_param(int, stall_cpu_holdoff, 10,
+ "Time to wait before starting stall (s).");
+torture_param(int, stat_interval, 60,
+ "Number of seconds between stats printk()s");
+torture_param(int, stutter, 5, "Number of seconds to run/halt test");
+torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
+torture_param(int, test_boost_duration, 4,
+ "Duration of each boost test, seconds.");
+torture_param(int, test_boost_interval, 7,
+ "Interval between boost tests, seconds.");
+torture_param(bool, test_no_idle_hz, true,
+ "Test support for tickless idle CPUs");
+torture_param(bool, verbose, true,
+ "Enable verbose debugging printk()s");
-MODULE_ALIAS("rcutorture");
-#ifdef MODULE_PARAM_PREFIX
-#undef MODULE_PARAM_PREFIX
-#endif
-#define MODULE_PARAM_PREFIX "rcutorture."
-
-static int fqs_duration;
-module_param(fqs_duration, int, 0444);
-MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable");
-static int fqs_holdoff;
-module_param(fqs_holdoff, int, 0444);
-MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
-static int fqs_stutter = 3;
-module_param(fqs_stutter, int, 0444);
-MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
-static bool gp_exp;
-module_param(gp_exp, bool, 0444);
-MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives");
-static bool gp_normal;
-module_param(gp_normal, bool, 0444);
-MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives");
-static int irqreader = 1;
-module_param(irqreader, int, 0444);
-MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
-static int n_barrier_cbs;
-module_param(n_barrier_cbs, int, 0444);
-MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing");
-static int nfakewriters = 4;
-module_param(nfakewriters, int, 0444);
-MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
-static int nreaders = -1;
-module_param(nreaders, int, 0444);
-MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
-static int object_debug;
-module_param(object_debug, int, 0444);
-MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing");
-static int onoff_holdoff;
-module_param(onoff_holdoff, int, 0444);
-MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)");
-static int onoff_interval;
-module_param(onoff_interval, int, 0444);
-MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
-static int shuffle_interval = 3;
-module_param(shuffle_interval, int, 0444);
-MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
-static int shutdown_secs;
-module_param(shutdown_secs, int, 0444);
-MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable.");
-static int stall_cpu;
-module_param(stall_cpu, int, 0444);
-MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable.");
-static int stall_cpu_holdoff = 10;
-module_param(stall_cpu_holdoff, int, 0444);
-MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s).");
-static int stat_interval = 60;
-module_param(stat_interval, int, 0644);
-MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
-static int stutter = 5;
-module_param(stutter, int, 0444);
-MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
-static int test_boost = 1;
-module_param(test_boost, int, 0444);
-MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
-static int test_boost_duration = 4;
-module_param(test_boost_duration, int, 0444);
-MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
-static int test_boost_interval = 7;
-module_param(test_boost_interval, int, 0444);
-MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
-static bool test_no_idle_hz = true;
-module_param(test_no_idle_hz, bool, 0444);
-MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
static char *torture_type = "rcu";
module_param(torture_type, charp, 0444);
MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)");
-static bool verbose;
-module_param(verbose, bool, 0444);
-MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
-
-#define TORTURE_FLAG "-torture:"
-#define PRINTK_STRING(s) \
- do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
-#define VERBOSE_PRINTK_STRING(s) \
- do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
-#define VERBOSE_PRINTK_ERRSTRING(s) \
- do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
static int nrealreaders;
static struct task_struct *writer_task;
static struct task_struct **fakewriter_tasks;
static struct task_struct **reader_tasks;
static struct task_struct *stats_task;
-static struct task_struct *shuffler_task;
-static struct task_struct *stutter_task;
static struct task_struct *fqs_task;
static struct task_struct *boost_tasks[NR_CPUS];
-static struct task_struct *shutdown_task;
-#ifdef CONFIG_HOTPLUG_CPU
-static struct task_struct *onoff_task;
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
static struct task_struct *stall_task;
static struct task_struct **barrier_cbs_tasks;
static struct task_struct *barrier_task;
@@ -170,10 +120,10 @@ static struct rcu_torture __rcu *rcu_torture_current;
static unsigned long rcu_torture_current_version;
static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
static DEFINE_SPINLOCK(rcu_torture_lock);
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
- { 0 };
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) =
- { 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
+ rcu_torture_count) = { 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
+ rcu_torture_batch) = { 0 };
static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
static atomic_t n_rcu_torture_alloc;
static atomic_t n_rcu_torture_alloc_fail;
@@ -186,22 +136,21 @@ static long n_rcu_torture_boost_rterror;
static long n_rcu_torture_boost_failure;
static long n_rcu_torture_boosts;
static long n_rcu_torture_timers;
-static long n_offline_attempts;
-static long n_offline_successes;
-static unsigned long sum_offline;
-static int min_offline = -1;
-static int max_offline;
-static long n_online_attempts;
-static long n_online_successes;
-static unsigned long sum_online;
-static int min_online = -1;
-static int max_online;
static long n_barrier_attempts;
static long n_barrier_successes;
static struct list_head rcu_torture_removed;
-static cpumask_var_t shuffle_tmp_mask;
-static int stutter_pause_test;
+static int rcu_torture_writer_state;
+#define RTWS_FIXED_DELAY 0
+#define RTWS_DELAY 1
+#define RTWS_REPLACE 2
+#define RTWS_DEF_FREE 3
+#define RTWS_EXP_SYNC 4
+#define RTWS_COND_GET 5
+#define RTWS_COND_SYNC 6
+#define RTWS_SYNC 7
+#define RTWS_STUTTER 8
+#define RTWS_STOPPING 9
#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
#define RCUTORTURE_RUNNABLE_INIT 1
@@ -232,7 +181,6 @@ static u64 notrace rcu_trace_clock_local(void)
}
#endif /* #else #ifdef CONFIG_RCU_TRACE */
-static unsigned long shutdown_time; /* jiffies to system shutdown. */
static unsigned long boost_starttime; /* jiffies of next boost test start. */
DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
/* and boost task create/destroy. */
@@ -242,51 +190,6 @@ static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */
static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
-/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
-
-#define FULLSTOP_DONTSTOP 0 /* Normal operation. */
-#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */
-#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */
-static int fullstop = FULLSTOP_RMMOD;
-/*
- * Protect fullstop transitions and spawning of kthreads.
- */
-static DEFINE_MUTEX(fullstop_mutex);
-
-/* Forward reference. */
-static void rcu_torture_cleanup(void);
-
-/*
- * Detect and respond to a system shutdown.
- */
-static int
-rcutorture_shutdown_notify(struct notifier_block *unused1,
- unsigned long unused2, void *unused3)
-{
- mutex_lock(&fullstop_mutex);
- if (fullstop == FULLSTOP_DONTSTOP)
- fullstop = FULLSTOP_SHUTDOWN;
- else
- pr_warn(/* but going down anyway, so... */
- "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
- mutex_unlock(&fullstop_mutex);
- return NOTIFY_DONE;
-}
-
-/*
- * Absorb kthreads into a kernel function that won't return, so that
- * they won't ever access module text or data again.
- */
-static void rcutorture_shutdown_absorb(const char *title)
-{
- if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
- pr_notice(
- "rcutorture thread %s parking due to system shutdown\n",
- title);
- schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
- }
-}
-
/*
* Allocate an element from the rcu_tortures pool.
*/
@@ -320,57 +223,22 @@ rcu_torture_free(struct rcu_torture *p)
spin_unlock_bh(&rcu_torture_lock);
}
-struct rcu_random_state {
- unsigned long rrs_state;
- long rrs_count;
-};
-
-#define RCU_RANDOM_MULT 39916801 /* prime */
-#define RCU_RANDOM_ADD 479001701 /* prime */
-#define RCU_RANDOM_REFRESH 10000
-
-#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 }
-
-/*
- * Crude but fast random-number generator. Uses a linear congruential
- * generator, with occasional help from cpu_clock().
- */
-static unsigned long
-rcu_random(struct rcu_random_state *rrsp)
-{
- if (--rrsp->rrs_count < 0) {
- rrsp->rrs_state += (unsigned long)local_clock();
- rrsp->rrs_count = RCU_RANDOM_REFRESH;
- }
- rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
- return swahw32(rrsp->rrs_state);
-}
-
-static void
-rcu_stutter_wait(const char *title)
-{
- while (stutter_pause_test || !rcutorture_runnable) {
- if (rcutorture_runnable)
- schedule_timeout_interruptible(1);
- else
- schedule_timeout_interruptible(round_jiffies_relative(HZ));
- rcutorture_shutdown_absorb(title);
- }
-}
-
/*
* Operations vector for selecting different types of tests.
*/
struct rcu_torture_ops {
+ int ttype;
void (*init)(void);
int (*readlock)(void);
- void (*read_delay)(struct rcu_random_state *rrsp);
+ void (*read_delay)(struct torture_random_state *rrsp);
void (*readunlock)(int idx);
int (*completed)(void);
void (*deferred_free)(struct rcu_torture *p);
void (*sync)(void);
void (*exp_sync)(void);
+ unsigned long (*get_state)(void);
+ void (*cond_sync)(unsigned long oldstate);
void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
void (*cb_barrier)(void);
void (*fqs)(void);
@@ -392,7 +260,7 @@ static int rcu_torture_read_lock(void) __acquires(RCU)
return 0;
}
-static void rcu_read_delay(struct rcu_random_state *rrsp)
+static void rcu_read_delay(struct torture_random_state *rrsp)
{
const unsigned long shortdelay_us = 200;
const unsigned long longdelay_ms = 50;
@@ -401,12 +269,13 @@ static void rcu_read_delay(struct rcu_random_state *rrsp)
* period, and we want a long delay occasionally to trigger
* force_quiescent_state. */
- if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
+ if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
mdelay(longdelay_ms);
- if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
+ if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
udelay(shortdelay_us);
#ifdef CONFIG_PREEMPT
- if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000)))
+ if (!preempt_count() &&
+ !(torture_random(rrsp) % (nrealreaders * 20000)))
preempt_schedule(); /* No QS if preempt_disable() in effect */
#endif
}
@@ -421,27 +290,59 @@ static int rcu_torture_completed(void)
return rcu_batches_completed();
}
-static void
-rcu_torture_cb(struct rcu_head *p)
+/*
+ * Update callback in the pipe. This should be invoked after a grace period.
+ */
+static bool
+rcu_torture_pipe_update_one(struct rcu_torture *rp)
{
int i;
- struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
- if (fullstop != FULLSTOP_DONTSTOP) {
- /* Test is ending, just drop callbacks on the floor. */
- /* The next initialization will pick up the pieces. */
- return;
- }
i = rp->rtort_pipe_count;
if (i > RCU_TORTURE_PIPE_LEN)
i = RCU_TORTURE_PIPE_LEN;
atomic_inc(&rcu_torture_wcount[i]);
if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
rp->rtort_mbtest = 0;
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Update all callbacks in the pipe. Suitable for synchronous grace-period
+ * primitives.
+ */
+static void
+rcu_torture_pipe_update(struct rcu_torture *old_rp)
+{
+ struct rcu_torture *rp;
+ struct rcu_torture *rp1;
+
+ if (old_rp)
+ list_add(&old_rp->rtort_free, &rcu_torture_removed);
+ list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) {
+ if (rcu_torture_pipe_update_one(rp)) {
+ list_del(&rp->rtort_free);
+ rcu_torture_free(rp);
+ }
+ }
+}
+
+static void
+rcu_torture_cb(struct rcu_head *p)
+{
+ struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
+
+ if (torture_must_stop_irq()) {
+ /* Test is ending, just drop callbacks on the floor. */
+ /* The next initialization will pick up the pieces. */
+ return;
+ }
+ if (rcu_torture_pipe_update_one(rp))
rcu_torture_free(rp);
- } else {
+ else
cur_ops->deferred_free(rp);
- }
}
static int rcu_no_completed(void)
@@ -460,6 +361,7 @@ static void rcu_sync_torture_init(void)
}
static struct rcu_torture_ops rcu_ops = {
+ .ttype = RCU_FLAVOR,
.init = rcu_sync_torture_init,
.readlock = rcu_torture_read_lock,
.read_delay = rcu_read_delay,
@@ -468,6 +370,8 @@ static struct rcu_torture_ops rcu_ops = {
.deferred_free = rcu_torture_deferred_free,
.sync = synchronize_rcu,
.exp_sync = synchronize_rcu_expedited,
+ .get_state = get_state_synchronize_rcu,
+ .cond_sync = cond_synchronize_rcu,
.call = call_rcu,
.cb_barrier = rcu_barrier,
.fqs = rcu_force_quiescent_state,
@@ -503,6 +407,7 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
}
static struct rcu_torture_ops rcu_bh_ops = {
+ .ttype = RCU_BH_FLAVOR,
.init = rcu_sync_torture_init,
.readlock = rcu_bh_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
@@ -520,6 +425,49 @@ static struct rcu_torture_ops rcu_bh_ops = {
};
/*
+ * Don't even think about trying any of these in real life!!!
+ * The names includes "busted", and they really means it!
+ * The only purpose of these functions is to provide a buggy RCU
+ * implementation to make sure that rcutorture correctly emits
+ * buggy-RCU error messages.
+ */
+static void rcu_busted_torture_deferred_free(struct rcu_torture *p)
+{
+ /* This is a deliberate bug for testing purposes only! */
+ rcu_torture_cb(&p->rtort_rcu);
+}
+
+static void synchronize_rcu_busted(void)
+{
+ /* This is a deliberate bug for testing purposes only! */
+}
+
+static void
+call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+ /* This is a deliberate bug for testing purposes only! */
+ func(head);
+}
+
+static struct rcu_torture_ops rcu_busted_ops = {
+ .ttype = INVALID_RCU_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = rcu_torture_read_lock,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = rcu_torture_read_unlock,
+ .completed = rcu_no_completed,
+ .deferred_free = rcu_busted_torture_deferred_free,
+ .sync = synchronize_rcu_busted,
+ .exp_sync = synchronize_rcu_busted,
+ .call = call_rcu_busted,
+ .cb_barrier = NULL,
+ .fqs = NULL,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "rcu_busted"
+};
+
+/*
* Definitions for srcu torture testing.
*/
@@ -530,7 +478,7 @@ static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
return srcu_read_lock(&srcu_ctl);
}
-static void srcu_read_delay(struct rcu_random_state *rrsp)
+static void srcu_read_delay(struct torture_random_state *rrsp)
{
long delay;
const long uspertick = 1000000 / HZ;
@@ -538,7 +486,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp)
/* We want there to be long-running readers, but not all the time. */
- delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick);
+ delay = torture_random(rrsp) %
+ (nrealreaders * 2 * longdelay * uspertick);
if (!delay)
schedule_timeout_interruptible(longdelay);
else
@@ -584,9 +533,11 @@ static void srcu_torture_stats(char *page)
page += sprintf(page, "%s%s per-CPU(idx=%d):",
torture_type, TORTURE_FLAG, idx);
for_each_possible_cpu(cpu) {
- page += sprintf(page, " %d(%lu,%lu)", cpu,
- per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx],
- per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]);
+ long c0, c1;
+
+ c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx];
+ c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx];
+ page += sprintf(page, " %d(%ld,%ld)", cpu, c0, c1);
}
sprintf(page, "\n");
}
@@ -597,6 +548,7 @@ static void srcu_torture_synchronize_expedited(void)
}
static struct rcu_torture_ops srcu_ops = {
+ .ttype = SRCU_FLAVOR,
.init = rcu_sync_torture_init,
.readlock = srcu_torture_read_lock,
.read_delay = srcu_read_delay,
@@ -632,6 +584,7 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
}
static struct rcu_torture_ops sched_ops = {
+ .ttype = RCU_SCHED_FLAVOR,
.init = rcu_sync_torture_init,
.readlock = sched_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
@@ -677,12 +630,12 @@ static int rcu_torture_boost(void *arg)
struct rcu_boost_inflight rbi = { .inflight = 0 };
struct sched_param sp;
- VERBOSE_PRINTK_STRING("rcu_torture_boost started");
+ VERBOSE_TOROUT_STRING("rcu_torture_boost started");
/* Set real-time priority. */
sp.sched_priority = 1;
if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
- VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!");
+ VERBOSE_TOROUT_STRING("rcu_torture_boost RT prio failed!");
n_rcu_torture_boost_rterror++;
}
@@ -693,9 +646,8 @@ static int rcu_torture_boost(void *arg)
oldstarttime = boost_starttime;
while (ULONG_CMP_LT(jiffies, oldstarttime)) {
schedule_timeout_interruptible(oldstarttime - jiffies);
- rcu_stutter_wait("rcu_torture_boost");
- if (kthread_should_stop() ||
- fullstop != FULLSTOP_DONTSTOP)
+ stutter_wait("rcu_torture_boost");
+ if (torture_must_stop())
goto checkwait;
}
@@ -710,15 +662,14 @@ static int rcu_torture_boost(void *arg)
call_rcu(&rbi.rcu, rcu_torture_boost_cb);
if (jiffies - call_rcu_time >
test_boost_duration * HZ - HZ / 2) {
- VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed");
+ VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed");
n_rcu_torture_boost_failure++;
}
call_rcu_time = jiffies;
}
cond_resched();
- rcu_stutter_wait("rcu_torture_boost");
- if (kthread_should_stop() ||
- fullstop != FULLSTOP_DONTSTOP)
+ stutter_wait("rcu_torture_boost");
+ if (torture_must_stop())
goto checkwait;
}
@@ -742,16 +693,17 @@ static int rcu_torture_boost(void *arg)
}
/* Go do the stutter. */
-checkwait: rcu_stutter_wait("rcu_torture_boost");
- } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+checkwait: stutter_wait("rcu_torture_boost");
+ } while (!torture_must_stop());
/* Clean up and exit. */
- VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
- rcutorture_shutdown_absorb("rcu_torture_boost");
- while (!kthread_should_stop() || rbi.inflight)
+ while (!kthread_should_stop() || rbi.inflight) {
+ torture_shutdown_absorb("rcu_torture_boost");
schedule_timeout_uninterruptible(1);
+ }
smp_mb(); /* order accesses to ->inflight before stack-frame death. */
destroy_rcu_head_on_stack(&rbi.rcu);
+ torture_kthread_stopping("rcu_torture_boost");
return 0;
}
@@ -766,7 +718,7 @@ rcu_torture_fqs(void *arg)
unsigned long fqs_resume_time;
int fqs_burst_remaining;
- VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
+ VERBOSE_TOROUT_STRING("rcu_torture_fqs task started");
do {
fqs_resume_time = jiffies + fqs_stutter * HZ;
while (ULONG_CMP_LT(jiffies, fqs_resume_time) &&
@@ -780,12 +732,9 @@ rcu_torture_fqs(void *arg)
udelay(fqs_holdoff);
fqs_burst_remaining -= fqs_holdoff;
}
- rcu_stutter_wait("rcu_torture_fqs");
- } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
- VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping");
- rcutorture_shutdown_absorb("rcu_torture_fqs");
- while (!kthread_should_stop())
- schedule_timeout_uninterruptible(1);
+ stutter_wait("rcu_torture_fqs");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("rcu_torture_fqs");
return 0;
}
@@ -797,23 +746,59 @@ rcu_torture_fqs(void *arg)
static int
rcu_torture_writer(void *arg)
{
- bool exp;
+ unsigned long gp_snap;
+ bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
+ bool gp_sync1 = gp_sync;
int i;
struct rcu_torture *rp;
- struct rcu_torture *rp1;
struct rcu_torture *old_rp;
- static DEFINE_RCU_RANDOM(rand);
-
- VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
- set_user_nice(current, 19);
+ static DEFINE_TORTURE_RANDOM(rand);
+ int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC,
+ RTWS_COND_GET, RTWS_SYNC };
+ int nsynctypes = 0;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
+
+ /* Initialize synctype[] array. If none set, take default. */
+ if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync)
+ gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true;
+ if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync)
+ synctype[nsynctypes++] = RTWS_COND_GET;
+ else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync))
+ pr_alert("rcu_torture_writer: gp_cond without primitives.\n");
+ if (gp_exp1 && cur_ops->exp_sync)
+ synctype[nsynctypes++] = RTWS_EXP_SYNC;
+ else if (gp_exp && !cur_ops->exp_sync)
+ pr_alert("rcu_torture_writer: gp_exp without primitives.\n");
+ if (gp_normal1 && cur_ops->deferred_free)
+ synctype[nsynctypes++] = RTWS_DEF_FREE;
+ else if (gp_normal && !cur_ops->deferred_free)
+ pr_alert("rcu_torture_writer: gp_normal without primitives.\n");
+ if (gp_sync1 && cur_ops->sync)
+ synctype[nsynctypes++] = RTWS_SYNC;
+ else if (gp_sync && !cur_ops->sync)
+ pr_alert("rcu_torture_writer: gp_sync without primitives.\n");
+ if (WARN_ONCE(nsynctypes == 0,
+ "rcu_torture_writer: No update-side primitives.\n")) {
+ /*
+ * No updates primitives, so don't try updating.
+ * The resulting test won't be testing much, hence the
+ * above WARN_ONCE().
+ */
+ rcu_torture_writer_state = RTWS_STOPPING;
+ torture_kthread_stopping("rcu_torture_writer");
+ }
do {
+ rcu_torture_writer_state = RTWS_FIXED_DELAY;
schedule_timeout_uninterruptible(1);
rp = rcu_torture_alloc();
if (rp == NULL)
continue;
rp->rtort_pipe_count = 0;
- udelay(rcu_random(&rand) & 0x3ff);
+ rcu_torture_writer_state = RTWS_DELAY;
+ udelay(torture_random(&rand) & 0x3ff);
+ rcu_torture_writer_state = RTWS_REPLACE;
old_rp = rcu_dereference_check(rcu_torture_current,
current == writer_task);
rp->rtort_mbtest = 1;
@@ -825,39 +810,43 @@ rcu_torture_writer(void *arg)
i = RCU_TORTURE_PIPE_LEN;
atomic_inc(&rcu_torture_wcount[i]);
old_rp->rtort_pipe_count++;
- if (gp_normal == gp_exp)
- exp = !!(rcu_random(&rand) & 0x80);
- else
- exp = gp_exp;
- if (!exp) {
+ switch (synctype[torture_random(&rand) % nsynctypes]) {
+ case RTWS_DEF_FREE:
+ rcu_torture_writer_state = RTWS_DEF_FREE;
cur_ops->deferred_free(old_rp);
- } else {
+ break;
+ case RTWS_EXP_SYNC:
+ rcu_torture_writer_state = RTWS_EXP_SYNC;
cur_ops->exp_sync();
- list_add(&old_rp->rtort_free,
- &rcu_torture_removed);
- list_for_each_entry_safe(rp, rp1,
- &rcu_torture_removed,
- rtort_free) {
- i = rp->rtort_pipe_count;
- if (i > RCU_TORTURE_PIPE_LEN)
- i = RCU_TORTURE_PIPE_LEN;
- atomic_inc(&rcu_torture_wcount[i]);
- if (++rp->rtort_pipe_count >=
- RCU_TORTURE_PIPE_LEN) {
- rp->rtort_mbtest = 0;
- list_del(&rp->rtort_free);
- rcu_torture_free(rp);
- }
- }
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_COND_GET:
+ rcu_torture_writer_state = RTWS_COND_GET;
+ gp_snap = cur_ops->get_state();
+ i = torture_random(&rand) % 16;
+ if (i != 0)
+ schedule_timeout_interruptible(i);
+ udelay(torture_random(&rand) % 1000);
+ rcu_torture_writer_state = RTWS_COND_SYNC;
+ cur_ops->cond_sync(gp_snap);
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_SYNC:
+ rcu_torture_writer_state = RTWS_SYNC;
+ cur_ops->sync();
+ rcu_torture_pipe_update(old_rp);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
}
}
rcutorture_record_progress(++rcu_torture_current_version);
- rcu_stutter_wait("rcu_torture_writer");
- } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
- VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
- rcutorture_shutdown_absorb("rcu_torture_writer");
- while (!kthread_should_stop())
- schedule_timeout_uninterruptible(1);
+ rcu_torture_writer_state = RTWS_STUTTER;
+ stutter_wait("rcu_torture_writer");
+ } while (!torture_must_stop());
+ rcu_torture_writer_state = RTWS_STOPPING;
+ torture_kthread_stopping("rcu_torture_writer");
return 0;
}
@@ -868,19 +857,19 @@ rcu_torture_writer(void *arg)
static int
rcu_torture_fakewriter(void *arg)
{
- DEFINE_RCU_RANDOM(rand);
+ DEFINE_TORTURE_RANDOM(rand);
- VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started");
- set_user_nice(current, 19);
+ VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started");
+ set_user_nice(current, MAX_NICE);
do {
- schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
- udelay(rcu_random(&rand) & 0x3ff);
+ schedule_timeout_uninterruptible(1 + torture_random(&rand)%10);
+ udelay(torture_random(&rand) & 0x3ff);
if (cur_ops->cb_barrier != NULL &&
- rcu_random(&rand) % (nfakewriters * 8) == 0) {
+ torture_random(&rand) % (nfakewriters * 8) == 0) {
cur_ops->cb_barrier();
} else if (gp_normal == gp_exp) {
- if (rcu_random(&rand) & 0x80)
+ if (torture_random(&rand) & 0x80)
cur_ops->sync();
else
cur_ops->exp_sync();
@@ -889,17 +878,14 @@ rcu_torture_fakewriter(void *arg)
} else {
cur_ops->exp_sync();
}
- rcu_stutter_wait("rcu_torture_fakewriter");
- } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+ stutter_wait("rcu_torture_fakewriter");
+ } while (!torture_must_stop());
- VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
- rcutorture_shutdown_absorb("rcu_torture_fakewriter");
- while (!kthread_should_stop())
- schedule_timeout_uninterruptible(1);
+ torture_kthread_stopping("rcu_torture_fakewriter");
return 0;
}
-void rcutorture_trace_dump(void)
+static void rcutorture_trace_dump(void)
{
static atomic_t beenhere = ATOMIC_INIT(0);
@@ -921,7 +907,7 @@ static void rcu_torture_timer(unsigned long unused)
int idx;
int completed;
int completed_end;
- static DEFINE_RCU_RANDOM(rand);
+ static DEFINE_TORTURE_RANDOM(rand);
static DEFINE_SPINLOCK(rand_lock);
struct rcu_torture *p;
int pipe_count;
@@ -980,14 +966,14 @@ rcu_torture_reader(void *arg)
int completed;
int completed_end;
int idx;
- DEFINE_RCU_RANDOM(rand);
+ DEFINE_TORTURE_RANDOM(rand);
struct rcu_torture *p;
int pipe_count;
struct timer_list t;
unsigned long long ts;
- VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
- set_user_nice(current, 19);
+ VERBOSE_TOROUT_STRING("rcu_torture_reader task started");
+ set_user_nice(current, MAX_NICE);
if (irqreader && cur_ops->irq_capable)
setup_timer_on_stack(&t, rcu_torture_timer, 0);
@@ -1033,15 +1019,14 @@ rcu_torture_reader(void *arg)
__this_cpu_inc(rcu_torture_batch[completed]);
preempt_enable();
cur_ops->readunlock(idx);
- schedule();
- rcu_stutter_wait("rcu_torture_reader");
- } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
- VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
- rcutorture_shutdown_absorb("rcu_torture_reader");
- if (irqreader && cur_ops->irq_capable)
+ cond_resched();
+ stutter_wait("rcu_torture_reader");
+ } while (!torture_must_stop());
+ if (irqreader && cur_ops->irq_capable) {
del_timer_sync(&t);
- while (!kthread_should_stop())
- schedule_timeout_uninterruptible(1);
+ destroy_timer_on_stack(&t);
+ }
+ torture_kthread_stopping("rcu_torture_reader");
return 0;
}
@@ -1055,6 +1040,7 @@ rcu_torture_printk(char *page)
int i;
long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+ static unsigned long rtcv_snap = ULONG_MAX;
for_each_possible_cpu(cpu) {
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
@@ -1083,13 +1069,7 @@ rcu_torture_printk(char *page)
n_rcu_torture_boost_failure,
n_rcu_torture_boosts,
n_rcu_torture_timers);
- page += sprintf(page,
- "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
- n_online_successes, n_online_attempts,
- n_offline_successes, n_offline_attempts,
- min_online, max_online,
- min_offline, max_offline,
- sum_online, sum_offline, HZ);
+ page = torture_onoff_stats(page);
page += sprintf(page, "barrier: %ld/%ld:%ld",
n_barrier_successes,
n_barrier_attempts,
@@ -1121,6 +1101,22 @@ rcu_torture_printk(char *page)
page += sprintf(page, "\n");
if (cur_ops->stats)
cur_ops->stats(page);
+ if (rtcv_snap == rcu_torture_current_version &&
+ rcu_torture_current != NULL) {
+ int __maybe_unused flags;
+ unsigned long __maybe_unused gpnum;
+ unsigned long __maybe_unused completed;
+
+ rcutorture_get_gp_data(cur_ops->ttype,
+ &flags, &gpnum, &completed);
+ page += sprintf(page,
+ "??? Writer stall state %d g%lu c%lu f%#x\n",
+ rcu_torture_writer_state,
+ gpnum, completed, flags);
+ show_rcu_gp_kthreads();
+ rcutorture_trace_dump();
+ }
+ rtcv_snap = rcu_torture_current_version;
}
/*
@@ -1150,123 +1146,17 @@ rcu_torture_stats_print(void)
/*
* Periodically prints torture statistics, if periodic statistics printing
* was specified via the stat_interval module parameter.
- *
- * No need to worry about fullstop here, since this one doesn't reference
- * volatile state or register callbacks.
*/
static int
rcu_torture_stats(void *arg)
{
- VERBOSE_PRINTK_STRING("rcu_torture_stats task started");
+ VERBOSE_TOROUT_STRING("rcu_torture_stats task started");
do {
schedule_timeout_interruptible(stat_interval * HZ);
rcu_torture_stats_print();
- rcutorture_shutdown_absorb("rcu_torture_stats");
- } while (!kthread_should_stop());
- VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
- return 0;
-}
-
-static int rcu_idle_cpu; /* Force all torture tasks off this CPU */
-
-/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case
- * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs.
- */
-static void rcu_torture_shuffle_tasks(void)
-{
- int i;
-
- cpumask_setall(shuffle_tmp_mask);
- get_online_cpus();
-
- /* No point in shuffling if there is only one online CPU (ex: UP) */
- if (num_online_cpus() == 1) {
- put_online_cpus();
- return;
- }
-
- if (rcu_idle_cpu != -1)
- cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask);
-
- set_cpus_allowed_ptr(current, shuffle_tmp_mask);
-
- if (reader_tasks) {
- for (i = 0; i < nrealreaders; i++)
- if (reader_tasks[i])
- set_cpus_allowed_ptr(reader_tasks[i],
- shuffle_tmp_mask);
- }
- if (fakewriter_tasks) {
- for (i = 0; i < nfakewriters; i++)
- if (fakewriter_tasks[i])
- set_cpus_allowed_ptr(fakewriter_tasks[i],
- shuffle_tmp_mask);
- }
- if (writer_task)
- set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask);
- if (stats_task)
- set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask);
- if (stutter_task)
- set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask);
- if (fqs_task)
- set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask);
- if (shutdown_task)
- set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask);
-#ifdef CONFIG_HOTPLUG_CPU
- if (onoff_task)
- set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask);
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
- if (stall_task)
- set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask);
- if (barrier_cbs_tasks)
- for (i = 0; i < n_barrier_cbs; i++)
- if (barrier_cbs_tasks[i])
- set_cpus_allowed_ptr(barrier_cbs_tasks[i],
- shuffle_tmp_mask);
- if (barrier_task)
- set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask);
-
- if (rcu_idle_cpu == -1)
- rcu_idle_cpu = num_online_cpus() - 1;
- else
- rcu_idle_cpu--;
-
- put_online_cpus();
-}
-
-/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
- * system to become idle at a time and cut off its timer ticks. This is meant
- * to test the support for such tickless idle CPU in RCU.
- */
-static int
-rcu_torture_shuffle(void *arg)
-{
- VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started");
- do {
- schedule_timeout_interruptible(shuffle_interval * HZ);
- rcu_torture_shuffle_tasks();
- rcutorture_shutdown_absorb("rcu_torture_shuffle");
- } while (!kthread_should_stop());
- VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping");
- return 0;
-}
-
-/* Cause the rcutorture test to "stutter", starting and stopping all
- * threads periodically.
- */
-static int
-rcu_torture_stutter(void *arg)
-{
- VERBOSE_PRINTK_STRING("rcu_torture_stutter task started");
- do {
- schedule_timeout_interruptible(stutter * HZ);
- stutter_pause_test = 1;
- if (!kthread_should_stop())
- schedule_timeout_interruptible(stutter * HZ);
- stutter_pause_test = 0;
- rcutorture_shutdown_absorb("rcu_torture_stutter");
- } while (!kthread_should_stop());
- VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping");
+ torture_shutdown_absorb("rcu_torture_stats");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("rcu_torture_stats");
return 0;
}
@@ -1293,10 +1183,6 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
onoff_interval, onoff_holdoff);
}
-static struct notifier_block rcutorture_shutdown_nb = {
- .notifier_call = rcutorture_shutdown_notify,
-};
-
static void rcutorture_booster_cleanup(int cpu)
{
struct task_struct *t;
@@ -1304,14 +1190,12 @@ static void rcutorture_booster_cleanup(int cpu)
if (boost_tasks[cpu] == NULL)
return;
mutex_lock(&boost_mutex);
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
t = boost_tasks[cpu];
boost_tasks[cpu] = NULL;
mutex_unlock(&boost_mutex);
/* This must be outside of the mutex, otherwise deadlock! */
- kthread_stop(t);
- boost_tasks[cpu] = NULL;
+ torture_stop_kthread(rcu_torture_boost, t);
}
static int rcutorture_booster_init(int cpu)
@@ -1323,13 +1207,13 @@ static int rcutorture_booster_init(int cpu)
/* Don't allow time recalculation while creating a new task. */
mutex_lock(&boost_mutex);
- VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
+ VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task");
boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL,
cpu_to_node(cpu),
"rcu_torture_boost");
if (IS_ERR(boost_tasks[cpu])) {
retval = PTR_ERR(boost_tasks[cpu]);
- VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
+ VERBOSE_TOROUT_STRING("rcu_torture_boost task create failed");
n_rcu_torture_boost_ktrerror++;
boost_tasks[cpu] = NULL;
mutex_unlock(&boost_mutex);
@@ -1342,175 +1226,6 @@ static int rcutorture_booster_init(int cpu)
}
/*
- * Cause the rcutorture test to shutdown the system after the test has
- * run for the time specified by the shutdown_secs module parameter.
- */
-static int
-rcu_torture_shutdown(void *arg)
-{
- long delta;
- unsigned long jiffies_snap;
-
- VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started");
- jiffies_snap = ACCESS_ONCE(jiffies);
- while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
- !kthread_should_stop()) {
- delta = shutdown_time - jiffies_snap;
- if (verbose)
- pr_alert("%s" TORTURE_FLAG
- "rcu_torture_shutdown task: %lu jiffies remaining\n",
- torture_type, delta);
- schedule_timeout_interruptible(delta);
- jiffies_snap = ACCESS_ONCE(jiffies);
- }
- if (kthread_should_stop()) {
- VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping");
- return 0;
- }
-
- /* OK, shut down the system. */
-
- VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system");
- shutdown_task = NULL; /* Avoid self-kill deadlock. */
- rcu_torture_cleanup(); /* Get the success/failure message. */
- kernel_power_off(); /* Shut down the system. */
- return 0;
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Execute random CPU-hotplug operations at the interval specified
- * by the onoff_interval.
- */
-static int
-rcu_torture_onoff(void *arg)
-{
- int cpu;
- unsigned long delta;
- int maxcpu = -1;
- DEFINE_RCU_RANDOM(rand);
- int ret;
- unsigned long starttime;
-
- VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
- for_each_online_cpu(cpu)
- maxcpu = cpu;
- WARN_ON(maxcpu < 0);
- if (onoff_holdoff > 0) {
- VERBOSE_PRINTK_STRING("rcu_torture_onoff begin holdoff");
- schedule_timeout_interruptible(onoff_holdoff * HZ);
- VERBOSE_PRINTK_STRING("rcu_torture_onoff end holdoff");
- }
- while (!kthread_should_stop()) {
- cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1);
- if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
- if (verbose)
- pr_alert("%s" TORTURE_FLAG
- "rcu_torture_onoff task: offlining %d\n",
- torture_type, cpu);
- starttime = jiffies;
- n_offline_attempts++;
- ret = cpu_down(cpu);
- if (ret) {
- if (verbose)
- pr_alert("%s" TORTURE_FLAG
- "rcu_torture_onoff task: offline %d failed: errno %d\n",
- torture_type, cpu, ret);
- } else {
- if (verbose)
- pr_alert("%s" TORTURE_FLAG
- "rcu_torture_onoff task: offlined %d\n",
- torture_type, cpu);
- n_offline_successes++;
- delta = jiffies - starttime;
- sum_offline += delta;
- if (min_offline < 0) {
- min_offline = delta;
- max_offline = delta;
- }
- if (min_offline > delta)
- min_offline = delta;
- if (max_offline < delta)
- max_offline = delta;
- }
- } else if (cpu_is_hotpluggable(cpu)) {
- if (verbose)
- pr_alert("%s" TORTURE_FLAG
- "rcu_torture_onoff task: onlining %d\n",
- torture_type, cpu);
- starttime = jiffies;
- n_online_attempts++;
- ret = cpu_up(cpu);
- if (ret) {
- if (verbose)
- pr_alert("%s" TORTURE_FLAG
- "rcu_torture_onoff task: online %d failed: errno %d\n",
- torture_type, cpu, ret);
- } else {
- if (verbose)
- pr_alert("%s" TORTURE_FLAG
- "rcu_torture_onoff task: onlined %d\n",
- torture_type, cpu);
- n_online_successes++;
- delta = jiffies - starttime;
- sum_online += delta;
- if (min_online < 0) {
- min_online = delta;
- max_online = delta;
- }
- if (min_online > delta)
- min_online = delta;
- if (max_online < delta)
- max_online = delta;
- }
- }
- schedule_timeout_interruptible(onoff_interval * HZ);
- }
- VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping");
- return 0;
-}
-
-static int
-rcu_torture_onoff_init(void)
-{
- int ret;
-
- if (onoff_interval <= 0)
- return 0;
- onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff");
- if (IS_ERR(onoff_task)) {
- ret = PTR_ERR(onoff_task);
- onoff_task = NULL;
- return ret;
- }
- return 0;
-}
-
-static void rcu_torture_onoff_cleanup(void)
-{
- if (onoff_task == NULL)
- return;
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
- kthread_stop(onoff_task);
- onoff_task = NULL;
-}
-
-#else /* #ifdef CONFIG_HOTPLUG_CPU */
-
-static int
-rcu_torture_onoff_init(void)
-{
- return 0;
-}
-
-static void rcu_torture_onoff_cleanup(void)
-{
-}
-
-#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
-
-/*
* CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then
* induces a CPU stall for the time specified by stall_cpu.
*/
@@ -1518,11 +1233,11 @@ static int rcu_torture_stall(void *args)
{
unsigned long stop_at;
- VERBOSE_PRINTK_STRING("rcu_torture_stall task started");
+ VERBOSE_TOROUT_STRING("rcu_torture_stall task started");
if (stall_cpu_holdoff > 0) {
- VERBOSE_PRINTK_STRING("rcu_torture_stall begin holdoff");
+ VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff");
schedule_timeout_interruptible(stall_cpu_holdoff * HZ);
- VERBOSE_PRINTK_STRING("rcu_torture_stall end holdoff");
+ VERBOSE_TOROUT_STRING("rcu_torture_stall end holdoff");
}
if (!kthread_should_stop()) {
stop_at = get_seconds() + stall_cpu;
@@ -1536,7 +1251,7 @@ static int rcu_torture_stall(void *args)
rcu_read_unlock();
pr_alert("rcu_torture_stall end.\n");
}
- rcutorture_shutdown_absorb("rcu_torture_stall");
+ torture_shutdown_absorb("rcu_torture_stall");
while (!kthread_should_stop())
schedule_timeout_interruptible(10 * HZ);
return 0;
@@ -1545,31 +1260,13 @@ static int rcu_torture_stall(void *args)
/* Spawn CPU-stall kthread, if stall_cpu specified. */
static int __init rcu_torture_stall_init(void)
{
- int ret;
-
if (stall_cpu <= 0)
return 0;
- stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall");
- if (IS_ERR(stall_task)) {
- ret = PTR_ERR(stall_task);
- stall_task = NULL;
- return ret;
- }
- return 0;
-}
-
-/* Clean up after the CPU-stall kthread, if one was spawned. */
-static void rcu_torture_stall_cleanup(void)
-{
- if (stall_task == NULL)
- return;
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task.");
- kthread_stop(stall_task);
- stall_task = NULL;
+ return torture_create_kthread(rcu_torture_stall, NULL, stall_task);
}
/* Callback function for RCU barrier testing. */
-void rcu_torture_barrier_cbf(struct rcu_head *rcu)
+static void rcu_torture_barrier_cbf(struct rcu_head *rcu)
{
atomic_inc(&barrier_cbs_invoked);
}
@@ -1583,28 +1280,24 @@ static int rcu_torture_barrier_cbs(void *arg)
struct rcu_head rcu;
init_rcu_head_on_stack(&rcu);
- VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started");
- set_user_nice(current, 19);
+ VERBOSE_TOROUT_STRING("rcu_torture_barrier_cbs task started");
+ set_user_nice(current, MAX_NICE);
do {
wait_event(barrier_cbs_wq[myid],
(newphase =
ACCESS_ONCE(barrier_phase)) != lastphase ||
- kthread_should_stop() ||
- fullstop != FULLSTOP_DONTSTOP);
+ torture_must_stop());
lastphase = newphase;
smp_mb(); /* ensure barrier_phase load before ->call(). */
- if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
+ if (torture_must_stop())
break;
cur_ops->call(&rcu, rcu_torture_barrier_cbf);
if (atomic_dec_and_test(&barrier_cbs_count))
wake_up(&barrier_wq);
- } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
- VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping");
- rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
- while (!kthread_should_stop())
- schedule_timeout_interruptible(1);
+ } while (!torture_must_stop());
cur_ops->cb_barrier();
destroy_rcu_head_on_stack(&rcu);
+ torture_kthread_stopping("rcu_torture_barrier_cbs");
return 0;
}
@@ -1613,7 +1306,7 @@ static int rcu_torture_barrier(void *arg)
{
int i;
- VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting");
+ VERBOSE_TOROUT_STRING("rcu_torture_barrier task starting");
do {
atomic_set(&barrier_cbs_invoked, 0);
atomic_set(&barrier_cbs_count, n_barrier_cbs);
@@ -1623,9 +1316,8 @@ static int rcu_torture_barrier(void *arg)
wake_up(&barrier_cbs_wq[i]);
wait_event(barrier_wq,
atomic_read(&barrier_cbs_count) == 0 ||
- kthread_should_stop() ||
- fullstop != FULLSTOP_DONTSTOP);
- if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
+ torture_must_stop());
+ if (torture_must_stop())
break;
n_barrier_attempts++;
cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */
@@ -1635,11 +1327,8 @@ static int rcu_torture_barrier(void *arg)
}
n_barrier_successes++;
schedule_timeout_interruptible(HZ / 10);
- } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
- VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping");
- rcutorture_shutdown_absorb("rcu_torture_barrier");
- while (!kthread_should_stop())
- schedule_timeout_interruptible(1);
+ } while (!torture_must_stop());
+ torture_kthread_stopping("rcu_torture_barrier");
return 0;
}
@@ -1672,24 +1361,13 @@ static int rcu_torture_barrier_init(void)
return -ENOMEM;
for (i = 0; i < n_barrier_cbs; i++) {
init_waitqueue_head(&barrier_cbs_wq[i]);
- barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs,
- (void *)(long)i,
- "rcu_torture_barrier_cbs");
- if (IS_ERR(barrier_cbs_tasks[i])) {
- ret = PTR_ERR(barrier_cbs_tasks[i]);
- VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs");
- barrier_cbs_tasks[i] = NULL;
+ ret = torture_create_kthread(rcu_torture_barrier_cbs,
+ (void *)(long)i,
+ barrier_cbs_tasks[i]);
+ if (ret)
return ret;
- }
}
- barrier_task = kthread_run(rcu_torture_barrier, NULL,
- "rcu_torture_barrier");
- if (IS_ERR(barrier_task)) {
- ret = PTR_ERR(barrier_task);
- VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier");
- barrier_task = NULL;
- }
- return 0;
+ return torture_create_kthread(rcu_torture_barrier, NULL, barrier_task);
}
/* Clean up after RCU barrier testing. */
@@ -1697,19 +1375,11 @@ static void rcu_torture_barrier_cleanup(void)
{
int i;
- if (barrier_task != NULL) {
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task");
- kthread_stop(barrier_task);
- barrier_task = NULL;
- }
+ torture_stop_kthread(rcu_torture_barrier, barrier_task);
if (barrier_cbs_tasks != NULL) {
- for (i = 0; i < n_barrier_cbs; i++) {
- if (barrier_cbs_tasks[i] != NULL) {
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task");
- kthread_stop(barrier_cbs_tasks[i]);
- barrier_cbs_tasks[i] = NULL;
- }
- }
+ for (i = 0; i < n_barrier_cbs; i++)
+ torture_stop_kthread(rcu_torture_barrier_cbs,
+ barrier_cbs_tasks[i]);
kfree(barrier_cbs_tasks);
barrier_cbs_tasks = NULL;
}
@@ -1747,90 +1417,42 @@ rcu_torture_cleanup(void)
{
int i;
- mutex_lock(&fullstop_mutex);
rcutorture_record_test_transition();
- if (fullstop == FULLSTOP_SHUTDOWN) {
- pr_warn(/* but going down anyway, so... */
- "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
- mutex_unlock(&fullstop_mutex);
- schedule_timeout_uninterruptible(10);
+ if (torture_cleanup()) {
if (cur_ops->cb_barrier != NULL)
cur_ops->cb_barrier();
return;
}
- fullstop = FULLSTOP_RMMOD;
- mutex_unlock(&fullstop_mutex);
- unregister_reboot_notifier(&rcutorture_shutdown_nb);
- rcu_torture_barrier_cleanup();
- rcu_torture_stall_cleanup();
- if (stutter_task) {
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
- kthread_stop(stutter_task);
- }
- stutter_task = NULL;
- if (shuffler_task) {
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
- kthread_stop(shuffler_task);
- free_cpumask_var(shuffle_tmp_mask);
- }
- shuffler_task = NULL;
- if (writer_task) {
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
- kthread_stop(writer_task);
- }
- writer_task = NULL;
+ rcu_torture_barrier_cleanup();
+ torture_stop_kthread(rcu_torture_stall, stall_task);
+ torture_stop_kthread(rcu_torture_writer, writer_task);
if (reader_tasks) {
- for (i = 0; i < nrealreaders; i++) {
- if (reader_tasks[i]) {
- VERBOSE_PRINTK_STRING(
- "Stopping rcu_torture_reader task");
- kthread_stop(reader_tasks[i]);
- }
- reader_tasks[i] = NULL;
- }
+ for (i = 0; i < nrealreaders; i++)
+ torture_stop_kthread(rcu_torture_reader,
+ reader_tasks[i]);
kfree(reader_tasks);
- reader_tasks = NULL;
}
rcu_torture_current = NULL;
if (fakewriter_tasks) {
for (i = 0; i < nfakewriters; i++) {
- if (fakewriter_tasks[i]) {
- VERBOSE_PRINTK_STRING(
- "Stopping rcu_torture_fakewriter task");
- kthread_stop(fakewriter_tasks[i]);
- }
- fakewriter_tasks[i] = NULL;
+ torture_stop_kthread(rcu_torture_fakewriter,
+ fakewriter_tasks[i]);
}
kfree(fakewriter_tasks);
fakewriter_tasks = NULL;
}
- if (stats_task) {
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
- kthread_stop(stats_task);
- }
- stats_task = NULL;
-
- if (fqs_task) {
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task");
- kthread_stop(fqs_task);
- }
- fqs_task = NULL;
+ torture_stop_kthread(rcu_torture_stats, stats_task);
+ torture_stop_kthread(rcu_torture_fqs, fqs_task);
if ((test_boost == 1 && cur_ops->can_boost) ||
test_boost == 2) {
unregister_cpu_notifier(&rcutorture_cpu_nb);
for_each_possible_cpu(i)
rcutorture_booster_cleanup(i);
}
- if (shutdown_task != NULL) {
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
- kthread_stop(shutdown_task);
- }
- shutdown_task = NULL;
- rcu_torture_onoff_cleanup();
/* Wait for all RCU callbacks to fire. */
@@ -1841,8 +1463,7 @@ rcu_torture_cleanup(void)
if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
- else if (n_online_successes != n_online_attempts ||
- n_offline_successes != n_offline_attempts)
+ else if (torture_onoff_failures())
rcu_torture_print_module_parms(cur_ops,
"End of test: RCU_HOTPLUG");
else
@@ -1911,12 +1532,12 @@ rcu_torture_init(void)
int i;
int cpu;
int firsterr = 0;
- int retval;
static struct rcu_torture_ops *torture_ops[] = {
- &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops,
+ &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops,
};
- mutex_lock(&fullstop_mutex);
+ if (!torture_init_begin(torture_type, verbose, &rcutorture_runnable))
+ return -EBUSY;
/* Process args and tell the world that the torturer is on the job. */
for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
@@ -1931,7 +1552,7 @@ rcu_torture_init(void)
for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
pr_alert(" %s", torture_ops[i]->name);
pr_alert("\n");
- mutex_unlock(&fullstop_mutex);
+ torture_init_end();
return -EINVAL;
}
if (cur_ops->fqs == NULL && fqs_duration != 0) {
@@ -1941,12 +1562,14 @@ rcu_torture_init(void)
if (cur_ops->init)
cur_ops->init(); /* no "goto unwind" prior to this point!!! */
- if (nreaders >= 0)
+ if (nreaders >= 0) {
nrealreaders = nreaders;
- else
- nrealreaders = 2 * num_online_cpus();
+ } else {
+ nrealreaders = num_online_cpus() - 1;
+ if (nrealreaders <= 0)
+ nrealreaders = 1;
+ }
rcu_torture_print_module_parms(cur_ops, "Start of test");
- fullstop = FULLSTOP_DONTSTOP;
/* Set up the freelist. */
@@ -1982,108 +1605,62 @@ rcu_torture_init(void)
/* Start up the kthreads. */
- VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task");
- writer_task = kthread_create(rcu_torture_writer, NULL,
- "rcu_torture_writer");
- if (IS_ERR(writer_task)) {
- firsterr = PTR_ERR(writer_task);
- VERBOSE_PRINTK_ERRSTRING("Failed to create writer");
- writer_task = NULL;
+ firsterr = torture_create_kthread(rcu_torture_writer, NULL,
+ writer_task);
+ if (firsterr)
goto unwind;
- }
- wake_up_process(writer_task);
fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
GFP_KERNEL);
if (fakewriter_tasks == NULL) {
- VERBOSE_PRINTK_ERRSTRING("out of memory");
+ VERBOSE_TOROUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
for (i = 0; i < nfakewriters; i++) {
- VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task");
- fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL,
- "rcu_torture_fakewriter");
- if (IS_ERR(fakewriter_tasks[i])) {
- firsterr = PTR_ERR(fakewriter_tasks[i]);
- VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter");
- fakewriter_tasks[i] = NULL;
+ firsterr = torture_create_kthread(rcu_torture_fakewriter,
+ NULL, fakewriter_tasks[i]);
+ if (firsterr)
goto unwind;
- }
}
reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]),
GFP_KERNEL);
if (reader_tasks == NULL) {
- VERBOSE_PRINTK_ERRSTRING("out of memory");
+ VERBOSE_TOROUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
for (i = 0; i < nrealreaders; i++) {
- VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task");
- reader_tasks[i] = kthread_run(rcu_torture_reader, NULL,
- "rcu_torture_reader");
- if (IS_ERR(reader_tasks[i])) {
- firsterr = PTR_ERR(reader_tasks[i]);
- VERBOSE_PRINTK_ERRSTRING("Failed to create reader");
- reader_tasks[i] = NULL;
+ firsterr = torture_create_kthread(rcu_torture_reader, NULL,
+ reader_tasks[i]);
+ if (firsterr)
goto unwind;
- }
}
if (stat_interval > 0) {
- VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task");
- stats_task = kthread_run(rcu_torture_stats, NULL,
- "rcu_torture_stats");
- if (IS_ERR(stats_task)) {
- firsterr = PTR_ERR(stats_task);
- VERBOSE_PRINTK_ERRSTRING("Failed to create stats");
- stats_task = NULL;
+ firsterr = torture_create_kthread(rcu_torture_stats, NULL,
+ stats_task);
+ if (firsterr)
goto unwind;
- }
}
if (test_no_idle_hz) {
- rcu_idle_cpu = num_online_cpus() - 1;
-
- if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
- firsterr = -ENOMEM;
- VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask");
- goto unwind;
- }
-
- /* Create the shuffler thread */
- shuffler_task = kthread_run(rcu_torture_shuffle, NULL,
- "rcu_torture_shuffle");
- if (IS_ERR(shuffler_task)) {
- free_cpumask_var(shuffle_tmp_mask);
- firsterr = PTR_ERR(shuffler_task);
- VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler");
- shuffler_task = NULL;
+ firsterr = torture_shuffle_init(shuffle_interval * HZ);
+ if (firsterr)
goto unwind;
- }
}
if (stutter < 0)
stutter = 0;
if (stutter) {
- /* Create the stutter thread */
- stutter_task = kthread_run(rcu_torture_stutter, NULL,
- "rcu_torture_stutter");
- if (IS_ERR(stutter_task)) {
- firsterr = PTR_ERR(stutter_task);
- VERBOSE_PRINTK_ERRSTRING("Failed to create stutter");
- stutter_task = NULL;
+ firsterr = torture_stutter_init(stutter * HZ);
+ if (firsterr)
goto unwind;
- }
}
if (fqs_duration < 0)
fqs_duration = 0;
if (fqs_duration) {
- /* Create the stutter thread */
- fqs_task = kthread_run(rcu_torture_fqs, NULL,
- "rcu_torture_fqs");
- if (IS_ERR(fqs_task)) {
- firsterr = PTR_ERR(fqs_task);
- VERBOSE_PRINTK_ERRSTRING("Failed to create fqs");
- fqs_task = NULL;
+ /* Create the fqs thread */
+ firsterr = torture_create_kthread(rcu_torture_fqs, NULL,
+ fqs_task);
+ if (firsterr)
goto unwind;
- }
}
if (test_boost_interval < 1)
test_boost_interval = 1;
@@ -2097,49 +1674,31 @@ rcu_torture_init(void)
for_each_possible_cpu(i) {
if (cpu_is_offline(i))
continue; /* Heuristic: CPU can go offline. */
- retval = rcutorture_booster_init(i);
- if (retval < 0) {
- firsterr = retval;
+ firsterr = rcutorture_booster_init(i);
+ if (firsterr)
goto unwind;
- }
- }
- }
- if (shutdown_secs > 0) {
- shutdown_time = jiffies + shutdown_secs * HZ;
- shutdown_task = kthread_create(rcu_torture_shutdown, NULL,
- "rcu_torture_shutdown");
- if (IS_ERR(shutdown_task)) {
- firsterr = PTR_ERR(shutdown_task);
- VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown");
- shutdown_task = NULL;
- goto unwind;
}
- wake_up_process(shutdown_task);
}
- i = rcu_torture_onoff_init();
- if (i != 0) {
- firsterr = i;
+ firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
+ if (firsterr)
goto unwind;
- }
- register_reboot_notifier(&rcutorture_shutdown_nb);
- i = rcu_torture_stall_init();
- if (i != 0) {
- firsterr = i;
+ firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ);
+ if (firsterr)
goto unwind;
- }
- retval = rcu_torture_barrier_init();
- if (retval != 0) {
- firsterr = retval;
+ firsterr = rcu_torture_stall_init();
+ if (firsterr)
+ goto unwind;
+ firsterr = rcu_torture_barrier_init();
+ if (firsterr)
goto unwind;
- }
if (object_debug)
rcu_test_debug_objects();
rcutorture_record_test_transition();
- mutex_unlock(&fullstop_mutex);
+ torture_init_end();
return 0;
unwind:
- mutex_unlock(&fullstop_mutex);
+ torture_init_end();
rcu_torture_cleanup();
return firsterr;
}
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 3318d8284384..e037f3eb2f7b 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -12,8 +12,8 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright (C) IBM Corporation, 2006
* Copyright (C) Fujitsu, 2012
@@ -36,8 +36,6 @@
#include <linux/delay.h>
#include <linux/srcu.h>
-#include <trace/events/rcu.h>
-
#include "rcu.h"
/*
@@ -300,9 +298,9 @@ int __srcu_read_lock(struct srcu_struct *sp)
idx = ACCESS_ONCE(sp->completed) & 0x1;
preempt_disable();
- ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
+ __this_cpu_inc(sp->per_cpu_ref->c[idx]);
smp_mb(); /* B */ /* Avoid leaking the critical section. */
- ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
+ __this_cpu_inc(sp->per_cpu_ref->seq[idx]);
preempt_enable();
return idx;
}
@@ -398,7 +396,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
rcu_batch_queue(&sp->batch_queue, head);
if (!sp->running) {
sp->running = true;
- schedule_delayed_work(&sp->work, 0);
+ queue_delayed_work(system_power_efficient_wq, &sp->work, 0);
}
spin_unlock_irqrestore(&sp->queue_lock, flags);
}
@@ -674,7 +672,8 @@ static void srcu_reschedule(struct srcu_struct *sp)
}
if (pending)
- schedule_delayed_work(&sp->work, SRCU_INTERVAL);
+ queue_delayed_work(system_power_efficient_wq,
+ &sp->work, SRCU_INTERVAL);
}
/*
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 1254f312d024..d9efcc13008c 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -12,8 +12,8 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright IBM Corporation, 2008
*
@@ -37,10 +37,6 @@
#include <linux/prefetch.h>
#include <linux/ftrace_event.h>
-#ifdef CONFIG_RCU_TRACE
-#include <trace/events/rcu.h>
-#endif /* #else #ifdef CONFIG_RCU_TRACE */
-
#include "rcu.h"
/* Forward declarations for tiny_plugin.h. */
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 280d06cae352..858c56569127 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -14,8 +14,8 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright (c) 2010 Linaro
*
@@ -144,7 +144,7 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
return;
rcp->ticks_this_gp++;
j = jiffies;
- js = rcp->jiffies_stall;
+ js = ACCESS_ONCE(rcp->jiffies_stall);
if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
@@ -152,17 +152,17 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
dump_stack();
}
if (*rcp->curtail && ULONG_CMP_GE(j, js))
- rcp->jiffies_stall = jiffies +
+ ACCESS_ONCE(rcp->jiffies_stall) = jiffies +
3 * rcu_jiffies_till_stall_check() + 3;
else if (ULONG_CMP_GE(j, js))
- rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
+ ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
}
static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
{
rcp->ticks_this_gp = 0;
rcp->gp_start = jiffies;
- rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
+ ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
}
static void check_cpu_stalls(void)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b3d116cd072d..1b70cb6fbe3c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -12,8 +12,8 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright IBM Corporation, 2008
*
@@ -58,8 +58,6 @@
#include <linux/suspend.h>
#include "tree.h"
-#include <trace/events/rcu.h>
-
#include "rcu.h"
MODULE_ALIAS("rcutree");
@@ -103,7 +101,7 @@ DEFINE_PER_CPU(struct rcu_data, sname##_data)
RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
-static struct rcu_state *rcu_state;
+static struct rcu_state *rcu_state_p;
LIST_HEAD(rcu_struct_flavors);
/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
@@ -208,6 +206,70 @@ void rcu_bh_qs(int cpu)
rdp->passed_quiesce = 1;
}
+static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
+
+static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
+ .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
+ .dynticks = ATOMIC_INIT(1),
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+ .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
+ .dynticks_idle = ATOMIC_INIT(1),
+#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+};
+
+/*
+ * Let the RCU core know that this CPU has gone through the scheduler,
+ * which is a quiescent state. This is called when the need for a
+ * quiescent state is urgent, so we burn an atomic operation and full
+ * memory barriers to let the RCU core know about it, regardless of what
+ * this CPU might (or might not) do in the near future.
+ *
+ * We inform the RCU core by emulating a zero-duration dyntick-idle
+ * period, which we in turn do by incrementing the ->dynticks counter
+ * by two.
+ */
+static void rcu_momentary_dyntick_idle(void)
+{
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_dynticks *rdtp;
+ int resched_mask;
+ struct rcu_state *rsp;
+
+ local_irq_save(flags);
+
+ /*
+ * Yes, we can lose flag-setting operations. This is OK, because
+ * the flag will be set again after some delay.
+ */
+ resched_mask = raw_cpu_read(rcu_sched_qs_mask);
+ raw_cpu_write(rcu_sched_qs_mask, 0);
+
+ /* Find the flavor that needs a quiescent state. */
+ for_each_rcu_flavor(rsp) {
+ rdp = raw_cpu_ptr(rsp->rda);
+ if (!(resched_mask & rsp->flavor_mask))
+ continue;
+ smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */
+ if (ACCESS_ONCE(rdp->mynode->completed) !=
+ ACCESS_ONCE(rdp->cond_resched_completed))
+ continue;
+
+ /*
+ * Pretend to be momentarily idle for the quiescent state.
+ * This allows the grace-period kthread to record the
+ * quiescent state, with no need for this CPU to do anything
+ * further.
+ */
+ rdtp = this_cpu_ptr(&rcu_dynticks);
+ smp_mb__before_atomic(); /* Earlier stuff before QS. */
+ atomic_add(2, &rdtp->dynticks); /* QS. */
+ smp_mb__after_atomic(); /* Later stuff after QS. */
+ break;
+ }
+ local_irq_restore(flags);
+}
+
/*
* Note a context switch. This is a quiescent state for RCU-sched,
* and requires special handling for preemptible RCU.
@@ -218,19 +280,12 @@ void rcu_note_context_switch(int cpu)
trace_rcu_utilization(TPS("Start context switch"));
rcu_sched_qs(cpu);
rcu_preempt_note_context_switch(cpu);
+ if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+ rcu_momentary_dyntick_idle();
trace_rcu_utilization(TPS("End context switch"));
}
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
-static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
- .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
- .dynticks = ATOMIC_INIT(1),
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
- .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
- .dynticks_idle = ATOMIC_INIT(1),
-#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-};
-
static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
static long qhimark = 10000; /* If this many pending, ignore blimit. */
static long qlowmark = 100; /* Once only this many pending, use blimit. */
@@ -245,7 +300,14 @@ static ulong jiffies_till_next_fqs = ULONG_MAX;
module_param(jiffies_till_first_fqs, ulong, 0644);
module_param(jiffies_till_next_fqs, ulong, 0644);
-static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
+/*
+ * How long the grace period must be before we start recruiting
+ * quiescent-state help from rcu_note_context_switch().
+ */
+static ulong jiffies_till_sched_qs = HZ / 20;
+module_param(jiffies_till_sched_qs, ulong, 0644);
+
+static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp);
static void force_qs_rnp(struct rcu_state *rsp,
int (*f)(struct rcu_data *rsp, bool *isidle,
@@ -273,6 +335,15 @@ long rcu_batches_completed_bh(void)
EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
/*
+ * Force a quiescent state.
+ */
+void rcu_force_quiescent_state(void)
+{
+ force_quiescent_state(rcu_state_p);
+}
+EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
+
+/*
* Force a quiescent state for RCU BH.
*/
void rcu_bh_force_quiescent_state(void)
@@ -282,6 +353,21 @@ void rcu_bh_force_quiescent_state(void)
EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
/*
+ * Show the state of the grace-period kthreads.
+ */
+void show_rcu_gp_kthreads(void)
+{
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp) {
+ pr_info("%s: wait state: %d ->state: %#lx\n",
+ rsp->name, rsp->gp_state, rsp->gp_kthread->state);
+ /* sched_show_task(rsp->gp_kthread); */
+ }
+}
+EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
+
+/*
* Record the number of times rcutorture tests have been initiated and
* terminated. This information allows the debugfs tracing stats to be
* correlated to the rcutorture messages, even when the rcutorture module
@@ -296,6 +382,39 @@ void rcutorture_record_test_transition(void)
EXPORT_SYMBOL_GPL(rcutorture_record_test_transition);
/*
+ * Send along grace-period-related data for rcutorture diagnostics.
+ */
+void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
+ unsigned long *gpnum, unsigned long *completed)
+{
+ struct rcu_state *rsp = NULL;
+
+ switch (test_type) {
+ case RCU_FLAVOR:
+ rsp = rcu_state_p;
+ break;
+ case RCU_BH_FLAVOR:
+ rsp = &rcu_bh_state;
+ break;
+ case RCU_SCHED_FLAVOR:
+ rsp = &rcu_sched_state;
+ break;
+ default:
+ break;
+ }
+ if (rsp != NULL) {
+ *flags = ACCESS_ONCE(rsp->gp_flags);
+ *gpnum = ACCESS_ONCE(rsp->gpnum);
+ *completed = ACCESS_ONCE(rsp->completed);
+ return;
+ }
+ *flags = 0;
+ *gpnum = 0;
+ *completed = 0;
+}
+EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
+
+/*
* Record the number of writer passes through the current rcutorture test.
* This is also used to correlate debugfs tracing stats with the rcutorture
* messages.
@@ -326,6 +445,28 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
}
/*
+ * Return the root node of the specified rcu_state structure.
+ */
+static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
+{
+ return &rsp->node[0];
+}
+
+/*
+ * Is there any need for future grace periods?
+ * Interrupts must be disabled. If the caller does not hold the root
+ * rnp_node structure's ->lock, the results are advisory only.
+ */
+static int rcu_future_needs_gp(struct rcu_state *rsp)
+{
+ struct rcu_node *rnp = rcu_get_root(rsp);
+ int idx = (ACCESS_ONCE(rnp->completed) + 1) & 0x1;
+ int *fp = &rnp->need_future_gp[idx];
+
+ return ACCESS_ONCE(*fp);
+}
+
+/*
* Does the current CPU require a not-yet-started grace period?
* The caller must have disabled interrupts to prevent races with
* normal callback registry.
@@ -337,7 +478,7 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
if (rcu_gp_in_progress(rsp))
return 0; /* No, a grace period is already in progress. */
- if (rcu_nocb_needs_gp(rsp))
+ if (rcu_future_needs_gp(rsp))
return 1; /* Yes, a no-CBs CPU needs one. */
if (!rdp->nxttail[RCU_NEXT_TAIL])
return 0; /* No, this is a no-CBs (or offline) CPU. */
@@ -352,14 +493,6 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
}
/*
- * Return the root node of the specified rcu_state structure.
- */
-static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
-{
- return &rsp->node[0];
-}
-
-/*
* rcu_eqs_enter_common - current CPU is moving towards extended quiescent state
*
* If the new value of the ->dynticks_nesting counter now is zero,
@@ -389,9 +522,9 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
}
rcu_prepare_for_idle(smp_processor_id());
/* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
- smp_mb__before_atomic_inc(); /* See above. */
+ smp_mb__before_atomic(); /* See above. */
atomic_inc(&rdtp->dynticks);
- smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
+ smp_mb__after_atomic(); /* Force ordering with next sojourn. */
WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
/*
@@ -509,10 +642,10 @@ void rcu_irq_exit(void)
static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
int user)
{
- smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */
+ smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */
atomic_inc(&rdtp->dynticks);
/* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
- smp_mb__after_atomic_inc(); /* See above. */
+ smp_mb__after_atomic(); /* See above. */
WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
rcu_cleanup_after_idle(smp_processor_id());
trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
@@ -637,10 +770,10 @@ void rcu_nmi_enter(void)
(atomic_read(&rdtp->dynticks) & 0x1))
return;
rdtp->dynticks_nmi_nesting++;
- smp_mb__before_atomic_inc(); /* Force delay from prior write. */
+ smp_mb__before_atomic(); /* Force delay from prior write. */
atomic_inc(&rdtp->dynticks);
/* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
- smp_mb__after_atomic_inc(); /* See above. */
+ smp_mb__after_atomic(); /* See above. */
WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
}
@@ -659,9 +792,9 @@ void rcu_nmi_exit(void)
--rdtp->dynticks_nmi_nesting != 0)
return;
/* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
- smp_mb__before_atomic_inc(); /* See above. */
+ smp_mb__before_atomic(); /* See above. */
atomic_inc(&rdtp->dynticks);
- smp_mb__after_atomic_inc(); /* Force delay to next write. */
+ smp_mb__after_atomic(); /* Force delay to next write. */
WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
}
@@ -760,7 +893,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
{
rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
rcu_sysidle_check_cpu(rdp, isidle, maxj);
- return (rdp->dynticks_snap & 0x1) == 0;
+ if ((rdp->dynticks_snap & 0x1) == 0) {
+ trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
+ return 1;
+ } else {
+ return 0;
+ }
}
/*
@@ -779,6 +917,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
bool *isidle, unsigned long *maxj)
{
unsigned int curr;
+ int *rcrmp;
unsigned int snap;
curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
@@ -819,27 +958,43 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
}
/*
- * There is a possibility that a CPU in adaptive-ticks state
- * might run in the kernel with the scheduling-clock tick disabled
- * for an extended time period. Invoke rcu_kick_nohz_cpu() to
- * force the CPU to restart the scheduling-clock tick in this
- * CPU is in this state.
- */
- rcu_kick_nohz_cpu(rdp->cpu);
-
- /*
- * Alternatively, the CPU might be running in the kernel
- * for an extended period of time without a quiescent state.
- * Attempt to force the CPU through the scheduler to gain the
- * needed quiescent state, but only if the grace period has gone
- * on for an uncommonly long time. If there are many stuck CPUs,
- * we will beat on the first one until it gets unstuck, then move
- * to the next. Only do this for the primary flavor of RCU.
+ * A CPU running for an extended time within the kernel can
+ * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode,
+ * even context-switching back and forth between a pair of
+ * in-kernel CPU-bound tasks cannot advance grace periods.
+ * So if the grace period is old enough, make the CPU pay attention.
+ * Note that the unsynchronized assignments to the per-CPU
+ * rcu_sched_qs_mask variable are safe. Yes, setting of
+ * bits can be lost, but they will be set again on the next
+ * force-quiescent-state pass. So lost bit sets do not result
+ * in incorrect behavior, merely in a grace period lasting
+ * a few jiffies longer than it might otherwise. Because
+ * there are at most four threads involved, and because the
+ * updates are only once every few jiffies, the probability of
+ * lossage (and thus of slight grace-period extension) is
+ * quite low.
+ *
+ * Note that if the jiffies_till_sched_qs boot/sysfs parameter
+ * is set too high, we override with half of the RCU CPU stall
+ * warning delay.
*/
- if (rdp->rsp == rcu_state &&
- ULONG_CMP_GE(ACCESS_ONCE(jiffies), rdp->rsp->jiffies_resched)) {
- rdp->rsp->jiffies_resched += 5;
- resched_cpu(rdp->cpu);
+ rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu);
+ if (ULONG_CMP_GE(jiffies,
+ rdp->rsp->gp_start + jiffies_till_sched_qs) ||
+ ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
+ if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
+ ACCESS_ONCE(rdp->cond_resched_completed) =
+ ACCESS_ONCE(rdp->mynode->completed);
+ smp_mb(); /* ->cond_resched_completed before *rcrmp. */
+ ACCESS_ONCE(*rcrmp) =
+ ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask;
+ resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
+ rdp->rsp->jiffies_resched += 5; /* Enable beating. */
+ } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
+ /* Time to beat on that CPU again! */
+ resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
+ rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
+ }
}
return 0;
@@ -847,21 +1002,18 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
static void record_gp_stall_check_time(struct rcu_state *rsp)
{
- unsigned long j = ACCESS_ONCE(jiffies);
+ unsigned long j = jiffies;
unsigned long j1;
rsp->gp_start = j;
smp_wmb(); /* Record start time before stall time. */
j1 = rcu_jiffies_till_stall_check();
- rsp->jiffies_stall = j + j1;
+ ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
rsp->jiffies_resched = j + j1 / 2;
}
/*
- * Dump stacks of all tasks running on stalled CPUs. This is a fallback
- * for architectures that do not implement trigger_all_cpu_backtrace().
- * The NMI-triggered stack traces are more accurate because they are
- * printed by the target CPU.
+ * Dump stacks of all tasks running on stalled CPUs.
*/
static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
{
@@ -892,12 +1044,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
/* Only let one CPU complain about others per time interval. */
raw_spin_lock_irqsave(&rnp->lock, flags);
- delta = jiffies - rsp->jiffies_stall;
+ delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
- rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+ ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
/*
@@ -934,12 +1086,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
print_cpu_stall_info_end();
for_each_possible_cpu(cpu)
totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
- pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n",
+ pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
smp_processor_id(), (long)(jiffies - rsp->gp_start),
- rsp->gpnum, rsp->completed, totqlen);
+ (long)rsp->gpnum, (long)rsp->completed, totqlen);
if (ndetected == 0)
pr_err("INFO: Stall ended before state dump start\n");
- else if (!trigger_all_cpu_backtrace())
+ else
rcu_dump_cpu_stacks(rsp);
/* Complain about tasks blocking the grace period. */
@@ -949,12 +1101,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
force_quiescent_state(rsp); /* Kick them all. */
}
-/*
- * This function really isn't for public consumption, but RCU is special in
- * that context switches can allow the state machine to make progress.
- */
-extern void resched_cpu(int cpu);
-
static void print_cpu_stall(struct rcu_state *rsp)
{
int cpu;
@@ -973,14 +1119,14 @@ static void print_cpu_stall(struct rcu_state *rsp)
print_cpu_stall_info_end();
for_each_possible_cpu(cpu)
totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
- pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n",
- jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen);
- if (!trigger_all_cpu_backtrace())
- dump_stack();
+ pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
+ jiffies - rsp->gp_start,
+ (long)rsp->gpnum, (long)rsp->completed, totqlen);
+ rcu_dump_cpu_stacks(rsp);
raw_spin_lock_irqsave(&rnp->lock, flags);
- if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
- rsp->jiffies_stall = jiffies +
+ if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall)))
+ ACCESS_ONCE(rsp->jiffies_stall) = jiffies +
3 * rcu_jiffies_till_stall_check() + 3;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1005,7 +1151,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp))
return;
- j = ACCESS_ONCE(jiffies);
+ j = jiffies;
/*
* Lots of memory barriers to reject false positives.
@@ -1064,7 +1210,7 @@ void rcu_cpu_stall_reset(void)
struct rcu_state *rsp;
for_each_rcu_flavor(rsp)
- rsp->jiffies_stall = jiffies + ULONG_MAX / 2;
+ ACCESS_ONCE(rsp->jiffies_stall) = jiffies + ULONG_MAX / 2;
}
/*
@@ -1125,15 +1271,18 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
/*
* Start some future grace period, as needed to handle newly arrived
* callbacks. The required future grace periods are recorded in each
- * rcu_node structure's ->need_future_gp field.
+ * rcu_node structure's ->need_future_gp field. Returns true if there
+ * is reason to awaken the grace-period kthread.
*
* The caller must hold the specified rcu_node structure's ->lock.
*/
-static unsigned long __maybe_unused
-rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
+static bool __maybe_unused
+rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
+ unsigned long *c_out)
{
unsigned long c;
int i;
+ bool ret = false;
struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
/*
@@ -1144,7 +1293,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf"));
if (rnp->need_future_gp[c & 0x1]) {
trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf"));
- return c;
+ goto out;
}
/*
@@ -1152,13 +1301,19 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
* believe that a grace period is in progress, then we must wait
* for the one following, which is in "c". Because our request
* will be noticed at the end of the current grace period, we don't
- * need to explicitly start one.
+ * need to explicitly start one. We only do the lockless check
+ * of rnp_root's fields if the current rcu_node structure thinks
+ * there is no grace period in flight, and because we hold rnp->lock,
+ * the only possible change is when rnp_root's two fields are
+ * equal, in which case rnp_root->gpnum might be concurrently
+ * incremented. But that is OK, as it will just result in our
+ * doing some extra useless work.
*/
if (rnp->gpnum != rnp->completed ||
- ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
+ ACCESS_ONCE(rnp_root->gpnum) != ACCESS_ONCE(rnp_root->completed)) {
rnp->need_future_gp[c & 0x1]++;
trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
- return c;
+ goto out;
}
/*
@@ -1199,12 +1354,15 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot"));
} else {
trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot"));
- rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
+ ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
}
unlock_out:
if (rnp != rnp_root)
raw_spin_unlock(&rnp_root->lock);
- return c;
+out:
+ if (c_out != NULL)
+ *c_out = c;
+ return ret;
}
/*
@@ -1228,25 +1386,43 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
}
/*
+ * Awaken the grace-period kthread for the specified flavor of RCU.
+ * Don't do a self-awaken, and don't bother awakening when there is
+ * nothing for the grace-period kthread to do (as in several CPUs
+ * raced to awaken, and we lost), and finally don't try to awaken
+ * a kthread that has not yet been created.
+ */
+static void rcu_gp_kthread_wake(struct rcu_state *rsp)
+{
+ if (current == rsp->gp_kthread ||
+ !ACCESS_ONCE(rsp->gp_flags) ||
+ !rsp->gp_kthread)
+ return;
+ wake_up(&rsp->gp_wq);
+}
+
+/*
* If there is room, assign a ->completed number to any callbacks on
* this CPU that have not already been assigned. Also accelerate any
* callbacks that were previously assigned a ->completed number that has
* since proven to be too conservative, which can happen if callbacks get
* assigned a ->completed number while RCU is idle, but with reference to
* a non-root rcu_node structure. This function is idempotent, so it does
- * not hurt to call it repeatedly.
+ * not hurt to call it repeatedly. Returns an flag saying that we should
+ * awaken the RCU grace-period kthread.
*
* The caller must hold rnp->lock with interrupts disabled.
*/
-static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
+static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp)
{
unsigned long c;
int i;
+ bool ret;
/* If the CPU has no callbacks, nothing to do. */
if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
- return;
+ return false;
/*
* Starting from the sublist containing the callbacks most
@@ -1275,7 +1451,7 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
* be grouped into.
*/
if (++i >= RCU_NEXT_TAIL)
- return;
+ return false;
/*
* Assign all subsequent callbacks' ->completed number to the next
@@ -1287,13 +1463,14 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
rdp->nxtcompleted[i] = c;
}
/* Record any needed additional grace periods. */
- rcu_start_future_gp(rnp, rdp);
+ ret = rcu_start_future_gp(rnp, rdp, NULL);
/* Trace depending on how much we were able to accelerate. */
if (!*rdp->nxttail[RCU_WAIT_TAIL])
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB"));
else
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB"));
+ return ret;
}
/*
@@ -1302,17 +1479,18 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
* assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL
* sublist. This function is idempotent, so it does not hurt to
* invoke it repeatedly. As long as it is not invoked -too- often...
+ * Returns true if the RCU grace-period kthread needs to be awakened.
*
* The caller must hold rnp->lock with interrupts disabled.
*/
-static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
+static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp)
{
int i, j;
/* If the CPU has no callbacks, nothing to do. */
if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
- return;
+ return false;
/*
* Find all callbacks whose ->completed numbers indicate that they
@@ -1336,26 +1514,30 @@ static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
}
/* Classify any remaining callbacks. */
- rcu_accelerate_cbs(rsp, rnp, rdp);
+ return rcu_accelerate_cbs(rsp, rnp, rdp);
}
/*
* Update CPU-local rcu_data state to record the beginnings and ends of
* grace periods. The caller must hold the ->lock of the leaf rcu_node
* structure corresponding to the current CPU, and must have irqs disabled.
+ * Returns true if the grace-period kthread needs to be awakened.
*/
-static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
+static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
+ struct rcu_data *rdp)
{
+ bool ret;
+
/* Handle the ends of any preceding grace periods first. */
if (rdp->completed == rnp->completed) {
/* No grace period end, so just accelerate recent callbacks. */
- rcu_accelerate_cbs(rsp, rnp, rdp);
+ ret = rcu_accelerate_cbs(rsp, rnp, rdp);
} else {
/* Advance callbacks. */
- rcu_advance_cbs(rsp, rnp, rdp);
+ ret = rcu_advance_cbs(rsp, rnp, rdp);
/* Remember that we saw this grace-period completion. */
rdp->completed = rnp->completed;
@@ -1374,11 +1556,13 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc
rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
zero_cpu_stall_ticks(rdp);
}
+ return ret;
}
static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
{
unsigned long flags;
+ bool needwake;
struct rcu_node *rnp;
local_irq_save(flags);
@@ -1390,8 +1574,10 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
return;
}
smp_mb__after_unlock_lock();
- __note_gp_changes(rsp, rnp, rdp);
+ needwake = __note_gp_changes(rsp, rnp, rdp);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
}
/*
@@ -1405,12 +1591,12 @@ static int rcu_gp_init(struct rcu_state *rsp)
rcu_bind_gp_kthread();
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
- if (rsp->gp_flags == 0) {
+ if (!ACCESS_ONCE(rsp->gp_flags)) {
/* Spurious wakeup, tell caller to go back to sleep. */
raw_spin_unlock_irq(&rnp->lock);
return 0;
}
- rsp->gp_flags = 0; /* Clear all flags: New grace period. */
+ ACCESS_ONCE(rsp->gp_flags) = 0; /* Clear all flags: New grace period. */
if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) {
/*
@@ -1423,13 +1609,14 @@ static int rcu_gp_init(struct rcu_state *rsp)
/* Advance to a new grace period and initialize state. */
record_gp_stall_check_time(rsp);
- smp_wmb(); /* Record GP times before starting GP. */
- rsp->gpnum++;
+ /* Record GP times before starting GP, hence smp_store_release(). */
+ smp_store_release(&rsp->gpnum, rsp->gpnum + 1);
trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
raw_spin_unlock_irq(&rnp->lock);
/* Exclude any concurrent CPU-hotplug operations. */
mutex_lock(&rsp->onoff_mutex);
+ smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */
/*
* Set the quiescent-state-needed bits in all the rcu_node
@@ -1454,17 +1641,12 @@ static int rcu_gp_init(struct rcu_state *rsp)
WARN_ON_ONCE(rnp->completed != rsp->completed);
ACCESS_ONCE(rnp->completed) = rsp->completed;
if (rnp == rdp->mynode)
- __note_gp_changes(rsp, rnp, rdp);
+ (void)__note_gp_changes(rsp, rnp, rdp);
rcu_preempt_boost_start_gp(rnp);
trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
rnp->level, rnp->grplo,
rnp->grphi, rnp->qsmask);
raw_spin_unlock_irq(&rnp->lock);
-#ifdef CONFIG_PROVE_RCU_DELAY
- if ((prandom_u32() % (rcu_num_nodes + 1)) == 0 &&
- system_state == SYSTEM_RUNNING)
- udelay(200);
-#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
cond_resched();
}
@@ -1502,7 +1684,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
- rsp->gp_flags &= ~RCU_GP_FLAG_FQS;
+ ACCESS_ONCE(rsp->gp_flags) &= ~RCU_GP_FLAG_FQS;
raw_spin_unlock_irq(&rnp->lock);
}
return fqs_state;
@@ -1514,6 +1696,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
static void rcu_gp_cleanup(struct rcu_state *rsp)
{
unsigned long gp_duration;
+ bool needgp = false;
int nocb = 0;
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
@@ -1549,7 +1732,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
ACCESS_ONCE(rnp->completed) = rsp->gpnum;
rdp = this_cpu_ptr(rsp->rda);
if (rnp == rdp->mynode)
- __note_gp_changes(rsp, rnp, rdp);
+ needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
/* smp_mb() provided by prior unlock-lock pair. */
nocb += rcu_future_gp_cleanup(rsp, rnp);
raw_spin_unlock_irq(&rnp->lock);
@@ -1557,16 +1740,18 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
}
rnp = rcu_get_root(rsp);
raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock();
+ smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */
rcu_nocb_gp_set(rnp, nocb);
- rsp->completed = rsp->gpnum; /* Declare grace period done. */
+ /* Declare grace period done. */
+ ACCESS_ONCE(rsp->completed) = rsp->gpnum;
trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
rsp->fqs_state = RCU_GP_IDLE;
rdp = this_cpu_ptr(rsp->rda);
- rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */
- if (cpu_needs_another_gp(rsp, rdp)) {
- rsp->gp_flags = RCU_GP_FLAG_INIT;
+ /* Advance CBs to reduce false positives below. */
+ needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp;
+ if (needgp || cpu_needs_another_gp(rsp, rdp)) {
+ ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
trace_rcu_grace_period(rsp->name,
ACCESS_ONCE(rsp->gpnum),
TPS("newreq"));
@@ -1593,6 +1778,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
trace_rcu_grace_period(rsp->name,
ACCESS_ONCE(rsp->gpnum),
TPS("reqwait"));
+ rsp->gp_state = RCU_GP_WAIT_GPS;
wait_event_interruptible(rsp->gp_wq,
ACCESS_ONCE(rsp->gp_flags) &
RCU_GP_FLAG_INIT);
@@ -1620,6 +1806,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
trace_rcu_grace_period(rsp->name,
ACCESS_ONCE(rsp->gpnum),
TPS("fqswait"));
+ rsp->gp_state = RCU_GP_WAIT_FQS;
ret = wait_event_interruptible_timeout(rsp->gp_wq,
((gf = ACCESS_ONCE(rsp->gp_flags)) &
RCU_GP_FLAG_FQS) ||
@@ -1665,14 +1852,6 @@ static int __noreturn rcu_gp_kthread(void *arg)
}
}
-static void rsp_wakeup(struct irq_work *work)
-{
- struct rcu_state *rsp = container_of(work, struct rcu_state, wakeup_work);
-
- /* Wake up rcu_gp_kthread() to start the grace period. */
- wake_up(&rsp->gp_wq);
-}
-
/*
* Start a new RCU grace period if warranted, re-initializing the hierarchy
* in preparation for detecting the next grace period. The caller must hold
@@ -1681,8 +1860,10 @@ static void rsp_wakeup(struct irq_work *work)
* Note that it is legal for a dying CPU (which is marked as offline) to
* invoke this function. This can happen when the dying CPU reports its
* quiescent state.
+ *
+ * Returns true if the grace-period kthread must be awakened.
*/
-static void
+static bool
rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp)
{
@@ -1693,20 +1874,18 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
* or a grace period is already in progress.
* Either way, don't start a new grace period.
*/
- return;
+ return false;
}
- rsp->gp_flags = RCU_GP_FLAG_INIT;
+ ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum),
TPS("newreq"));
/*
* We can't do wakeups while holding the rnp->lock, as that
* could cause possible deadlocks with the rq->lock. Defer
- * the wakeup to interrupt context. And don't bother waking
- * up the running kthread.
+ * the wakeup to our caller.
*/
- if (current != rsp->gp_kthread)
- irq_work_queue(&rsp->wakeup_work);
+ return true;
}
/*
@@ -1715,12 +1894,14 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
* is invoked indirectly from rcu_advance_cbs(), which would result in
* endless recursion -- or would do so if it wasn't for the self-deadlock
* that is encountered beforehand.
+ *
+ * Returns true if the grace-period kthread needs to be awakened.
*/
-static void
-rcu_start_gp(struct rcu_state *rsp)
+static bool rcu_start_gp(struct rcu_state *rsp)
{
struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
struct rcu_node *rnp = rcu_get_root(rsp);
+ bool ret = false;
/*
* If there is no grace period in progress right now, any
@@ -1730,8 +1911,9 @@ rcu_start_gp(struct rcu_state *rsp)
* resulting in pointless grace periods. So, advance callbacks
* then start the grace period!
*/
- rcu_advance_cbs(rsp, rnp, rdp);
- rcu_start_gp_advanced(rsp, rnp, rdp);
+ ret = rcu_advance_cbs(rsp, rnp, rdp) || ret;
+ ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret;
+ return ret;
}
/*
@@ -1820,6 +2002,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
{
unsigned long flags;
unsigned long mask;
+ bool needwake;
struct rcu_node *rnp;
rnp = rdp->mynode;
@@ -1848,9 +2031,11 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
* This GP can't end until cpu checks in, so all of our
* callbacks can be processed during the next GP.
*/
- rcu_accelerate_cbs(rsp, rnp, rdp);
+ needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
}
}
@@ -1951,7 +2136,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
{
int i;
- struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
+ struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
/* No-CBs CPUs are handled specially. */
if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
@@ -2159,7 +2344,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
}
smp_mb(); /* List handling before counting for rcu_barrier(). */
rdp->qlen_lazy -= count_lazy;
- ACCESS_ONCE(rdp->qlen) -= count;
+ ACCESS_ONCE(rdp->qlen) = rdp->qlen - count;
rdp->n_cbs_invoked += count;
/* Reinstate batch limit if we have worked down the excess. */
@@ -2297,7 +2482,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
struct rcu_node *rnp_old = NULL;
/* Funnel through hierarchy to reduce memory contention. */
- rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode;
+ rnp = __this_cpu_read(rsp->rda->mynode);
for (; rnp != NULL; rnp = rnp->parent) {
ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
!raw_spin_trylock(&rnp->fqslock);
@@ -2320,7 +2505,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
return; /* Someone beat us to it. */
}
- rsp->gp_flags |= RCU_GP_FLAG_FQS;
+ ACCESS_ONCE(rsp->gp_flags) |= RCU_GP_FLAG_FQS;
raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */
}
@@ -2334,7 +2519,8 @@ static void
__rcu_process_callbacks(struct rcu_state *rsp)
{
unsigned long flags;
- struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
+ bool needwake;
+ struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
WARN_ON_ONCE(rdp->beenonline == 0);
@@ -2345,8 +2531,10 @@ __rcu_process_callbacks(struct rcu_state *rsp)
local_irq_save(flags);
if (cpu_needs_another_gp(rsp, rdp)) {
raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
- rcu_start_gp(rsp);
+ needwake = rcu_start_gp(rsp);
raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
} else {
local_irq_restore(flags);
}
@@ -2404,6 +2592,8 @@ static void invoke_rcu_core(void)
static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
struct rcu_head *head, unsigned long flags)
{
+ bool needwake;
+
/*
* If called from an extended quiescent state, invoke the RCU
* core in order to force a re-evaluation of RCU's idleness.
@@ -2433,8 +2623,10 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
raw_spin_lock(&rnp_root->lock);
smp_mb__after_unlock_lock();
- rcu_start_gp(rsp);
+ needwake = rcu_start_gp(rsp);
raw_spin_unlock(&rnp_root->lock);
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
} else {
/* Give the grace period a kick. */
rdp->blimit = LONG_MAX;
@@ -2467,7 +2659,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
unsigned long flags;
struct rcu_data *rdp;
- WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */
+ WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */
if (debug_rcu_head_queue(head)) {
/* Probable double call_rcu(), so leak the callback. */
ACCESS_ONCE(head->func) = rcu_leak_callback;
@@ -2498,7 +2690,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
local_irq_restore(flags);
return;
}
- ACCESS_ONCE(rdp->qlen)++;
+ ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1;
if (lazy)
rdp->qlen_lazy++;
else
@@ -2537,6 +2729,20 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
EXPORT_SYMBOL_GPL(call_rcu_bh);
/*
+ * Queue an RCU callback for lazy invocation after a grace period.
+ * This will likely be later named something like "call_rcu_lazy()",
+ * but this change will require some way of tagging the lazy RCU
+ * callbacks in the list of pending callbacks. Until then, this
+ * function may only be called from __kfree_rcu().
+ */
+void kfree_call_rcu(struct rcu_head *head,
+ void (*func)(struct rcu_head *rcu))
+{
+ __call_rcu(head, func, rcu_state_p, -1, 1);
+}
+EXPORT_SYMBOL_GPL(kfree_call_rcu);
+
+/*
* Because a context switch is a grace period for RCU-sched and RCU-bh,
* any blocking grace-period wait automatically implies a grace period
* if there is only one CPU online at any point time during execution
@@ -2639,6 +2845,58 @@ void synchronize_rcu_bh(void)
}
EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
+/**
+ * get_state_synchronize_rcu - Snapshot current RCU state
+ *
+ * Returns a cookie that is used by a later call to cond_synchronize_rcu()
+ * to determine whether or not a full grace period has elapsed in the
+ * meantime.
+ */
+unsigned long get_state_synchronize_rcu(void)
+{
+ /*
+ * Any prior manipulation of RCU-protected data must happen
+ * before the load from ->gpnum.
+ */
+ smp_mb(); /* ^^^ */
+
+ /*
+ * Make sure this load happens before the purportedly
+ * time-consuming work between get_state_synchronize_rcu()
+ * and cond_synchronize_rcu().
+ */
+ return smp_load_acquire(&rcu_state_p->gpnum);
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
+
+/**
+ * cond_synchronize_rcu - Conditionally wait for an RCU grace period
+ *
+ * @oldstate: return value from earlier call to get_state_synchronize_rcu()
+ *
+ * If a full RCU grace period has elapsed since the earlier call to
+ * get_state_synchronize_rcu(), just return. Otherwise, invoke
+ * synchronize_rcu() to wait for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account. But
+ * counter wrap is harmless. If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for one additional grace period should be just fine.
+ */
+void cond_synchronize_rcu(unsigned long oldstate)
+{
+ unsigned long newstate;
+
+ /*
+ * Ensure that this load happens before any RCU-destructive
+ * actions the caller might carry out after we return.
+ */
+ newstate = smp_load_acquire(&rcu_state_p->completed);
+ if (ULONG_CMP_GE(oldstate, newstate))
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
+
static int synchronize_sched_expedited_cpu_stop(void *data)
{
/*
@@ -2738,7 +2996,7 @@ void synchronize_sched_expedited(void)
s = atomic_long_read(&rsp->expedited_done);
if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
/* ensure test happens before caller kfree */
- smp_mb__before_atomic_inc(); /* ^^^ */
+ smp_mb__before_atomic(); /* ^^^ */
atomic_long_inc(&rsp->expedited_workdone1);
return;
}
@@ -2756,7 +3014,7 @@ void synchronize_sched_expedited(void)
s = atomic_long_read(&rsp->expedited_done);
if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
/* ensure test happens before caller kfree */
- smp_mb__before_atomic_inc(); /* ^^^ */
+ smp_mb__before_atomic(); /* ^^^ */
atomic_long_inc(&rsp->expedited_workdone2);
return;
}
@@ -2785,7 +3043,7 @@ void synchronize_sched_expedited(void)
s = atomic_long_read(&rsp->expedited_done);
if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
/* ensure test happens before caller kfree */
- smp_mb__before_atomic_inc(); /* ^^^ */
+ smp_mb__before_atomic(); /* ^^^ */
atomic_long_inc(&rsp->expedited_done_lost);
break;
}
@@ -2880,7 +3138,7 @@ static int rcu_pending(int cpu)
* non-NULL, store an indication of whether all callbacks are lazy.
* (If there are no callbacks, all of them are deemed to be lazy.)
*/
-static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
+static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
{
bool al = true;
bool hc = false;
@@ -2936,7 +3194,7 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
static void rcu_barrier_func(void *type)
{
struct rcu_state *rsp = type;
- struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
+ struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
_rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);
atomic_inc(&rsp->barrier_cpu_count);
@@ -2996,7 +3254,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
* ACCESS_ONCE() to prevent the compiler from speculating
* the increment to precede the early-exit check.
*/
- ACCESS_ONCE(rsp->n_barrier_done)++;
+ ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
_rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
@@ -3046,7 +3304,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
/* Increment ->n_barrier_done to prevent duplicate work. */
smp_mb(); /* Keep increment after above mechanism. */
- ACCESS_ONCE(rsp->n_barrier_done)++;
+ ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
_rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
smp_mb(); /* Keep increment before caller's subsequent code. */
@@ -3108,7 +3366,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
* that this CPU cannot possibly have any RCU callbacks in flight yet.
*/
static void
-rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
+rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
{
unsigned long flags;
unsigned long mask;
@@ -3121,7 +3379,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave(&rnp->lock, flags);
rdp->beenonline = 1; /* We have now been online. */
- rdp->preemptible = preemptible;
rdp->qlen_last_fqs_check = 0;
rdp->n_force_qs_snap = rsp->n_force_qs;
rdp->blimit = blimit;
@@ -3165,8 +3422,7 @@ static void rcu_prepare_cpu(int cpu)
struct rcu_state *rsp;
for_each_rcu_flavor(rsp)
- rcu_init_percpu_data(cpu, rsp,
- strcmp(rsp->name, "rcu_preempt") == 0);
+ rcu_init_percpu_data(cpu, rsp);
}
/*
@@ -3176,7 +3432,7 @@ static int rcu_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
- struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
+ struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
struct rcu_node *rnp = rdp->mynode;
struct rcu_state *rsp;
@@ -3305,14 +3561,17 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
static void __init rcu_init_one(struct rcu_state *rsp,
struct rcu_data __percpu *rda)
{
- static char *buf[] = { "rcu_node_0",
- "rcu_node_1",
- "rcu_node_2",
- "rcu_node_3" }; /* Match MAX_RCU_LVLS */
- static char *fqs[] = { "rcu_node_fqs_0",
- "rcu_node_fqs_1",
- "rcu_node_fqs_2",
- "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
+ static const char * const buf[] = {
+ "rcu_node_0",
+ "rcu_node_1",
+ "rcu_node_2",
+ "rcu_node_3" }; /* Match MAX_RCU_LVLS */
+ static const char * const fqs[] = {
+ "rcu_node_fqs_0",
+ "rcu_node_fqs_1",
+ "rcu_node_fqs_2",
+ "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
+ static u8 fl_mask = 0x1;
int cpustride = 1;
int i;
int j;
@@ -3331,6 +3590,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
for (i = 1; i < rcu_num_lvls; i++)
rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
rcu_init_levelspread(rsp);
+ rsp->flavor_mask = fl_mask;
+ fl_mask <<= 1;
/* Initialize the elements themselves, starting from the leaves. */
@@ -3350,8 +3611,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
rnp->qsmaskinit = 0;
rnp->grplo = j * cpustride;
rnp->grphi = (j + 1) * cpustride - 1;
- if (rnp->grphi >= NR_CPUS)
- rnp->grphi = NR_CPUS - 1;
+ if (rnp->grphi >= nr_cpu_ids)
+ rnp->grphi = nr_cpu_ids - 1;
if (i == 0) {
rnp->grpnum = 0;
rnp->grpmask = 0;
@@ -3370,7 +3631,6 @@ static void __init rcu_init_one(struct rcu_state *rsp,
rsp->rda = rda;
init_waitqueue_head(&rsp->gp_wq);
- init_irq_work(&rsp->wakeup_work, rsp_wakeup);
rnp = rsp->level[rcu_num_lvls - 1];
for_each_possible_cpu(i) {
while (i > rnp->grphi)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 8c19873f1ac9..6a86eb7bac45 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -13,8 +13,8 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright IBM Corporation, 2008
*
@@ -172,6 +172,14 @@ struct rcu_node {
/* queued on this rcu_node structure that */
/* are blocking the current grace period, */
/* there can be no such task. */
+ struct completion boost_completion;
+ /* Used to ensure that the rt_mutex used */
+ /* to carry out the boosting is fully */
+ /* released with no future boostee accesses */
+ /* before that rt_mutex is re-initialized. */
+ struct rt_mutex boost_mtx;
+ /* Used only for the priority-boosting */
+ /* side effect, not as a lock. */
unsigned long boost_time;
/* When to start boosting (jiffies). */
struct task_struct *boost_kthread_task;
@@ -252,7 +260,6 @@ struct rcu_data {
bool passed_quiesce; /* User-mode/idle loop etc. */
bool qs_pending; /* Core waits for quiesc state. */
bool beenonline; /* CPU online at least once. */
- bool preemptible; /* Preemptible RCU? */
struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
unsigned long grpmask; /* Mask to apply to leaf qsmask. */
#ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -308,6 +315,9 @@ struct rcu_data {
/* 4) reasons this CPU needed to be kicked by force_quiescent_state */
unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
unsigned long offline_fqs; /* Kicked due to being offline. */
+ unsigned long cond_resched_completed;
+ /* Grace period that needs help */
+ /* from cond_resched(). */
/* 5) __rcu_pending() statistics. */
unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
@@ -332,11 +342,29 @@ struct rcu_data {
struct rcu_head **nocb_tail;
atomic_long_t nocb_q_count; /* # CBs waiting for kthread */
atomic_long_t nocb_q_count_lazy; /* (approximate). */
+ struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
+ struct rcu_head **nocb_follower_tail;
+ atomic_long_t nocb_follower_count; /* # CBs ready to invoke. */
+ atomic_long_t nocb_follower_count_lazy; /* (approximate). */
int nocb_p_count; /* # CBs being invoked by kthread */
int nocb_p_count_lazy; /* (approximate). */
wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
struct task_struct *nocb_kthread;
bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
+
+ /* The following fields are used by the leader, hence own cacheline. */
+ struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
+ /* CBs waiting for GP. */
+ struct rcu_head **nocb_gp_tail;
+ long nocb_gp_count;
+ long nocb_gp_count_lazy;
+ bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */
+ struct rcu_data *nocb_next_follower;
+ /* Next follower in wakeup chain. */
+
+ /* The following fields are used by the follower, hence new cachline. */
+ struct rcu_data *nocb_leader ____cacheline_internodealigned_in_smp;
+ /* Leader CPU takes GP-end wakeups. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
/* 8) RCU CPU stall data. */
@@ -393,6 +421,7 @@ struct rcu_state {
struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */
u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
+ u8 flavor_mask; /* bit in flavor mask. */
struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
void (*func)(struct rcu_head *head));
@@ -406,7 +435,8 @@ struct rcu_state {
unsigned long completed; /* # of last completed gp. */
struct task_struct *gp_kthread; /* Task for grace periods. */
wait_queue_head_t gp_wq; /* Where GP task waits. */
- int gp_flags; /* Commands for GP task. */
+ short gp_flags; /* Commands for GP task. */
+ short gp_state; /* GP kthread sleep state. */
/* End of fields guarded by root rcu_node's lock. */
@@ -462,13 +492,17 @@ struct rcu_state {
const char *name; /* Name of structure. */
char abbr; /* Abbreviated name. */
struct list_head flavors; /* List of RCU flavors. */
- struct irq_work wakeup_work; /* Postponed wakeups */
};
/* Values for rcu_state structure's gp_flags field. */
#define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */
#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */
+/* Values for rcu_state structure's gp_flags field. */
+#define RCU_GP_WAIT_INIT 0 /* Initial state. */
+#define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */
+#define RCU_GP_WAIT_FQS 2 /* Wait for force-quiescent-state time. */
+
extern struct list_head rcu_struct_flavors;
/* Sequence through rcu_state structures for each RCU flavor. */
@@ -547,7 +581,6 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
static void print_cpu_stall_info_end(void);
static void zero_cpu_stall_ticks(struct rcu_data *rdp);
static void increment_cpu_stall_ticks(void);
-static int rcu_nocb_needs_gp(struct rcu_state *rsp);
static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
static void rcu_init_one_nocb(struct rcu_node *rnp);
@@ -560,7 +593,7 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
-static void rcu_kick_nohz_cpu(int cpu);
+static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
static bool init_nocb_callback_list(struct rcu_data *rdp);
static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
@@ -580,8 +613,14 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
/* Sum up queue lengths for tracing. */
static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
{
- *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count;
- *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy;
+ *ql = atomic_long_read(&rdp->nocb_q_count) +
+ rdp->nocb_p_count +
+ atomic_long_read(&rdp->nocb_follower_count) +
+ rdp->nocb_p_count + rdp->nocb_gp_count;
+ *qll = atomic_long_read(&rdp->nocb_q_count_lazy) +
+ rdp->nocb_p_count_lazy +
+ atomic_long_read(&rdp->nocb_follower_count_lazy) +
+ rdp->nocb_p_count_lazy + rdp->nocb_gp_count_lazy;
}
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 6e2ef4b2b920..a7997e272564 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -14,8 +14,8 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright Red Hat, 2009
* Copyright IBM Corporation, 2009
@@ -33,6 +33,7 @@
#define RCU_KTHREAD_PRIO 1
#ifdef CONFIG_RCU_BOOST
+#include "../locking/rtmutex_common.h"
#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
#else
#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
@@ -116,7 +117,7 @@ static void __init rcu_bootup_announce_oddness(void)
#ifdef CONFIG_TREE_PREEMPT_RCU
RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
-static struct rcu_state *rcu_state = &rcu_preempt_state;
+static struct rcu_state *rcu_state_p = &rcu_preempt_state;
static int rcu_preempted_readers_exp(struct rcu_node *rnp);
@@ -149,15 +150,6 @@ long rcu_batches_completed(void)
EXPORT_SYMBOL_GPL(rcu_batches_completed);
/*
- * Force a quiescent state for preemptible RCU.
- */
-void rcu_force_quiescent_state(void)
-{
- force_quiescent_state(&rcu_preempt_state);
-}
-EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
-
-/*
* Record a preemptible-RCU quiescent state for the specified CPU. Note
* that this just means that the task currently running on the CPU is
* not in a quiescent state. There might be any number of tasks blocked
@@ -345,7 +337,7 @@ void rcu_read_unlock_special(struct task_struct *t)
unsigned long flags;
struct list_head *np;
#ifdef CONFIG_RCU_BOOST
- struct rt_mutex *rbmp = NULL;
+ bool drop_boost_mutex = false;
#endif /* #ifdef CONFIG_RCU_BOOST */
struct rcu_node *rnp;
int special;
@@ -407,11 +399,8 @@ void rcu_read_unlock_special(struct task_struct *t)
#ifdef CONFIG_RCU_BOOST
if (&t->rcu_node_entry == rnp->boost_tasks)
rnp->boost_tasks = np;
- /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */
- if (t->rcu_boost_mutex) {
- rbmp = t->rcu_boost_mutex;
- t->rcu_boost_mutex = NULL;
- }
+ /* Snapshot ->boost_mtx ownership with rcu_node lock held. */
+ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
#endif /* #ifdef CONFIG_RCU_BOOST */
/*
@@ -436,8 +425,10 @@ void rcu_read_unlock_special(struct task_struct *t)
#ifdef CONFIG_RCU_BOOST
/* Unboost if we were boosted. */
- if (rbmp)
- rt_mutex_unlock(rbmp);
+ if (drop_boost_mutex) {
+ rt_mutex_unlock(&rnp->boost_mtx);
+ complete(&rnp->boost_completion);
+ }
#endif /* #ifdef CONFIG_RCU_BOOST */
/*
@@ -688,20 +679,6 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
}
EXPORT_SYMBOL_GPL(call_rcu);
-/*
- * Queue an RCU callback for lazy invocation after a grace period.
- * This will likely be later named something like "call_rcu_lazy()",
- * but this change will require some way of tagging the lazy RCU
- * callbacks in the list of pending callbacks. Until then, this
- * function may only be called from __kfree_rcu().
- */
-void kfree_call_rcu(struct rcu_head *head,
- void (*func)(struct rcu_head *rcu))
-{
- __call_rcu(head, func, &rcu_preempt_state, -1, 1);
-}
-EXPORT_SYMBOL_GPL(kfree_call_rcu);
-
/**
* synchronize_rcu - wait until a grace period has elapsed.
*
@@ -970,7 +947,7 @@ void exit_rcu(void)
#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-static struct rcu_state *rcu_state = &rcu_sched_state;
+static struct rcu_state *rcu_state_p = &rcu_sched_state;
/*
* Tell them what RCU they are running.
@@ -991,16 +968,6 @@ long rcu_batches_completed(void)
EXPORT_SYMBOL_GPL(rcu_batches_completed);
/*
- * Force a quiescent state for RCU, which, because there is no preemptible
- * RCU, becomes the same as rcu-sched.
- */
-void rcu_force_quiescent_state(void)
-{
- rcu_sched_force_quiescent_state();
-}
-EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
-
-/*
* Because preemptible RCU does not exist, we never have to check for
* CPUs being in quiescent states.
*/
@@ -1021,6 +988,7 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
/* Because preemptible RCU does not exist, no quieting of tasks. */
static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
+ __releases(rnp->lock)
{
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
@@ -1080,22 +1048,6 @@ static void rcu_preempt_check_callbacks(int cpu)
}
/*
- * Queue an RCU callback for lazy invocation after a grace period.
- * This will likely be later named something like "call_rcu_lazy()",
- * but this change will require some way of tagging the lazy RCU
- * callbacks in the list of pending callbacks. Until then, this
- * function may only be called from __kfree_rcu().
- *
- * Because there is no preemptible RCU, we use RCU-sched instead.
- */
-void kfree_call_rcu(struct rcu_head *head,
- void (*func)(struct rcu_head *rcu))
-{
- __call_rcu(head, func, &rcu_sched_state, -1, 1);
-}
-EXPORT_SYMBOL_GPL(kfree_call_rcu);
-
-/*
* Wait for an rcu-preempt grace period, but make it happen quickly.
* But because preemptible RCU does not exist, map to rcu-sched.
*/
@@ -1198,7 +1150,6 @@ static void rcu_wake_cond(struct task_struct *t, int status)
static int rcu_boost(struct rcu_node *rnp)
{
unsigned long flags;
- struct rt_mutex mtx;
struct task_struct *t;
struct list_head *tb;
@@ -1249,11 +1200,15 @@ static int rcu_boost(struct rcu_node *rnp)
* section.
*/
t = container_of(tb, struct task_struct, rcu_node_entry);
- rt_mutex_init_proxy_locked(&mtx, t);
- t->rcu_boost_mutex = &mtx;
+ rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
+ init_completion(&rnp->boost_completion);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
- rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
- rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
+ /* Lock only for side effect: boosts task t's priority. */
+ rt_mutex_lock(&rnp->boost_mtx);
+ rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
+
+ /* Wait for boostee to be done w/boost_mtx before reinitializing. */
+ wait_for_completion(&rnp->boost_completion);
return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
ACCESS_ONCE(rnp->boost_tasks) != NULL;
@@ -1305,6 +1260,7 @@ static int rcu_boost_kthread(void *arg)
* about it going away.
*/
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
+ __releases(rnp->lock)
{
struct task_struct *t;
@@ -1517,11 +1473,11 @@ static int __init rcu_spawn_kthreads(void)
for_each_possible_cpu(cpu)
per_cpu(rcu_cpu_has_work, cpu) = 0;
BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
- rnp = rcu_get_root(rcu_state);
- (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
+ rnp = rcu_get_root(rcu_state_p);
+ (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
if (NUM_RCU_NODES > 1) {
- rcu_for_each_leaf_node(rcu_state, rnp)
- (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
+ rcu_for_each_leaf_node(rcu_state_p, rnp)
+ (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
}
return 0;
}
@@ -1529,17 +1485,18 @@ early_initcall(rcu_spawn_kthreads);
static void rcu_prepare_kthreads(int cpu)
{
- struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
+ struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
struct rcu_node *rnp = rdp->mynode;
/* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
if (rcu_scheduler_fully_active)
- (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
+ (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
}
#else /* #ifdef CONFIG_RCU_BOOST */
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
+ __releases(rnp->lock)
{
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
@@ -1586,11 +1543,13 @@ static void rcu_prepare_kthreads(int cpu)
* Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
* any flavor of RCU.
*/
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
{
*delta_jiffies = ULONG_MAX;
return rcu_cpu_has_callbacks(cpu, NULL);
}
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
/*
* Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
@@ -1656,7 +1615,7 @@ extern int tick_nohz_active;
* only if it has been awhile since the last time we did so. Afterwards,
* if there are any callbacks ready for immediate invocation, return true.
*/
-static bool rcu_try_advance_all_cbs(void)
+static bool __maybe_unused rcu_try_advance_all_cbs(void)
{
bool cbs_ready = false;
struct rcu_data *rdp;
@@ -1696,6 +1655,7 @@ static bool rcu_try_advance_all_cbs(void)
*
* The caller must have disabled interrupts.
*/
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
int rcu_needs_cpu(int cpu, unsigned long *dj)
{
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
@@ -1726,6 +1686,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
}
return 0;
}
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
/*
* Prepare a CPU for idle from an RCU perspective. The first major task
@@ -1739,6 +1700,8 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
*/
static void rcu_prepare_for_idle(int cpu)
{
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
+ bool needwake;
struct rcu_data *rdp;
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
struct rcu_node *rnp;
@@ -1787,9 +1750,12 @@ static void rcu_prepare_for_idle(int cpu)
rnp = rdp->mynode;
raw_spin_lock(&rnp->lock); /* irqs already disabled. */
smp_mb__after_unlock_lock();
- rcu_accelerate_cbs(rsp, rnp, rdp);
+ needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
}
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
}
/*
@@ -1799,11 +1765,12 @@ static void rcu_prepare_for_idle(int cpu)
*/
static void rcu_cleanup_after_idle(int cpu)
{
-
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
if (rcu_is_nocb_cpu(cpu))
return;
if (rcu_try_advance_all_cbs())
invoke_rcu_core();
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
}
/*
@@ -1848,7 +1815,7 @@ static void rcu_oom_notify_cpu(void *unused)
struct rcu_data *rdp;
for_each_rcu_flavor(rsp) {
- rdp = __this_cpu_ptr(rsp->rda);
+ rdp = raw_cpu_ptr(rsp->rda);
if (rdp->qlen_lazy != 0) {
atomic_inc(&oom_callback_count);
rsp->call(&rdp->oom_head, rcu_oom_callback);
@@ -1990,7 +1957,7 @@ static void increment_cpu_stall_ticks(void)
struct rcu_state *rsp;
for_each_rcu_flavor(rsp)
- __this_cpu_ptr(rsp->rda)->ticks_this_gp++;
+ raw_cpu_inc(rsp->rda->ticks_this_gp);
}
#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
@@ -2061,19 +2028,6 @@ static int __init parse_rcu_nocb_poll(char *arg)
early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
/*
- * Do any no-CBs CPUs need another grace period?
- *
- * Interrupts must be disabled. If the caller does not hold the root
- * rnp_node structure's ->lock, the results are advisory only.
- */
-static int rcu_nocb_needs_gp(struct rcu_state *rsp)
-{
- struct rcu_node *rnp = rcu_get_root(rsp);
-
- return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1];
-}
-
-/*
* Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
* grace period.
*/
@@ -2101,13 +2055,31 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
init_waitqueue_head(&rnp->nocb_gp_wq[1]);
}
-/* Is the specified CPU a no-CPUs CPU? */
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
+/* Is the specified CPU a no-CBs CPU? */
bool rcu_is_nocb_cpu(int cpu)
{
if (have_rcu_nocb_mask)
return cpumask_test_cpu(cpu, rcu_nocb_mask);
return false;
}
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
+
+/*
+ * Kick the leader kthread for this NOCB group.
+ */
+static void wake_nocb_leader(struct rcu_data *rdp, bool force)
+{
+ struct rcu_data *rdp_leader = rdp->nocb_leader;
+
+ if (!ACCESS_ONCE(rdp_leader->nocb_kthread))
+ return;
+ if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) {
+ /* Prior xchg orders against prior callback enqueue. */
+ ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false;
+ wake_up(&rdp_leader->nocb_wq);
+ }
+}
/*
* Enqueue the specified string of rcu_head structures onto the specified
@@ -2143,7 +2115,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
len = atomic_long_read(&rdp->nocb_q_count);
if (old_rhpp == &rdp->nocb_head) {
if (!irqs_disabled_flags(flags)) {
- wake_up(&rdp->nocb_wq); /* ... if queue was empty ... */
+ /* ... if queue was empty ... */
+ wake_nocb_leader(rdp, false);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeEmpty"));
} else {
@@ -2153,7 +2126,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
}
rdp->qlen_last_fqs_check = 0;
} else if (len > rdp->qlen_last_fqs_check + qhimark) {
- wake_up_process(t); /* ... or if many callbacks queued. */
+ /* ... or if many callbacks queued. */
+ wake_nocb_leader(rdp, true);
rdp->qlen_last_fqs_check = LONG_MAX / 2;
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf"));
} else {
@@ -2234,12 +2208,15 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
unsigned long c;
bool d;
unsigned long flags;
+ bool needwake;
struct rcu_node *rnp = rdp->mynode;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
- c = rcu_start_future_gp(rnp, rdp);
+ needwake = rcu_start_future_gp(rnp, rdp, &c);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ if (needwake)
+ rcu_gp_kthread_wake(rdp->rsp);
/*
* Wait for the grace period. Do so interruptibly to avoid messing
@@ -2260,13 +2237,150 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
}
/*
+ * Leaders come here to wait for additional callbacks to show up.
+ * This function does not return until callbacks appear.
+ */
+static void nocb_leader_wait(struct rcu_data *my_rdp)
+{
+ bool firsttime = true;
+ bool gotcbs;
+ struct rcu_data *rdp;
+ struct rcu_head **tail;
+
+wait_again:
+
+ /* Wait for callbacks to appear. */
+ if (!rcu_nocb_poll) {
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
+ wait_event_interruptible(my_rdp->nocb_wq,
+ !ACCESS_ONCE(my_rdp->nocb_leader_sleep));
+ /* Memory barrier handled by smp_mb() calls below and repoll. */
+ } else if (firsttime) {
+ firsttime = false; /* Don't drown trace log with "Poll"! */
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll");
+ }
+
+ /*
+ * Each pass through the following loop checks a follower for CBs.
+ * We are our own first follower. Any CBs found are moved to
+ * nocb_gp_head, where they await a grace period.
+ */
+ gotcbs = false;
+ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
+ rdp->nocb_gp_head = ACCESS_ONCE(rdp->nocb_head);
+ if (!rdp->nocb_gp_head)
+ continue; /* No CBs here, try next follower. */
+
+ /* Move callbacks to wait-for-GP list, which is empty. */
+ ACCESS_ONCE(rdp->nocb_head) = NULL;
+ rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
+ rdp->nocb_gp_count = atomic_long_xchg(&rdp->nocb_q_count, 0);
+ rdp->nocb_gp_count_lazy =
+ atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
+ gotcbs = true;
+ }
+
+ /*
+ * If there were no callbacks, sleep a bit, rescan after a
+ * memory barrier, and go retry.
+ */
+ if (unlikely(!gotcbs)) {
+ if (!rcu_nocb_poll)
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
+ "WokeEmpty");
+ flush_signals(current);
+ schedule_timeout_interruptible(1);
+
+ /* Rescan in case we were a victim of memory ordering. */
+ my_rdp->nocb_leader_sleep = true;
+ smp_mb(); /* Ensure _sleep true before scan. */
+ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
+ if (ACCESS_ONCE(rdp->nocb_head)) {
+ /* Found CB, so short-circuit next wait. */
+ my_rdp->nocb_leader_sleep = false;
+ break;
+ }
+ goto wait_again;
+ }
+
+ /* Wait for one grace period. */
+ rcu_nocb_wait_gp(my_rdp);
+
+ /*
+ * We left ->nocb_leader_sleep unset to reduce cache thrashing.
+ * We set it now, but recheck for new callbacks while
+ * traversing our follower list.
+ */
+ my_rdp->nocb_leader_sleep = true;
+ smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */
+
+ /* Each pass through the following loop wakes a follower, if needed. */
+ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
+ if (ACCESS_ONCE(rdp->nocb_head))
+ my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
+ if (!rdp->nocb_gp_head)
+ continue; /* No CBs, so no need to wake follower. */
+
+ /* Append callbacks to follower's "done" list. */
+ tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
+ *tail = rdp->nocb_gp_head;
+ atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count);
+ atomic_long_add(rdp->nocb_gp_count_lazy,
+ &rdp->nocb_follower_count_lazy);
+ if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
+ /*
+ * List was empty, wake up the follower.
+ * Memory barriers supplied by atomic_long_add().
+ */
+ wake_up(&rdp->nocb_wq);
+ }
+ }
+
+ /* If we (the leader) don't have CBs, go wait some more. */
+ if (!my_rdp->nocb_follower_head)
+ goto wait_again;
+}
+
+/*
+ * Followers come here to wait for additional callbacks to show up.
+ * This function does not return until callbacks appear.
+ */
+static void nocb_follower_wait(struct rcu_data *rdp)
+{
+ bool firsttime = true;
+
+ for (;;) {
+ if (!rcu_nocb_poll) {
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ "FollowerSleep");
+ wait_event_interruptible(rdp->nocb_wq,
+ ACCESS_ONCE(rdp->nocb_follower_head));
+ } else if (firsttime) {
+ /* Don't drown trace log with "Poll"! */
+ firsttime = false;
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll");
+ }
+ if (smp_load_acquire(&rdp->nocb_follower_head)) {
+ /* ^^^ Ensure CB invocation follows _head test. */
+ return;
+ }
+ if (!rcu_nocb_poll)
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ "WokeEmpty");
+ flush_signals(current);
+ schedule_timeout_interruptible(1);
+ }
+}
+
+/*
* Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes
- * callbacks queued by the corresponding no-CBs CPU.
+ * callbacks queued by the corresponding no-CBs CPU, however, there is
+ * an optional leader-follower relationship so that the grace-period
+ * kthreads don't have to do quite so many wakeups.
*/
static int rcu_nocb_kthread(void *arg)
{
int c, cl;
- bool firsttime = 1;
struct rcu_head *list;
struct rcu_head *next;
struct rcu_head **tail;
@@ -2274,41 +2388,22 @@ static int rcu_nocb_kthread(void *arg)
/* Each pass through this loop invokes one batch of callbacks */
for (;;) {
- /* If not polling, wait for next batch of callbacks. */
- if (!rcu_nocb_poll) {
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("Sleep"));
- wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head);
- /* Memory barrier provide by xchg() below. */
- } else if (firsttime) {
- firsttime = 0;
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("Poll"));
- }
- list = ACCESS_ONCE(rdp->nocb_head);
- if (!list) {
- if (!rcu_nocb_poll)
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WokeEmpty"));
- schedule_timeout_interruptible(1);
- flush_signals(current);
- continue;
- }
- firsttime = 1;
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WokeNonEmpty"));
-
- /*
- * Extract queued callbacks, update counts, and wait
- * for a grace period to elapse.
- */
- ACCESS_ONCE(rdp->nocb_head) = NULL;
- tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
- c = atomic_long_xchg(&rdp->nocb_q_count, 0);
- cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
- ACCESS_ONCE(rdp->nocb_p_count) += c;
- ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
- rcu_nocb_wait_gp(rdp);
+ /* Wait for callbacks. */
+ if (rdp->nocb_leader == rdp)
+ nocb_leader_wait(rdp);
+ else
+ nocb_follower_wait(rdp);
+
+ /* Pull the ready-to-invoke callbacks onto local list. */
+ list = ACCESS_ONCE(rdp->nocb_follower_head);
+ BUG_ON(!list);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
+ ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
+ tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
+ c = atomic_long_xchg(&rdp->nocb_follower_count, 0);
+ cl = atomic_long_xchg(&rdp->nocb_follower_count_lazy, 0);
+ rdp->nocb_p_count += c;
+ rdp->nocb_p_count_lazy += cl;
/* Each pass through the following loop invokes a callback. */
trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
@@ -2352,7 +2447,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
if (!rcu_nocb_need_deferred_wakeup(rdp))
return;
ACCESS_ONCE(rdp->nocb_defer_wakeup) = false;
- wake_up(&rdp->nocb_wq);
+ wake_nocb_leader(rdp, false);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty"));
}
@@ -2361,19 +2456,57 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
{
rdp->nocb_tail = &rdp->nocb_head;
init_waitqueue_head(&rdp->nocb_wq);
+ rdp->nocb_follower_tail = &rdp->nocb_follower_head;
}
-/* Create a kthread for each RCU flavor for each no-CBs CPU. */
+/* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */
+static int rcu_nocb_leader_stride = -1;
+module_param(rcu_nocb_leader_stride, int, 0444);
+
+/*
+ * Create a kthread for each RCU flavor for each no-CBs CPU.
+ * Also initialize leader-follower relationships.
+ */
static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
{
int cpu;
+ int ls = rcu_nocb_leader_stride;
+ int nl = 0; /* Next leader. */
struct rcu_data *rdp;
+ struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */
+ struct rcu_data *rdp_prev = NULL;
struct task_struct *t;
if (rcu_nocb_mask == NULL)
return;
+#if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL)
+ if (tick_nohz_full_running)
+ cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
+#endif /* #if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) */
+ if (ls == -1) {
+ ls = int_sqrt(nr_cpu_ids);
+ rcu_nocb_leader_stride = ls;
+ }
+
+ /*
+ * Each pass through this loop sets up one rcu_data structure and
+ * spawns one rcu_nocb_kthread().
+ */
for_each_cpu(cpu, rcu_nocb_mask) {
rdp = per_cpu_ptr(rsp->rda, cpu);
+ if (rdp->cpu >= nl) {
+ /* New leader, set up for followers & next leader. */
+ nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
+ rdp->nocb_leader = rdp;
+ rdp_leader = rdp;
+ } else {
+ /* Another follower, link to previous leader. */
+ rdp->nocb_leader = rdp_leader;
+ rdp_prev->nocb_next_follower = rdp;
+ }
+ rdp_prev = rdp;
+
+ /* Spawn the kthread for this CPU. */
t = kthread_run(rcu_nocb_kthread, rdp,
"rcuo%c/%d", rsp->abbr, cpu);
BUG_ON(IS_ERR(t));
@@ -2393,11 +2526,6 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
-static int rcu_nocb_needs_gp(struct rcu_state *rsp)
-{
- return 0;
-}
-
static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
{
}
@@ -2456,7 +2584,7 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
* if an adaptive-ticks CPU is failing to respond to the current grace
* period and has not be idle from an RCU perspective, kick it.
*/
-static void rcu_kick_nohz_cpu(int cpu)
+static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
{
#ifdef CONFIG_NO_HZ_FULL
if (tick_nohz_full_cpu(cpu))
@@ -2514,9 +2642,9 @@ static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
/* Record start of fully idle period. */
j = jiffies;
ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j;
- smp_mb__before_atomic_inc();
+ smp_mb__before_atomic();
atomic_inc(&rdtp->dynticks_idle);
- smp_mb__after_atomic_inc();
+ smp_mb__after_atomic();
WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1);
}
@@ -2581,9 +2709,9 @@ static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
}
/* Record end of idle period. */
- smp_mb__before_atomic_inc();
+ smp_mb__before_atomic();
atomic_inc(&rdtp->dynticks_idle);
- smp_mb__after_atomic_inc();
+ smp_mb__after_atomic();
WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1));
/*
@@ -2648,20 +2776,6 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp)
}
/*
- * Bind the grace-period kthread for the sysidle flavor of RCU to the
- * timekeeping CPU.
- */
-static void rcu_bind_gp_kthread(void)
-{
- int cpu = ACCESS_ONCE(tick_do_timer_cpu);
-
- if (cpu < 0 || cpu >= nr_cpu_ids)
- return;
- if (raw_smp_processor_id() != cpu)
- set_cpus_allowed_ptr(current, cpumask_of(cpu));
-}
-
-/*
* Return a delay in jiffies based on the number of CPUs, rcu_node
* leaf fanout, and jiffies tick rate. The idea is to allow larger
* systems more time to transition to full-idle state in order to
@@ -2725,7 +2839,8 @@ static void rcu_sysidle(unsigned long j)
static void rcu_sysidle_cancel(void)
{
smp_mb();
- ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
+ if (full_sysidle_state > RCU_SYSIDLE_SHORT)
+ ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
}
/*
@@ -2871,10 +2986,6 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp)
return false;
}
-static void rcu_bind_gp_kthread(void)
-{
-}
-
static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
unsigned long maxj)
{
@@ -2893,7 +3004,7 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
* CPU unless the grace period has extended for too long.
*
* This code relies on the fact that all NO_HZ_FULL CPUs are also
- * CONFIG_RCU_NOCB_CPUs.
+ * CONFIG_RCU_NOCB_CPU CPUs.
*/
static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
{
@@ -2905,3 +3016,23 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
#endif /* #ifdef CONFIG_NO_HZ_FULL */
return 0;
}
+
+/*
+ * Bind the grace-period kthread for the sysidle flavor of RCU to the
+ * timekeeping CPU.
+ */
+static void rcu_bind_gp_kthread(void)
+{
+ int __maybe_unused cpu;
+
+ if (!tick_nohz_full_enabled())
+ return;
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+ cpu = tick_do_timer_cpu;
+ if (cpu >= 0 && cpu < nr_cpu_ids && raw_smp_processor_id() != cpu)
+ set_cpus_allowed_ptr(current, cpumask_of(cpu));
+#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+ if (!is_housekeeping_cpu(raw_smp_processor_id()))
+ housekeeping_affine(current);
+#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+}
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 4def475336d4..5cdc62e1beeb 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -12,8 +12,8 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright IBM Corporation, 2008
*
@@ -273,7 +273,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
rsp->n_force_qs, rsp->n_force_qs_ngp,
rsp->n_force_qs - rsp->n_force_qs_ngp,
- rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen);
+ ACCESS_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen);
for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
if (rnp->level != level) {
seq_puts(m, "\n");
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index c54609faf233..4056d7992a6c 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -12,8 +12,8 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright IBM Corporation, 2001
*
@@ -49,7 +49,6 @@
#include <linux/module.h>
#define CREATE_TRACE_POINTS
-#include <trace/events/rcu.h>
#include "rcu.h"
@@ -91,9 +90,6 @@ void __rcu_read_unlock(void)
} else {
barrier(); /* critical section before exit code. */
t->rcu_read_lock_nesting = INT_MIN;
-#ifdef CONFIG_PROVE_RCU_DELAY
- udelay(10); /* Make preemption more probable. */
-#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
barrier(); /* assign before ->rcu_read_unlock_special load */
if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
rcu_read_unlock_special(t);
@@ -201,12 +197,12 @@ void wait_rcu_gp(call_rcu_func_t crf)
EXPORT_SYMBOL_GPL(wait_rcu_gp);
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
-static inline void debug_init_rcu_head(struct rcu_head *head)
+void init_rcu_head(struct rcu_head *head)
{
debug_object_init(head, &rcuhead_debug_descr);
}
-static inline void debug_rcu_head_free(struct rcu_head *head)
+void destroy_rcu_head(struct rcu_head *head)
{
debug_object_free(head, &rcuhead_debug_descr);
}
@@ -321,6 +317,18 @@ int rcu_jiffies_till_stall_check(void)
return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
}
+void rcu_sysrq_start(void)
+{
+ if (!rcu_cpu_stall_suppress)
+ rcu_cpu_stall_suppress = 2;
+}
+
+void rcu_sysrq_end(void)
+{
+ if (rcu_cpu_stall_suppress == 2)
+ rcu_cpu_stall_suppress = 0;
+}
+
static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
{
rcu_cpu_stall_suppress = 1;
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 662c83fc16b7..a3a9e240fcdb 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -388,15 +388,22 @@ static int __init reboot_setup(char *str)
break;
case 's':
- if (isdigit(*(str+1)))
- reboot_cpu = simple_strtoul(str+1, NULL, 0);
- else if (str[1] == 'm' && str[2] == 'p' &&
- isdigit(*(str+3)))
- reboot_cpu = simple_strtoul(str+3, NULL, 0);
- else
+ {
+ int rc;
+
+ if (isdigit(*(str+1))) {
+ rc = kstrtoint(str+1, 0, &reboot_cpu);
+ if (rc)
+ return rc;
+ } else if (str[1] == 'm' && str[2] == 'p' &&
+ isdigit(*(str+3))) {
+ rc = kstrtoint(str+3, 0, &reboot_cpu);
+ if (rc)
+ return rc;
+ } else
reboot_mode = REBOOT_SOFT;
break;
-
+ }
case 'g':
reboot_mode = REBOOT_GPIO;
break;
diff --git a/kernel/relay.c b/kernel/relay.c
index 5001c9887db1..5a56d3c8dc03 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -227,7 +227,7 @@ static void relay_destroy_buf(struct rchan_buf *buf)
* relay_remove_buf - remove a channel buffer
* @kref: target kernel reference that contains the relay buffer
*
- * Removes the file from the fileystem, which also frees the
+ * Removes the file from the filesystem, which also frees the
* rchan_buf_struct and the channel buffer. Should only be called from
* kref_put().
*/
@@ -1195,8 +1195,6 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
static const struct pipe_buf_operations relay_pipe_buf_ops = {
.can_merge = 0,
- .map = generic_pipe_buf_map,
- .unmap = generic_pipe_buf_unmap,
.confirm = generic_pipe_buf_confirm,
.release = relay_pipe_buf_release,
.steal = generic_pipe_buf_steal,
@@ -1253,7 +1251,7 @@ static ssize_t subbuf_splice_actor(struct file *in,
subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
pidx = (read_start / PAGE_SIZE) % subbuf_pages;
poff = read_start & ~PAGE_MASK;
- nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers);
+ nr_pages = min_t(unsigned int, subbuf_pages, spd.nr_pages_max);
for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
unsigned int this_len, this_end, private;
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 4aa8a305aede..e791130f85a7 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -22,8 +22,18 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent)
counter->parent = parent;
}
-int res_counter_charge_locked(struct res_counter *counter, unsigned long val,
- bool force)
+static u64 res_counter_uncharge_locked(struct res_counter *counter,
+ unsigned long val)
+{
+ if (WARN_ON(counter->usage < val))
+ val = counter->usage;
+
+ counter->usage -= val;
+ return counter->usage;
+}
+
+static int res_counter_charge_locked(struct res_counter *counter,
+ unsigned long val, bool force)
{
int ret = 0;
@@ -86,15 +96,6 @@ int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
return __res_counter_charge(counter, val, limit_fail_at, true);
}
-u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
-{
- if (WARN_ON(counter->usage < val))
- val = counter->usage;
-
- counter->usage -= val;
- return counter->usage;
-}
-
u64 res_counter_uncharge_until(struct res_counter *counter,
struct res_counter *top,
unsigned long val)
@@ -185,8 +186,11 @@ int res_counter_memparse_write_strategy(const char *buf,
/* return RES_COUNTER_MAX(unlimited) if "-1" is specified */
if (*buf == '-') {
- res = simple_strtoull(buf + 1, &end, 10);
- if (res != 1 || *end != '\0')
+ int rc = kstrtoull(buf + 1, 10, &res);
+
+ if (rc)
+ return rc;
+ if (res != 1)
return -EINVAL;
*resp = RES_COUNTER_MAX;
return 0;
diff --git a/kernel/resource.c b/kernel/resource.c
index 3f285dce9347..60c5a3856ab7 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -59,10 +59,12 @@ static DEFINE_RWLOCK(resource_lock);
static struct resource *bootmem_resource_free;
static DEFINE_SPINLOCK(bootmem_resource_lock);
-static void *r_next(struct seq_file *m, void *v, loff_t *pos)
+static struct resource *next_resource(struct resource *p, bool sibling_only)
{
- struct resource *p = v;
- (*pos)++;
+ /* Caller wants to traverse through siblings only */
+ if (sibling_only)
+ return p->sibling;
+
if (p->child)
return p->child;
while (!p->sibling && p->parent)
@@ -70,6 +72,13 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos)
return p->sibling;
}
+static void *r_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct resource *p = v;
+ (*pos)++;
+ return (void *)next_resource(p, false);
+}
+
#ifdef CONFIG_PROC_FS
enum { MAX_IORES_LEVEL = 5 };
@@ -322,16 +331,19 @@ int release_resource(struct resource *old)
EXPORT_SYMBOL(release_resource);
-#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
/*
- * Finds the lowest memory reosurce exists within [res->start.res->end)
+ * Finds the lowest iomem reosurce exists with-in [res->start.res->end)
* the caller must specify res->start, res->end, res->flags and "name".
* If found, returns 0, res is overwritten, if not found, returns -1.
+ * This walks through whole tree and not just first level children
+ * until and unless first_level_children_only is true.
*/
-static int find_next_system_ram(struct resource *res, char *name)
+static int find_next_iomem_res(struct resource *res, char *name,
+ bool first_level_children_only)
{
resource_size_t start, end;
struct resource *p;
+ bool sibling_only = false;
BUG_ON(!res);
@@ -339,9 +351,12 @@ static int find_next_system_ram(struct resource *res, char *name)
end = res->end;
BUG_ON(start >= end);
+ if (first_level_children_only)
+ sibling_only = true;
+
read_lock(&resource_lock);
- for (p = iomem_resource.child; p ; p = p->sibling) {
- /* system ram is just marked as IORESOURCE_MEM */
+
+ for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) {
if (p->flags != res->flags)
continue;
if (name && strcmp(p->name, name))
@@ -353,6 +368,7 @@ static int find_next_system_ram(struct resource *res, char *name)
if ((p->end >= start) && (p->start < end))
break;
}
+
read_unlock(&resource_lock);
if (!p)
return -1;
@@ -365,6 +381,70 @@ static int find_next_system_ram(struct resource *res, char *name)
}
/*
+ * Walks through iomem resources and calls func() with matching resource
+ * ranges. This walks through whole tree and not just first level children.
+ * All the memory ranges which overlap start,end and also match flags and
+ * name are valid candidates.
+ *
+ * @name: name of resource
+ * @flags: resource flags
+ * @start: start addr
+ * @end: end addr
+ */
+int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end,
+ void *arg, int (*func)(u64, u64, void *))
+{
+ struct resource res;
+ u64 orig_end;
+ int ret = -1;
+
+ res.start = start;
+ res.end = end;
+ res.flags = flags;
+ orig_end = res.end;
+ while ((res.start < res.end) &&
+ (!find_next_iomem_res(&res, name, false))) {
+ ret = (*func)(res.start, res.end, arg);
+ if (ret)
+ break;
+ res.start = res.end + 1;
+ res.end = orig_end;
+ }
+ return ret;
+}
+
+/*
+ * This function calls callback against all memory range of "System RAM"
+ * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
+ * Now, this function is only for "System RAM". This function deals with
+ * full ranges and not pfn. If resources are not pfn aligned, dealing
+ * with pfn can truncate ranges.
+ */
+int walk_system_ram_res(u64 start, u64 end, void *arg,
+ int (*func)(u64, u64, void *))
+{
+ struct resource res;
+ u64 orig_end;
+ int ret = -1;
+
+ res.start = start;
+ res.end = end;
+ res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ orig_end = res.end;
+ while ((res.start < res.end) &&
+ (!find_next_iomem_res(&res, "System RAM", true))) {
+ ret = (*func)(res.start, res.end, arg);
+ if (ret)
+ break;
+ res.start = res.end + 1;
+ res.end = orig_end;
+ }
+ return ret;
+}
+
+#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
+
+/*
* This function calls callback against all memory range of "System RAM"
* which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
* Now, this function is only for "System RAM".
@@ -382,7 +462,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
orig_end = res.end;
while ((res.start < res.end) &&
- (find_next_system_ram(&res, "System RAM") >= 0)) {
+ (find_next_iomem_res(&res, "System RAM", true) >= 0)) {
pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
end_pfn = (res.end + 1) >> PAGE_SHIFT;
if (end_pfn > pfn)
@@ -432,11 +512,6 @@ static void resource_clip(struct resource *res, resource_size_t min,
res->end = max;
}
-static bool resource_contains(struct resource *res1, struct resource *res2)
-{
- return res1->start <= res2->start && res1->end >= res2->end;
-}
-
/*
* Find empty slot in the resource tree with the given range and
* alignment constraints
@@ -471,10 +546,11 @@ static int __find_resource(struct resource *root, struct resource *old,
arch_remove_reservations(&tmp);
/* Check for overflow after ALIGN() */
- avail = *new;
avail.start = ALIGN(tmp.start, constraint->align);
avail.end = tmp.end;
+ avail.flags = new->flags & ~IORESOURCE_UNSET;
if (avail.start >= tmp.start) {
+ alloc.flags = avail.flags;
alloc.start = constraint->alignf(constraint->alignf_data, &avail,
size, constraint->align);
alloc.end = alloc.start + size - 1;
@@ -515,7 +591,7 @@ static int find_resource(struct resource *root, struct resource *new,
* @newsize: new size of the resource descriptor
* @constraint: the size and alignment constraints to be met.
*/
-int reallocate_resource(struct resource *root, struct resource *old,
+static int reallocate_resource(struct resource *root, struct resource *old,
resource_size_t newsize,
struct resource_constraint *constraint)
{
@@ -949,8 +1025,8 @@ struct resource * __request_region(struct resource *parent,
res->name = name;
res->start = start;
res->end = start + n - 1;
- res->flags = IORESOURCE_BUSY;
- res->flags |= flags;
+ res->flags = resource_type(parent);
+ res->flags |= IORESOURCE_BUSY | flags;
write_lock(&resource_lock);
@@ -1292,13 +1368,10 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
if (p->flags & IORESOURCE_BUSY)
continue;
- printk(KERN_WARNING "resource map sanity check conflict: "
- "0x%llx 0x%llx 0x%llx 0x%llx %s\n",
+ printk(KERN_WARNING "resource sanity check: requesting [mem %#010llx-%#010llx], which spans more than %s %pR\n",
(unsigned long long)addr,
(unsigned long long)(addr + size - 1),
- (unsigned long long)p->start,
- (unsigned long long)p->end,
- p->name);
+ p->name, p);
err = -1;
break;
}
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 9a95c8c2af2a..ab32b7b0db5c 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -13,7 +13,7 @@ endif
obj-y += core.o proc.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-y += wait.o completion.o
+obj-y += wait.o completion.o idle.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 4a073539c58e..e73efba98301 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -203,7 +203,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
struct autogroup *ag;
int err;
- if (nice < -20 || nice > 19)
+ if (nice < MIN_NICE || nice > MAX_NICE)
return -EINVAL;
err = security_task_setnice(current, nice);
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index b30a2924ef14..3ef6451e972e 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -60,13 +60,14 @@
#include <linux/sched.h>
#include <linux/static_key.h>
#include <linux/workqueue.h>
+#include <linux/compiler.h>
/*
* Scheduler clock - returns current time in nanosec units.
* This is default implementation.
* Architectures and sub-architectures can override this.
*/
-unsigned long long __attribute__((weak)) sched_clock(void)
+unsigned long long __weak sched_clock(void)
{
return (unsigned long long)(jiffies - INITIAL_JIFFIES)
* (NSEC_PER_SEC / HZ);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f5c6635b806c..ec1a286684a5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -73,6 +73,7 @@
#include <linux/init_task.h>
#include <linux/binfmts.h>
#include <linux/context_tracking.h>
+#include <linux/compiler.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -89,6 +90,22 @@
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
+#ifdef smp_mb__before_atomic
+void __smp_mb__before_atomic(void)
+{
+ smp_mb__before_atomic();
+}
+EXPORT_SYMBOL(__smp_mb__before_atomic);
+#endif
+
+#ifdef smp_mb__after_atomic
+void __smp_mb__after_atomic(void)
+{
+ smp_mb__after_atomic();
+}
+EXPORT_SYMBOL(__smp_mb__after_atomic);
+#endif
+
void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
{
unsigned long delta;
@@ -122,6 +139,8 @@ void update_rq_clock(struct rq *rq)
return;
delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+ if (delta < 0)
+ return;
rq->clock += delta;
update_rq_clock_task(rq, delta);
}
@@ -226,6 +245,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
char buf[64];
char *cmp;
int i;
+ struct inode *inode;
if (cnt > 63)
cnt = 63;
@@ -236,7 +256,11 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
buf[cnt] = 0;
cmp = strstrip(buf);
+ /* Ensure the static_key remains in a consistent state */
+ inode = file_inode(filp);
+ mutex_lock(&inode->i_mutex);
i = sched_feat_set(cmp);
+ mutex_unlock(&inode->i_mutex);
if (i == __SCHED_FEAT_NR)
return -EINVAL;
@@ -432,7 +456,7 @@ void hrtick_start(struct rq *rq, u64 delay)
if (rq == this_rq()) {
__hrtick_restart(rq);
} else if (!rq->hrtick_csd_pending) {
- __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
+ smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
rq->hrtick_csd_pending = 1;
}
}
@@ -505,33 +529,99 @@ static inline void init_hrtick(void)
#endif /* CONFIG_SCHED_HRTICK */
/*
- * resched_task - mark a task 'to be rescheduled now'.
+ * cmpxchg based fetch_or, macro so it works for different integer types
+ */
+#define fetch_or(ptr, val) \
+({ typeof(*(ptr)) __old, __val = *(ptr); \
+ for (;;) { \
+ __old = cmpxchg((ptr), __val, __val | (val)); \
+ if (__old == __val) \
+ break; \
+ __val = __old; \
+ } \
+ __old; \
+})
+
+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
+/*
+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
+ * this avoids any races wrt polling state changes and thereby avoids
+ * spurious IPIs.
+ */
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+ struct thread_info *ti = task_thread_info(p);
+ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+}
+
+/*
+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
+ *
+ * If this returns true, then the idle task promises to call
+ * sched_ttwu_pending() and reschedule soon.
+ */
+static bool set_nr_if_polling(struct task_struct *p)
+{
+ struct thread_info *ti = task_thread_info(p);
+ typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+
+ for (;;) {
+ if (!(val & _TIF_POLLING_NRFLAG))
+ return false;
+ if (val & _TIF_NEED_RESCHED)
+ return true;
+ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
+ if (old == val)
+ break;
+ val = old;
+ }
+ return true;
+}
+
+#else
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+ set_tsk_need_resched(p);
+ return true;
+}
+
+#ifdef CONFIG_SMP
+static bool set_nr_if_polling(struct task_struct *p)
+{
+ return false;
+}
+#endif
+#endif
+
+/*
+ * resched_curr - mark rq's current task 'to be rescheduled now'.
*
* On UP this means the setting of the need_resched flag, on SMP it
* might also involve a cross-CPU call to trigger the scheduler on
* the target CPU.
*/
-void resched_task(struct task_struct *p)
+void resched_curr(struct rq *rq)
{
+ struct task_struct *curr = rq->curr;
int cpu;
- lockdep_assert_held(&task_rq(p)->lock);
+ lockdep_assert_held(&rq->lock);
- if (test_tsk_need_resched(p))
+ if (test_tsk_need_resched(curr))
return;
- set_tsk_need_resched(p);
+ cpu = cpu_of(rq);
- cpu = task_cpu(p);
if (cpu == smp_processor_id()) {
+ set_tsk_need_resched(curr);
set_preempt_need_resched();
return;
}
- /* NEED_RESCHED must be visible before we test polling */
- smp_mb();
- if (!tsk_is_polling(p))
+ if (set_nr_and_not_polling(curr))
smp_send_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
}
void resched_cpu(int cpu)
@@ -541,7 +631,7 @@ void resched_cpu(int cpu)
if (!raw_spin_trylock_irqsave(&rq->lock, flags))
return;
- resched_task(cpu_curr(cpu));
+ resched_curr(rq);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -555,12 +645,15 @@ void resched_cpu(int cpu)
* selecting an idle cpu will add more delays to the timers than intended
* (as that cpu's timer base may not be uptodate wrt jiffies etc).
*/
-int get_nohz_timer_target(void)
+int get_nohz_timer_target(int pinned)
{
int cpu = smp_processor_id();
int i;
struct sched_domain *sd;
+ if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
+ return cpu;
+
rcu_read_lock();
for_each_domain(cpu, sd) {
for_each_cpu(i, sched_domain_span(sd)) {
@@ -591,35 +684,24 @@ static void wake_up_idle_cpu(int cpu)
if (cpu == smp_processor_id())
return;
- /*
- * This is safe, as this function is called with the timer
- * wheel base lock of (cpu) held. When the CPU is on the way
- * to idle and has not yet set rq->curr to idle then it will
- * be serialized on the timer wheel base lock and take the new
- * timer into account automatically.
- */
- if (rq->curr != rq->idle)
- return;
-
- /*
- * We can set TIF_RESCHED on the idle task of the other CPU
- * lockless. The worst case is that the other CPU runs the
- * idle task through an additional NOOP schedule()
- */
- set_tsk_need_resched(rq->idle);
-
- /* NEED_RESCHED must be visible before we test polling */
- smp_mb();
- if (!tsk_is_polling(rq->idle))
+ if (set_nr_and_not_polling(rq->idle))
smp_send_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
}
static bool wake_up_full_nohz_cpu(int cpu)
{
+ /*
+ * We just need the target to call irq_exit() and re-evaluate
+ * the next tick. The nohz full kick at least implies that.
+ * If needed we can still optimize that later with an
+ * empty IRQ.
+ */
if (tick_nohz_full_cpu(cpu)) {
if (cpu != smp_processor_id() ||
tick_nohz_tick_stopped())
- smp_send_reschedule(cpu);
+ tick_nohz_full_kick_cpu(cpu);
return true;
}
@@ -662,18 +744,15 @@ static inline bool got_nohz_idle_kick(void)
#ifdef CONFIG_NO_HZ_FULL
bool sched_can_stop_tick(void)
{
- struct rq *rq;
-
- rq = this_rq();
-
- /* Make sure rq->nr_running update is visible after the IPI */
- smp_rmb();
-
- /* More than one running task need preemption */
- if (rq->nr_running > 1)
- return false;
+ /*
+ * More than one running task need preemption.
+ * nr_running update is assumed to be visible
+ * after IPI is sent from wakers.
+ */
+ if (this_rq()->nr_running > 1)
+ return false;
- return true;
+ return true;
}
#endif /* CONFIG_NO_HZ_FULL */
@@ -823,19 +902,13 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((&paravirt_steal_rq_enabled))) {
- u64 st;
-
steal = paravirt_steal_clock(cpu_of(rq));
steal -= rq->prev_steal_time_rq;
if (unlikely(steal > delta))
steal = delta;
- st = steal_ticks(steal);
- steal = st * TICK_NSEC;
-
rq->prev_steal_time_rq += steal;
-
delta -= steal;
}
#endif
@@ -843,7 +916,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->clock_task += delta;
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
- if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
+ if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
sched_rt_avg_update(rq, irq_delta + steal);
#endif
}
@@ -960,7 +1033,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
if (class == rq->curr->sched_class)
break;
if (class == p->sched_class) {
- resched_task(rq->curr);
+ resched_curr(rq);
break;
}
}
@@ -1322,7 +1395,7 @@ out:
* leave kernel.
*/
if (p->mm && printk_ratelimit()) {
- printk_sched("process %d (%s) no longer affine to cpu%d\n",
+ printk_deferred("process %d (%s) no longer affine to cpu%d\n",
task_pid_nr(p), p->comm, cpu);
}
}
@@ -1476,13 +1549,17 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
}
#ifdef CONFIG_SMP
-static void sched_ttwu_pending(void)
+void sched_ttwu_pending(void)
{
struct rq *rq = this_rq();
struct llist_node *llist = llist_del_all(&rq->wake_list);
struct task_struct *p;
+ unsigned long flags;
- raw_spin_lock(&rq->lock);
+ if (!llist)
+ return;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
while (llist) {
p = llist_entry(llist, struct task_struct, wake_entry);
@@ -1490,7 +1567,7 @@ static void sched_ttwu_pending(void)
ttwu_do_activate(rq, p, 0);
}
- raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
}
void scheduler_ipi(void)
@@ -1502,9 +1579,7 @@ void scheduler_ipi(void)
*/
preempt_fold_need_resched();
- if (llist_empty(&this_rq()->wake_list)
- && !tick_nohz_full_cpu(smp_processor_id())
- && !got_nohz_idle_kick())
+ if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
return;
/*
@@ -1521,7 +1596,6 @@ void scheduler_ipi(void)
* somewhat pessimize the simple resched case.
*/
irq_enter();
- tick_nohz_full_check();
sched_ttwu_pending();
/*
@@ -1536,8 +1610,14 @@ void scheduler_ipi(void)
static void ttwu_queue_remote(struct task_struct *p, int cpu)
{
- if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
- smp_send_reschedule(cpu);
+ struct rq *rq = cpu_rq(cpu);
+
+ if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
+ if (!set_nr_if_polling(rq->idle))
+ smp_send_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
+ }
}
bool cpus_share_cache(int this_cpu, int that_cpu)
@@ -1745,8 +1825,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
p->numa_work.next = &p->numa_work;
- p->numa_faults = NULL;
- p->numa_faults_buffer = NULL;
+ p->numa_faults_memory = NULL;
+ p->numa_faults_buffer_memory = NULL;
+ p->last_task_numa_placement = 0;
+ p->last_sum_exec_runtime = 0;
INIT_LIST_HEAD(&p->numa_entry);
p->numa_group = NULL;
@@ -2149,8 +2231,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
if (mm)
mmdrop(mm);
if (unlikely(prev_state == TASK_DEAD)) {
- task_numa_free(prev);
-
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
@@ -2167,13 +2247,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
#ifdef CONFIG_SMP
-/* assumes rq->lock is held */
-static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
-{
- if (prev->sched_class->pre_schedule)
- prev->sched_class->pre_schedule(rq, prev);
-}
-
/* rq->lock is NOT held, but preemption is disabled */
static inline void post_schedule(struct rq *rq)
{
@@ -2191,10 +2264,6 @@ static inline void post_schedule(struct rq *rq)
#else
-static inline void pre_schedule(struct rq *rq, struct task_struct *p)
-{
-}
-
static inline void post_schedule(struct rq *rq)
{
}
@@ -2205,7 +2274,7 @@ static inline void post_schedule(struct rq *rq)
* schedule_tail - first thing a freshly forked thread must call.
* @prev: the thread we just switched away from.
*/
-asmlinkage void schedule_tail(struct task_struct *prev)
+asmlinkage __visible void schedule_tail(struct task_struct *prev)
__releases(rq->lock)
{
struct rq *rq = this_rq();
@@ -2324,6 +2393,13 @@ unsigned long nr_iowait_cpu(int cpu)
return atomic_read(&this->nr_iowait);
}
+void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
+{
+ struct rq *this = this_rq();
+ *nr_waiters = atomic_read(&this->nr_iowait);
+ *load = this->cpu_load[0];
+}
+
#ifdef CONFIG_SMP
/*
@@ -2370,7 +2446,12 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
{
u64 ns = 0;
- if (task_current(rq, p)) {
+ /*
+ * Must be ->curr _and_ ->on_rq. If dequeued, we would
+ * project cycles that may never be accounted to this
+ * thread, breaking clock_gettime().
+ */
+ if (task_current(rq, p) && p->on_rq) {
update_rq_clock(rq);
ns = rq_clock_task(rq) - p->se.exec_start;
if ((s64)ns < 0)
@@ -2413,8 +2494,10 @@ unsigned long long task_sched_runtime(struct task_struct *p)
* If we race with it leaving cpu, we'll take a lock. So we're correct.
* If we race with it entering cpu, unaccounted time is 0. This is
* indistinguishable from the read occurring a few cycles earlier.
+ * If we see ->on_cpu without ->on_rq, the task is leaving, and has
+ * been accounted, so we're correct here as well.
*/
- if (!p->on_cpu)
+ if (!p->on_cpu || !p->on_rq)
return p->se.sum_exec_runtime;
#endif
@@ -2493,7 +2576,7 @@ notrace unsigned long get_parent_ip(unsigned long addr)
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_PREEMPT_TRACER))
-void __kprobes preempt_count_add(int val)
+void preempt_count_add(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
/*
@@ -2510,12 +2593,18 @@ void __kprobes preempt_count_add(int val)
DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
PREEMPT_MASK - 10);
#endif
- if (preempt_count() == val)
- trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+ if (preempt_count() == val) {
+ unsigned long ip = get_parent_ip(CALLER_ADDR1);
+#ifdef CONFIG_DEBUG_PREEMPT
+ current->preempt_disable_ip = ip;
+#endif
+ trace_preempt_off(CALLER_ADDR0, ip);
+ }
}
EXPORT_SYMBOL(preempt_count_add);
+NOKPROBE_SYMBOL(preempt_count_add);
-void __kprobes preempt_count_sub(int val)
+void preempt_count_sub(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
/*
@@ -2536,6 +2625,7 @@ void __kprobes preempt_count_sub(int val)
__preempt_count_sub(val);
}
EXPORT_SYMBOL(preempt_count_sub);
+NOKPROBE_SYMBOL(preempt_count_sub);
#endif
@@ -2554,6 +2644,13 @@ static noinline void __schedule_bug(struct task_struct *prev)
print_modules();
if (irqs_disabled())
print_irqtrace_events(prev);
+#ifdef CONFIG_DEBUG_PREEMPT
+ if (in_atomic_preempt_off()) {
+ pr_err("Preemption disabled at:");
+ print_ip_sym(current->preempt_disable_ip);
+ pr_cont("\n");
+ }
+#endif
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
@@ -2577,36 +2674,40 @@ static inline void schedule_debug(struct task_struct *prev)
schedstat_inc(this_rq(), sched_count);
}
-static void put_prev_task(struct rq *rq, struct task_struct *prev)
-{
- if (prev->on_rq || rq->skip_clock_update < 0)
- update_rq_clock(rq);
- prev->sched_class->put_prev_task(rq, prev);
-}
-
/*
* Pick up the highest-prio task:
*/
static inline struct task_struct *
-pick_next_task(struct rq *rq)
+pick_next_task(struct rq *rq, struct task_struct *prev)
{
- const struct sched_class *class;
+ const struct sched_class *class = &fair_sched_class;
struct task_struct *p;
/*
* Optimization: we know that if all tasks are in
* the fair class we can call that function directly:
*/
- if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
- p = fair_sched_class.pick_next_task(rq);
- if (likely(p))
- return p;
+ if (likely(prev->sched_class == class &&
+ rq->nr_running == rq->cfs.h_nr_running)) {
+ p = fair_sched_class.pick_next_task(rq, prev);
+ if (unlikely(p == RETRY_TASK))
+ goto again;
+
+ /* assumes fair_sched_class->next == idle_sched_class */
+ if (unlikely(!p))
+ p = idle_sched_class.pick_next_task(rq, prev);
+
+ return p;
}
+again:
for_each_class(class) {
- p = class->pick_next_task(rq);
- if (p)
+ p = class->pick_next_task(rq, prev);
+ if (p) {
+ if (unlikely(p == RETRY_TASK))
+ goto again;
return p;
+ }
}
BUG(); /* the idle class will always have a runnable task */
@@ -2700,13 +2801,10 @@ need_resched:
switch_count = &prev->nvcsw;
}
- pre_schedule(rq, prev);
-
- if (unlikely(!rq->nr_running))
- idle_balance(cpu, rq);
+ if (prev->on_rq || rq->skip_clock_update < 0)
+ update_rq_clock(rq);
- put_prev_task(rq, prev);
- next = pick_next_task(rq);
+ next = pick_next_task(rq, prev);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
rq->skip_clock_update = 0;
@@ -2747,7 +2845,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
blk_schedule_flush_plug(tsk);
}
-asmlinkage void __sched schedule(void)
+asmlinkage __visible void __sched schedule(void)
{
struct task_struct *tsk = current;
@@ -2757,7 +2855,7 @@ asmlinkage void __sched schedule(void)
EXPORT_SYMBOL(schedule);
#ifdef CONFIG_CONTEXT_TRACKING
-asmlinkage void __sched schedule_user(void)
+asmlinkage __visible void __sched schedule_user(void)
{
/*
* If we come here after a random call to set_need_resched(),
@@ -2789,7 +2887,7 @@ void __sched schedule_preempt_disabled(void)
* off of preempt_enable. Kernel preemptions off return from interrupt
* occur there and call schedule directly.
*/
-asmlinkage void __sched notrace preempt_schedule(void)
+asmlinkage __visible void __sched notrace preempt_schedule(void)
{
/*
* If there is a non-zero preempt_count or interrupts are disabled,
@@ -2810,6 +2908,7 @@ asmlinkage void __sched notrace preempt_schedule(void)
barrier();
} while (need_resched());
}
+NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
#endif /* CONFIG_PREEMPT */
@@ -2819,7 +2918,7 @@ EXPORT_SYMBOL(preempt_schedule);
* Note, that this is called and return with irqs disabled. This will
* protect us against recursive calling from irq.
*/
-asmlinkage void __sched preempt_schedule_irq(void)
+asmlinkage __visible void __sched preempt_schedule_irq(void)
{
enum ctx_state prev_state;
@@ -2852,52 +2951,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
}
EXPORT_SYMBOL(default_wake_function);
-static long __sched
-sleep_on_common(wait_queue_head_t *q, int state, long timeout)
-{
- unsigned long flags;
- wait_queue_t wait;
-
- init_waitqueue_entry(&wait, current);
-
- __set_current_state(state);
-
- spin_lock_irqsave(&q->lock, flags);
- __add_wait_queue(q, &wait);
- spin_unlock(&q->lock);
- timeout = schedule_timeout(timeout);
- spin_lock_irq(&q->lock);
- __remove_wait_queue(q, &wait);
- spin_unlock_irqrestore(&q->lock, flags);
-
- return timeout;
-}
-
-void __sched interruptible_sleep_on(wait_queue_head_t *q)
-{
- sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
-}
-EXPORT_SYMBOL(interruptible_sleep_on);
-
-long __sched
-interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
-{
- return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
-}
-EXPORT_SYMBOL(interruptible_sleep_on_timeout);
-
-void __sched sleep_on(wait_queue_head_t *q)
-{
- sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
-}
-EXPORT_SYMBOL(sleep_on);
-
-long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
-{
- return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
-}
-EXPORT_SYMBOL(sleep_on_timeout);
-
#ifdef CONFIG_RT_MUTEXES
/*
@@ -2908,7 +2961,8 @@ EXPORT_SYMBOL(sleep_on_timeout);
* This function changes the 'effective' priority of a task. It does
* not touch ->normal_prio like __setscheduler().
*
- * Used by the rt_mutex code to implement priority inheritance logic.
+ * Used by the rt_mutex code to implement priority inheritance
+ * logic. Call site only calls if the priority of the task changed.
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
@@ -2939,7 +2993,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
}
trace_sched_pi_setprio(p, prio);
- p->pi_top_task = rt_mutex_get_top_task(p);
oldprio = p->prio;
prev_class = p->sched_class;
on_rq = p->on_rq;
@@ -2959,8 +3012,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
* running task
*/
if (dl_prio(prio)) {
- if (!dl_prio(p->normal_prio) || (p->pi_top_task &&
- dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) {
+ struct task_struct *pi_task = rt_mutex_get_top_task(p);
+ if (!dl_prio(p->normal_prio) ||
+ (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
p->dl.dl_throttled = 0;
enqueue_flag = ENQUEUE_REPLENISH;
@@ -2998,7 +3052,7 @@ void set_user_nice(struct task_struct *p, long nice)
unsigned long flags;
struct rq *rq;
- if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
+ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
return;
/*
* We have to be careful, if called from sys_setpriority(),
@@ -3032,7 +3086,7 @@ void set_user_nice(struct task_struct *p, long nice)
* lowered its priority, then reschedule its CPU:
*/
if (delta < 0 || (delta > 0 && task_running(rq, p)))
- resched_task(rq->curr);
+ resched_curr(rq);
}
out_unlock:
task_rq_unlock(rq, p, &flags);
@@ -3047,7 +3101,7 @@ EXPORT_SYMBOL(set_user_nice);
int can_nice(const struct task_struct *p, const int nice)
{
/* convert nice value [19,-20] to rlimit style value [1,40] */
- int nice_rlim = 20 - nice;
+ int nice_rlim = nice_to_rlimit(nice);
return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
capable(CAP_SYS_NICE));
@@ -3071,17 +3125,10 @@ SYSCALL_DEFINE1(nice, int, increment)
* We don't have to worry. Conceptually one call occurs first
* and we have a single winner.
*/
- if (increment < -40)
- increment = -40;
- if (increment > 40)
- increment = 40;
-
- nice = TASK_NICE(current) + increment;
- if (nice < -20)
- nice = -20;
- if (nice > 19)
- nice = 19;
+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
+ nice = task_nice(current) + increment;
+ nice = clamp_val(nice, MIN_NICE, MAX_NICE);
if (increment < 0 && !can_nice(current, nice))
return -EPERM;
@@ -3109,18 +3156,6 @@ int task_prio(const struct task_struct *p)
}
/**
- * task_nice - return the nice value of a given task.
- * @p: the task in question.
- *
- * Return: The nice value [ -20 ... 0 ... 19 ].
- */
-int task_nice(const struct task_struct *p)
-{
- return TASK_NICE(p);
-}
-EXPORT_SYMBOL(task_nice);
-
-/**
* idle_cpu - is a given cpu idle currently?
* @cpu: the processor in question.
*
@@ -3187,15 +3222,21 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
dl_se->dl_throttled = 0;
dl_se->dl_new = 1;
+ dl_se->dl_yielded = 0;
}
-/* Actually do priority change: must hold pi & rq lock. */
-static void __setscheduler(struct rq *rq, struct task_struct *p,
- const struct sched_attr *attr)
+/*
+ * sched_setparam() passes in -1 for its policy, to let the functions
+ * it calls know not to change it.
+ */
+#define SETPARAM_POLICY -1
+
+static void __setscheduler_params(struct task_struct *p,
+ const struct sched_attr *attr)
{
int policy = attr->sched_policy;
- if (policy == -1) /* setparam */
+ if (policy == SETPARAM_POLICY)
policy = p->policy;
p->policy = policy;
@@ -3211,9 +3252,21 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
* getparam()/getattr() don't report silly values for !rt tasks.
*/
p->rt_priority = attr->sched_priority;
-
p->normal_prio = normal_prio(p);
- p->prio = rt_mutex_getprio(p);
+ set_load_weight(p);
+}
+
+/* Actually do priority change: must hold pi & rq lock. */
+static void __setscheduler(struct rq *rq, struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ __setscheduler_params(p, attr);
+
+ /*
+ * If we get here, there was no pi waiters boosting the
+ * task. It is safe to use the normal prio.
+ */
+ p->prio = normal_prio(p);
if (dl_prio(p->prio))
p->sched_class = &dl_sched_class;
@@ -3221,8 +3274,6 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
p->sched_class = &rt_sched_class;
else
p->sched_class = &fair_sched_class;
-
- set_load_weight(p);
}
static void
@@ -3242,17 +3293,40 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr)
* We ask for the deadline not being zero, and greater or equal
* than the runtime, as well as the period of being zero or
* greater than deadline. Furthermore, we have to be sure that
- * user parameters are above the internal resolution (1us); we
- * check sched_runtime only since it is always the smaller one.
+ * user parameters are above the internal resolution of 1us (we
+ * check sched_runtime only since it is always the smaller one) and
+ * below 2^63 ns (we have to check both sched_deadline and
+ * sched_period, as the latter can be zero).
*/
static bool
__checkparam_dl(const struct sched_attr *attr)
{
- return attr && attr->sched_deadline != 0 &&
- (attr->sched_period == 0 ||
- (s64)(attr->sched_period - attr->sched_deadline) >= 0) &&
- (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 &&
- attr->sched_runtime >= (2 << (DL_SCALE - 1));
+ /* deadline != 0 */
+ if (attr->sched_deadline == 0)
+ return false;
+
+ /*
+ * Since we truncate DL_SCALE bits, make sure we're at least
+ * that big.
+ */
+ if (attr->sched_runtime < (1ULL << DL_SCALE))
+ return false;
+
+ /*
+ * Since we use the MSB for wrap-around and sign issues, make
+ * sure it's not set (mind that period can be equal to zero).
+ */
+ if (attr->sched_deadline & (1ULL << 63) ||
+ attr->sched_period & (1ULL << 63))
+ return false;
+
+ /* runtime <= deadline <= period (if period != 0) */
+ if ((attr->sched_period != 0 &&
+ attr->sched_period < attr->sched_deadline) ||
+ attr->sched_deadline < attr->sched_runtime)
+ return false;
+
+ return true;
}
/*
@@ -3275,6 +3349,8 @@ static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user)
{
+ int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
+ MAX_RT_PRIO - 1 - attr->sched_priority;
int retval, oldprio, oldpolicy = -1, on_rq, running;
int policy = attr->sched_policy;
unsigned long flags;
@@ -3319,7 +3395,7 @@ recheck:
*/
if (user && !capable(CAP_SYS_NICE)) {
if (fair_policy(policy)) {
- if (attr->sched_nice < TASK_NICE(p) &&
+ if (attr->sched_nice < task_nice(p) &&
!can_nice(p, attr->sched_nice))
return -EPERM;
}
@@ -3352,7 +3428,7 @@ recheck:
* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
*/
if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
- if (!can_nice(p, TASK_NICE(p)))
+ if (!can_nice(p, task_nice(p)))
return -EPERM;
}
@@ -3389,16 +3465,18 @@ recheck:
}
/*
- * If not changing anything there's no need to proceed further:
+ * If not changing anything there's no need to proceed further,
+ * but store a possible modification of reset_on_fork.
*/
if (unlikely(policy == p->policy)) {
- if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p))
+ if (fair_policy(policy) && attr->sched_nice != task_nice(p))
goto change;
if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
goto change;
if (dl_policy(policy))
goto change;
+ p->sched_reset_on_fork = reset_on_fork;
task_rq_unlock(rq, p, &flags);
return 0;
}
@@ -3452,6 +3530,24 @@ change:
return -EBUSY;
}
+ p->sched_reset_on_fork = reset_on_fork;
+ oldprio = p->prio;
+
+ /*
+ * Special case for priority boosted tasks.
+ *
+ * If the new priority is lower or equal (user space view)
+ * than the current (boosted) priority, we just store the new
+ * normal parameters and do not touch the scheduler class and
+ * the runqueue. This will be done when the task deboost
+ * itself.
+ */
+ if (rt_mutex_check_prio(p, newprio)) {
+ __setscheduler_params(p, attr);
+ task_rq_unlock(rq, p, &flags);
+ return 0;
+ }
+
on_rq = p->on_rq;
running = task_current(rq, p);
if (on_rq)
@@ -3459,16 +3555,18 @@ change:
if (running)
p->sched_class->put_prev_task(rq, p);
- p->sched_reset_on_fork = reset_on_fork;
-
- oldprio = p->prio;
prev_class = p->sched_class;
__setscheduler(rq, p, attr);
if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq)
- enqueue_task(rq, p, 0);
+ if (on_rq) {
+ /*
+ * We enqueue to tail when the priority of a task is
+ * increased (user space view).
+ */
+ enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+ }
check_class_changed(rq, p, prev_class, oldprio);
task_rq_unlock(rq, p, &flags);
@@ -3487,10 +3585,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
.sched_nice = PRIO_TO_NICE(p->static_prio),
};
- /*
- * Fixup the legacy SCHED_RESET_ON_FORK hack
- */
- if (policy & SCHED_RESET_ON_FORK) {
+ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
+ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
policy &= ~SCHED_RESET_ON_FORK;
attr.sched_policy = policy;
@@ -3624,15 +3720,13 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
* XXX: do we want to be lenient like existing syscalls; or do we want
* to be strict and return an error on out-of-bounds values?
*/
- attr->sched_nice = clamp(attr->sched_nice, -20, 19);
+ attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
-out:
- return ret;
+ return 0;
err_size:
put_user(sizeof(*attr), &uattr->size);
- ret = -E2BIG;
- goto out;
+ return -E2BIG;
}
/**
@@ -3662,13 +3756,14 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
*/
SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
{
- return do_sched_setscheduler(pid, -1, param);
+ return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
}
/**
* sys_sched_setattr - same as above, but with extended sched_attr
* @pid: the pid in question.
* @uattr: structure containing the extended parameters.
+ * @flags: for future extension.
*/
SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
unsigned int, flags)
@@ -3680,8 +3775,12 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
if (!uattr || pid < 0 || flags)
return -EINVAL;
- if (sched_copy_attr(uattr, &attr))
- return -EFAULT;
+ retval = sched_copy_attr(uattr, &attr);
+ if (retval)
+ return retval;
+
+ if ((int)attr.sched_policy < 0)
+ return -EINVAL;
rcu_read_lock();
retval = -ESRCH;
@@ -3731,7 +3830,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
*/
SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
{
- struct sched_param lp;
+ struct sched_param lp = { .sched_priority = 0 };
struct task_struct *p;
int retval;
@@ -3748,11 +3847,8 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
if (retval)
goto out_unlock;
- if (task_has_dl_policy(p)) {
- retval = -EINVAL;
- goto out_unlock;
- }
- lp.sched_priority = p->rt_priority;
+ if (task_has_rt_policy(p))
+ lp.sched_priority = p->rt_priority;
rcu_read_unlock();
/*
@@ -3790,7 +3886,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
for (; addr < end; addr++) {
if (*addr)
- goto err_size;
+ return -EFBIG;
}
attr->size = usize;
@@ -3800,12 +3896,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
if (ret)
return -EFAULT;
-out:
- return ret;
-
-err_size:
- ret = -E2BIG;
- goto out;
+ return 0;
}
/**
@@ -3813,6 +3904,7 @@ err_size:
* @pid: the pid in question.
* @uattr: structure containing the extended parameters.
* @size: sizeof(attr) for fwd/bwd comp.
+ * @flags: for future extension.
*/
SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
unsigned int, size, unsigned int, flags)
@@ -3845,7 +3937,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
else if (task_has_rt_policy(p))
attr.sched_priority = p->rt_priority;
else
- attr.sched_nice = TASK_NICE(p);
+ attr.sched_nice = task_nice(p);
rcu_read_unlock();
@@ -4175,7 +4267,7 @@ EXPORT_SYMBOL(yield);
* false (0) if we failed to boost the target.
* -ESRCH if there's no task to yield to.
*/
-bool __sched yield_to(struct task_struct *p, bool preempt)
+int __sched yield_to(struct task_struct *p, bool preempt)
{
struct task_struct *curr = current;
struct rq *rq, *p_rq;
@@ -4219,7 +4311,7 @@ again:
* fairness.
*/
if (preempt && rq != p_rq)
- resched_task(p_rq->curr);
+ resched_curr(p_rq);
}
out_unlock:
@@ -4483,6 +4575,7 @@ void init_idle(struct task_struct *idle, int cpu)
rcu_read_unlock();
rq->curr = rq->idle = idle;
+ idle->on_rq = 1;
#if defined(CONFIG_SMP)
idle->on_cpu = 1;
#endif
@@ -4702,8 +4795,10 @@ void idle_task_exit(void)
BUG_ON(cpu_online(smp_processor_id()));
- if (mm != &init_mm)
+ if (mm != &init_mm) {
switch_mm(mm, &init_mm, current);
+ finish_arch_post_lock_switch();
+ }
mmdrop(mm);
}
@@ -4721,6 +4816,22 @@ static void calc_load_migrate(struct rq *rq)
atomic_long_add(delta, &calc_load_tasks);
}
+static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
+{
+}
+
+static const struct sched_class fake_sched_class = {
+ .put_prev_task = put_prev_task_fake,
+};
+
+static struct task_struct fake_task = {
+ /*
+ * Avoid pull_{rt,dl}_task()
+ */
+ .prio = MAX_PRIO + 1,
+ .sched_class = &fake_sched_class,
+};
+
/*
* Migrate all tasks from the rq, sleeping tasks will be migrated by
* try_to_wake_up()->select_task_rq().
@@ -4761,7 +4872,7 @@ static void migrate_tasks(unsigned int dead_cpu)
if (rq->nr_running == 1)
break;
- next = pick_next_task(rq);
+ next = pick_next_task(rq, &fake_task);
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
@@ -4851,7 +4962,7 @@ set_table_entry(struct ctl_table *entry,
static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
- struct ctl_table *table = sd_alloc_ctl_entry(13);
+ struct ctl_table *table = sd_alloc_ctl_entry(14);
if (table == NULL)
return NULL;
@@ -4879,9 +4990,12 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[10], "flags", &sd->flags,
sizeof(int), 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[11], "name", sd->name,
+ set_table_entry(&table[11], "max_newidle_lb_cost",
+ &sd->max_newidle_lb_cost,
+ sizeof(long), 0644, proc_doulongvec_minmax, false);
+ set_table_entry(&table[12], "name", sd->name,
CORENAME_MAX_SIZE, 0444, proc_dostring, false);
- /* &table[12] is terminator */
+ /* &table[13] is terminator */
return table;
}
@@ -5047,11 +5161,20 @@ static struct notifier_block migration_notifier = {
.priority = CPU_PRI_MIGRATION,
};
+static void __cpuinit set_cpu_rq_start_time(void)
+{
+ int cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+ rq->age_stamp = sched_clock_cpu(cpu);
+}
+
static int sched_cpu_active(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_STARTING:
+ set_cpu_rq_start_time();
+ return NOTIFY_OK;
case CPU_DOWN_FAILED:
set_cpu_active((long)hcpu, true);
return NOTIFY_OK;
@@ -5170,14 +5293,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
}
/*
- * Even though we initialize ->power to something semi-sane,
- * we leave power_orig unset. This allows us to detect if
+ * Even though we initialize ->capacity to something semi-sane,
+ * we leave capacity_orig unset. This allows us to detect if
* domain iteration is still funny without causing /0 traps.
*/
- if (!group->sgp->power_orig) {
+ if (!group->sgc->capacity_orig) {
printk(KERN_CONT "\n");
- printk(KERN_ERR "ERROR: domain->cpu_power not "
- "set\n");
+ printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
break;
}
@@ -5199,9 +5321,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
printk(KERN_CONT " %s", str);
- if (group->sgp->power != SCHED_POWER_SCALE) {
- printk(KERN_CONT " (cpu_power = %d)",
- group->sgp->power);
+ if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
+ printk(KERN_CONT " (cpu_capacity = %d)",
+ group->sgc->capacity);
}
group = group->next;
@@ -5259,8 +5381,9 @@ static int sd_degenerate(struct sched_domain *sd)
SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
- SD_SHARE_CPUPOWER |
- SD_SHARE_PKG_RESOURCES)) {
+ SD_SHARE_CPUCAPACITY |
+ SD_SHARE_PKG_RESOURCES |
+ SD_SHARE_POWERDOMAIN)) {
if (sd->groups != sd->groups->next)
return 0;
}
@@ -5289,9 +5412,10 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
- SD_SHARE_CPUPOWER |
+ SD_SHARE_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
- SD_PREFER_SIBLING);
+ SD_PREFER_SIBLING |
+ SD_SHARE_POWERDOMAIN);
if (nr_node_ids == 1)
pflags &= ~SD_SERIALIZE;
}
@@ -5413,7 +5537,7 @@ static struct root_domain *alloc_rootdomain(void)
return rd;
}
-static void free_sched_groups(struct sched_group *sg, int free_sgp)
+static void free_sched_groups(struct sched_group *sg, int free_sgc)
{
struct sched_group *tmp, *first;
@@ -5424,8 +5548,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgp)
do {
tmp = sg->next;
- if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
- kfree(sg->sgp);
+ if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
+ kfree(sg->sgc);
kfree(sg);
sg = tmp;
@@ -5443,7 +5567,7 @@ static void free_sched_domain(struct rcu_head *rcu)
if (sd->flags & SD_OVERLAP) {
free_sched_groups(sd->groups, 1);
} else if (atomic_dec_and_test(&sd->groups->ref)) {
- kfree(sd->groups->sgp);
+ kfree(sd->groups->sgc);
kfree(sd->groups);
}
kfree(sd);
@@ -5565,17 +5689,6 @@ static int __init isolated_cpu_setup(char *str)
__setup("isolcpus=", isolated_cpu_setup);
-static const struct cpumask *cpu_cpu_mask(int cpu)
-{
- return cpumask_of_node(cpu_to_node(cpu));
-}
-
-struct sd_data {
- struct sched_domain **__percpu sd;
- struct sched_group **__percpu sg;
- struct sched_group_power **__percpu sgp;
-};
-
struct s_data {
struct sched_domain ** __percpu sd;
struct root_domain *rd;
@@ -5588,21 +5701,6 @@ enum s_alloc {
sa_none,
};
-struct sched_domain_topology_level;
-
-typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
-typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
-
-#define SDTL_OVERLAP 0x01
-
-struct sched_domain_topology_level {
- sched_domain_init_f init;
- sched_domain_mask_f mask;
- int flags;
- int numa_level;
- struct sd_data data;
-};
-
/*
* Build an iteration mask that can exclude certain CPUs from the upwards
* domain traversal.
@@ -5680,17 +5778,17 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
cpumask_or(covered, covered, sg_span);
- sg->sgp = *per_cpu_ptr(sdd->sgp, i);
- if (atomic_inc_return(&sg->sgp->ref) == 1)
+ sg->sgc = *per_cpu_ptr(sdd->sgc, i);
+ if (atomic_inc_return(&sg->sgc->ref) == 1)
build_group_mask(sd, sg);
/*
- * Initialize sgp->power such that even if we mess up the
+ * Initialize sgc->capacity such that even if we mess up the
* domains and no possible iteration will get us here, we won't
* die on a /0 trap.
*/
- sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
- sg->sgp->power_orig = sg->sgp->power;
+ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
+ sg->sgc->capacity_orig = sg->sgc->capacity;
/*
* Make sure the first group of this domain contains the
@@ -5728,8 +5826,8 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
if (sg) {
*sg = *per_cpu_ptr(sdd->sg, cpu);
- (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
- atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
+ (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+ atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */
}
return cpu;
@@ -5738,7 +5836,7 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
/*
* build_sched_groups will build a circular linked list of the groups
* covered by the given span, and will set each group's ->cpumask correctly,
- * and ->cpu_power to 0.
+ * and ->cpu_capacity to 0.
*
* Assumes the sched_domain tree is fully constructed
*/
@@ -5770,8 +5868,6 @@ build_sched_groups(struct sched_domain *sd, int cpu)
continue;
group = get_group(i, sdd, &sg);
- cpumask_clear(sched_group_cpus(sg));
- sg->sgp->power = 0;
cpumask_setall(sched_group_mask(sg));
for_each_cpu(j, span) {
@@ -5794,16 +5890,16 @@ build_sched_groups(struct sched_domain *sd, int cpu)
}
/*
- * Initialize sched groups cpu_power.
+ * Initialize sched groups cpu_capacity.
*
- * cpu_power indicates the capacity of sched group, which is used while
+ * cpu_capacity indicates the capacity of sched group, which is used while
* distributing the load between different sched groups in a sched domain.
- * Typically cpu_power for all the groups in a sched domain will be same unless
- * there are asymmetries in the topology. If there are asymmetries, group
- * having more cpu_power will pickup more load compared to the group having
- * less cpu_power.
+ * Typically cpu_capacity for all the groups in a sched domain will be same
+ * unless there are asymmetries in the topology. If there are asymmetries,
+ * group having more cpu_capacity will pickup more load compared to the
+ * group having less cpu_capacity.
*/
-static void init_sched_groups_power(int cpu, struct sched_domain *sd)
+static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
{
struct sched_group *sg = sd->groups;
@@ -5817,13 +5913,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
if (cpu != group_balance_cpu(sg))
return;
- update_group_power(sd, cpu);
- atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
-}
-
-int __weak arch_sd_sibling_asym_packing(void)
-{
- return 0*SD_ASYM_PACKING;
+ update_group_capacity(sd, cpu);
+ atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
}
/*
@@ -5831,34 +5922,6 @@ int __weak arch_sd_sibling_asym_packing(void)
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
*/
-#ifdef CONFIG_SCHED_DEBUG
-# define SD_INIT_NAME(sd, type) sd->name = #type
-#else
-# define SD_INIT_NAME(sd, type) do { } while (0)
-#endif
-
-#define SD_INIT_FUNC(type) \
-static noinline struct sched_domain * \
-sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
-{ \
- struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
- *sd = SD_##type##_INIT; \
- SD_INIT_NAME(sd, type); \
- sd->private = &tl->data; \
- return sd; \
-}
-
-SD_INIT_FUNC(CPU)
-#ifdef CONFIG_SCHED_SMT
- SD_INIT_FUNC(SIBLING)
-#endif
-#ifdef CONFIG_SCHED_MC
- SD_INIT_FUNC(MC)
-#endif
-#ifdef CONFIG_SCHED_BOOK
- SD_INIT_FUNC(BOOK)
-#endif
-
static int default_relax_domain_level = -1;
int sched_domain_level_max;
@@ -5942,101 +6005,158 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
*per_cpu_ptr(sdd->sg, cpu) = NULL;
- if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
- *per_cpu_ptr(sdd->sgp, cpu) = NULL;
+ if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
+ *per_cpu_ptr(sdd->sgc, cpu) = NULL;
}
-#ifdef CONFIG_SCHED_SMT
-static const struct cpumask *cpu_smt_mask(int cpu)
-{
- return topology_thread_cpumask(cpu);
-}
-#endif
-
-/*
- * Topology list, bottom-up.
- */
-static struct sched_domain_topology_level default_topology[] = {
-#ifdef CONFIG_SCHED_SMT
- { sd_init_SIBLING, cpu_smt_mask, },
-#endif
-#ifdef CONFIG_SCHED_MC
- { sd_init_MC, cpu_coregroup_mask, },
-#endif
-#ifdef CONFIG_SCHED_BOOK
- { sd_init_BOOK, cpu_book_mask, },
-#endif
- { sd_init_CPU, cpu_cpu_mask, },
- { NULL, },
-};
-
-static struct sched_domain_topology_level *sched_domain_topology = default_topology;
-
-#define for_each_sd_topology(tl) \
- for (tl = sched_domain_topology; tl->init; tl++)
-
#ifdef CONFIG_NUMA
-
static int sched_domains_numa_levels;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
static int sched_domains_curr_level;
+#endif
-static inline int sd_local_flags(int level)
-{
- if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
- return 0;
-
- return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
-}
+/*
+ * SD_flags allowed in topology descriptions.
+ *
+ * SD_SHARE_CPUCAPACITY - describes SMT topologies
+ * SD_SHARE_PKG_RESOURCES - describes shared caches
+ * SD_NUMA - describes NUMA topologies
+ * SD_SHARE_POWERDOMAIN - describes shared power domain
+ *
+ * Odd one out:
+ * SD_ASYM_PACKING - describes SMT quirks
+ */
+#define TOPOLOGY_SD_FLAGS \
+ (SD_SHARE_CPUCAPACITY | \
+ SD_SHARE_PKG_RESOURCES | \
+ SD_NUMA | \
+ SD_ASYM_PACKING | \
+ SD_SHARE_POWERDOMAIN)
static struct sched_domain *
-sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
+sd_init(struct sched_domain_topology_level *tl, int cpu)
{
struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
- int level = tl->numa_level;
- int sd_weight = cpumask_weight(
- sched_domains_numa_masks[level][cpu_to_node(cpu)]);
+ int sd_weight, sd_flags = 0;
+
+#ifdef CONFIG_NUMA
+ /*
+ * Ugly hack to pass state to sd_numa_mask()...
+ */
+ sched_domains_curr_level = tl->numa_level;
+#endif
+
+ sd_weight = cpumask_weight(tl->mask(cpu));
+
+ if (tl->sd_flags)
+ sd_flags = (*tl->sd_flags)();
+ if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
+ "wrong sd_flags in topology description\n"))
+ sd_flags &= ~TOPOLOGY_SD_FLAGS;
*sd = (struct sched_domain){
.min_interval = sd_weight,
.max_interval = 2*sd_weight,
.busy_factor = 32,
.imbalance_pct = 125,
- .cache_nice_tries = 2,
- .busy_idx = 3,
- .idle_idx = 2,
+
+ .cache_nice_tries = 0,
+ .busy_idx = 0,
+ .idle_idx = 0,
.newidle_idx = 0,
.wake_idx = 0,
.forkexec_idx = 0,
.flags = 1*SD_LOAD_BALANCE
| 1*SD_BALANCE_NEWIDLE
- | 0*SD_BALANCE_EXEC
- | 0*SD_BALANCE_FORK
+ | 1*SD_BALANCE_EXEC
+ | 1*SD_BALANCE_FORK
| 0*SD_BALANCE_WAKE
- | 0*SD_WAKE_AFFINE
- | 0*SD_SHARE_CPUPOWER
+ | 1*SD_WAKE_AFFINE
+ | 0*SD_SHARE_CPUCAPACITY
| 0*SD_SHARE_PKG_RESOURCES
- | 1*SD_SERIALIZE
+ | 0*SD_SERIALIZE
| 0*SD_PREFER_SIBLING
- | 1*SD_NUMA
- | sd_local_flags(level)
+ | 0*SD_NUMA
+ | sd_flags
,
+
.last_balance = jiffies,
.balance_interval = sd_weight,
+ .smt_gain = 0,
+ .max_newidle_lb_cost = 0,
+ .next_decay_max_lb_cost = jiffies,
+#ifdef CONFIG_SCHED_DEBUG
+ .name = tl->name,
+#endif
};
- SD_INIT_NAME(sd, NUMA);
- sd->private = &tl->data;
/*
- * Ugly hack to pass state to sd_numa_mask()...
+ * Convert topological properties into behaviour.
*/
- sched_domains_curr_level = tl->numa_level;
+
+ if (sd->flags & SD_SHARE_CPUCAPACITY) {
+ sd->imbalance_pct = 110;
+ sd->smt_gain = 1178; /* ~15% */
+
+ } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+ sd->imbalance_pct = 117;
+ sd->cache_nice_tries = 1;
+ sd->busy_idx = 2;
+
+#ifdef CONFIG_NUMA
+ } else if (sd->flags & SD_NUMA) {
+ sd->cache_nice_tries = 2;
+ sd->busy_idx = 3;
+ sd->idle_idx = 2;
+
+ sd->flags |= SD_SERIALIZE;
+ if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+ sd->flags &= ~(SD_BALANCE_EXEC |
+ SD_BALANCE_FORK |
+ SD_WAKE_AFFINE);
+ }
+
+#endif
+ } else {
+ sd->flags |= SD_PREFER_SIBLING;
+ sd->cache_nice_tries = 1;
+ sd->busy_idx = 2;
+ sd->idle_idx = 1;
+ }
+
+ sd->private = &tl->data;
return sd;
}
+/*
+ * Topology list, bottom-up.
+ */
+static struct sched_domain_topology_level default_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+#endif
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { NULL, },
+};
+
+struct sched_domain_topology_level *sched_domain_topology = default_topology;
+
+#define for_each_sd_topology(tl) \
+ for (tl = sched_domain_topology; tl->mask; tl++)
+
+void set_sched_topology(struct sched_domain_topology_level *tl)
+{
+ sched_domain_topology = tl;
+}
+
+#ifdef CONFIG_NUMA
+
static const struct cpumask *sd_numa_mask(int cpu)
{
return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
@@ -6180,7 +6300,10 @@ static void sched_init_numa(void)
}
}
- tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
+ /* Compute default topology size */
+ for (i = 0; sched_domain_topology[i].mask; i++);
+
+ tl = kzalloc((i + level + 1) *
sizeof(struct sched_domain_topology_level), GFP_KERNEL);
if (!tl)
return;
@@ -6188,18 +6311,19 @@ static void sched_init_numa(void)
/*
* Copy the default topology bits..
*/
- for (i = 0; default_topology[i].init; i++)
- tl[i] = default_topology[i];
+ for (i = 0; sched_domain_topology[i].mask; i++)
+ tl[i] = sched_domain_topology[i];
/*
* .. and append 'j' levels of NUMA goodness.
*/
for (j = 0; j < level; i++, j++) {
tl[i] = (struct sched_domain_topology_level){
- .init = sd_numa_init,
.mask = sd_numa_mask,
+ .sd_flags = cpu_numa_flags,
.flags = SDTL_OVERLAP,
.numa_level = j,
+ SD_INIT_NAME(NUMA)
};
}
@@ -6284,14 +6408,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
if (!sdd->sg)
return -ENOMEM;
- sdd->sgp = alloc_percpu(struct sched_group_power *);
- if (!sdd->sgp)
+ sdd->sgc = alloc_percpu(struct sched_group_capacity *);
+ if (!sdd->sgc)
return -ENOMEM;
for_each_cpu(j, cpu_map) {
struct sched_domain *sd;
struct sched_group *sg;
- struct sched_group_power *sgp;
+ struct sched_group_capacity *sgc;
sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
@@ -6309,12 +6433,12 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
*per_cpu_ptr(sdd->sg, j) = sg;
- sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
+ sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
- if (!sgp)
+ if (!sgc)
return -ENOMEM;
- *per_cpu_ptr(sdd->sgp, j) = sgp;
+ *per_cpu_ptr(sdd->sgc, j) = sgc;
}
}
@@ -6341,15 +6465,15 @@ static void __sdt_free(const struct cpumask *cpu_map)
if (sdd->sg)
kfree(*per_cpu_ptr(sdd->sg, j));
- if (sdd->sgp)
- kfree(*per_cpu_ptr(sdd->sgp, j));
+ if (sdd->sgc)
+ kfree(*per_cpu_ptr(sdd->sgc, j));
}
free_percpu(sdd->sd);
sdd->sd = NULL;
free_percpu(sdd->sg);
sdd->sg = NULL;
- free_percpu(sdd->sgp);
- sdd->sgp = NULL;
+ free_percpu(sdd->sgc);
+ sdd->sgc = NULL;
}
}
@@ -6357,7 +6481,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int cpu)
{
- struct sched_domain *sd = tl->init(tl, cpu);
+ struct sched_domain *sd = sd_init(tl, cpu);
if (!sd)
return child;
@@ -6367,6 +6491,20 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
sched_domain_level_max = max(sched_domain_level_max, sd->level);
child->parent = sd;
sd->child = child;
+
+ if (!cpumask_subset(sched_domain_span(child),
+ sched_domain_span(sd))) {
+ pr_err("BUG: arch topology borken\n");
+#ifdef CONFIG_SCHED_DEBUG
+ pr_err(" the %s domain not a subset of the %s domain\n",
+ child->name, sd->name);
+#endif
+ /* Fixup, ensure @sd has at least @child cpus. */
+ cpumask_or(sched_domain_span(sd),
+ sched_domain_span(sd),
+ sched_domain_span(child));
+ }
+
}
set_domain_attribute(sd, attr);
@@ -6419,14 +6557,14 @@ static int build_sched_domains(const struct cpumask *cpu_map,
}
}
- /* Calculate CPU power for physical packages and nodes */
+ /* Calculate CPU capacity for physical packages and nodes */
for (i = nr_cpumask_bits-1; i >= 0; i--) {
if (!cpumask_test_cpu(i, cpu_map))
continue;
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
claim_allocations(i, sd);
- init_sched_groups_power(i, sd);
+ init_sched_groups_capacity(i, sd);
}
}
@@ -6461,7 +6599,7 @@ static cpumask_var_t fallback_doms;
* cpu core maps. It is supposed to return 1 if the topology changed
* or 0 if it stayed the same.
*/
-int __attribute__((weak)) arch_update_cpu_topology(void)
+int __weak arch_update_cpu_topology(void)
{
return 0;
}
@@ -6858,7 +6996,6 @@ void __init sched_init(void)
rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
#ifdef CONFIG_RT_GROUP_SCHED
- INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
@@ -6870,7 +7007,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
- rq->cpu_power = SCHED_POWER_SCALE;
+ rq->cpu_capacity = SCHED_CAPACITY_SCALE;
rq->post_schedule = 0;
rq->active_balance = 0;
rq->next_balance = jiffies;
@@ -6928,6 +7065,7 @@ void __init sched_init(void)
if (cpu_isolated_map == NULL)
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
idle_thread_set_boot_cpu();
+ set_cpu_rq_start_time();
#endif
init_sched_fair_class();
@@ -6947,7 +7085,8 @@ void __might_sleep(const char *file, int line, int preempt_offset)
static unsigned long prev_jiffy; /* ratelimiting */
rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
- if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
+ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
+ !is_idle_task(current)) ||
system_state != SYSTEM_RUNNING || oops_in_progress)
return;
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
@@ -6965,6 +7104,13 @@ void __might_sleep(const char *file, int line, int preempt_offset)
debug_show_held_locks(current);
if (irqs_disabled())
print_irqtrace_events(current);
+#ifdef CONFIG_DEBUG_PREEMPT
+ if (!preempt_count_equals(preempt_offset)) {
+ pr_err("Preemption disabled at:");
+ print_ip_sym(current->preempt_disable_ip);
+ pr_cont("\n");
+ }
+#endif
dump_stack();
}
EXPORT_SYMBOL(__might_sleep);
@@ -6986,7 +7132,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
__setscheduler(rq, p, &attr);
if (on_rq) {
enqueue_task(rq, p, 0);
- resched_task(rq->curr);
+ resched_curr(rq);
}
check_class_changed(rq, p, prev_class, old_prio);
@@ -7018,7 +7164,7 @@ void normalize_rt_tasks(void)
* Renice negative nice level userspace
* tasks back to 0:
*/
- if (TASK_NICE(p) < 0 && p->mm)
+ if (task_nice(p) < 0 && p->mm)
set_user_nice(p, 0);
continue;
}
@@ -7186,7 +7332,7 @@ void sched_move_task(struct task_struct *tsk)
if (unlikely(running))
tsk->sched_class->put_prev_task(rq, tsk);
- tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id,
+ tg = container_of(task_css_check(tsk, cpu_cgrp_id,
lockdep_is_held(&tsk->sighand->siglock)),
struct task_group, css);
tg = autogroup_task_group(tsk, tg);
@@ -7587,7 +7733,7 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
- struct task_group *parent = css_tg(css_parent(css));
+ struct task_group *parent = css_tg(css->parent);
if (parent)
sched_online_group(tg, parent);
@@ -7613,7 +7759,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
{
struct task_struct *task;
- cgroup_taskset_for_each(task, css, tset) {
+ cgroup_taskset_for_each(task, tset) {
#ifdef CONFIG_RT_GROUP_SCHED
if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
@@ -7631,7 +7777,7 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
{
struct task_struct *task;
- cgroup_taskset_for_each(task, css, tset)
+ cgroup_taskset_for_each(task, tset)
sched_move_task(task);
}
@@ -7697,6 +7843,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
if (period > max_cfs_quota_period)
return -EINVAL;
+ /*
+ * Prevent race between setting of cfs_rq->runtime_enabled and
+ * unthrottle_offline_cfs_rqs().
+ */
+ get_online_cpus();
mutex_lock(&cfs_constraints_mutex);
ret = __cfs_schedulable(tg, period, quota);
if (ret)
@@ -7718,12 +7869,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
/* restart the period timer (if active) to handle new period expiry */
if (runtime_enabled && cfs_b->timer_active) {
/* force a reprogram */
- cfs_b->timer_active = 0;
- __start_cfs_bandwidth(cfs_b);
+ __start_cfs_bandwidth(cfs_b, true);
}
raw_spin_unlock_irq(&cfs_b->lock);
- for_each_possible_cpu(i) {
+ for_each_online_cpu(i) {
struct cfs_rq *cfs_rq = tg->cfs_rq[i];
struct rq *rq = cfs_rq->rq;
@@ -7739,6 +7889,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
cfs_bandwidth_usage_dec();
out_unlock:
mutex_unlock(&cfs_constraints_mutex);
+ put_online_cpus();
return ret;
}
@@ -7970,8 +8121,7 @@ static struct cftype cpu_files[] = {
{ } /* terminate */
};
-struct cgroup_subsys cpu_cgroup_subsys = {
- .name = "cpu",
+struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
.css_free = cpu_cgroup_css_free,
.css_online = cpu_cgroup_css_online,
@@ -7979,8 +8129,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
.exit = cpu_cgroup_exit,
- .subsys_id = cpu_cgroup_subsys_id,
- .base_cftypes = cpu_files,
+ .legacy_cftypes = cpu_files,
.early_init = 1,
};
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 622e0818f905..dd7cbb55bbf2 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -41,12 +41,12 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
/* return cpu accounting group to which this task belongs */
static inline struct cpuacct *task_ca(struct task_struct *tsk)
{
- return css_ca(task_css(tsk, cpuacct_subsys_id));
+ return css_ca(task_css(tsk, cpuacct_cgrp_id));
}
static inline struct cpuacct *parent_ca(struct cpuacct *ca)
{
- return css_ca(css_parent(&ca->css));
+ return css_ca(ca->css.parent);
}
static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
@@ -275,11 +275,9 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)
rcu_read_unlock();
}
-struct cgroup_subsys cpuacct_subsys = {
- .name = "cpuacct",
+struct cgroup_subsys cpuacct_cgrp_subsys = {
.css_alloc = cpuacct_css_alloc,
.css_free = cpuacct_css_free,
- .subsys_id = cpuacct_subsys_id,
- .base_cftypes = files,
+ .legacy_cftypes = files,
.early_init = 1,
};
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5b9bb42b2d47..bd95963dae80 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -13,6 +13,7 @@
#include <linux/gfp.h>
#include <linux/kernel.h>
+#include <linux/slab.h>
#include "cpudeadline.h"
static inline int parent(int i)
@@ -39,8 +40,10 @@ static void cpudl_exchange(struct cpudl *cp, int a, int b)
{
int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
- swap(cp->elements[a], cp->elements[b]);
- swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]);
+ swap(cp->elements[a].cpu, cp->elements[b].cpu);
+ swap(cp->elements[a].dl , cp->elements[b].dl );
+
+ swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx);
}
static void cpudl_heapify(struct cpudl *cp, int idx)
@@ -140,7 +143,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
WARN_ON(!cpu_present(cpu));
raw_spin_lock_irqsave(&cp->lock, flags);
- old_idx = cp->cpu_to_idx[cpu];
+ old_idx = cp->elements[cpu].idx;
if (!is_valid) {
/* remove item */
if (old_idx == IDX_INVALID) {
@@ -155,8 +158,8 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
cp->elements[old_idx].cpu = new_cpu;
cp->size--;
- cp->cpu_to_idx[new_cpu] = old_idx;
- cp->cpu_to_idx[cpu] = IDX_INVALID;
+ cp->elements[new_cpu].idx = old_idx;
+ cp->elements[cpu].idx = IDX_INVALID;
while (old_idx > 0 && dl_time_before(
cp->elements[parent(old_idx)].dl,
cp->elements[old_idx].dl)) {
@@ -173,7 +176,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
cp->size++;
cp->elements[cp->size - 1].dl = 0;
cp->elements[cp->size - 1].cpu = cpu;
- cp->cpu_to_idx[cpu] = cp->size - 1;
+ cp->elements[cpu].idx = cp->size - 1;
cpudl_change_key(cp, cp->size - 1, dl);
cpumask_clear_cpu(cpu, cp->free_cpus);
} else {
@@ -195,10 +198,21 @@ int cpudl_init(struct cpudl *cp)
memset(cp, 0, sizeof(*cp));
raw_spin_lock_init(&cp->lock);
cp->size = 0;
- for (i = 0; i < NR_CPUS; i++)
- cp->cpu_to_idx[i] = IDX_INVALID;
- if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL))
+
+ cp->elements = kcalloc(nr_cpu_ids,
+ sizeof(struct cpudl_item),
+ GFP_KERNEL);
+ if (!cp->elements)
+ return -ENOMEM;
+
+ if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
+ kfree(cp->elements);
return -ENOMEM;
+ }
+
+ for_each_possible_cpu(i)
+ cp->elements[i].idx = IDX_INVALID;
+
cpumask_setall(cp->free_cpus);
return 0;
@@ -210,7 +224,6 @@ int cpudl_init(struct cpudl *cp)
*/
void cpudl_cleanup(struct cpudl *cp)
{
- /*
- * nothing to do for the moment
- */
+ free_cpumask_var(cp->free_cpus);
+ kfree(cp->elements);
}
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index a202789a412c..538c9796ad4a 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -5,17 +5,17 @@
#define IDX_INVALID -1
-struct array_item {
+struct cpudl_item {
u64 dl;
int cpu;
+ int idx;
};
struct cpudl {
raw_spinlock_t lock;
int size;
- int cpu_to_idx[NR_CPUS];
- struct array_item elements[NR_CPUS];
cpumask_var_t free_cpus;
+ struct cpudl_item *elements;
};
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 8b836b376d91..981fcd7dc394 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -30,6 +30,7 @@
#include <linux/gfp.h>
#include <linux/sched.h>
#include <linux/sched/rt.h>
+#include <linux/slab.h>
#include "cpupri.h"
/* Convert between a 140 based task->prio, and our 102 based cpupri */
@@ -70,8 +71,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
int idx = 0;
int task_pri = convert_prio(p->prio);
- if (task_pri >= MAX_RT_PRIO)
- return 0;
+ BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
for (idx = 0; idx < task_pri; idx++) {
struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
@@ -165,7 +165,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
* do a write memory barrier, and then update the count, to
* make sure the vector is visible when count is set.
*/
- smp_mb__before_atomic_inc();
+ smp_mb__before_atomic();
atomic_inc(&(vec)->count);
do_mb = 1;
}
@@ -185,14 +185,14 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
* the new priority vec.
*/
if (do_mb)
- smp_mb__after_atomic_inc();
+ smp_mb__after_atomic();
/*
* When removing from the vector, we decrement the counter first
* do a memory barrier and then clear the mask.
*/
atomic_dec(&(vec)->count);
- smp_mb__after_atomic_inc();
+ smp_mb__after_atomic();
cpumask_clear_cpu(cpu, vec->mask);
}
@@ -219,8 +219,13 @@ int cpupri_init(struct cpupri *cp)
goto cleanup;
}
+ cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL);
+ if (!cp->cpu_to_pri)
+ goto cleanup;
+
for_each_possible_cpu(i)
cp->cpu_to_pri[i] = CPUPRI_INVALID;
+
return 0;
cleanup:
@@ -237,6 +242,7 @@ void cpupri_cleanup(struct cpupri *cp)
{
int i;
+ kfree(cp->cpu_to_pri);
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
free_cpumask_var(cp->pri_to_cpu[i].mask);
}
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index f6d756173491..6b033347fdfd 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -17,7 +17,7 @@ struct cpupri_vec {
struct cpupri {
struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
- int cpu_to_pri[NR_CPUS];
+ int *cpu_to_pri;
};
#ifdef CONFIG_SMP
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 99947919e30b..72fdf06ef865 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -142,7 +142,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
p->utimescaled += cputime_scaled;
account_group_user_time(p, cputime);
- index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
+ index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
/* Add user time to cpustat. */
task_group_account_field(p, index, (__force u64) cputime);
@@ -169,7 +169,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
p->gtime += cputime;
/* Add guest time to cpustat. */
- if (TASK_NICE(p) > 0) {
+ if (task_nice(p) > 0) {
cpustat[CPUTIME_NICE] += (__force u64) cputime;
cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
} else {
@@ -258,16 +258,22 @@ static __always_inline bool steal_account_process_tick(void)
{
#ifdef CONFIG_PARAVIRT
if (static_key_false(&paravirt_steal_enabled)) {
- u64 steal, st = 0;
+ u64 steal;
+ cputime_t steal_ct;
steal = paravirt_steal_clock(smp_processor_id());
steal -= this_rq()->prev_steal_time;
- st = steal_ticks(steal);
- this_rq()->prev_steal_time += st * TICK_NSEC;
+ /*
+ * cputime_t may be less precise than nsecs (eg: if it's
+ * based on jiffies). Lets cast the result to cputime
+ * granularity and account the rest on the next rounds.
+ */
+ steal_ct = nsecs_to_cputime(steal);
+ this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
- account_steal_time(st);
- return st;
+ account_steal_time(steal_ct);
+ return steal_ct;
}
#endif
return false;
@@ -326,50 +332,50 @@ out:
* softirq as those do not count in task exec_runtime any more.
*/
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq)
+ struct rq *rq, int ticks)
{
- cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+ cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
+ u64 cputime = (__force u64) cputime_one_jiffy;
u64 *cpustat = kcpustat_this_cpu->cpustat;
if (steal_account_process_tick())
return;
+ cputime *= ticks;
+ scaled *= ticks;
+
if (irqtime_account_hi_update()) {
- cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
+ cpustat[CPUTIME_IRQ] += cputime;
} else if (irqtime_account_si_update()) {
- cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
+ cpustat[CPUTIME_SOFTIRQ] += cputime;
} else if (this_cpu_ksoftirqd() == p) {
/*
* ksoftirqd time do not get accounted in cpu_softirq_time.
* So, we have to handle it separately here.
* Also, p->stime needs to be updated for ksoftirqd.
*/
- __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
- CPUTIME_SOFTIRQ);
+ __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);
} else if (user_tick) {
- account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ account_user_time(p, cputime, scaled);
} else if (p == rq->idle) {
- account_idle_time(cputime_one_jiffy);
+ account_idle_time(cputime);
} else if (p->flags & PF_VCPU) { /* System time or guest time */
- account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ account_guest_time(p, cputime, scaled);
} else {
- __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
- CPUTIME_SYSTEM);
+ __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM);
}
}
static void irqtime_account_idle_ticks(int ticks)
{
- int i;
struct rq *rq = this_rq();
- for (i = 0; i < ticks; i++)
- irqtime_account_process_tick(current, 0, rq);
+ irqtime_account_process_tick(current, 0, rq, ticks);
}
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
static inline void irqtime_account_idle_ticks(int ticks) {}
static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq) {}
+ struct rq *rq, int nr_ticks) {}
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
/*
@@ -458,7 +464,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
return;
if (sched_clock_irqtime) {
- irqtime_account_process_tick(p, user_tick, rq);
+ irqtime_account_process_tick(p, user_tick, rq, 1);
return;
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 6e79b3faa4cd..255ce138b652 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -57,8 +57,6 @@ void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
dl_b->dl_runtime = runtime;
}
-extern unsigned long to_ratio(u64 period, u64 runtime);
-
void init_dl_bw(struct dl_bw *dl_b)
{
raw_spin_lock_init(&dl_b->lock);
@@ -210,6 +208,16 @@ static inline int has_pushable_dl_tasks(struct rq *rq)
static int push_dl_task(struct rq *rq);
+static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
+{
+ return dl_task(prev);
+}
+
+static inline void set_post_schedule(struct rq *rq)
+{
+ rq->post_schedule = has_pushable_dl_tasks(rq);
+}
+
#else
static inline
@@ -232,6 +240,19 @@ void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
}
+static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
+{
+ return false;
+}
+
+static inline int pull_dl_task(struct rq *rq)
+{
+ return 0;
+}
+
+static inline void set_post_schedule(struct rq *rq)
+{
+}
#endif /* CONFIG_SMP */
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
@@ -285,7 +306,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
* the overrunning entity can't interfere with other entity in the system and
* can't make them miss their deadlines. Reasons why this kind of overruns
* could happen are, typically, a entity voluntarily trying to overcome its
- * runtime, or it just underestimated it during sched_setscheduler_ex().
+ * runtime, or it just underestimated it during sched_setattr().
*/
static void replenish_dl_entity(struct sched_dl_entity *dl_se,
struct sched_dl_entity *pi_se)
@@ -325,12 +346,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
* entity.
*/
if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
- static bool lag_once = false;
-
- if (!lag_once) {
- lag_once = true;
- printk_sched("sched: DL replenish lagged to much\n");
- }
+ printk_deferred_once("sched: DL replenish lagged to much\n");
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
dl_se->runtime = pi_se->dl_runtime;
}
@@ -490,14 +506,22 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
struct sched_dl_entity,
dl_timer);
struct task_struct *p = dl_task_of(dl_se);
- struct rq *rq = task_rq(p);
+ struct rq *rq;
+again:
+ rq = task_rq(p);
raw_spin_lock(&rq->lock);
+ if (rq != task_rq(p)) {
+ /* Task was moved, retrying. */
+ raw_spin_unlock(&rq->lock);
+ goto again;
+ }
+
/*
* We need to take care of a possible races here. In fact, the
* task might have changed its scheduling policy to something
* different from SCHED_DEADLINE or changed its reservation
- * parameters (through sched_setscheduler()).
+ * parameters (through sched_setattr()).
*/
if (!dl_task(p) || dl_se->dl_new)
goto unlock;
@@ -505,12 +529,13 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
sched_clock_tick();
update_rq_clock(rq);
dl_se->dl_throttled = 0;
+ dl_se->dl_yielded = 0;
if (p->on_rq) {
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
if (task_has_dl_policy(rq->curr))
check_preempt_curr_dl(rq, p, 0);
else
- resched_task(rq->curr);
+ resched_curr(rq);
#ifdef CONFIG_SMP
/*
* Queueing this task back might have overloaded rq,
@@ -586,8 +611,8 @@ static void update_curr_dl(struct rq *rq)
* approach need further study.
*/
delta_exec = rq_clock_task(rq) - curr->se.exec_start;
- if (unlikely((s64)delta_exec < 0))
- delta_exec = 0;
+ if (unlikely((s64)delta_exec <= 0))
+ return;
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
@@ -609,7 +634,7 @@ static void update_curr_dl(struct rq *rq)
enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
if (!is_leftmost(curr, &rq->dl))
- resched_task(curr);
+ resched_curr(rq);
}
/*
@@ -717,7 +742,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
WARN_ON(!dl_prio(prio));
dl_rq->dl_nr_running++;
- inc_nr_running(rq_of_dl_rq(dl_rq));
+ add_nr_running(rq_of_dl_rq(dl_rq), 1);
inc_dl_deadline(dl_rq, deadline);
inc_dl_migration(dl_se, dl_rq);
@@ -731,7 +756,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
WARN_ON(!dl_prio(prio));
WARN_ON(!dl_rq->dl_nr_running);
dl_rq->dl_nr_running--;
- dec_nr_running(rq_of_dl_rq(dl_rq));
+ sub_nr_running(rq_of_dl_rq(dl_rq), 1);
dec_dl_deadline(dl_rq, dl_se->deadline);
dec_dl_migration(dl_se, dl_rq);
@@ -870,10 +895,10 @@ static void yield_task_dl(struct rq *rq)
* We make the task go to sleep until its current deadline by
* forcing its runtime to zero. This way, update_curr_dl() stops
* it and the bandwidth timer will wake it up and will give it
- * new scheduling parameters (thanks to dl_new=1).
+ * new scheduling parameters (thanks to dl_yielded=1).
*/
if (p->dl.runtime > 0) {
- rq->curr->dl.dl_new = 1;
+ rq->curr->dl.dl_yielded = 1;
p->dl.runtime = 0;
}
update_curr_dl(rq);
@@ -939,9 +964,11 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
return;
- resched_task(rq->curr);
+ resched_curr(rq);
}
+static int pull_dl_task(struct rq *this_rq);
+
#endif /* CONFIG_SMP */
/*
@@ -952,7 +979,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
int flags)
{
if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
- resched_task(rq->curr);
+ resched_curr(rq);
return;
}
@@ -988,7 +1015,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
return rb_entry(left, struct sched_dl_entity, rb_node);
}
-struct task_struct *pick_next_task_dl(struct rq *rq)
+struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
{
struct sched_dl_entity *dl_se;
struct task_struct *p;
@@ -996,9 +1023,29 @@ struct task_struct *pick_next_task_dl(struct rq *rq)
dl_rq = &rq->dl;
+ if (need_pull_dl_task(rq, prev)) {
+ pull_dl_task(rq);
+ /*
+ * pull_rt_task() can drop (and re-acquire) rq->lock; this
+ * means a stop task can slip in, in which case we need to
+ * re-start task selection.
+ */
+ if (rq->stop && rq->stop->on_rq)
+ return RETRY_TASK;
+ }
+
+ /*
+ * When prev is DL, we may throttle it in put_prev_task().
+ * So, we update time before we check for dl_nr_running.
+ */
+ if (prev->sched_class == &dl_sched_class)
+ update_curr_dl(rq);
+
if (unlikely(!dl_rq->dl_nr_running))
return NULL;
+ put_prev_task(rq, prev);
+
dl_se = pick_next_dl_entity(rq, dl_rq);
BUG_ON(!dl_se);
@@ -1013,9 +1060,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq)
start_hrtick_dl(rq, p);
#endif
-#ifdef CONFIG_SMP
- rq->post_schedule = has_pushable_dl_tasks(rq);
-#endif /* CONFIG_SMP */
+ set_post_schedule(rq);
return p;
}
@@ -1288,7 +1333,7 @@ retry:
if (dl_task(rq->curr) &&
dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
rq->curr->nr_cpus_allowed > 1) {
- resched_task(rq->curr);
+ resched_curr(rq);
return 0;
}
@@ -1328,7 +1373,7 @@ retry:
set_task_cpu(next_task, later_rq->cpu);
activate_task(later_rq, next_task, 0);
- resched_task(later_rq->curr);
+ resched_curr(later_rq);
double_unlock_balance(rq, later_rq);
@@ -1424,13 +1469,6 @@ skip:
return ret;
}
-static void pre_schedule_dl(struct rq *rq, struct task_struct *prev)
-{
- /* Try to pull other tasks here */
- if (dl_task(prev))
- pull_dl_task(rq);
-}
-
static void post_schedule_dl(struct rq *rq)
{
push_dl_tasks(rq);
@@ -1558,7 +1596,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
if (unlikely(p->dl.dl_throttled))
return;
- if (p->on_rq || rq->curr != p) {
+ if (p->on_rq && rq->curr != p) {
#ifdef CONFIG_SMP
if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
/* Only reschedule if pushing failed */
@@ -1594,14 +1632,14 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
*/
if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
rq->curr == p)
- resched_task(p);
+ resched_curr(rq);
#else
/*
* Again, we don't know if p has a earlier
* or later deadline, so let's blindly set a
* (maybe not needed) rescheduling point.
*/
- resched_task(p);
+ resched_curr(rq);
#endif /* CONFIG_SMP */
} else
switched_to_dl(rq, p);
@@ -1623,7 +1661,6 @@ const struct sched_class dl_sched_class = {
.set_cpus_allowed = set_cpus_allowed_dl,
.rq_online = rq_online_dl,
.rq_offline = rq_offline_dl,
- .pre_schedule = pre_schedule_dl,
.post_schedule = post_schedule_dl,
.task_woken = task_woken_dl,
#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index dd52e7ffb10e..627b3c34b821 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -111,8 +111,7 @@ static char *task_group_path(struct task_group *tg)
if (autogroup_path(tg, group_path, PATH_MAX))
return group_path;
- cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
- return group_path;
+ return cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
}
#endif
@@ -321,6 +320,7 @@ do { \
P(sched_goidle);
#ifdef CONFIG_SMP
P64(avg_idle);
+ P64(max_idle_balance_cost);
#endif
P(ttwu_count);
@@ -533,15 +533,15 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
unsigned long nr_faults = -1;
int cpu_current, home_node;
- if (p->numa_faults)
- nr_faults = p->numa_faults[2*node + i];
+ if (p->numa_faults_memory)
+ nr_faults = p->numa_faults_memory[2*node + i];
cpu_current = !i ? (task_node(p) == node) :
(pol && node_isset(node, pol->v.nodes));
home_node = (p->numa_preferred_nid == node);
- SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n",
+ SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n",
i, node, cpu_current, home_node, nr_faults);
}
}
@@ -608,7 +608,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
avg_atom = p->se.sum_exec_runtime;
if (nr_switches)
- do_div(avg_atom, nr_switches);
+ avg_atom = div64_ul(avg_atom, nr_switches);
else
avg_atom = -1LL;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9b4c4f320130..bfa3c86d0d68 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -322,13 +322,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
/* Do the two (enqueued) entities belong to the same group ? */
-static inline int
+static inline struct cfs_rq *
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{
if (se->cfs_rq == pse->cfs_rq)
- return 1;
+ return se->cfs_rq;
- return 0;
+ return NULL;
}
static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -336,17 +336,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
return se->parent;
}
-/* return depth at which a sched entity is present in the hierarchy */
-static inline int depth_se(struct sched_entity *se)
-{
- int depth = 0;
-
- for_each_sched_entity(se)
- depth++;
-
- return depth;
-}
-
static void
find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
@@ -360,8 +349,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
*/
/* First walk up until both entities are at same depth */
- se_depth = depth_se(*se);
- pse_depth = depth_se(*pse);
+ se_depth = (*se)->depth;
+ pse_depth = (*pse)->depth;
while (se_depth > pse_depth) {
se_depth--;
@@ -426,12 +415,6 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
-static inline int
-is_same_group(struct sched_entity *se, struct sched_entity *pse)
-{
- return 1;
-}
-
static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
return NULL;
@@ -819,14 +802,6 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
-/*
- * After skipping a page migration on a shared page, skip N more numa page
- * migrations unconditionally. This reduces the number of NUMA migrations
- * in shared memory workloads, and has the effect of pulling tasks towards
- * where their memory lives, over pulling the memory towards the task.
- */
-unsigned int sysctl_numa_balancing_migrate_deferred = 16;
-
static unsigned int task_nr_scan_windows(struct task_struct *p)
{
unsigned long rss = 0;
@@ -893,10 +868,26 @@ struct numa_group {
struct list_head task_list;
struct rcu_head rcu;
+ nodemask_t active_nodes;
unsigned long total_faults;
+ /*
+ * Faults_cpu is used to decide whether memory should move
+ * towards the CPU. As a consequence, these stats are weighted
+ * more by CPU use than by memory faults.
+ */
+ unsigned long *faults_cpu;
unsigned long faults[0];
};
+/* Shared or private faults. */
+#define NR_NUMA_HINT_FAULT_TYPES 2
+
+/* Memory and CPU locality */
+#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
+
+/* Averaged statistics, and temporary buffers. */
+#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
+
pid_t task_numa_group_id(struct task_struct *p)
{
return p->numa_group ? p->numa_group->gid : 0;
@@ -904,16 +895,16 @@ pid_t task_numa_group_id(struct task_struct *p)
static inline int task_faults_idx(int nid, int priv)
{
- return 2 * nid + priv;
+ return NR_NUMA_HINT_FAULT_TYPES * nid + priv;
}
static inline unsigned long task_faults(struct task_struct *p, int nid)
{
- if (!p->numa_faults)
+ if (!p->numa_faults_memory)
return 0;
- return p->numa_faults[task_faults_idx(nid, 0)] +
- p->numa_faults[task_faults_idx(nid, 1)];
+ return p->numa_faults_memory[task_faults_idx(nid, 0)] +
+ p->numa_faults_memory[task_faults_idx(nid, 1)];
}
static inline unsigned long group_faults(struct task_struct *p, int nid)
@@ -925,6 +916,12 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
p->numa_group->faults[task_faults_idx(nid, 1)];
}
+static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
+{
+ return group->faults_cpu[task_faults_idx(nid, 0)] +
+ group->faults_cpu[task_faults_idx(nid, 1)];
+}
+
/*
* These return the fraction of accesses done by a particular task, or
* task group, on a particular numa node. The group weight is given a
@@ -935,7 +932,7 @@ static inline unsigned long task_weight(struct task_struct *p, int nid)
{
unsigned long total_faults;
- if (!p->numa_faults)
+ if (!p->numa_faults_memory)
return 0;
total_faults = p->total_numa_faults;
@@ -954,10 +951,73 @@ static inline unsigned long group_weight(struct task_struct *p, int nid)
return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
}
+bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
+ int src_nid, int dst_cpu)
+{
+ struct numa_group *ng = p->numa_group;
+ int dst_nid = cpu_to_node(dst_cpu);
+ int last_cpupid, this_cpupid;
+
+ this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
+
+ /*
+ * Multi-stage node selection is used in conjunction with a periodic
+ * migration fault to build a temporal task<->page relation. By using
+ * a two-stage filter we remove short/unlikely relations.
+ *
+ * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
+ * a task's usage of a particular page (n_p) per total usage of this
+ * page (n_t) (in a given time-span) to a probability.
+ *
+ * Our periodic faults will sample this probability and getting the
+ * same result twice in a row, given these samples are fully
+ * independent, is then given by P(n)^2, provided our sample period
+ * is sufficiently short compared to the usage pattern.
+ *
+ * This quadric squishes small probabilities, making it less likely we
+ * act on an unlikely task<->page relation.
+ */
+ last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+ if (!cpupid_pid_unset(last_cpupid) &&
+ cpupid_to_nid(last_cpupid) != dst_nid)
+ return false;
+
+ /* Always allow migrate on private faults */
+ if (cpupid_match_pid(p, last_cpupid))
+ return true;
+
+ /* A shared fault, but p->numa_group has not been set up yet. */
+ if (!ng)
+ return true;
+
+ /*
+ * Do not migrate if the destination is not a node that
+ * is actively used by this numa group.
+ */
+ if (!node_isset(dst_nid, ng->active_nodes))
+ return false;
+
+ /*
+ * Source is a node that is not actively used by this
+ * numa group, while the destination is. Migrate.
+ */
+ if (!node_isset(src_nid, ng->active_nodes))
+ return true;
+
+ /*
+ * Both source and destination are nodes in active
+ * use by this numa group. Maximize memory bandwidth
+ * by migrating from more heavily used groups, to less
+ * heavily used ones, spreading the load around.
+ * Use a 1/4 hysteresis to avoid spurious page movement.
+ */
+ return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
+}
+
static unsigned long weighted_cpuload(const int cpu);
static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
-static unsigned long power_of(int cpu);
+static unsigned long capacity_of(int cpu);
static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
/* Cached statistics for all CPUs within a node */
@@ -966,11 +1026,11 @@ struct numa_stats {
unsigned long load;
/* Total compute capacity of CPUs on a node */
- unsigned long power;
+ unsigned long compute_capacity;
/* Approximate capacity in terms of runnable tasks on a node */
- unsigned long capacity;
- int has_capacity;
+ unsigned long task_capacity;
+ int has_free_capacity;
};
/*
@@ -986,7 +1046,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
ns->nr_running += rq->nr_running;
ns->load += weighted_cpuload(cpu);
- ns->power += power_of(cpu);
+ ns->compute_capacity += capacity_of(cpu);
cpus++;
}
@@ -996,15 +1056,15 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
* the @ns structure is NULL'ed and task_numa_compare() will
* not find this node attractive.
*
- * We'll either bail at !has_capacity, or we'll detect a huge imbalance
- * and bail there.
+ * We'll either bail at !has_free_capacity, or we'll detect a huge
+ * imbalance and bail there.
*/
if (!cpus)
return;
- ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
- ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
- ns->has_capacity = (ns->nr_running < ns->capacity);
+ ns->task_capacity =
+ DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
+ ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
}
struct task_numa_env {
@@ -1035,6 +1095,50 @@ static void task_numa_assign(struct task_numa_env *env,
env->best_cpu = env->dst_cpu;
}
+static bool load_too_imbalanced(long src_load, long dst_load,
+ struct task_numa_env *env)
+{
+ long imb, old_imb;
+ long orig_src_load, orig_dst_load;
+ long src_capacity, dst_capacity;
+
+ /*
+ * The load is corrected for the CPU capacity available on each node.
+ *
+ * src_load dst_load
+ * ------------ vs ---------
+ * src_capacity dst_capacity
+ */
+ src_capacity = env->src_stats.compute_capacity;
+ dst_capacity = env->dst_stats.compute_capacity;
+
+ /* We care about the slope of the imbalance, not the direction. */
+ if (dst_load < src_load)
+ swap(dst_load, src_load);
+
+ /* Is the difference below the threshold? */
+ imb = dst_load * src_capacity * 100 -
+ src_load * dst_capacity * env->imbalance_pct;
+ if (imb <= 0)
+ return false;
+
+ /*
+ * The imbalance is above the allowed threshold.
+ * Compare it with the old imbalance.
+ */
+ orig_src_load = env->src_stats.load;
+ orig_dst_load = env->dst_stats.load;
+
+ if (orig_dst_load < orig_src_load)
+ swap(orig_dst_load, orig_src_load);
+
+ old_imb = orig_dst_load * src_capacity * 100 -
+ orig_src_load * dst_capacity * env->imbalance_pct;
+
+ /* Would this change make things worse? */
+ return (imb > old_imb);
+}
+
/*
* This checks if the overall compute and NUMA accesses of the system would
* be improved if the source tasks was migrated to the target dst_cpu taking
@@ -1047,9 +1151,10 @@ static void task_numa_compare(struct task_numa_env *env,
struct rq *src_rq = cpu_rq(env->src_cpu);
struct rq *dst_rq = cpu_rq(env->dst_cpu);
struct task_struct *cur;
- long dst_load, src_load;
+ long src_load, dst_load;
long load;
- long imp = (groupimp > 0) ? groupimp : taskimp;
+ long imp = env->p->numa_group ? groupimp : taskimp;
+ long moveimp = imp;
rcu_read_lock();
cur = ACCESS_ONCE(dst_rq->curr);
@@ -1087,11 +1192,6 @@ static void task_numa_compare(struct task_numa_env *env,
* itself (not part of a group), use the task weight
* instead.
*/
- if (env->p->numa_group)
- imp = groupimp;
- else
- imp = taskimp;
-
if (cur->numa_group)
imp += group_weight(cur, env->src_nid) -
group_weight(cur, env->dst_nid);
@@ -1101,33 +1201,47 @@ static void task_numa_compare(struct task_numa_env *env,
}
}
- if (imp < env->best_imp)
+ if (imp <= env->best_imp && moveimp <= env->best_imp)
goto unlock;
if (!cur) {
/* Is there capacity at our destination? */
- if (env->src_stats.has_capacity &&
- !env->dst_stats.has_capacity)
+ if (env->src_stats.has_free_capacity &&
+ !env->dst_stats.has_free_capacity)
goto unlock;
goto balance;
}
/* Balance doesn't matter much if we're running a task per cpu */
- if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
+ if (imp > env->best_imp && src_rq->nr_running == 1 &&
+ dst_rq->nr_running == 1)
goto assign;
/*
* In the overloaded case, try and keep the load balanced.
*/
balance:
- dst_load = env->dst_stats.load;
- src_load = env->src_stats.load;
-
- /* XXX missing power terms */
load = task_h_load(env->p);
- dst_load += load;
- src_load -= load;
+ dst_load = env->dst_stats.load + load;
+ src_load = env->src_stats.load - load;
+
+ if (moveimp > imp && moveimp > env->best_imp) {
+ /*
+ * If the improvement from just moving env->p direction is
+ * better than swapping tasks around, check if a move is
+ * possible. Store a slightly smaller score than moveimp,
+ * so an actually idle CPU will win.
+ */
+ if (!load_too_imbalanced(src_load, dst_load, env)) {
+ imp = moveimp - 1;
+ cur = NULL;
+ goto assign;
+ }
+ }
+
+ if (imp <= env->best_imp)
+ goto unlock;
if (cur) {
load = task_h_load(cur);
@@ -1135,11 +1249,7 @@ balance:
src_load += load;
}
- /* make src_load the smaller */
- if (dst_load < src_load)
- swap(dst_load, src_load);
-
- if (src_load * env->imbalance_pct < dst_load * 100)
+ if (load_too_imbalanced(src_load, dst_load, env))
goto unlock;
assign:
@@ -1215,9 +1325,8 @@ static int task_numa_migrate(struct task_struct *p)
groupimp = group_weight(p, env.dst_nid) - groupweight;
update_numa_stats(&env.dst_stats, env.dst_nid);
- /* If the preferred nid has capacity, try to use it. */
- if (env.dst_stats.has_capacity)
- task_numa_find_cpu(&env, taskimp, groupimp);
+ /* Try to find a spot on the preferred nid. */
+ task_numa_find_cpu(&env, taskimp, groupimp);
/* No space available on the preferred nid. Look elsewhere. */
if (env.best_cpu == -1) {
@@ -1237,12 +1346,28 @@ static int task_numa_migrate(struct task_struct *p)
}
}
+ /*
+ * If the task is part of a workload that spans multiple NUMA nodes,
+ * and is migrating into one of the workload's active nodes, remember
+ * this node as the task's preferred numa node, so the workload can
+ * settle down.
+ * A task that migrated to a second choice node will be better off
+ * trying for a better one later. Do not set the preferred node here.
+ */
+ if (p->numa_group) {
+ if (env.best_cpu == -1)
+ nid = env.src_nid;
+ else
+ nid = env.dst_nid;
+
+ if (node_isset(nid, p->numa_group->active_nodes))
+ sched_setnuma(p, env.dst_nid);
+ }
+
/* No better CPU than the current one was found. */
if (env.best_cpu == -1)
return -EAGAIN;
- sched_setnuma(p, env.dst_nid);
-
/*
* Reset the scan period if the task is being rescheduled on an
* alternative node to recheck if the tasks is now properly placed.
@@ -1266,12 +1391,15 @@ static int task_numa_migrate(struct task_struct *p)
/* Attempt to migrate a task to a CPU on the preferred node. */
static void numa_migrate_preferred(struct task_struct *p)
{
+ unsigned long interval = HZ;
+
/* This task has no NUMA fault statistics yet */
- if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
+ if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
return;
/* Periodically retry migrating the task to the preferred node */
- p->numa_migrate_retry = jiffies + HZ;
+ interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
+ p->numa_migrate_retry = jiffies + interval;
/* Success if task is already running on preferred CPU */
if (task_node(p) == p->numa_preferred_nid)
@@ -1282,14 +1410,46 @@ static void numa_migrate_preferred(struct task_struct *p)
}
/*
+ * Find the nodes on which the workload is actively running. We do this by
+ * tracking the nodes from which NUMA hinting faults are triggered. This can
+ * be different from the set of nodes where the workload's memory is currently
+ * located.
+ *
+ * The bitmask is used to make smarter decisions on when to do NUMA page
+ * migrations, To prevent flip-flopping, and excessive page migrations, nodes
+ * are added when they cause over 6/16 of the maximum number of faults, but
+ * only removed when they drop below 3/16.
+ */
+static void update_numa_active_node_mask(struct numa_group *numa_group)
+{
+ unsigned long faults, max_faults = 0;
+ int nid;
+
+ for_each_online_node(nid) {
+ faults = group_faults_cpu(numa_group, nid);
+ if (faults > max_faults)
+ max_faults = faults;
+ }
+
+ for_each_online_node(nid) {
+ faults = group_faults_cpu(numa_group, nid);
+ if (!node_isset(nid, numa_group->active_nodes)) {
+ if (faults > max_faults * 6 / 16)
+ node_set(nid, numa_group->active_nodes);
+ } else if (faults < max_faults * 3 / 16)
+ node_clear(nid, numa_group->active_nodes);
+ }
+}
+
+/*
* When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
* increments. The more local the fault statistics are, the higher the scan
- * period will be for the next scan window. If local/remote ratio is below
- * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
- * scan period will decrease
+ * period will be for the next scan window. If local/(local+remote) ratio is
+ * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
+ * the scan period will decrease. Aim for 70% local accesses.
*/
#define NUMA_PERIOD_SLOTS 10
-#define NUMA_PERIOD_THRESHOLD 3
+#define NUMA_PERIOD_THRESHOLD 7
/*
* Increase the scan period (slow down scanning) if the majority of
@@ -1355,11 +1515,41 @@ static void update_task_scan_period(struct task_struct *p,
memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
}
+/*
+ * Get the fraction of time the task has been running since the last
+ * NUMA placement cycle. The scheduler keeps similar statistics, but
+ * decays those on a 32ms period, which is orders of magnitude off
+ * from the dozens-of-seconds NUMA balancing period. Use the scheduler
+ * stats only if the task is so new there are no NUMA statistics yet.
+ */
+static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
+{
+ u64 runtime, delta, now;
+ /* Use the start of this time slice to avoid calculations. */
+ now = p->se.exec_start;
+ runtime = p->se.sum_exec_runtime;
+
+ if (p->last_task_numa_placement) {
+ delta = runtime - p->last_sum_exec_runtime;
+ *period = now - p->last_task_numa_placement;
+ } else {
+ delta = p->se.avg.runnable_avg_sum;
+ *period = p->se.avg.runnable_avg_period;
+ }
+
+ p->last_sum_exec_runtime = runtime;
+ p->last_task_numa_placement = now;
+
+ return delta;
+}
+
static void task_numa_placement(struct task_struct *p)
{
int seq, nid, max_nid = -1, max_group_nid = -1;
unsigned long max_faults = 0, max_group_faults = 0;
unsigned long fault_types[2] = { 0, 0 };
+ unsigned long total_faults;
+ u64 runtime, period;
spinlock_t *group_lock = NULL;
seq = ACCESS_ONCE(p->mm->numa_scan_seq);
@@ -1368,10 +1558,14 @@ static void task_numa_placement(struct task_struct *p)
p->numa_scan_seq = seq;
p->numa_scan_period_max = task_scan_max(p);
+ total_faults = p->numa_faults_locality[0] +
+ p->numa_faults_locality[1];
+ runtime = numa_get_avg_runtime(p, &period);
+
/* If the task is part of a group prevent parallel updates to group stats */
if (p->numa_group) {
group_lock = &p->numa_group->lock;
- spin_lock(group_lock);
+ spin_lock_irq(group_lock);
}
/* Find the node with the highest number of faults */
@@ -1379,24 +1573,37 @@ static void task_numa_placement(struct task_struct *p)
unsigned long faults = 0, group_faults = 0;
int priv, i;
- for (priv = 0; priv < 2; priv++) {
- long diff;
+ for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
+ long diff, f_diff, f_weight;
i = task_faults_idx(nid, priv);
- diff = -p->numa_faults[i];
/* Decay existing window, copy faults since last scan */
- p->numa_faults[i] >>= 1;
- p->numa_faults[i] += p->numa_faults_buffer[i];
- fault_types[priv] += p->numa_faults_buffer[i];
- p->numa_faults_buffer[i] = 0;
+ diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;
+ fault_types[priv] += p->numa_faults_buffer_memory[i];
+ p->numa_faults_buffer_memory[i] = 0;
- faults += p->numa_faults[i];
- diff += p->numa_faults[i];
+ /*
+ * Normalize the faults_from, so all tasks in a group
+ * count according to CPU use, instead of by the raw
+ * number of faults. Tasks with little runtime have
+ * little over-all impact on throughput, and thus their
+ * faults are less important.
+ */
+ f_weight = div64_u64(runtime << 16, period + 1);
+ f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
+ (total_faults + 1);
+ f_diff = f_weight - p->numa_faults_cpu[i] / 2;
+ p->numa_faults_buffer_cpu[i] = 0;
+
+ p->numa_faults_memory[i] += diff;
+ p->numa_faults_cpu[i] += f_diff;
+ faults += p->numa_faults_memory[i];
p->total_numa_faults += diff;
if (p->numa_group) {
/* safe because we can only change our own group */
p->numa_group->faults[i] += diff;
+ p->numa_group->faults_cpu[i] += f_diff;
p->numa_group->total_faults += diff;
group_faults += p->numa_group->faults[i];
}
@@ -1416,30 +1623,18 @@ static void task_numa_placement(struct task_struct *p)
update_task_scan_period(p, fault_types[0], fault_types[1]);
if (p->numa_group) {
- /*
- * If the preferred task and group nids are different,
- * iterate over the nodes again to find the best place.
- */
- if (max_nid != max_group_nid) {
- unsigned long weight, max_weight = 0;
-
- for_each_online_node(nid) {
- weight = task_weight(p, nid) + group_weight(p, nid);
- if (weight > max_weight) {
- max_weight = weight;
- max_nid = nid;
- }
- }
- }
-
- spin_unlock(group_lock);
+ update_numa_active_node_mask(p->numa_group);
+ spin_unlock_irq(group_lock);
+ max_nid = max_group_nid;
}
- /* Preferred node as the node with the most faults */
- if (max_faults && max_nid != p->numa_preferred_nid) {
- /* Update the preferred nid and migrate task if possible */
- sched_setnuma(p, max_nid);
- numa_migrate_preferred(p);
+ if (max_faults) {
+ /* Set the new preferred node */
+ if (max_nid != p->numa_preferred_nid)
+ sched_setnuma(p, max_nid);
+
+ if (task_node(p) != p->numa_preferred_nid)
+ numa_migrate_preferred(p);
}
}
@@ -1465,7 +1660,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
if (unlikely(!p->numa_group)) {
unsigned int size = sizeof(struct numa_group) +
- 2*nr_node_ids*sizeof(unsigned long);
+ 4*nr_node_ids*sizeof(unsigned long);
grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
if (!grp)
@@ -1475,9 +1670,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
spin_lock_init(&grp->lock);
INIT_LIST_HEAD(&grp->task_list);
grp->gid = p->pid;
+ /* Second half of the array tracks nids where faults happen */
+ grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
+ nr_node_ids;
- for (i = 0; i < 2*nr_node_ids; i++)
- grp->faults[i] = p->numa_faults[i];
+ node_set(task_node(current), grp->active_nodes);
+
+ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+ grp->faults[i] = p->numa_faults_memory[i];
grp->total_faults = p->total_numa_faults;
@@ -1532,11 +1732,12 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
if (!join)
return;
- double_lock(&my_grp->lock, &grp->lock);
+ BUG_ON(irqs_disabled());
+ double_lock_irq(&my_grp->lock, &grp->lock);
- for (i = 0; i < 2*nr_node_ids; i++) {
- my_grp->faults[i] -= p->numa_faults[i];
- grp->faults[i] += p->numa_faults[i];
+ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
+ my_grp->faults[i] -= p->numa_faults_memory[i];
+ grp->faults[i] += p->numa_faults_memory[i];
}
my_grp->total_faults -= p->total_numa_faults;
grp->total_faults += p->total_numa_faults;
@@ -1546,7 +1747,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
grp->nr_tasks++;
spin_unlock(&my_grp->lock);
- spin_unlock(&grp->lock);
+ spin_unlock_irq(&grp->lock);
rcu_assign_pointer(p->numa_group, grp);
@@ -1561,34 +1762,39 @@ no_join:
void task_numa_free(struct task_struct *p)
{
struct numa_group *grp = p->numa_group;
+ void *numa_faults = p->numa_faults_memory;
+ unsigned long flags;
int i;
- void *numa_faults = p->numa_faults;
if (grp) {
- spin_lock(&grp->lock);
- for (i = 0; i < 2*nr_node_ids; i++)
- grp->faults[i] -= p->numa_faults[i];
+ spin_lock_irqsave(&grp->lock, flags);
+ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+ grp->faults[i] -= p->numa_faults_memory[i];
grp->total_faults -= p->total_numa_faults;
list_del(&p->numa_entry);
grp->nr_tasks--;
- spin_unlock(&grp->lock);
+ spin_unlock_irqrestore(&grp->lock, flags);
rcu_assign_pointer(p->numa_group, NULL);
put_numa_group(grp);
}
- p->numa_faults = NULL;
- p->numa_faults_buffer = NULL;
+ p->numa_faults_memory = NULL;
+ p->numa_faults_buffer_memory = NULL;
+ p->numa_faults_cpu= NULL;
+ p->numa_faults_buffer_cpu = NULL;
kfree(numa_faults);
}
/*
* Got a PROT_NONE fault for a page on @node.
*/
-void task_numa_fault(int last_cpupid, int node, int pages, int flags)
+void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
{
struct task_struct *p = current;
bool migrated = flags & TNF_MIGRATED;
+ int cpu_node = task_node(current);
+ int local = !!(flags & TNF_FAULT_LOCAL);
int priv;
if (!numabalancing_enabled)
@@ -1603,16 +1809,24 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
return;
/* Allocate buffer to track faults on a per-node basis */
- if (unlikely(!p->numa_faults)) {
- int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
+ if (unlikely(!p->numa_faults_memory)) {
+ int size = sizeof(*p->numa_faults_memory) *
+ NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
- /* numa_faults and numa_faults_buffer share the allocation */
- p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
- if (!p->numa_faults)
+ p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
+ if (!p->numa_faults_memory)
return;
- BUG_ON(p->numa_faults_buffer);
- p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
+ BUG_ON(p->numa_faults_buffer_memory);
+ /*
+ * The averaged statistics, shared & private, memory & cpu,
+ * occupy the first half of the array. The second half of the
+ * array is for current counters, which are averaged into the
+ * first set by task_numa_placement.
+ */
+ p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
+ p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
+ p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
p->total_numa_faults = 0;
memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
}
@@ -1629,6 +1843,17 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
task_numa_group(p, last_cpupid, flags, &priv);
}
+ /*
+ * If a workload spans multiple NUMA nodes, a shared fault that
+ * occurs wholly within the set of nodes that the workload is
+ * actively using should be counted as local. This allows the
+ * scan rate to slow down when a workload has settled down.
+ */
+ if (!priv && !local && p->numa_group &&
+ node_isset(cpu_node, p->numa_group->active_nodes) &&
+ node_isset(mem_node, p->numa_group->active_nodes))
+ local = 1;
+
task_numa_placement(p);
/*
@@ -1641,8 +1866,9 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
if (migrated)
p->numa_pages_migrated += pages;
- p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
- p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
+ p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
+ p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
+ p->numa_faults_locality[local] += pages;
}
static void reset_ptenuma_scan(struct task_struct *p)
@@ -2219,13 +2445,20 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
se->avg.load_avg_contrib >>= NICE_0_SHIFT;
}
}
-#else
+
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
+{
+ __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
+ __update_tg_runnable_avg(&rq->avg, &rq->cfs);
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
int force_update) {}
static inline void __update_tg_runnable_avg(struct sched_avg *sa,
struct cfs_rq *cfs_rq) {}
static inline void __update_group_entity_contrib(struct sched_entity *se) {}
-#endif
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
static inline void __update_task_entity_contrib(struct sched_entity *se)
{
@@ -2323,12 +2556,6 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
}
-static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
-{
- __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
- __update_tg_runnable_avg(&rq->avg, &rq->cfs);
-}
-
/* Add the load generated by se into cfs_rq's child load-average */
static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
struct sched_entity *se,
@@ -2416,7 +2643,10 @@ void idle_exit_fair(struct rq *this_rq)
update_rq_runnable_avg(this_rq, 0);
}
-#else
+static int idle_balance(struct rq *this_rq);
+
+#else /* CONFIG_SMP */
+
static inline void update_entity_load_avg(struct sched_entity *se,
int update_cfs_rq) {}
static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
@@ -2428,7 +2658,13 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
int sleep) {}
static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
int force_update) {}
-#endif
+
+static inline int idle_balance(struct rq *rq)
+{
+ return 0;
+}
+
+#endif /* CONFIG_SMP */
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -2578,10 +2814,10 @@ static void __clear_buddies_last(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->last == se)
- cfs_rq->last = NULL;
- else
+ if (cfs_rq->last != se)
break;
+
+ cfs_rq->last = NULL;
}
}
@@ -2589,10 +2825,10 @@ static void __clear_buddies_next(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->next == se)
- cfs_rq->next = NULL;
- else
+ if (cfs_rq->next != se)
break;
+
+ cfs_rq->next = NULL;
}
}
@@ -2600,10 +2836,10 @@ static void __clear_buddies_skip(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->skip == se)
- cfs_rq->skip = NULL;
- else
+ if (cfs_rq->skip != se)
break;
+
+ cfs_rq->skip = NULL;
}
}
@@ -2679,7 +2915,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
ideal_runtime = sched_slice(cfs_rq, curr);
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
if (delta_exec > ideal_runtime) {
- resched_task(rq_of(cfs_rq)->curr);
+ resched_curr(rq_of(cfs_rq));
/*
* The current task ran long enough, ensure it doesn't get
* re-elected due to buddy favours.
@@ -2703,7 +2939,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
return;
if (delta > ideal_runtime)
- resched_task(rq_of(cfs_rq)->curr);
+ resched_curr(rq_of(cfs_rq));
}
static void
@@ -2746,17 +2982,36 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
* 3) pick the "last" process, for cache locality
* 4) do not run the "skip" process, if something else is available
*/
-static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *
+pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
- struct sched_entity *se = __pick_first_entity(cfs_rq);
- struct sched_entity *left = se;
+ struct sched_entity *left = __pick_first_entity(cfs_rq);
+ struct sched_entity *se;
+
+ /*
+ * If curr is set we have to see if its left of the leftmost entity
+ * still in the tree, provided there was anything in the tree at all.
+ */
+ if (!left || (curr && entity_before(curr, left)))
+ left = curr;
+
+ se = left; /* ideally we run the leftmost entity */
/*
* Avoid running the skip buddy, if running something else can
* be done without getting too unfair.
*/
if (cfs_rq->skip == se) {
- struct sched_entity *second = __pick_next_entity(se);
+ struct sched_entity *second;
+
+ if (se == curr) {
+ second = __pick_first_entity(cfs_rq);
+ } else {
+ second = __pick_next_entity(se);
+ if (!second || (curr && entity_before(curr, second)))
+ second = curr;
+ }
+
if (second && wakeup_preempt_entity(second, left) < 1)
se = second;
}
@@ -2778,7 +3033,7 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
return se;
}
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
@@ -2824,7 +3079,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
* validating it and just reschedule.
*/
if (queued) {
- resched_task(rq_of(cfs_rq)->curr);
+ resched_curr(rq_of(cfs_rq));
return;
}
/*
@@ -2942,7 +3197,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
*/
if (!cfs_b->timer_active) {
__refill_cfs_bandwidth_runtime(cfs_b);
- __start_cfs_bandwidth(cfs_b);
+ __start_cfs_bandwidth(cfs_b, false);
}
if (cfs_b->runtime > 0) {
@@ -2987,10 +3242,12 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
* has not truly expired.
*
* Fortunately we can check determine whether this the case by checking
- * whether the global deadline has advanced.
+ * whether the global deadline has advanced. It is valid to compare
+ * cfs_b->runtime_expires without any locks since we only care about
+ * exact equality, so a partial write will still work.
*/
- if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
+ if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
/* extend local deadline, drift is bounded above by 2 ticks */
cfs_rq->runtime_expires += TICK_NSEC;
} else {
@@ -3013,7 +3270,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
* hierarchy can be throttled
*/
if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
- resched_task(rq_of(cfs_rq)->curr);
+ resched_curr(rq_of(cfs_rq));
}
static __always_inline
@@ -3114,14 +3371,18 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
}
if (!se)
- rq->nr_running -= task_delta;
+ sub_nr_running(rq, task_delta);
cfs_rq->throttled = 1;
cfs_rq->throttled_clock = rq_clock(rq);
raw_spin_lock(&cfs_b->lock);
- list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+ /*
+ * Add to the _head_ of the list, so that an already-started
+ * distribute_cfs_runtime will not see us
+ */
+ list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
if (!cfs_b->timer_active)
- __start_cfs_bandwidth(cfs_b);
+ __start_cfs_bandwidth(cfs_b, false);
raw_spin_unlock(&cfs_b->lock);
}
@@ -3165,18 +3426,19 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
}
if (!se)
- rq->nr_running += task_delta;
+ add_nr_running(rq, task_delta);
/* determine whether we need to wake up potentially idle cpu */
if (rq->curr == rq->idle && rq->cfs.nr_running)
- resched_task(rq->curr);
+ resched_curr(rq);
}
static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
u64 remaining, u64 expires)
{
struct cfs_rq *cfs_rq;
- u64 runtime = remaining;
+ u64 runtime;
+ u64 starting_runtime = remaining;
rcu_read_lock();
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
@@ -3207,7 +3469,7 @@ next:
}
rcu_read_unlock();
- return remaining;
+ return starting_runtime - remaining;
}
/*
@@ -3219,21 +3481,21 @@ next:
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
{
u64 runtime, runtime_expires;
- int idle = 1, throttled;
+ int throttled;
- raw_spin_lock(&cfs_b->lock);
/* no need to continue the timer with no bandwidth constraint */
if (cfs_b->quota == RUNTIME_INF)
- goto out_unlock;
+ goto out_deactivate;
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
- /* idle depends on !throttled (for the case of a large deficit) */
- idle = cfs_b->idle && !throttled;
cfs_b->nr_periods += overrun;
- /* if we're going inactive then everything else can be deferred */
- if (idle)
- goto out_unlock;
+ /*
+ * idle depends on !throttled (for the case of a large deficit), and if
+ * we're going inactive then everything else can be deferred
+ */
+ if (cfs_b->idle && !throttled)
+ goto out_deactivate;
/*
* if we have relooped after returning idle once, we need to update our
@@ -3247,28 +3509,23 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
if (!throttled) {
/* mark as potentially idle for the upcoming period */
cfs_b->idle = 1;
- goto out_unlock;
+ return 0;
}
/* account preceding periods in which throttling occurred */
cfs_b->nr_throttled += overrun;
- /*
- * There are throttled entities so we must first use the new bandwidth
- * to unthrottle them before making it generally available. This
- * ensures that all existing debts will be paid before a new cfs_rq is
- * allowed to run.
- */
- runtime = cfs_b->runtime;
runtime_expires = cfs_b->runtime_expires;
- cfs_b->runtime = 0;
/*
- * This check is repeated as we are holding onto the new bandwidth
- * while we unthrottle. This can potentially race with an unthrottled
- * group trying to acquire new bandwidth from the global pool.
+ * This check is repeated as we are holding onto the new bandwidth while
+ * we unthrottle. This can potentially race with an unthrottled group
+ * trying to acquire new bandwidth from the global pool. This can result
+ * in us over-using our runtime if it is all used during this loop, but
+ * only by limited amounts in that extreme case.
*/
- while (throttled && runtime > 0) {
+ while (throttled && cfs_b->runtime > 0) {
+ runtime = cfs_b->runtime;
raw_spin_unlock(&cfs_b->lock);
/* we can't nest cfs_b->lock while distributing bandwidth */
runtime = distribute_cfs_runtime(cfs_b, runtime,
@@ -3276,10 +3533,10 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
raw_spin_lock(&cfs_b->lock);
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+
+ cfs_b->runtime -= min(runtime, cfs_b->runtime);
}
- /* return (any) remaining runtime */
- cfs_b->runtime = runtime;
/*
* While we are ensured activity in the period following an
* unthrottle, this also covers the case in which the new bandwidth is
@@ -3287,12 +3544,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
* timer to remain active while there are any throttled entities.)
*/
cfs_b->idle = 0;
-out_unlock:
- if (idle)
- cfs_b->timer_active = 0;
- raw_spin_unlock(&cfs_b->lock);
- return idle;
+ return 0;
+
+out_deactivate:
+ cfs_b->timer_active = 0;
+ return 1;
}
/* a cfs_rq won't donate quota below this amount */
@@ -3390,10 +3647,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
return;
}
- if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
+ if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
runtime = cfs_b->runtime;
- cfs_b->runtime = 0;
- }
+
expires = cfs_b->runtime_expires;
raw_spin_unlock(&cfs_b->lock);
@@ -3404,7 +3660,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
raw_spin_lock(&cfs_b->lock);
if (expires == cfs_b->runtime_expires)
- cfs_b->runtime = runtime;
+ cfs_b->runtime -= min(runtime, cfs_b->runtime);
raw_spin_unlock(&cfs_b->lock);
}
@@ -3433,22 +3689,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
}
/* conditionally throttle active cfs_rq's from put_prev_entity() */
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
if (!cfs_bandwidth_used())
- return;
+ return false;
if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
- return;
+ return false;
/*
* it's possible for a throttled entity to be forced into a running
* state (e.g. set_curr_task), in this case we're finished.
*/
if (cfs_rq_throttled(cfs_rq))
- return;
+ return true;
throttle_cfs_rq(cfs_rq);
+ return true;
}
static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -3468,6 +3725,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
int overrun;
int idle = 0;
+ raw_spin_lock(&cfs_b->lock);
for (;;) {
now = hrtimer_cb_get_time(timer);
overrun = hrtimer_forward(timer, now, cfs_b->period);
@@ -3477,6 +3735,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
idle = do_sched_cfs_period_timer(cfs_b, overrun);
}
+ raw_spin_unlock(&cfs_b->lock);
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}
@@ -3502,7 +3761,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
}
/* requires cfs_b->lock, may release to reprogram timer */
-void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
{
/*
* The timer may be active because we're trying to set a new bandwidth
@@ -3517,7 +3776,7 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
cpu_relax();
raw_spin_lock(&cfs_b->lock);
/* if someone else restarted the timer then we're done */
- if (cfs_b->timer_active)
+ if (!force && cfs_b->timer_active)
return;
}
@@ -3531,13 +3790,24 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
hrtimer_cancel(&cfs_b->slack_timer);
}
-static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
+static void __maybe_unused update_runtime_enabled(struct rq *rq)
{
struct cfs_rq *cfs_rq;
for_each_leaf_cfs_rq(rq, cfs_rq) {
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+ struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
+
+ raw_spin_lock(&cfs_b->lock);
+ cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
+ raw_spin_unlock(&cfs_b->lock);
+ }
+}
+
+static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
+{
+ struct cfs_rq *cfs_rq;
+ for_each_leaf_cfs_rq(rq, cfs_rq) {
if (!cfs_rq->runtime_enabled)
continue;
@@ -3545,7 +3815,13 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
* clock_task is not advancing so we just need to make sure
* there's some valid quota amount
*/
- cfs_rq->runtime_remaining = cfs_b->quota;
+ cfs_rq->runtime_remaining = 1;
+ /*
+ * Offline rq is schedulable till cpu is completely disabled
+ * in take_cpu_down(), so we prevent new cfs throttling here.
+ */
+ cfs_rq->runtime_enabled = 0;
+
if (cfs_rq_throttled(cfs_rq))
unthrottle_cfs_rq(cfs_rq);
}
@@ -3558,7 +3834,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
}
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -3589,6 +3865,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
return NULL;
}
static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+static inline void update_runtime_enabled(struct rq *rq) {}
static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
#endif /* CONFIG_CFS_BANDWIDTH */
@@ -3612,7 +3889,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
if (delta < 0) {
if (rq->curr == p)
- resched_task(p);
+ resched_curr(rq);
return;
}
@@ -3696,7 +3973,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se) {
update_rq_runnable_avg(rq, rq->nr_running);
- inc_nr_running(rq);
+ add_nr_running(rq, 1);
}
hrtick_update(rq);
}
@@ -3756,7 +4033,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
}
if (!se) {
- dec_nr_running(rq);
+ sub_nr_running(rq, 1);
update_rq_runnable_avg(rq, 1);
}
hrtick_update(rq);
@@ -3802,9 +4079,9 @@ static unsigned long target_load(int cpu, int type)
return max(rq->cpu_load[type-1], total);
}
-static unsigned long power_of(int cpu)
+static unsigned long capacity_of(int cpu)
{
- return cpu_rq(cpu)->cpu_power;
+ return cpu_rq(cpu)->cpu_capacity;
}
static unsigned long cpu_avg_load_per_task(int cpu)
@@ -3826,8 +4103,8 @@ static void record_wakee(struct task_struct *p)
* about the boundary, really active task won't care
* about the loss.
*/
- if (jiffies > current->wakee_flip_decay_ts + HZ) {
- current->wakee_flips = 0;
+ if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
+ current->wakee_flips >>= 1;
current->wakee_flip_decay_ts = jiffies;
}
@@ -4047,12 +4324,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
s64 this_eff_load, prev_eff_load;
this_eff_load = 100;
- this_eff_load *= power_of(prev_cpu);
+ this_eff_load *= capacity_of(prev_cpu);
this_eff_load *= this_load +
effective_load(tg, this_cpu, weight, weight);
prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
- prev_eff_load *= power_of(this_cpu);
+ prev_eff_load *= capacity_of(this_cpu);
prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
balanced = this_eff_load <= prev_eff_load;
@@ -4128,8 +4405,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
avg_load += load;
}
- /* Adjust by relative CPU power of the group */
- avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
+ /* Adjust by relative CPU capacity of the group */
+ avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
if (local_group) {
this_load = avg_load;
@@ -4213,13 +4490,14 @@ done:
}
/*
- * sched_balance_self: balance the current task (running on cpu) in domains
- * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
- * SD_BALANCE_EXEC.
+ * select_task_rq_fair: Select target runqueue for the waking task in domains
+ * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
+ * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
*
- * Balance, ie. select the least loaded group.
+ * Balances load by selecting the idlest cpu in the idlest group, or under
+ * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
*
- * Returns the target CPU number, or the same CPU if no balancing is needed.
+ * Returns the target cpu number.
*
* preempt must be disabled.
*/
@@ -4260,10 +4538,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
sd = tmp;
}
- if (affine_sd) {
- if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
- prev_cpu = cpu;
+ if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+ prev_cpu = cpu;
+ if (sd_flag & SD_BALANCE_WAKE) {
new_cpu = select_idle_sibling(p, prev_cpu);
goto unlock;
}
@@ -4331,6 +4609,9 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
atomic_long_add(se->avg.load_avg_contrib,
&cfs_rq->removed_load);
}
+
+ /* We have migrated, no longer consider this task hot */
+ se->exec_start = 0;
}
#endif /* CONFIG_SMP */
@@ -4477,7 +4758,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return;
preempt:
- resched_task(curr);
+ resched_curr(rq);
/*
* Only set the backward buddy when the current task is still
* on the rq. This can happen when a wakeup gets interleaved
@@ -4494,26 +4775,124 @@ preempt:
set_last_buddy(se);
}
-static struct task_struct *pick_next_task_fair(struct rq *rq)
+static struct task_struct *
+pick_next_task_fair(struct rq *rq, struct task_struct *prev)
{
- struct task_struct *p;
struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
+ struct task_struct *p;
+ int new_tasks;
+again:
+#ifdef CONFIG_FAIR_GROUP_SCHED
if (!cfs_rq->nr_running)
- return NULL;
+ goto idle;
+
+ if (prev->sched_class != &fair_sched_class)
+ goto simple;
+
+ /*
+ * Because of the set_next_buddy() in dequeue_task_fair() it is rather
+ * likely that a next task is from the same cgroup as the current.
+ *
+ * Therefore attempt to avoid putting and setting the entire cgroup
+ * hierarchy, only change the part that actually changes.
+ */
do {
- se = pick_next_entity(cfs_rq);
+ struct sched_entity *curr = cfs_rq->curr;
+
+ /*
+ * Since we got here without doing put_prev_entity() we also
+ * have to consider cfs_rq->curr. If it is still a runnable
+ * entity, update_curr() will update its vruntime, otherwise
+ * forget we've ever seen it.
+ */
+ if (curr && curr->on_rq)
+ update_curr(cfs_rq);
+ else
+ curr = NULL;
+
+ /*
+ * This call to check_cfs_rq_runtime() will do the throttle and
+ * dequeue its entity in the parent(s). Therefore the 'simple'
+ * nr_running test will indeed be correct.
+ */
+ if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+ goto simple;
+
+ se = pick_next_entity(cfs_rq, curr);
+ cfs_rq = group_cfs_rq(se);
+ } while (cfs_rq);
+
+ p = task_of(se);
+
+ /*
+ * Since we haven't yet done put_prev_entity and if the selected task
+ * is a different task than we started out with, try and touch the
+ * least amount of cfs_rqs.
+ */
+ if (prev != p) {
+ struct sched_entity *pse = &prev->se;
+
+ while (!(cfs_rq = is_same_group(se, pse))) {
+ int se_depth = se->depth;
+ int pse_depth = pse->depth;
+
+ if (se_depth <= pse_depth) {
+ put_prev_entity(cfs_rq_of(pse), pse);
+ pse = parent_entity(pse);
+ }
+ if (se_depth >= pse_depth) {
+ set_next_entity(cfs_rq_of(se), se);
+ se = parent_entity(se);
+ }
+ }
+
+ put_prev_entity(cfs_rq, pse);
+ set_next_entity(cfs_rq, se);
+ }
+
+ if (hrtick_enabled(rq))
+ hrtick_start_fair(rq, p);
+
+ return p;
+simple:
+ cfs_rq = &rq->cfs;
+#endif
+
+ if (!cfs_rq->nr_running)
+ goto idle;
+
+ put_prev_task(rq, prev);
+
+ do {
+ se = pick_next_entity(cfs_rq, NULL);
set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
p = task_of(se);
+
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
return p;
+
+idle:
+ new_tasks = idle_balance(rq);
+ /*
+ * Because idle_balance() releases (and re-acquires) rq->lock, it is
+ * possible for any higher priority task to appear. In that case we
+ * must re-start the pick_next_entity() loop.
+ */
+ if (new_tasks < 0)
+ return RETRY_TASK;
+
+ if (new_tasks > 0)
+ goto again;
+
+ return NULL;
}
/*
@@ -4607,14 +4986,14 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
*
* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
*
- * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
+ * C_i is the compute capacity of cpu i, typically it is the
* fraction of 'recent' time available for SCHED_OTHER task execution. But it
* can also include other factors [XXX].
*
* To achieve this balance we define a measure of imbalance which follows
* directly from (1):
*
- * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
+ * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
*
* We them move tasks around to minimize the imbalance. In the continuous
* function space it is obvious this converges, in the discrete case we get
@@ -4750,8 +5129,7 @@ static void move_task(struct task_struct *p, struct lb_env *env)
/*
* Is this task likely cache-hot:
*/
-static int
-task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
+static int task_hot(struct task_struct *p, struct lb_env *env)
{
s64 delta;
@@ -4764,7 +5142,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
/*
* Buddy candidates are cache hot:
*/
- if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
+ if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
(&p->se == cfs_rq_of(&p->se)->next ||
&p->se == cfs_rq_of(&p->se)->last))
return 1;
@@ -4774,7 +5152,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
if (sysctl_sched_migration_cost == 0)
return 0;
- delta = now - p->se.exec_start;
+ delta = rq_clock_task(env->src_rq) - p->se.exec_start;
return delta < (s64)sysctl_sched_migration_cost;
}
@@ -4783,9 +5161,10 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
/* Returns true if the destination node has incurred more faults */
static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
{
+ struct numa_group *numa_group = rcu_dereference(p->numa_group);
int src_nid, dst_nid;
- if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
+ if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
!(env->sd->flags & SD_NUMA)) {
return false;
}
@@ -4796,27 +5175,35 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
if (src_nid == dst_nid)
return false;
- /* Always encourage migration to the preferred node. */
- if (dst_nid == p->numa_preferred_nid)
- return true;
+ if (numa_group) {
+ /* Task is already in the group's interleave set. */
+ if (node_isset(src_nid, numa_group->active_nodes))
+ return false;
- /* If both task and group weight improve, this move is a winner. */
- if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
- group_weight(p, dst_nid) > group_weight(p, src_nid))
+ /* Task is moving into the group's interleave set. */
+ if (node_isset(dst_nid, numa_group->active_nodes))
+ return true;
+
+ return group_faults(p, dst_nid) > group_faults(p, src_nid);
+ }
+
+ /* Encourage migration to the preferred node. */
+ if (dst_nid == p->numa_preferred_nid)
return true;
- return false;
+ return task_faults(p, dst_nid) > task_faults(p, src_nid);
}
static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
{
+ struct numa_group *numa_group = rcu_dereference(p->numa_group);
int src_nid, dst_nid;
if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
return false;
- if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
+ if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
return false;
src_nid = cpu_to_node(env->src_cpu);
@@ -4825,16 +5212,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
if (src_nid == dst_nid)
return false;
+ if (numa_group) {
+ /* Task is moving within/into the group's interleave set. */
+ if (node_isset(dst_nid, numa_group->active_nodes))
+ return false;
+
+ /* Task is moving out of the group's interleave set. */
+ if (node_isset(src_nid, numa_group->active_nodes))
+ return true;
+
+ return group_faults(p, dst_nid) < group_faults(p, src_nid);
+ }
+
/* Migrating away from the preferred node is always bad. */
if (src_nid == p->numa_preferred_nid)
return true;
- /* If either task or group weight get worse, don't do it. */
- if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
- group_weight(p, dst_nid) < group_weight(p, src_nid))
- return true;
-
- return false;
+ return task_faults(p, dst_nid) < task_faults(p, src_nid);
}
#else
@@ -4912,7 +5306,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
* 2) task is cache cold, or
* 3) too many balance attempts have failed.
*/
- tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
+ tsk_cache_hot = task_hot(p, env);
if (!tsk_cache_hot)
tsk_cache_hot = migrate_degrades_locality(p, env);
@@ -5173,13 +5567,13 @@ struct sg_lb_stats {
unsigned long group_load; /* Total load over the CPUs of the group */
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
unsigned long load_per_task;
- unsigned long group_power;
+ unsigned long group_capacity;
unsigned int sum_nr_running; /* Nr tasks running in the group */
- unsigned int group_capacity;
+ unsigned int group_capacity_factor;
unsigned int idle_cpus;
unsigned int group_weight;
int group_imb; /* Is there an imbalance in the group ? */
- int group_has_capacity; /* Is there extra capacity in the group? */
+ int group_has_free_capacity;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
@@ -5194,7 +5588,7 @@ struct sd_lb_stats {
struct sched_group *busiest; /* Busiest group in this sd */
struct sched_group *local; /* Local group in this sd */
unsigned long total_load; /* Total load of all groups in sd */
- unsigned long total_pwr; /* Total power of all groups in sd */
+ unsigned long total_capacity; /* Total capacity of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd */
struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
@@ -5213,7 +5607,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
.busiest = NULL,
.local = NULL,
.total_load = 0UL,
- .total_pwr = 0UL,
+ .total_capacity = 0UL,
.busiest_stat = {
.avg_load = 0UL,
},
@@ -5248,17 +5642,17 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
return load_idx;
}
-static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
+static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu)
{
- return SCHED_POWER_SCALE;
+ return SCHED_CAPACITY_SCALE;
}
-unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
+unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
{
- return default_scale_freq_power(sd, cpu);
+ return default_scale_capacity(sd, cpu);
}
-static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
+static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu)
{
unsigned long weight = sd->span_weight;
unsigned long smt_gain = sd->smt_gain;
@@ -5268,15 +5662,16 @@ static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
return smt_gain;
}
-unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
+unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu)
{
- return default_scale_smt_power(sd, cpu);
+ return default_scale_smt_capacity(sd, cpu);
}
-static unsigned long scale_rt_power(int cpu)
+static unsigned long scale_rt_capacity(int cpu)
{
struct rq *rq = cpu_rq(cpu);
u64 total, available, age_stamp, avg;
+ s64 delta;
/*
* Since we're reading these variables without serialization make sure
@@ -5285,74 +5680,78 @@ static unsigned long scale_rt_power(int cpu)
age_stamp = ACCESS_ONCE(rq->age_stamp);
avg = ACCESS_ONCE(rq->rt_avg);
- total = sched_avg_period() + (rq_clock(rq) - age_stamp);
+ delta = rq_clock(rq) - age_stamp;
+ if (unlikely(delta < 0))
+ delta = 0;
+
+ total = sched_avg_period() + delta;
if (unlikely(total < avg)) {
- /* Ensures that power won't end up being negative */
+ /* Ensures that capacity won't end up being negative */
available = 0;
} else {
available = total - avg;
}
- if (unlikely((s64)total < SCHED_POWER_SCALE))
- total = SCHED_POWER_SCALE;
+ if (unlikely((s64)total < SCHED_CAPACITY_SCALE))
+ total = SCHED_CAPACITY_SCALE;
- total >>= SCHED_POWER_SHIFT;
+ total >>= SCHED_CAPACITY_SHIFT;
return div_u64(available, total);
}
-static void update_cpu_power(struct sched_domain *sd, int cpu)
+static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
unsigned long weight = sd->span_weight;
- unsigned long power = SCHED_POWER_SCALE;
+ unsigned long capacity = SCHED_CAPACITY_SCALE;
struct sched_group *sdg = sd->groups;
- if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
- if (sched_feat(ARCH_POWER))
- power *= arch_scale_smt_power(sd, cpu);
+ if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) {
+ if (sched_feat(ARCH_CAPACITY))
+ capacity *= arch_scale_smt_capacity(sd, cpu);
else
- power *= default_scale_smt_power(sd, cpu);
+ capacity *= default_scale_smt_capacity(sd, cpu);
- power >>= SCHED_POWER_SHIFT;
+ capacity >>= SCHED_CAPACITY_SHIFT;
}
- sdg->sgp->power_orig = power;
+ sdg->sgc->capacity_orig = capacity;
- if (sched_feat(ARCH_POWER))
- power *= arch_scale_freq_power(sd, cpu);
+ if (sched_feat(ARCH_CAPACITY))
+ capacity *= arch_scale_freq_capacity(sd, cpu);
else
- power *= default_scale_freq_power(sd, cpu);
+ capacity *= default_scale_capacity(sd, cpu);
- power >>= SCHED_POWER_SHIFT;
+ capacity >>= SCHED_CAPACITY_SHIFT;
- power *= scale_rt_power(cpu);
- power >>= SCHED_POWER_SHIFT;
+ capacity *= scale_rt_capacity(cpu);
+ capacity >>= SCHED_CAPACITY_SHIFT;
- if (!power)
- power = 1;
+ if (!capacity)
+ capacity = 1;
- cpu_rq(cpu)->cpu_power = power;
- sdg->sgp->power = power;
+ cpu_rq(cpu)->cpu_capacity = capacity;
+ sdg->sgc->capacity = capacity;
}
-void update_group_power(struct sched_domain *sd, int cpu)
+void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long power, power_orig;
+ unsigned long capacity, capacity_orig;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
interval = clamp(interval, 1UL, max_load_balance_interval);
- sdg->sgp->next_update = jiffies + interval;
+ sdg->sgc->next_update = jiffies + interval;
if (!child) {
- update_cpu_power(sd, cpu);
+ update_cpu_capacity(sd, cpu);
return;
}
- power_orig = power = 0;
+ capacity_orig = capacity = 0;
if (child->flags & SD_OVERLAP) {
/*
@@ -5361,31 +5760,31 @@ void update_group_power(struct sched_domain *sd, int cpu)
*/
for_each_cpu(cpu, sched_group_cpus(sdg)) {
- struct sched_group_power *sgp;
+ struct sched_group_capacity *sgc;
struct rq *rq = cpu_rq(cpu);
/*
- * build_sched_domains() -> init_sched_groups_power()
+ * build_sched_domains() -> init_sched_groups_capacity()
* gets here before we've attached the domains to the
* runqueues.
*
- * Use power_of(), which is set irrespective of domains
- * in update_cpu_power().
+ * Use capacity_of(), which is set irrespective of domains
+ * in update_cpu_capacity().
*
- * This avoids power/power_orig from being 0 and
+ * This avoids capacity/capacity_orig from being 0 and
* causing divide-by-zero issues on boot.
*
- * Runtime updates will correct power_orig.
+ * Runtime updates will correct capacity_orig.
*/
if (unlikely(!rq->sd)) {
- power_orig += power_of(cpu);
- power += power_of(cpu);
+ capacity_orig += capacity_of(cpu);
+ capacity += capacity_of(cpu);
continue;
}
- sgp = rq->sd->groups->sgp;
- power_orig += sgp->power_orig;
- power += sgp->power;
+ sgc = rq->sd->groups->sgc;
+ capacity_orig += sgc->capacity_orig;
+ capacity += sgc->capacity;
}
} else {
/*
@@ -5395,14 +5794,14 @@ void update_group_power(struct sched_domain *sd, int cpu)
group = child->groups;
do {
- power_orig += group->sgp->power_orig;
- power += group->sgp->power;
+ capacity_orig += group->sgc->capacity_orig;
+ capacity += group->sgc->capacity;
group = group->next;
} while (group != child->groups);
}
- sdg->sgp->power_orig = power_orig;
- sdg->sgp->power = power;
+ sdg->sgc->capacity_orig = capacity_orig;
+ sdg->sgc->capacity = capacity;
}
/*
@@ -5416,15 +5815,15 @@ static inline int
fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
{
/*
- * Only siblings can have significantly less than SCHED_POWER_SCALE
+ * Only siblings can have significantly less than SCHED_CAPACITY_SCALE
*/
- if (!(sd->flags & SD_SHARE_CPUPOWER))
+ if (!(sd->flags & SD_SHARE_CPUCAPACITY))
return 0;
/*
- * If ~90% of the cpu_power is still there, we're good.
+ * If ~90% of the cpu_capacity is still there, we're good.
*/
- if (group->sgp->power * 32 > group->sgp->power_orig * 29)
+ if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29)
return 1;
return 0;
@@ -5461,34 +5860,35 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
static inline int sg_imbalanced(struct sched_group *group)
{
- return group->sgp->imbalance;
+ return group->sgc->imbalance;
}
/*
- * Compute the group capacity.
+ * Compute the group capacity factor.
*
- * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
+ * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by
* first dividing out the smt factor and computing the actual number of cores
- * and limit power unit capacity with that.
+ * and limit unit capacity with that.
*/
-static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
+static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group)
{
- unsigned int capacity, smt, cpus;
- unsigned int power, power_orig;
+ unsigned int capacity_factor, smt, cpus;
+ unsigned int capacity, capacity_orig;
- power = group->sgp->power;
- power_orig = group->sgp->power_orig;
+ capacity = group->sgc->capacity;
+ capacity_orig = group->sgc->capacity_orig;
cpus = group->group_weight;
- /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
- smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
- capacity = cpus / smt; /* cores */
+ /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */
+ smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig);
+ capacity_factor = cpus / smt; /* cores */
- capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
- if (!capacity)
- capacity = fix_small_capacity(env->sd, group);
+ capacity_factor = min_t(unsigned,
+ capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE));
+ if (!capacity_factor)
+ capacity_factor = fix_small_capacity(env->sd, group);
- return capacity;
+ return capacity_factor;
}
/**
@@ -5498,10 +5898,12 @@ static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
* @load_idx: Load index of sched_domain of this_cpu for load calc.
* @local_group: Does group contain this_cpu.
* @sgs: variable to hold the statistics for this group.
+ * @overload: Indicate more than one runnable task for any CPU.
*/
static inline void update_sg_lb_stats(struct lb_env *env,
struct sched_group *group, int load_idx,
- int local_group, struct sg_lb_stats *sgs)
+ int local_group, struct sg_lb_stats *sgs,
+ bool *overload)
{
unsigned long load;
int i;
@@ -5519,6 +5921,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_load += load;
sgs->sum_nr_running += rq->nr_running;
+
+ if (rq->nr_running > 1)
+ *overload = true;
+
#ifdef CONFIG_NUMA_BALANCING
sgs->nr_numa_running += rq->nr_numa_running;
sgs->nr_preferred_running += rq->nr_preferred_running;
@@ -5528,9 +5934,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->idle_cpus++;
}
- /* Adjust by relative CPU power of the group */
- sgs->group_power = group->sgp->power;
- sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
+ /* Adjust by relative CPU capacity of the group */
+ sgs->group_capacity = group->sgc->capacity;
+ sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
if (sgs->sum_nr_running)
sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
@@ -5538,10 +5944,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_weight = group->group_weight;
sgs->group_imb = sg_imbalanced(group);
- sgs->group_capacity = sg_capacity(env, group);
+ sgs->group_capacity_factor = sg_capacity_factor(env, group);
- if (sgs->group_capacity > sgs->sum_nr_running)
- sgs->group_has_capacity = 1;
+ if (sgs->group_capacity_factor > sgs->sum_nr_running)
+ sgs->group_has_free_capacity = 1;
}
/**
@@ -5565,7 +5971,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
if (sgs->avg_load <= sds->busiest_stat.avg_load)
return false;
- if (sgs->sum_nr_running > sgs->group_capacity)
+ if (sgs->sum_nr_running > sgs->group_capacity_factor)
return true;
if (sgs->group_imb)
@@ -5629,6 +6035,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats tmp_sgs;
int load_idx, prefer_sibling = 0;
+ bool overload = false;
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;
@@ -5645,28 +6052,29 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
sgs = &sds->local_stat;
if (env->idle != CPU_NEWLY_IDLE ||
- time_after_eq(jiffies, sg->sgp->next_update))
- update_group_power(env->sd, env->dst_cpu);
+ time_after_eq(jiffies, sg->sgc->next_update))
+ update_group_capacity(env->sd, env->dst_cpu);
}
- update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
+ update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
+ &overload);
if (local_group)
goto next_group;
/*
* In case the child domain prefers tasks go to siblings
- * first, lower the sg capacity to one so that we'll try
+ * first, lower the sg capacity factor to one so that we'll try
* and move all the excess tasks away. We lower the capacity
* of a group only if the local group has the capacity to fit
- * these excess tasks, i.e. nr_running < group_capacity. The
+ * these excess tasks, i.e. nr_running < group_capacity_factor. The
* extra check prevents the case where you always pull from the
* heaviest group when it is already under-utilized (possible
* with a large weight task outweighs the tasks on the system).
*/
if (prefer_sibling && sds->local &&
- sds->local_stat.group_has_capacity)
- sgs->group_capacity = min(sgs->group_capacity, 1U);
+ sds->local_stat.group_has_free_capacity)
+ sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
@@ -5676,13 +6084,20 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
next_group:
/* Now, start updating sd_lb_stats */
sds->total_load += sgs->group_load;
- sds->total_pwr += sgs->group_power;
+ sds->total_capacity += sgs->group_capacity;
sg = sg->next;
} while (sg != env->sd->groups);
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
+
+ if (!env->sd->parent) {
+ /* update overload indicator if we are at root domain */
+ if (env->dst_rq->rd->overload != overload)
+ env->dst_rq->rd->overload = overload;
+ }
+
}
/**
@@ -5723,8 +6138,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
return 0;
env->imbalance = DIV_ROUND_CLOSEST(
- sds->busiest_stat.avg_load * sds->busiest_stat.group_power,
- SCHED_POWER_SCALE);
+ sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
+ SCHED_CAPACITY_SCALE);
return 1;
}
@@ -5739,7 +6154,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
static inline
void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
- unsigned long tmp, pwr_now = 0, pwr_move = 0;
+ unsigned long tmp, capa_now = 0, capa_move = 0;
unsigned int imbn = 2;
unsigned long scaled_busy_load_per_task;
struct sg_lb_stats *local, *busiest;
@@ -5753,8 +6168,8 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
imbn = 1;
scaled_busy_load_per_task =
- (busiest->load_per_task * SCHED_POWER_SCALE) /
- busiest->group_power;
+ (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
+ busiest->group_capacity;
if (busiest->avg_load + scaled_busy_load_per_task >=
local->avg_load + (scaled_busy_load_per_task * imbn)) {
@@ -5764,40 +6179,38 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
/*
* OK, we don't have enough imbalance to justify moving tasks,
- * however we may be able to increase total CPU power used by
+ * however we may be able to increase total CPU capacity used by
* moving them.
*/
- pwr_now += busiest->group_power *
+ capa_now += busiest->group_capacity *
min(busiest->load_per_task, busiest->avg_load);
- pwr_now += local->group_power *
+ capa_now += local->group_capacity *
min(local->load_per_task, local->avg_load);
- pwr_now /= SCHED_POWER_SCALE;
+ capa_now /= SCHED_CAPACITY_SCALE;
/* Amount of load we'd subtract */
- tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
- busiest->group_power;
- if (busiest->avg_load > tmp) {
- pwr_move += busiest->group_power *
+ if (busiest->avg_load > scaled_busy_load_per_task) {
+ capa_move += busiest->group_capacity *
min(busiest->load_per_task,
- busiest->avg_load - tmp);
+ busiest->avg_load - scaled_busy_load_per_task);
}
/* Amount of load we'd add */
- if (busiest->avg_load * busiest->group_power <
- busiest->load_per_task * SCHED_POWER_SCALE) {
- tmp = (busiest->avg_load * busiest->group_power) /
- local->group_power;
+ if (busiest->avg_load * busiest->group_capacity <
+ busiest->load_per_task * SCHED_CAPACITY_SCALE) {
+ tmp = (busiest->avg_load * busiest->group_capacity) /
+ local->group_capacity;
} else {
- tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
- local->group_power;
+ tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
+ local->group_capacity;
}
- pwr_move += local->group_power *
+ capa_move += local->group_capacity *
min(local->load_per_task, local->avg_load + tmp);
- pwr_move /= SCHED_POWER_SCALE;
+ capa_move /= SCHED_CAPACITY_SCALE;
/* Move if we gain throughput */
- if (pwr_move > pwr_now)
+ if (capa_move > capa_now)
env->imbalance = busiest->load_per_task;
}
@@ -5827,7 +6240,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/*
* In the presence of smp nice balancing, certain scenarios can have
* max load less than avg load(as we skip the groups at or below
- * its cpu_power, while calculating max_load..)
+ * its cpu_capacity, while calculating max_load..)
*/
if (busiest->avg_load <= sds->avg_load ||
local->avg_load >= sds->avg_load) {
@@ -5842,10 +6255,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* have to drop below capacity to reach cpu-load equilibrium.
*/
load_above_capacity =
- (busiest->sum_nr_running - busiest->group_capacity);
+ (busiest->sum_nr_running - busiest->group_capacity_factor);
- load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
- load_above_capacity /= busiest->group_power;
+ load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE);
+ load_above_capacity /= busiest->group_capacity;
}
/*
@@ -5860,9 +6273,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/* How much load to actually move to equalise the imbalance */
env->imbalance = min(
- max_pull * busiest->group_power,
- (sds->avg_load - local->avg_load) * local->group_power
- ) / SCHED_POWER_SCALE;
+ max_pull * busiest->group_capacity,
+ (sds->avg_load - local->avg_load) * local->group_capacity
+ ) / SCHED_CAPACITY_SCALE;
/*
* if *imbalance is less than the average load per runnable task
@@ -5916,7 +6329,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (!sds.busiest || busiest->sum_nr_running == 0)
goto out_balanced;
- sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
+ sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
+ / sds.total_capacity;
/*
* If the busiest group is imbalanced the below checks don't
@@ -5927,8 +6341,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
goto force_balance;
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
- if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity &&
- !busiest->group_has_capacity)
+ if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity &&
+ !busiest->group_has_free_capacity)
goto force_balance;
/*
@@ -5982,11 +6396,11 @@ static struct rq *find_busiest_queue(struct lb_env *env,
struct sched_group *group)
{
struct rq *busiest = NULL, *rq;
- unsigned long busiest_load = 0, busiest_power = 1;
+ unsigned long busiest_load = 0, busiest_capacity = 1;
int i;
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
- unsigned long power, capacity, wl;
+ unsigned long capacity, capacity_factor, wl;
enum fbq_type rt;
rq = cpu_rq(i);
@@ -6014,34 +6428,34 @@ static struct rq *find_busiest_queue(struct lb_env *env,
if (rt > env->fbq_type)
continue;
- power = power_of(i);
- capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
- if (!capacity)
- capacity = fix_small_capacity(env->sd, group);
+ capacity = capacity_of(i);
+ capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE);
+ if (!capacity_factor)
+ capacity_factor = fix_small_capacity(env->sd, group);
wl = weighted_cpuload(i);
/*
* When comparing with imbalance, use weighted_cpuload()
- * which is not scaled with the cpu power.
+ * which is not scaled with the cpu capacity.
*/
- if (capacity && rq->nr_running == 1 && wl > env->imbalance)
+ if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance)
continue;
/*
* For the load comparisons with the other cpu's, consider
- * the weighted_cpuload() scaled with the cpu power, so that
- * the load can be moved away from the cpu that is potentially
- * running at a lower capacity.
+ * the weighted_cpuload() scaled with the cpu capacity, so
+ * that the load can be moved away from the cpu that is
+ * potentially running at a lower capacity.
*
- * Thus we're looking for max(wl_i / power_i), crosswise
+ * Thus we're looking for max(wl_i / capacity_i), crosswise
* multiplication to rid ourselves of the division works out
- * to: wl_i * power_j > wl_j * power_i; where j is our
- * previous maximum.
+ * to: wl_i * capacity_j > wl_j * capacity_i; where j is
+ * our previous maximum.
*/
- if (wl * busiest_power > busiest_load * power) {
+ if (wl * busiest_capacity > busiest_load * capacity) {
busiest_load = wl;
- busiest_power = power;
+ busiest_capacity = capacity;
busiest = rq;
}
}
@@ -6249,7 +6663,7 @@ more_balance:
* We failed to reach balance because of affinity.
*/
if (sd_parent) {
- int *group_imbalance = &sd_parent->groups->sgp->imbalance;
+ int *group_imbalance = &sd_parent->groups->sgc->imbalance;
if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
*group_imbalance = 1;
@@ -6355,21 +6769,63 @@ out:
return ld_moved;
}
+static inline unsigned long
+get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
+{
+ unsigned long interval = sd->balance_interval;
+
+ if (cpu_busy)
+ interval *= sd->busy_factor;
+
+ /* scale ms to jiffies */
+ interval = msecs_to_jiffies(interval);
+ interval = clamp(interval, 1UL, max_load_balance_interval);
+
+ return interval;
+}
+
+static inline void
+update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
+{
+ unsigned long interval, next;
+
+ interval = get_sd_balance_interval(sd, cpu_busy);
+ next = sd->last_balance + interval;
+
+ if (time_after(*next_balance, next))
+ *next_balance = next;
+}
+
/*
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*/
-void idle_balance(int this_cpu, struct rq *this_rq)
+static int idle_balance(struct rq *this_rq)
{
+ unsigned long next_balance = jiffies + HZ;
+ int this_cpu = this_rq->cpu;
struct sched_domain *sd;
int pulled_task = 0;
- unsigned long next_balance = jiffies + HZ;
u64 curr_cost = 0;
+ idle_enter_fair(this_rq);
+
+ /*
+ * We must set idle_stamp _before_ calling idle_balance(), such that we
+ * measure the duration of idle_balance() as idle time.
+ */
this_rq->idle_stamp = rq_clock(this_rq);
- if (this_rq->avg_idle < sysctl_sched_migration_cost)
- return;
+ if (this_rq->avg_idle < sysctl_sched_migration_cost ||
+ !this_rq->rd->overload) {
+ rcu_read_lock();
+ sd = rcu_dereference_check_sched_domain(this_rq->sd);
+ if (sd)
+ update_next_balance(sd, 0, &next_balance);
+ rcu_read_unlock();
+
+ goto out;
+ }
/*
* Drop the rq->lock, but keep IRQ/preempt disabled.
@@ -6379,20 +6835,20 @@ void idle_balance(int this_cpu, struct rq *this_rq)
update_blocked_averages(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
- unsigned long interval;
int continue_balancing = 1;
u64 t0, domain_cost;
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
- if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
+ if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
+ update_next_balance(sd, 0, &next_balance);
break;
+ }
if (sd->flags & SD_BALANCE_NEWIDLE) {
t0 = sched_clock_cpu(this_cpu);
- /* If we've pulled tasks over stop searching: */
pulled_task = load_balance(this_cpu, this_rq,
sd, CPU_NEWLY_IDLE,
&continue_balancing);
@@ -6404,28 +6860,45 @@ void idle_balance(int this_cpu, struct rq *this_rq)
curr_cost += domain_cost;
}
- interval = msecs_to_jiffies(sd->balance_interval);
- if (time_after(next_balance, sd->last_balance + interval))
- next_balance = sd->last_balance + interval;
- if (pulled_task) {
- this_rq->idle_stamp = 0;
+ update_next_balance(sd, 0, &next_balance);
+
+ /*
+ * Stop searching for tasks to pull if there are
+ * now runnable tasks on this rq.
+ */
+ if (pulled_task || this_rq->nr_running > 0)
break;
- }
}
rcu_read_unlock();
raw_spin_lock(&this_rq->lock);
- if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
- /*
- * We are going idle. next_balance may be set based on
- * a busy processor. So reset next_balance.
- */
+ if (curr_cost > this_rq->max_idle_balance_cost)
+ this_rq->max_idle_balance_cost = curr_cost;
+
+ /*
+ * While browsing the domains, we released the rq lock, a task could
+ * have been enqueued in the meantime. Since we're not going idle,
+ * pretend we pulled a task.
+ */
+ if (this_rq->cfs.h_nr_running && !pulled_task)
+ pulled_task = 1;
+
+out:
+ /* Move the next balance forward */
+ if (time_after(this_rq->next_balance, next_balance))
this_rq->next_balance = next_balance;
+
+ /* Is there a task of a high priority class? */
+ if (this_rq->nr_running != this_rq->cfs.h_nr_running)
+ pulled_task = -1;
+
+ if (pulled_task) {
+ idle_exit_fair(this_rq);
+ this_rq->idle_stamp = 0;
}
- if (curr_cost > this_rq->max_idle_balance_cost)
- this_rq->max_idle_balance_cost = curr_cost;
+ return pulled_task;
}
/*
@@ -6496,6 +6969,11 @@ out_unlock:
return 0;
}
+static inline int on_null_domain(struct rq *rq)
+{
+ return unlikely(!rcu_dereference_sched(rq->sd));
+}
+
#ifdef CONFIG_NO_HZ_COMMON
/*
* idle load balancing details
@@ -6550,8 +7028,13 @@ static void nohz_balancer_kick(void)
static inline void nohz_balance_exit_idle(int cpu)
{
if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
- cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
- atomic_dec(&nohz.nr_cpus);
+ /*
+ * Completely isolated CPUs don't ever set, so we must test.
+ */
+ if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
+ cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+ atomic_dec(&nohz.nr_cpus);
+ }
clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
}
}
@@ -6568,7 +7051,7 @@ static inline void set_cpu_sd_state_busy(void)
goto unlock;
sd->nohz_idle = 0;
- atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+ atomic_inc(&sd->groups->sgc->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
@@ -6585,7 +7068,7 @@ void set_cpu_sd_state_idle(void)
goto unlock;
sd->nohz_idle = 1;
- atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+ atomic_dec(&sd->groups->sgc->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
@@ -6605,6 +7088,12 @@ void nohz_balance_enter_idle(int cpu)
if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
return;
+ /*
+ * If we're a completely isolated CPU, we don't play.
+ */
+ if (on_null_domain(cpu_rq(cpu)))
+ return;
+
cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
atomic_inc(&nohz.nr_cpus);
set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
@@ -6682,16 +7171,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
break;
}
- interval = sd->balance_interval;
- if (idle != CPU_IDLE)
- interval *= sd->busy_factor;
-
- /* scale ms to jiffies */
- interval = msecs_to_jiffies(interval);
- interval = clamp(interval, 1UL, max_load_balance_interval);
+ interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
need_serialize = sd->flags & SD_SERIALIZE;
-
if (need_serialize) {
if (!spin_trylock(&balancing))
goto out;
@@ -6707,6 +7189,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
}
sd->last_balance = jiffies;
+ interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
}
if (need_serialize)
spin_unlock(&balancing);
@@ -6764,12 +7247,17 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
rq = cpu_rq(balance_cpu);
- raw_spin_lock_irq(&rq->lock);
- update_rq_clock(rq);
- update_idle_cpu_load(rq);
- raw_spin_unlock_irq(&rq->lock);
-
- rebalance_domains(rq, CPU_IDLE);
+ /*
+ * If time for next balance is due,
+ * do the balance.
+ */
+ if (time_after_eq(jiffies, rq->next_balance)) {
+ raw_spin_lock_irq(&rq->lock);
+ update_rq_clock(rq);
+ update_idle_cpu_load(rq);
+ raw_spin_unlock_irq(&rq->lock);
+ rebalance_domains(rq, CPU_IDLE);
+ }
if (time_after(this_rq->next_balance, rq->next_balance))
this_rq->next_balance = rq->next_balance;
@@ -6784,7 +7272,7 @@ end:
* of an idle cpu is the system.
* - This rq has more than one task.
* - At any scheduler domain level, this cpu's scheduler group has multiple
- * busy cpu's exceeding the group's power.
+ * busy cpu's exceeding the group's capacity.
* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
* domain span are idle.
*/
@@ -6792,7 +7280,7 @@ static inline int nohz_kick_needed(struct rq *rq)
{
unsigned long now = jiffies;
struct sched_domain *sd;
- struct sched_group_power *sgp;
+ struct sched_group_capacity *sgc;
int nr_busy, cpu = rq->cpu;
if (unlikely(rq->idle_balance))
@@ -6822,8 +7310,8 @@ static inline int nohz_kick_needed(struct rq *rq)
sd = rcu_dereference(per_cpu(sd_busy, cpu));
if (sd) {
- sgp = sd->groups->sgp;
- nr_busy = atomic_read(&sgp->nr_busy_cpus);
+ sgc = sd->groups->sgc;
+ nr_busy = atomic_read(&sgc->nr_busy_cpus);
if (nr_busy > 1)
goto need_kick_unlock;
@@ -6867,11 +7355,6 @@ static void run_rebalance_domains(struct softirq_action *h)
nohz_idle_balance(this_rq, idle);
}
-static inline int on_null_domain(struct rq *rq)
-{
- return !rcu_dereference_sched(rq->sd);
-}
-
/*
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
*/
@@ -6892,6 +7375,8 @@ void trigger_load_balance(struct rq *rq)
static void rq_online_fair(struct rq *rq)
{
update_sysctl();
+
+ update_runtime_enabled(rq);
}
static void rq_offline_fair(struct rq *rq)
@@ -6965,7 +7450,7 @@ static void task_fork_fair(struct task_struct *p)
* 'current' within the tree based on its new key value.
*/
swap(curr->vruntime, se->vruntime);
- resched_task(rq->curr);
+ resched_curr(rq);
}
se->vruntime -= cfs_rq->min_vruntime;
@@ -6990,7 +7475,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
*/
if (rq->curr == p) {
if (p->prio > oldprio)
- resched_task(rq->curr);
+ resched_curr(rq);
} else
check_preempt_curr(rq, p, 0);
}
@@ -7036,7 +7521,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
*/
static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
- if (!p->se.on_rq)
+ struct sched_entity *se = &p->se;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /*
+ * Since the real-depth could have been changed (only FAIR
+ * class maintain depth value), reset depth properly.
+ */
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
+#endif
+ if (!se->on_rq)
return;
/*
@@ -7045,7 +7538,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
* if we can still preempt the current task.
*/
if (rq->curr == p)
- resched_task(rq->curr);
+ resched_curr(rq);
else
check_preempt_curr(rq, p, 0);
}
@@ -7084,7 +7577,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
#ifdef CONFIG_FAIR_GROUP_SCHED
static void task_move_group_fair(struct task_struct *p, int on_rq)
{
+ struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq;
+
/*
* If the task was not on the rq at the time of this cgroup movement
* it must have been asleep, sleeping tasks keep their ->vruntime
@@ -7110,23 +7605,24 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
* To prevent boost or penalty in the new cfs_rq caused by delta
* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
*/
- if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
+ if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
on_rq = 1;
if (!on_rq)
- p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+ se->vruntime -= cfs_rq_of(se)->min_vruntime;
set_task_rq(p, task_cpu(p));
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
if (!on_rq) {
- cfs_rq = cfs_rq_of(&p->se);
- p->se.vruntime += cfs_rq->min_vruntime;
+ cfs_rq = cfs_rq_of(se);
+ se->vruntime += cfs_rq->min_vruntime;
#ifdef CONFIG_SMP
/*
* migrate_task_rq_fair() will have removed our previous
* contribution, but we must synchronize for ongoing future
* decay.
*/
- p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
- cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
+ se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+ cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
#endif
}
}
@@ -7222,10 +7718,13 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
if (!se)
return;
- if (!parent)
+ if (!parent) {
se->cfs_rq = &rq->cfs;
- else
+ se->depth = 0;
+ } else {
se->cfs_rq = parent->my_q;
+ se->depth = parent->depth + 1;
+ }
se->my_q = cfs_rq;
/* guarantee group entities always have weight */
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 5716929a2e3a..90284d117fe6 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -37,18 +37,18 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
SCHED_FEAT(WAKEUP_PREEMPTION, true)
/*
- * Use arch dependent cpu power functions
+ * Use arch dependent cpu capacity functions
*/
-SCHED_FEAT(ARCH_POWER, true)
+SCHED_FEAT(ARCH_CAPACITY, true)
SCHED_FEAT(HRTICK, false)
SCHED_FEAT(DOUBLE_TICK, false)
SCHED_FEAT(LB_BIAS, true)
/*
- * Decrement CPU power based on time not spent running tasks
+ * Decrement CPU capacity based on time not spent running tasks
*/
-SCHED_FEAT(NONTASK_POWER, true)
+SCHED_FEAT(NONTASK_CAPACITY, true)
/*
* Queue remote wakeups on the target CPU and process them
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
new file mode 100644
index 000000000000..11e7bc434f43
--- /dev/null
+++ b/kernel/sched/idle.c
@@ -0,0 +1,269 @@
+/*
+ * Generic entry point for the idle threads
+ */
+#include <linux/sched.h>
+#include <linux/cpu.h>
+#include <linux/cpuidle.h>
+#include <linux/tick.h>
+#include <linux/mm.h>
+#include <linux/stackprotector.h>
+
+#include <asm/tlb.h>
+
+#include <trace/events/power.h>
+
+#include "sched.h"
+
+static int __read_mostly cpu_idle_force_poll;
+
+void cpu_idle_poll_ctrl(bool enable)
+{
+ if (enable) {
+ cpu_idle_force_poll++;
+ } else {
+ cpu_idle_force_poll--;
+ WARN_ON_ONCE(cpu_idle_force_poll < 0);
+ }
+}
+
+#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
+static int __init cpu_idle_poll_setup(char *__unused)
+{
+ cpu_idle_force_poll = 1;
+ return 1;
+}
+__setup("nohlt", cpu_idle_poll_setup);
+
+static int __init cpu_idle_nopoll_setup(char *__unused)
+{
+ cpu_idle_force_poll = 0;
+ return 1;
+}
+__setup("hlt", cpu_idle_nopoll_setup);
+#endif
+
+static inline int cpu_idle_poll(void)
+{
+ rcu_idle_enter();
+ trace_cpu_idle_rcuidle(0, smp_processor_id());
+ local_irq_enable();
+ while (!tif_need_resched())
+ cpu_relax();
+ trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+ rcu_idle_exit();
+ return 1;
+}
+
+/* Weak implementations for optional arch specific functions */
+void __weak arch_cpu_idle_prepare(void) { }
+void __weak arch_cpu_idle_enter(void) { }
+void __weak arch_cpu_idle_exit(void) { }
+void __weak arch_cpu_idle_dead(void) { }
+void __weak arch_cpu_idle(void)
+{
+ cpu_idle_force_poll = 1;
+ local_irq_enable();
+}
+
+/**
+ * cpuidle_idle_call - the main idle function
+ *
+ * NOTE: no locks or semaphores should be used here
+ *
+ * On archs that support TIF_POLLING_NRFLAG, is called with polling
+ * set, and it returns with polling set. If it ever stops polling, it
+ * must clear the polling bit.
+ */
+static void cpuidle_idle_call(void)
+{
+ struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
+ struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+ int next_state, entered_state;
+ unsigned int broadcast;
+
+ /*
+ * Check if the idle task must be rescheduled. If it is the
+ * case, exit the function after re-enabling the local irq.
+ */
+ if (need_resched()) {
+ local_irq_enable();
+ return;
+ }
+
+ /*
+ * During the idle period, stop measuring the disabled irqs
+ * critical sections latencies
+ */
+ stop_critical_timings();
+
+ /*
+ * Tell the RCU framework we are entering an idle section,
+ * so no more rcu read side critical sections and one more
+ * step to the grace period
+ */
+ rcu_idle_enter();
+
+ /*
+ * Ask the cpuidle framework to choose a convenient idle state.
+ * Fall back to the default arch idle method on errors.
+ */
+ next_state = cpuidle_select(drv, dev);
+ if (next_state < 0) {
+use_default:
+ /*
+ * We can't use the cpuidle framework, let's use the default
+ * idle routine.
+ */
+ if (current_clr_polling_and_test())
+ local_irq_enable();
+ else
+ arch_cpu_idle();
+
+ goto exit_idle;
+ }
+
+
+ /*
+ * The idle task must be scheduled, it is pointless to
+ * go to idle, just update no idle residency and get
+ * out of this function
+ */
+ if (current_clr_polling_and_test()) {
+ dev->last_residency = 0;
+ entered_state = next_state;
+ local_irq_enable();
+ goto exit_idle;
+ }
+
+ broadcast = drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP;
+
+ /*
+ * Tell the time framework to switch to a broadcast timer
+ * because our local timer will be shutdown. If a local timer
+ * is used from another cpu as a broadcast timer, this call may
+ * fail if it is not available
+ */
+ if (broadcast &&
+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
+ goto use_default;
+
+ /*
+ * Enter the idle state previously returned by the governor decision.
+ * This function will block until an interrupt occurs and will take
+ * care of re-enabling the local interrupts
+ */
+ entered_state = cpuidle_enter(drv, dev, next_state);
+
+ if (broadcast)
+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
+
+ /*
+ * Give the governor an opportunity to reflect on the outcome
+ */
+ cpuidle_reflect(dev, entered_state);
+
+exit_idle:
+ __current_set_polling();
+
+ /*
+ * It is up to the idle functions to reenable local interrupts
+ */
+ if (WARN_ON_ONCE(irqs_disabled()))
+ local_irq_enable();
+
+ rcu_idle_exit();
+ start_critical_timings();
+}
+
+/*
+ * Generic idle loop implementation
+ *
+ * Called with polling cleared.
+ */
+static void cpu_idle_loop(void)
+{
+ while (1) {
+ /*
+ * If the arch has a polling bit, we maintain an invariant:
+ *
+ * Our polling bit is clear if we're not scheduled (i.e. if
+ * rq->curr != rq->idle). This means that, if rq->idle has
+ * the polling bit set, then setting need_resched is
+ * guaranteed to cause the cpu to reschedule.
+ */
+
+ __current_set_polling();
+ tick_nohz_idle_enter();
+
+ while (!need_resched()) {
+ check_pgt_cache();
+ rmb();
+
+ if (cpu_is_offline(smp_processor_id()))
+ arch_cpu_idle_dead();
+
+ local_irq_disable();
+ arch_cpu_idle_enter();
+
+ /*
+ * In poll mode we reenable interrupts and spin.
+ *
+ * Also if we detected in the wakeup from idle
+ * path that the tick broadcast device expired
+ * for us, we don't want to go deep idle as we
+ * know that the IPI is going to arrive right
+ * away
+ */
+ if (cpu_idle_force_poll || tick_check_broadcast_expired())
+ cpu_idle_poll();
+ else
+ cpuidle_idle_call();
+
+ arch_cpu_idle_exit();
+ }
+
+ /*
+ * Since we fell out of the loop above, we know
+ * TIF_NEED_RESCHED must be set, propagate it into
+ * PREEMPT_NEED_RESCHED.
+ *
+ * This is required because for polling idle loops we will
+ * not have had an IPI to fold the state for us.
+ */
+ preempt_set_need_resched();
+ tick_nohz_idle_exit();
+ __current_clr_polling();
+
+ /*
+ * We promise to call sched_ttwu_pending and reschedule
+ * if need_resched is set while polling is set. That
+ * means that clearing polling needs to be visible
+ * before doing these things.
+ */
+ smp_mb__after_atomic();
+
+ sched_ttwu_pending();
+ schedule_preempt_disabled();
+ }
+}
+
+void cpu_startup_entry(enum cpuhp_state state)
+{
+ /*
+ * This #ifdef needs to die, but it's too late in the cycle to
+ * make this generic (arm and sh have never invoked the canary
+ * init for the non boot cpus!). Will be fixed in 3.11
+ */
+#ifdef CONFIG_X86
+ /*
+ * If we're the non-boot CPU, nothing set the stack canary up
+ * for us. The boot CPU already has it initialized but no harm
+ * in doing it again. This is a good place for updating it, as
+ * we wont ever return from this function (so the invalid
+ * canaries already on the stack wont ever trigger).
+ */
+ boot_init_stack_canary();
+#endif
+ arch_cpu_idle_prepare();
+ cpu_idle_loop();
+}
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 516c3d9ceea1..67ad4e7f506a 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -13,33 +13,22 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
-
-static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
-{
- idle_exit_fair(rq);
- rq_last_tick_reset(rq);
-}
-
-static void post_schedule_idle(struct rq *rq)
-{
- idle_enter_fair(rq);
-}
#endif /* CONFIG_SMP */
+
/*
* Idle tasks are unconditionally rescheduled:
*/
static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
{
- resched_task(rq->idle);
+ resched_curr(rq);
}
-static struct task_struct *pick_next_task_idle(struct rq *rq)
+static struct task_struct *
+pick_next_task_idle(struct rq *rq, struct task_struct *prev)
{
+ put_prev_task(rq, prev);
+
schedstat_inc(rq, sched_goidle);
-#ifdef CONFIG_SMP
- /* Trigger the post schedule to do an idle_enter for CFS */
- rq->post_schedule = 1;
-#endif
return rq->idle;
}
@@ -58,6 +47,8 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
+ idle_exit_fair(rq);
+ rq_last_tick_reset(rq);
}
static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
@@ -101,8 +92,6 @@ const struct sched_class idle_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_idle,
- .pre_schedule = pre_schedule_idle,
- .post_schedule = post_schedule_idle,
#endif
.set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c
index 16f5a30f9c88..8ecd552fe4f2 100644
--- a/kernel/sched/proc.c
+++ b/kernel/sched/proc.c
@@ -8,13 +8,6 @@
#include "sched.h"
-unsigned long this_cpu_load(void)
-{
- struct rq *this = this_rq();
- return this->cpu_load[0];
-}
-
-
/*
* Global load-average calculations
*
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 1999021042c7..5f6edca4fafd 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -79,6 +79,8 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
#endif
+ /* We start is dequeued state, because no RT tasks are queued */
+ rt_rq->rt_queued = 0;
rt_rq->rt_time = 0;
rt_rq->rt_throttled = 0;
@@ -112,6 +114,13 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
return rt_se->rt_rq;
}
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *rt_rq = rt_se->rt_rq;
+
+ return rt_rq->rq;
+}
+
void free_rt_sched_group(struct task_group *tg)
{
int i;
@@ -211,10 +220,16 @@ static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
return container_of(rt_rq, struct rq, rt);
}
-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
{
struct task_struct *p = rt_task_of(rt_se);
- struct rq *rq = task_rq(p);
+
+ return task_rq(p);
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+ struct rq *rq = rq_of_rt_se(rt_se);
return &rq->rt;
}
@@ -229,6 +244,14 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
#ifdef CONFIG_SMP
+static int pull_rt_task(struct rq *this_rq);
+
+static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
+{
+ /* Try to pull RT tasks here if we lower this rq's prio */
+ return rq->rt.highest_prio.curr > prev->prio;
+}
+
static inline int rt_overloaded(struct rq *rq)
{
return atomic_read(&rq->rd->rto_count);
@@ -315,6 +338,15 @@ static inline int has_pushable_tasks(struct rq *rq)
return !plist_head_empty(&rq->rt.pushable_tasks);
}
+static inline void set_post_schedule(struct rq *rq)
+{
+ /*
+ * We detect this state here so that we can avoid taking the RQ
+ * lock again later if there is no need to push
+ */
+ rq->post_schedule = has_pushable_tasks(rq);
+}
+
static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
{
plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
@@ -359,8 +391,24 @@ void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
}
+static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
+{
+ return false;
+}
+
+static inline int pull_rt_task(struct rq *this_rq)
+{
+ return 0;
+}
+
+static inline void set_post_schedule(struct rq *rq)
+{
+}
#endif /* CONFIG_SMP */
+static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
+static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
+
static inline int on_rt_rq(struct sched_rt_entity *rt_se)
{
return !list_empty(&rt_se->run_list);
@@ -415,17 +463,21 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
+ struct rq *rq = rq_of_rt_rq(rt_rq);
struct sched_rt_entity *rt_se;
- int cpu = cpu_of(rq_of_rt_rq(rt_rq));
+ int cpu = cpu_of(rq);
rt_se = rt_rq->tg->rt_se[cpu];
if (rt_rq->rt_nr_running) {
- if (rt_se && !on_rt_rq(rt_se))
+ if (!rt_se)
+ enqueue_top_rt_rq(rt_rq);
+ else if (!on_rt_rq(rt_se))
enqueue_rt_entity(rt_se, false);
+
if (rt_rq->highest_prio.curr < curr->prio)
- resched_task(curr);
+ resched_curr(rq);
}
}
@@ -436,7 +488,9 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
rt_se = rt_rq->tg->rt_se[cpu];
- if (rt_se && on_rt_rq(rt_se))
+ if (!rt_se)
+ dequeue_top_rt_rq(rt_rq);
+ else if (on_rt_rq(rt_se))
dequeue_rt_entity(rt_se);
}
@@ -507,12 +561,18 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
- if (rt_rq->rt_nr_running)
- resched_task(rq_of_rt_rq(rt_rq)->curr);
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ if (!rt_rq->rt_nr_running)
+ return;
+
+ enqueue_top_rt_rq(rt_rq);
+ resched_curr(rq);
}
static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
{
+ dequeue_top_rt_rq(rt_rq);
}
static inline int rt_rq_throttled(struct rt_rq *rt_rq)
@@ -681,6 +741,9 @@ balanced:
rt_rq->rt_throttled = 0;
raw_spin_unlock(&rt_rq->rt_runtime_lock);
raw_spin_unlock(&rt_b->rt_runtime_lock);
+
+ /* Make rt_rq available for pick_next_task() */
+ sched_rt_rq_enqueue(rt_rq);
}
}
@@ -831,14 +894,8 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
* but accrue some time due to boosting.
*/
if (likely(rt_b->rt_runtime)) {
- static bool once = false;
-
rt_rq->rt_throttled = 1;
-
- if (!once) {
- once = true;
- printk_sched("sched: RT throttling activated\n");
- }
+ printk_deferred_once("sched: RT throttling activated\n");
} else {
/*
* In case we did anyway, make it go away,
@@ -865,7 +922,6 @@ static void update_curr_rt(struct rq *rq)
{
struct task_struct *curr = rq->curr;
struct sched_rt_entity *rt_se = &curr->rt;
- struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
u64 delta_exec;
if (curr->sched_class != &rt_sched_class)
@@ -890,18 +946,50 @@ static void update_curr_rt(struct rq *rq)
return;
for_each_sched_rt_entity(rt_se) {
- rt_rq = rt_rq_of_se(rt_se);
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
raw_spin_lock(&rt_rq->rt_runtime_lock);
rt_rq->rt_time += delta_exec;
if (sched_rt_runtime_exceeded(rt_rq))
- resched_task(curr);
+ resched_curr(rq);
raw_spin_unlock(&rt_rq->rt_runtime_lock);
}
}
}
+static void
+dequeue_top_rt_rq(struct rt_rq *rt_rq)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ BUG_ON(&rq->rt != rt_rq);
+
+ if (!rt_rq->rt_queued)
+ return;
+
+ BUG_ON(!rq->nr_running);
+
+ sub_nr_running(rq, rt_rq->rt_nr_running);
+ rt_rq->rt_queued = 0;
+}
+
+static void
+enqueue_top_rt_rq(struct rt_rq *rt_rq)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ BUG_ON(&rq->rt != rt_rq);
+
+ if (rt_rq->rt_queued)
+ return;
+ if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
+ return;
+
+ add_nr_running(rq, rt_rq->rt_nr_running);
+ rt_rq->rt_queued = 1;
+}
+
#if defined CONFIG_SMP
static void
@@ -1025,12 +1113,23 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
#endif /* CONFIG_RT_GROUP_SCHED */
static inline
+unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *group_rq = group_rt_rq(rt_se);
+
+ if (group_rq)
+ return group_rq->rt_nr_running;
+ else
+ return 1;
+}
+
+static inline
void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
int prio = rt_se_prio(rt_se);
WARN_ON(!rt_prio(prio));
- rt_rq->rt_nr_running++;
+ rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
inc_rt_prio(rt_rq, prio);
inc_rt_migration(rt_se, rt_rq);
@@ -1042,7 +1141,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
WARN_ON(!rt_prio(rt_se_prio(rt_se)));
WARN_ON(!rt_rq->rt_nr_running);
- rt_rq->rt_nr_running--;
+ rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
dec_rt_prio(rt_rq, rt_se_prio(rt_se));
dec_rt_migration(rt_se, rt_rq);
@@ -1099,6 +1198,8 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
back = rt_se;
}
+ dequeue_top_rt_rq(rt_rq_of_se(back));
+
for (rt_se = back; rt_se; rt_se = rt_se->back) {
if (on_rt_rq(rt_se))
__dequeue_rt_entity(rt_se);
@@ -1107,13 +1208,18 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
{
+ struct rq *rq = rq_of_rt_se(rt_se);
+
dequeue_rt_stack(rt_se);
for_each_sched_rt_entity(rt_se)
__enqueue_rt_entity(rt_se, head);
+ enqueue_top_rt_rq(&rq->rt);
}
static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
{
+ struct rq *rq = rq_of_rt_se(rt_se);
+
dequeue_rt_stack(rt_se);
for_each_sched_rt_entity(rt_se) {
@@ -1122,6 +1228,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
if (rt_rq && rt_rq->rt_nr_running)
__enqueue_rt_entity(rt_se, false);
}
+ enqueue_top_rt_rq(&rq->rt);
}
/*
@@ -1139,8 +1246,6 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
-
- inc_nr_running(rq);
}
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -1151,8 +1256,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
dequeue_rt_entity(rt_se);
dequeue_pushable_task(rq, p);
-
- dec_nr_running(rq);
}
/*
@@ -1264,7 +1367,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
* to try and push current away:
*/
requeue_task_rt(rq, p, 1);
- resched_task(rq->curr);
+ resched_curr(rq);
}
#endif /* CONFIG_SMP */
@@ -1275,7 +1378,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
{
if (p->prio < rq->curr->prio) {
- resched_task(rq->curr);
+ resched_curr(rq);
return;
}
@@ -1318,15 +1421,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
{
struct sched_rt_entity *rt_se;
struct task_struct *p;
- struct rt_rq *rt_rq;
-
- rt_rq = &rq->rt;
-
- if (!rt_rq->rt_nr_running)
- return NULL;
-
- if (rt_rq_throttled(rt_rq))
- return NULL;
+ struct rt_rq *rt_rq = &rq->rt;
do {
rt_se = pick_next_rt_entity(rq, rt_rq);
@@ -1340,21 +1435,43 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
return p;
}
-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *
+pick_next_task_rt(struct rq *rq, struct task_struct *prev)
{
- struct task_struct *p = _pick_next_task_rt(rq);
+ struct task_struct *p;
+ struct rt_rq *rt_rq = &rq->rt;
+
+ if (need_pull_rt_task(rq, prev)) {
+ pull_rt_task(rq);
+ /*
+ * pull_rt_task() can drop (and re-acquire) rq->lock; this
+ * means a dl or stop task can slip in, in which case we need
+ * to re-start task selection.
+ */
+ if (unlikely((rq->stop && rq->stop->on_rq) ||
+ rq->dl.dl_nr_running))
+ return RETRY_TASK;
+ }
+
+ /*
+ * We may dequeue prev's rt_rq in put_prev_task().
+ * So, we update time before rt_nr_running check.
+ */
+ if (prev->sched_class == &rt_sched_class)
+ update_curr_rt(rq);
+
+ if (!rt_rq->rt_queued)
+ return NULL;
+
+ put_prev_task(rq, prev);
+
+ p = _pick_next_task_rt(rq);
/* The running task is never eligible for pushing */
if (p)
dequeue_pushable_task(rq, p);
-#ifdef CONFIG_SMP
- /*
- * We detect this state here so that we can avoid taking the RQ
- * lock again later if there is no need to push
- */
- rq->post_schedule = has_pushable_tasks(rq);
-#endif
+ set_post_schedule(rq);
return p;
}
@@ -1577,7 +1694,7 @@ retry:
* just reschedule current.
*/
if (unlikely(next_task->prio < rq->curr->prio)) {
- resched_task(rq->curr);
+ resched_curr(rq);
return 0;
}
@@ -1624,7 +1741,7 @@ retry:
activate_task(lowest_rq, next_task, 0);
ret = 1;
- resched_task(lowest_rq->curr);
+ resched_curr(lowest_rq);
double_unlock_balance(rq, lowest_rq);
@@ -1724,13 +1841,6 @@ skip:
return ret;
}
-static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
-{
- /* Try to pull RT tasks here if we lower this rq's prio */
- if (rq->rt.highest_prio.curr > prev->prio)
- pull_rt_task(rq);
-}
-
static void post_schedule_rt(struct rq *rq)
{
push_rt_tasks(rq);
@@ -1830,10 +1940,10 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
return;
if (pull_rt_task(rq))
- resched_task(rq->curr);
+ resched_curr(rq);
}
-void init_sched_rt_class(void)
+void __init init_sched_rt_class(void)
{
unsigned int i;
@@ -1862,13 +1972,13 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
*/
if (p->on_rq && rq->curr != p) {
#ifdef CONFIG_SMP
- if (rq->rt.overloaded && push_rt_task(rq) &&
+ if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
/* Don't resched if we changed runqueues */
- rq != task_rq(p))
+ push_rt_task(rq) && rq != task_rq(p))
check_resched = 0;
#endif /* CONFIG_SMP */
if (check_resched && p->prio < rq->curr->prio)
- resched_task(rq->curr);
+ resched_curr(rq);
}
}
@@ -1897,11 +2007,11 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
* Only reschedule if p is still on the same runqueue.
*/
if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
- resched_task(p);
+ resched_curr(rq);
#else
/* For UP simply resched on drop of prio */
if (oldprio < p->prio)
- resched_task(p);
+ resched_curr(rq);
#endif /* CONFIG_SMP */
} else {
/*
@@ -1910,7 +2020,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
* then reschedule.
*/
if (p->prio < rq->curr->prio)
- resched_task(rq->curr);
+ resched_curr(rq);
}
}
@@ -2007,7 +2117,6 @@ const struct sched_class rt_sched_class = {
.set_cpus_allowed = set_cpus_allowed_rt,
.rq_online = rq_online_rt,
.rq_offline = rq_offline_rt,
- .pre_schedule = pre_schedule_rt,
.post_schedule = post_schedule_rt,
.task_woken = task_woken_rt,
.switched_from = switched_from_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f964add50f38..579712f4e9d5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -24,24 +24,6 @@ extern long calc_load_fold_active(struct rq *this_rq);
extern void update_cpu_load_active(struct rq *this_rq);
/*
- * Convert user-nice values [ -20 ... 0 ... 19 ]
- * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
- * and back.
- */
-#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
-#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
-#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
-
-/*
- * 'User priority' is the nice value converted to something we
- * can work with better when scaling various scheduler parameters,
- * it's a [ 0 ... 39 ] range.
- */
-#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
-#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
-#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
-
-/*
* Helpers for converting nanosecond timing to jiffy resolution
*/
#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
@@ -296,7 +278,7 @@ extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
-extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
+extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force);
extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
extern void free_rt_sched_group(struct task_group *tg);
@@ -427,6 +409,8 @@ struct rt_rq {
int overloaded;
struct plist_head pushable_tasks;
#endif
+ int rt_queued;
+
int rt_throttled;
u64 rt_time;
u64 rt_runtime;
@@ -493,6 +477,9 @@ struct root_domain {
cpumask_var_t span;
cpumask_var_t online;
+ /* Indicate more than one runnable task for any CPU */
+ bool overload;
+
/*
* The bit corresponding to a CPU gets set here if such CPU has more
* than one runnable -deadline task (as it is below for RT tasks).
@@ -558,11 +545,9 @@ struct rq {
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-#ifdef CONFIG_RT_GROUP_SCHED
- struct list_head leaf_rt_rq_list;
-#endif
+ struct sched_avg avg;
+#endif /* CONFIG_FAIR_GROUP_SCHED */
/*
* This is part of a global counter where only the total sum
@@ -585,7 +570,7 @@ struct rq {
struct root_domain *rd;
struct sched_domain *sd;
- unsigned long cpu_power;
+ unsigned long cpu_capacity;
unsigned char idle_balance;
/* For active balancing */
@@ -651,8 +636,6 @@ struct rq {
#ifdef CONFIG_SMP
struct llist_head wake_list;
#endif
-
- struct sched_avg avg;
};
static inline int cpu_of(struct rq *rq)
@@ -690,6 +673,8 @@ extern int migrate_swap(struct task_struct *, struct task_struct *);
#ifdef CONFIG_SMP
+extern void sched_ttwu_pending(void);
+
#define rcu_dereference_check_sched_domain(p) \
rcu_dereference_check((p), \
lockdep_is_held(&sched_domains_mutex))
@@ -748,15 +733,15 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa);
DECLARE_PER_CPU(struct sched_domain *, sd_busy);
DECLARE_PER_CPU(struct sched_domain *, sd_asym);
-struct sched_group_power {
+struct sched_group_capacity {
atomic_t ref;
/*
- * CPU power of this group, SCHED_LOAD_SCALE being max power for a
- * single CPU.
+ * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
+ * for a single CPU.
*/
- unsigned int power, power_orig;
+ unsigned int capacity, capacity_orig;
unsigned long next_update;
- int imbalance; /* XXX unrelated to power but shared group state */
+ int imbalance; /* XXX unrelated to capacity but shared group state */
/*
* Number of busy cpus in this group.
*/
@@ -770,7 +755,7 @@ struct sched_group {
atomic_t ref;
unsigned int group_weight;
- struct sched_group_power *sgp;
+ struct sched_group_capacity *sgc;
/*
* The CPUs this group covers.
@@ -793,7 +778,7 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
*/
static inline struct cpumask *sched_group_mask(struct sched_group *sg)
{
- return to_cpumask(sg->sgp->cpumask);
+ return to_cpumask(sg->sgc->cpumask);
}
/**
@@ -807,6 +792,10 @@ static inline unsigned int group_first_cpu(struct sched_group *group)
extern int group_balance_cpu(struct sched_group *sg);
+#else
+
+static inline void sched_ttwu_pending(void) { }
+
#endif /* CONFIG_SMP */
#include "stats.h"
@@ -898,20 +887,10 @@ enum {
#undef SCHED_FEAT
#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
-static __always_inline bool static_branch__true(struct static_key *key)
-{
- return static_key_true(key); /* Not out of line branch. */
-}
-
-static __always_inline bool static_branch__false(struct static_key *key)
-{
- return static_key_false(key); /* Out of line branch. */
-}
-
#define SCHED_FEAT(name, enabled) \
static __always_inline bool static_branch_##name(struct static_key *key) \
{ \
- return static_branch__##enabled(key); \
+ return static_key_##enabled(key); \
}
#include "features.h"
@@ -1112,6 +1091,8 @@ static const u32 prio_to_wmult[40] = {
#define DEQUEUE_SLEEP 1
+#define RETRY_TASK ((void *)-1UL)
+
struct sched_class {
const struct sched_class *next;
@@ -1122,14 +1103,22 @@ struct sched_class {
void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
- struct task_struct * (*pick_next_task) (struct rq *rq);
+ /*
+ * It is the responsibility of the pick_next_task() method that will
+ * return the next task to call put_prev_task() on the @prev task or
+ * something equivalent.
+ *
+ * May return RETRY_TASK when it finds a higher prio class has runnable
+ * tasks.
+ */
+ struct task_struct * (*pick_next_task) (struct rq *rq,
+ struct task_struct *prev);
void (*put_prev_task) (struct rq *rq, struct task_struct *p);
#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
- void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
void (*post_schedule) (struct rq *this_rq);
void (*task_waking) (struct task_struct *task);
void (*task_woken) (struct rq *this_rq, struct task_struct *task);
@@ -1159,6 +1148,11 @@ struct sched_class {
#endif
};
+static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
+{
+ prev->sched_class->put_prev_task(rq, prev);
+}
+
#define sched_class_highest (&stop_sched_class)
#define for_each_class(class) \
for (class = sched_class_highest; class; class = class->next)
@@ -1172,19 +1166,17 @@ extern const struct sched_class idle_sched_class;
#ifdef CONFIG_SMP
-extern void update_group_power(struct sched_domain *sd, int cpu);
+extern void update_group_capacity(struct sched_domain *sd, int cpu);
extern void trigger_load_balance(struct rq *rq);
-extern void idle_balance(int this_cpu, struct rq *this_rq);
extern void idle_enter_fair(struct rq *this_rq);
extern void idle_exit_fair(struct rq *this_rq);
-#else /* CONFIG_SMP */
+#else
-static inline void idle_balance(int cpu, struct rq *rq)
-{
-}
+static inline void idle_enter_fair(struct rq *rq) { }
+static inline void idle_exit_fair(struct rq *rq) { }
#endif
@@ -1197,7 +1189,7 @@ extern void init_sched_rt_class(void);
extern void init_sched_fair_class(void);
extern void init_sched_dl_class(void);
-extern void resched_task(struct task_struct *p);
+extern void resched_curr(struct rq *rq);
extern void resched_cpu(int cpu);
extern struct rt_bandwidth def_rt_bandwidth;
@@ -1213,34 +1205,37 @@ extern void update_idle_cpu_load(struct rq *this_rq);
extern void init_task_runnable_average(struct task_struct *p);
-#ifdef CONFIG_PARAVIRT
-static inline u64 steal_ticks(u64 steal)
+static inline void add_nr_running(struct rq *rq, unsigned count)
{
- if (unlikely(steal > NSEC_PER_SEC))
- return div_u64(steal, TICK_NSEC);
+ unsigned prev_nr = rq->nr_running;
- return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
-}
-#endif
+ rq->nr_running = prev_nr + count;
-static inline void inc_nr_running(struct rq *rq)
-{
- rq->nr_running++;
+ if (prev_nr < 2 && rq->nr_running >= 2) {
+#ifdef CONFIG_SMP
+ if (!rq->rd->overload)
+ rq->rd->overload = true;
+#endif
#ifdef CONFIG_NO_HZ_FULL
- if (rq->nr_running == 2) {
if (tick_nohz_full_cpu(rq->cpu)) {
- /* Order rq->nr_running write against the IPI */
- smp_wmb();
- smp_send_reschedule(rq->cpu);
+ /*
+ * Tick is needed if more than one task runs on a CPU.
+ * Send the target an IPI to kick it out of nohz mode.
+ *
+ * We assume that IPI implies full memory barrier and the
+ * new value of rq->nr_running is visible on reception
+ * from the target.
+ */
+ tick_nohz_full_kick_cpu(rq->cpu);
}
- }
#endif
+ }
}
-static inline void dec_nr_running(struct rq *rq)
+static inline void sub_nr_running(struct rq *rq, unsigned count)
{
- rq->nr_running--;
+ rq->nr_running -= count;
}
static inline void rq_last_tick_reset(struct rq *rq)
@@ -1392,6 +1387,15 @@ static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
}
+static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2)
+{
+ if (l1 > l2)
+ swap(l1, l2);
+
+ spin_lock_irq(l1);
+ spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+}
+
static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
{
if (l1 > l2)
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index da98af347e8b..a476bea17fbc 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -142,4 +142,4 @@ static int __init proc_schedstat_init(void)
proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
return 0;
}
-module_init(proc_schedstat_init);
+subsys_initcall(proc_schedstat_init);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index fdb6bb0b3356..bfe0edadbfbb 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,28 +23,31 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
/* we're never preempted */
}
-static struct task_struct *pick_next_task_stop(struct rq *rq)
+static struct task_struct *
+pick_next_task_stop(struct rq *rq, struct task_struct *prev)
{
struct task_struct *stop = rq->stop;
- if (stop && stop->on_rq) {
- stop->se.exec_start = rq_clock_task(rq);
- return stop;
- }
+ if (!stop || !stop->on_rq)
+ return NULL;
- return NULL;
+ put_prev_task(rq, prev);
+
+ stop->se.exec_start = rq_clock_task(rq);
+
+ return stop;
}
static void
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
- inc_nr_running(rq);
+ add_nr_running(rq, 1);
}
static void
dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
- dec_nr_running(rq);
+ sub_nr_running(rq, 1);
}
static void yield_task_stop(struct rq *rq)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 7d50f794e248..15cab1a4f84e 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -319,14 +319,14 @@ EXPORT_SYMBOL(wake_bit_function);
*/
int __sched
__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
- int (*action)(void *), unsigned mode)
+ wait_bit_action_f *action, unsigned mode)
{
int ret = 0;
do {
prepare_to_wait(wq, &q->wait, mode);
if (test_bit(q->key.bit_nr, q->key.flags))
- ret = (*action)(q->key.flags);
+ ret = (*action)(&q->key);
} while (test_bit(q->key.bit_nr, q->key.flags) && !ret);
finish_wait(wq, &q->wait);
return ret;
@@ -334,7 +334,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
EXPORT_SYMBOL(__wait_on_bit);
int __sched out_of_line_wait_on_bit(void *word, int bit,
- int (*action)(void *), unsigned mode)
+ wait_bit_action_f *action, unsigned mode)
{
wait_queue_head_t *wq = bit_waitqueue(word, bit);
DEFINE_WAIT_BIT(wait, word, bit);
@@ -345,7 +345,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit);
int __sched
__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
- int (*action)(void *), unsigned mode)
+ wait_bit_action_f *action, unsigned mode)
{
do {
int ret;
@@ -353,7 +353,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
prepare_to_wait_exclusive(wq, &q->wait, mode);
if (!test_bit(q->key.bit_nr, q->key.flags))
continue;
- ret = action(q->key.flags);
+ ret = action(&q->key);
if (!ret)
continue;
abort_exclusive_wait(wq, &q->wait, mode, &q->key);
@@ -365,7 +365,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
EXPORT_SYMBOL(__wait_on_bit_lock);
int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
- int (*action)(void *), unsigned mode)
+ wait_bit_action_f *action, unsigned mode)
{
wait_queue_head_t *wq = bit_waitqueue(word, bit);
DEFINE_WAIT_BIT(wait, word, bit);
@@ -394,7 +394,7 @@ EXPORT_SYMBOL(__wake_up_bit);
*
* In order for this to function properly, as it uses waitqueue_active()
* internally, some kind of memory barrier must be done prior to calling
- * this. Typically, this will be smp_mb__after_clear_bit(), but in some
+ * this. Typically, this will be smp_mb__after_atomic(), but in some
* cases where bitflags are manipulated non-atomically under a lock, one
* may need to use a less regular barrier, such fs/inode.c's smp_mb(),
* because spin_unlock() does not guarantee a memory barrier.
@@ -502,3 +502,21 @@ void wake_up_atomic_t(atomic_t *p)
__wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
}
EXPORT_SYMBOL(wake_up_atomic_t);
+
+__sched int bit_wait(struct wait_bit_key *word)
+{
+ if (signal_pending_state(current->state, current))
+ return 1;
+ schedule();
+ return 0;
+}
+EXPORT_SYMBOL(bit_wait);
+
+__sched int bit_wait_io(struct wait_bit_key *word)
+{
+ if (signal_pending_state(current->state, current))
+ return 1;
+ io_schedule();
+ return 0;
+}
+EXPORT_SYMBOL(bit_wait_io);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index b7a10048a32c..44eb005c6695 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -18,15 +18,17 @@
#include <linux/compat.h>
#include <linux/sched.h>
#include <linux/seccomp.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
/* #define SECCOMP_DEBUG 1 */
#ifdef CONFIG_SECCOMP_FILTER
#include <asm/syscall.h>
#include <linux/filter.h>
+#include <linux/pid.h>
#include <linux/ptrace.h>
#include <linux/security.h>
-#include <linux/slab.h>
#include <linux/tracehook.h>
#include <linux/uaccess.h>
@@ -39,7 +41,7 @@
* is only needed for handling filters shared across tasks.
* @prev: points to a previously installed, or inherited, filter
* @len: the number of instructions in the program
- * @insns: the BPF program instructions to evaluate
+ * @insnsi: the BPF program instructions to evaluate
*
* seccomp_filter objects are organized in a tree linked via the @prev
* pointer. For any task, it appears to be a singly-linked list starting
@@ -54,61 +56,32 @@
struct seccomp_filter {
atomic_t usage;
struct seccomp_filter *prev;
- unsigned short len; /* Instruction count */
- struct sock_filter insns[];
+ struct bpf_prog *prog;
};
/* Limit any path through the tree to 256KB worth of instructions. */
#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
-/**
- * get_u32 - returns a u32 offset into data
- * @data: a unsigned 64 bit value
- * @index: 0 or 1 to return the first or second 32-bits
- *
- * This inline exists to hide the length of unsigned long. If a 32-bit
- * unsigned long is passed in, it will be extended and the top 32-bits will be
- * 0. If it is a 64-bit unsigned long, then whatever data is resident will be
- * properly returned.
- *
+/*
* Endianness is explicitly ignored and left for BPF program authors to manage
* as per the specific architecture.
*/
-static inline u32 get_u32(u64 data, int index)
+static void populate_seccomp_data(struct seccomp_data *sd)
{
- return ((u32 *)&data)[index];
-}
-
-/* Helper for bpf_load below. */
-#define BPF_DATA(_name) offsetof(struct seccomp_data, _name)
-/**
- * bpf_load: checks and returns a pointer to the requested offset
- * @off: offset into struct seccomp_data to load from
- *
- * Returns the requested 32-bits of data.
- * seccomp_check_filter() should assure that @off is 32-bit aligned
- * and not out of bounds. Failure to do so is a BUG.
- */
-u32 seccomp_bpf_load(int off)
-{
- struct pt_regs *regs = task_pt_regs(current);
- if (off == BPF_DATA(nr))
- return syscall_get_nr(current, regs);
- if (off == BPF_DATA(arch))
- return syscall_get_arch(current, regs);
- if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) {
- unsigned long value;
- int arg = (off - BPF_DATA(args[0])) / sizeof(u64);
- int index = !!(off % sizeof(u64));
- syscall_get_arguments(current, regs, arg, 1, &value);
- return get_u32(value, index);
- }
- if (off == BPF_DATA(instruction_pointer))
- return get_u32(KSTK_EIP(current), 0);
- if (off == BPF_DATA(instruction_pointer) + sizeof(u32))
- return get_u32(KSTK_EIP(current), 1);
- /* seccomp_check_filter should make this impossible. */
- BUG();
+ struct task_struct *task = current;
+ struct pt_regs *regs = task_pt_regs(task);
+ unsigned long args[6];
+
+ sd->nr = syscall_get_nr(task, regs);
+ sd->arch = syscall_get_arch();
+ syscall_get_arguments(task, regs, 0, 6, args);
+ sd->args[0] = args[0];
+ sd->args[1] = args[1];
+ sd->args[2] = args[2];
+ sd->args[3] = args[3];
+ sd->args[4] = args[4];
+ sd->args[5] = args[5];
+ sd->instruction_pointer = KSTK_EIP(task);
}
/**
@@ -116,7 +89,7 @@ u32 seccomp_bpf_load(int off)
* @filter: filter to verify
* @flen: length of filter
*
- * Takes a previously checked filter (by sk_chk_filter) and
+ * Takes a previously checked filter (by bpf_check_classic) and
* redirects all filter code that loads struct sk_buff data
* and related data through seccomp_bpf_load. It also
* enforces length and alignment checking of those loads.
@@ -132,59 +105,59 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
u32 k = ftest->k;
switch (code) {
- case BPF_S_LD_W_ABS:
- ftest->code = BPF_S_ANC_SECCOMP_LD_W;
+ case BPF_LD | BPF_W | BPF_ABS:
+ ftest->code = BPF_LDX | BPF_W | BPF_ABS;
/* 32-bit aligned and not out of bounds. */
if (k >= sizeof(struct seccomp_data) || k & 3)
return -EINVAL;
continue;
- case BPF_S_LD_W_LEN:
- ftest->code = BPF_S_LD_IMM;
+ case BPF_LD | BPF_W | BPF_LEN:
+ ftest->code = BPF_LD | BPF_IMM;
ftest->k = sizeof(struct seccomp_data);
continue;
- case BPF_S_LDX_W_LEN:
- ftest->code = BPF_S_LDX_IMM;
+ case BPF_LDX | BPF_W | BPF_LEN:
+ ftest->code = BPF_LDX | BPF_IMM;
ftest->k = sizeof(struct seccomp_data);
continue;
/* Explicitly include allowed calls. */
- case BPF_S_RET_K:
- case BPF_S_RET_A:
- case BPF_S_ALU_ADD_K:
- case BPF_S_ALU_ADD_X:
- case BPF_S_ALU_SUB_K:
- case BPF_S_ALU_SUB_X:
- case BPF_S_ALU_MUL_K:
- case BPF_S_ALU_MUL_X:
- case BPF_S_ALU_DIV_X:
- case BPF_S_ALU_AND_K:
- case BPF_S_ALU_AND_X:
- case BPF_S_ALU_OR_K:
- case BPF_S_ALU_OR_X:
- case BPF_S_ALU_XOR_K:
- case BPF_S_ALU_XOR_X:
- case BPF_S_ALU_LSH_K:
- case BPF_S_ALU_LSH_X:
- case BPF_S_ALU_RSH_K:
- case BPF_S_ALU_RSH_X:
- case BPF_S_ALU_NEG:
- case BPF_S_LD_IMM:
- case BPF_S_LDX_IMM:
- case BPF_S_MISC_TAX:
- case BPF_S_MISC_TXA:
- case BPF_S_ALU_DIV_K:
- case BPF_S_LD_MEM:
- case BPF_S_LDX_MEM:
- case BPF_S_ST:
- case BPF_S_STX:
- case BPF_S_JMP_JA:
- case BPF_S_JMP_JEQ_K:
- case BPF_S_JMP_JEQ_X:
- case BPF_S_JMP_JGE_K:
- case BPF_S_JMP_JGE_X:
- case BPF_S_JMP_JGT_K:
- case BPF_S_JMP_JGT_X:
- case BPF_S_JMP_JSET_K:
- case BPF_S_JMP_JSET_X:
+ case BPF_RET | BPF_K:
+ case BPF_RET | BPF_A:
+ case BPF_ALU | BPF_ADD | BPF_K:
+ case BPF_ALU | BPF_ADD | BPF_X:
+ case BPF_ALU | BPF_SUB | BPF_K:
+ case BPF_ALU | BPF_SUB | BPF_X:
+ case BPF_ALU | BPF_MUL | BPF_K:
+ case BPF_ALU | BPF_MUL | BPF_X:
+ case BPF_ALU | BPF_DIV | BPF_K:
+ case BPF_ALU | BPF_DIV | BPF_X:
+ case BPF_ALU | BPF_AND | BPF_K:
+ case BPF_ALU | BPF_AND | BPF_X:
+ case BPF_ALU | BPF_OR | BPF_K:
+ case BPF_ALU | BPF_OR | BPF_X:
+ case BPF_ALU | BPF_XOR | BPF_K:
+ case BPF_ALU | BPF_XOR | BPF_X:
+ case BPF_ALU | BPF_LSH | BPF_K:
+ case BPF_ALU | BPF_LSH | BPF_X:
+ case BPF_ALU | BPF_RSH | BPF_K:
+ case BPF_ALU | BPF_RSH | BPF_X:
+ case BPF_ALU | BPF_NEG:
+ case BPF_LD | BPF_IMM:
+ case BPF_LDX | BPF_IMM:
+ case BPF_MISC | BPF_TAX:
+ case BPF_MISC | BPF_TXA:
+ case BPF_LD | BPF_MEM:
+ case BPF_LDX | BPF_MEM:
+ case BPF_ST:
+ case BPF_STX:
+ case BPF_JMP | BPF_JA:
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_X:
continue;
default:
return -EINVAL;
@@ -201,102 +174,264 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
*/
static u32 seccomp_run_filters(int syscall)
{
- struct seccomp_filter *f;
+ struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter);
+ struct seccomp_data sd;
u32 ret = SECCOMP_RET_ALLOW;
/* Ensure unexpected behavior doesn't result in failing open. */
- if (WARN_ON(current->seccomp.filter == NULL))
+ if (unlikely(WARN_ON(f == NULL)))
return SECCOMP_RET_KILL;
+ /* Make sure cross-thread synced filter points somewhere sane. */
+ smp_read_barrier_depends();
+
+ populate_seccomp_data(&sd);
+
/*
* All filters in the list are evaluated and the lowest BPF return
* value always takes priority (ignoring the DATA).
*/
- for (f = current->seccomp.filter; f; f = f->prev) {
- u32 cur_ret = sk_run_filter(NULL, f->insns);
+ for (; f; f = f->prev) {
+ u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)&sd);
+
if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
ret = cur_ret;
}
return ret;
}
+#endif /* CONFIG_SECCOMP_FILTER */
+
+static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
+{
+ assert_spin_locked(&current->sighand->siglock);
+
+ if (current->seccomp.mode && current->seccomp.mode != seccomp_mode)
+ return false;
+
+ return true;
+}
+
+static inline void seccomp_assign_mode(struct task_struct *task,
+ unsigned long seccomp_mode)
+{
+ assert_spin_locked(&task->sighand->siglock);
+
+ task->seccomp.mode = seccomp_mode;
+ /*
+ * Make sure TIF_SECCOMP cannot be set before the mode (and
+ * filter) is set.
+ */
+ smp_mb__before_atomic();
+ set_tsk_thread_flag(task, TIF_SECCOMP);
+}
+
+#ifdef CONFIG_SECCOMP_FILTER
+/* Returns 1 if the parent is an ancestor of the child. */
+static int is_ancestor(struct seccomp_filter *parent,
+ struct seccomp_filter *child)
+{
+ /* NULL is the root ancestor. */
+ if (parent == NULL)
+ return 1;
+ for (; child; child = child->prev)
+ if (child == parent)
+ return 1;
+ return 0;
+}
+
+/**
+ * seccomp_can_sync_threads: checks if all threads can be synchronized
+ *
+ * Expects sighand and cred_guard_mutex locks to be held.
+ *
+ * Returns 0 on success, -ve on error, or the pid of a thread which was
+ * either not in the correct seccomp mode or it did not have an ancestral
+ * seccomp filter.
+ */
+static inline pid_t seccomp_can_sync_threads(void)
+{
+ struct task_struct *thread, *caller;
+
+ BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
+ assert_spin_locked(&current->sighand->siglock);
+
+ /* Validate all threads being eligible for synchronization. */
+ caller = current;
+ for_each_thread(caller, thread) {
+ pid_t failed;
+
+ /* Skip current, since it is initiating the sync. */
+ if (thread == caller)
+ continue;
+
+ if (thread->seccomp.mode == SECCOMP_MODE_DISABLED ||
+ (thread->seccomp.mode == SECCOMP_MODE_FILTER &&
+ is_ancestor(thread->seccomp.filter,
+ caller->seccomp.filter)))
+ continue;
+
+ /* Return the first thread that cannot be synchronized. */
+ failed = task_pid_vnr(thread);
+ /* If the pid cannot be resolved, then return -ESRCH */
+ if (unlikely(WARN_ON(failed == 0)))
+ failed = -ESRCH;
+ return failed;
+ }
+
+ return 0;
+}
+
+/**
+ * seccomp_sync_threads: sets all threads to use current's filter
+ *
+ * Expects sighand and cred_guard_mutex locks to be held, and for
+ * seccomp_can_sync_threads() to have returned success already
+ * without dropping the locks.
+ *
+ */
+static inline void seccomp_sync_threads(void)
+{
+ struct task_struct *thread, *caller;
+
+ BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
+ assert_spin_locked(&current->sighand->siglock);
+
+ /* Synchronize all threads. */
+ caller = current;
+ for_each_thread(caller, thread) {
+ /* Skip current, since it needs no changes. */
+ if (thread == caller)
+ continue;
+
+ /* Get a task reference for the new leaf node. */
+ get_seccomp_filter(caller);
+ /*
+ * Drop the task reference to the shared ancestor since
+ * current's path will hold a reference. (This also
+ * allows a put before the assignment.)
+ */
+ put_seccomp_filter(thread);
+ smp_store_release(&thread->seccomp.filter,
+ caller->seccomp.filter);
+ /*
+ * Opt the other thread into seccomp if needed.
+ * As threads are considered to be trust-realm
+ * equivalent (see ptrace_may_access), it is safe to
+ * allow one thread to transition the other.
+ */
+ if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) {
+ /*
+ * Don't let an unprivileged task work around
+ * the no_new_privs restriction by creating
+ * a thread that sets it up, enters seccomp,
+ * then dies.
+ */
+ if (task_no_new_privs(caller))
+ task_set_no_new_privs(thread);
+
+ seccomp_assign_mode(thread, SECCOMP_MODE_FILTER);
+ }
+ }
+}
/**
- * seccomp_attach_filter: Attaches a seccomp filter to current.
+ * seccomp_prepare_filter: Prepares a seccomp filter for use.
* @fprog: BPF program to install
*
- * Returns 0 on success or an errno on failure.
+ * Returns filter on success or an ERR_PTR on failure.
*/
-static long seccomp_attach_filter(struct sock_fprog *fprog)
+static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
{
struct seccomp_filter *filter;
- unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
- unsigned long total_insns = fprog->len;
+ unsigned long fp_size;
+ struct sock_filter *fp;
+ int new_len;
long ret;
if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
- return -EINVAL;
-
- for (filter = current->seccomp.filter; filter; filter = filter->prev)
- total_insns += filter->len + 4; /* include a 4 instr penalty */
- if (total_insns > MAX_INSNS_PER_PATH)
- return -ENOMEM;
+ return ERR_PTR(-EINVAL);
+ BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
+ fp_size = fprog->len * sizeof(struct sock_filter);
/*
- * Installing a seccomp filter requires that the task have
+ * Installing a seccomp filter requires that the task has
* CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
* This avoids scenarios where unprivileged tasks can affect the
* behavior of privileged children.
*/
- if (!current->no_new_privs &&
+ if (!task_no_new_privs(current) &&
security_capable_noaudit(current_cred(), current_user_ns(),
CAP_SYS_ADMIN) != 0)
- return -EACCES;
+ return ERR_PTR(-EACCES);
- /* Allocate a new seccomp_filter */
- filter = kzalloc(sizeof(struct seccomp_filter) + fp_size,
- GFP_KERNEL|__GFP_NOWARN);
- if (!filter)
- return -ENOMEM;
- atomic_set(&filter->usage, 1);
- filter->len = fprog->len;
+ fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN);
+ if (!fp)
+ return ERR_PTR(-ENOMEM);
/* Copy the instructions from fprog. */
ret = -EFAULT;
- if (copy_from_user(filter->insns, fprog->filter, fp_size))
- goto fail;
+ if (copy_from_user(fp, fprog->filter, fp_size))
+ goto free_prog;
/* Check and rewrite the fprog via the skb checker */
- ret = sk_chk_filter(filter->insns, filter->len);
+ ret = bpf_check_classic(fp, fprog->len);
if (ret)
- goto fail;
+ goto free_prog;
/* Check and rewrite the fprog for seccomp use */
- ret = seccomp_check_filter(filter->insns, filter->len);
+ ret = seccomp_check_filter(fp, fprog->len);
if (ret)
- goto fail;
+ goto free_prog;
- /*
- * If there is an existing filter, make it the prev and don't drop its
- * task reference.
- */
- filter->prev = current->seccomp.filter;
- current->seccomp.filter = filter;
- return 0;
-fail:
+ /* Convert 'sock_filter' insns to 'bpf_insn' insns */
+ ret = bpf_convert_filter(fp, fprog->len, NULL, &new_len);
+ if (ret)
+ goto free_prog;
+
+ /* Allocate a new seccomp_filter */
+ ret = -ENOMEM;
+ filter = kzalloc(sizeof(struct seccomp_filter),
+ GFP_KERNEL|__GFP_NOWARN);
+ if (!filter)
+ goto free_prog;
+
+ filter->prog = kzalloc(bpf_prog_size(new_len),
+ GFP_KERNEL|__GFP_NOWARN);
+ if (!filter->prog)
+ goto free_filter;
+
+ ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);
+ if (ret)
+ goto free_filter_prog;
+ kfree(fp);
+
+ atomic_set(&filter->usage, 1);
+ filter->prog->len = new_len;
+
+ bpf_prog_select_runtime(filter->prog);
+
+ return filter;
+
+free_filter_prog:
+ kfree(filter->prog);
+free_filter:
kfree(filter);
- return ret;
+free_prog:
+ kfree(fp);
+ return ERR_PTR(ret);
}
/**
- * seccomp_attach_user_filter - attaches a user-supplied sock_fprog
+ * seccomp_prepare_user_filter - prepares a user-supplied sock_fprog
* @user_filter: pointer to the user data containing a sock_fprog.
*
* Returns 0 on success and non-zero otherwise.
*/
-long seccomp_attach_user_filter(char __user *user_filter)
+static struct seccomp_filter *
+seccomp_prepare_user_filter(const char __user *user_filter)
{
struct sock_fprog fprog;
- long ret = -EFAULT;
+ struct seccomp_filter *filter = ERR_PTR(-EFAULT);
#ifdef CONFIG_COMPAT
if (is_compat_task()) {
@@ -309,9 +444,56 @@ long seccomp_attach_user_filter(char __user *user_filter)
#endif
if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
goto out;
- ret = seccomp_attach_filter(&fprog);
+ filter = seccomp_prepare_filter(&fprog);
out:
- return ret;
+ return filter;
+}
+
+/**
+ * seccomp_attach_filter: validate and attach filter
+ * @flags: flags to change filter behavior
+ * @filter: seccomp filter to add to the current process
+ *
+ * Caller must be holding current->sighand->siglock lock.
+ *
+ * Returns 0 on success, -ve on error.
+ */
+static long seccomp_attach_filter(unsigned int flags,
+ struct seccomp_filter *filter)
+{
+ unsigned long total_insns;
+ struct seccomp_filter *walker;
+
+ assert_spin_locked(&current->sighand->siglock);
+
+ /* Validate resulting filter length. */
+ total_insns = filter->prog->len;
+ for (walker = current->seccomp.filter; walker; walker = walker->prev)
+ total_insns += walker->prog->len + 4; /* 4 instr penalty */
+ if (total_insns > MAX_INSNS_PER_PATH)
+ return -ENOMEM;
+
+ /* If thread sync has been requested, check that it is possible. */
+ if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
+ int ret;
+
+ ret = seccomp_can_sync_threads();
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * If there is an existing filter, make it the prev and don't drop its
+ * task reference.
+ */
+ filter->prev = current->seccomp.filter;
+ current->seccomp.filter = filter;
+
+ /* Now that the new filter is in place, synchronize to all threads. */
+ if (flags & SECCOMP_FILTER_FLAG_TSYNC)
+ seccomp_sync_threads();
+
+ return 0;
}
/* get_seccomp_filter - increments the reference count of the filter on @tsk */
@@ -324,6 +506,14 @@ void get_seccomp_filter(struct task_struct *tsk)
atomic_inc(&orig->usage);
}
+static inline void seccomp_filter_free(struct seccomp_filter *filter)
+{
+ if (filter) {
+ bpf_prog_free(filter->prog);
+ kfree(filter);
+ }
+}
+
/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
void put_seccomp_filter(struct task_struct *tsk)
{
@@ -332,7 +522,7 @@ void put_seccomp_filter(struct task_struct *tsk)
while (orig && atomic_dec_and_test(&orig->usage)) {
struct seccomp_filter *freeme = orig;
orig = orig->prev;
- kfree(freeme);
+ seccomp_filter_free(freeme);
}
}
@@ -351,7 +541,7 @@ static void seccomp_send_sigsys(int syscall, int reason)
info.si_code = SYS_SECCOMP;
info.si_call_addr = (void __user *)KSTK_EIP(current);
info.si_errno = reason;
- info.si_arch = syscall_get_arch(current, task_pt_regs(current));
+ info.si_arch = syscall_get_arch();
info.si_syscall = syscall;
force_sig_info(SIGSYS, &info, current);
}
@@ -376,12 +566,17 @@ static int mode1_syscalls_32[] = {
int __secure_computing(int this_syscall)
{
- int mode = current->seccomp.mode;
int exit_sig = 0;
int *syscall;
u32 ret;
- switch (mode) {
+ /*
+ * Make sure that any changes to mode from another thread have
+ * been seen after TIF_SECCOMP was seen.
+ */
+ rmb();
+
+ switch (current->seccomp.mode) {
case SECCOMP_MODE_STRICT:
syscall = mode1_syscalls;
#ifdef CONFIG_COMPAT
@@ -467,47 +662,152 @@ long prctl_get_seccomp(void)
}
/**
- * prctl_set_seccomp: configures current->seccomp.mode
- * @seccomp_mode: requested mode to use
- * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
+ * seccomp_set_mode_strict: internal function for setting strict seccomp
+ *
+ * Once current->seccomp.mode is non-zero, it may not be changed.
+ *
+ * Returns 0 on success or -EINVAL on failure.
+ */
+static long seccomp_set_mode_strict(void)
+{
+ const unsigned long seccomp_mode = SECCOMP_MODE_STRICT;
+ long ret = -EINVAL;
+
+ spin_lock_irq(&current->sighand->siglock);
+
+ if (!seccomp_may_assign_mode(seccomp_mode))
+ goto out;
+
+#ifdef TIF_NOTSC
+ disable_TSC();
+#endif
+ seccomp_assign_mode(current, seccomp_mode);
+ ret = 0;
+
+out:
+ spin_unlock_irq(&current->sighand->siglock);
+
+ return ret;
+}
+
+#ifdef CONFIG_SECCOMP_FILTER
+/**
+ * seccomp_set_mode_filter: internal function for setting seccomp filter
+ * @flags: flags to change filter behavior
+ * @filter: struct sock_fprog containing filter
*
- * This function may be called repeatedly with a @seccomp_mode of
- * SECCOMP_MODE_FILTER to install additional filters. Every filter
- * successfully installed will be evaluated (in reverse order) for each system
- * call the task makes.
+ * This function may be called repeatedly to install additional filters.
+ * Every filter successfully installed will be evaluated (in reverse order)
+ * for each system call the task makes.
*
* Once current->seccomp.mode is non-zero, it may not be changed.
*
* Returns 0 on success or -EINVAL on failure.
*/
-long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
+static long seccomp_set_mode_filter(unsigned int flags,
+ const char __user *filter)
{
+ const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
+ struct seccomp_filter *prepared = NULL;
long ret = -EINVAL;
- if (current->seccomp.mode &&
- current->seccomp.mode != seccomp_mode)
+ /* Validate flags. */
+ if (flags & ~SECCOMP_FILTER_FLAG_MASK)
+ return -EINVAL;
+
+ /* Prepare the new filter before holding any locks. */
+ prepared = seccomp_prepare_user_filter(filter);
+ if (IS_ERR(prepared))
+ return PTR_ERR(prepared);
+
+ /*
+ * Make sure we cannot change seccomp or nnp state via TSYNC
+ * while another thread is in the middle of calling exec.
+ */
+ if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
+ mutex_lock_killable(&current->signal->cred_guard_mutex))
+ goto out_free;
+
+ spin_lock_irq(&current->sighand->siglock);
+
+ if (!seccomp_may_assign_mode(seccomp_mode))
+ goto out;
+
+ ret = seccomp_attach_filter(flags, prepared);
+ if (ret)
goto out;
+ /* Do not free the successfully attached filter. */
+ prepared = NULL;
+
+ seccomp_assign_mode(current, seccomp_mode);
+out:
+ spin_unlock_irq(&current->sighand->siglock);
+ if (flags & SECCOMP_FILTER_FLAG_TSYNC)
+ mutex_unlock(&current->signal->cred_guard_mutex);
+out_free:
+ seccomp_filter_free(prepared);
+ return ret;
+}
+#else
+static inline long seccomp_set_mode_filter(unsigned int flags,
+ const char __user *filter)
+{
+ return -EINVAL;
+}
+#endif
+
+/* Common entry point for both prctl and syscall. */
+static long do_seccomp(unsigned int op, unsigned int flags,
+ const char __user *uargs)
+{
+ switch (op) {
+ case SECCOMP_SET_MODE_STRICT:
+ if (flags != 0 || uargs != NULL)
+ return -EINVAL;
+ return seccomp_set_mode_strict();
+ case SECCOMP_SET_MODE_FILTER:
+ return seccomp_set_mode_filter(flags, uargs);
+ default:
+ return -EINVAL;
+ }
+}
+
+SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
+ const char __user *, uargs)
+{
+ return do_seccomp(op, flags, uargs);
+}
+
+/**
+ * prctl_set_seccomp: configures current->seccomp.mode
+ * @seccomp_mode: requested mode to use
+ * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
+ *
+ * Returns 0 on success or -EINVAL on failure.
+ */
+long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
+{
+ unsigned int op;
+ char __user *uargs;
switch (seccomp_mode) {
case SECCOMP_MODE_STRICT:
- ret = 0;
-#ifdef TIF_NOTSC
- disable_TSC();
-#endif
+ op = SECCOMP_SET_MODE_STRICT;
+ /*
+ * Setting strict mode through prctl always ignored filter,
+ * so make sure it is always NULL here to pass the internal
+ * check in do_seccomp().
+ */
+ uargs = NULL;
break;
-#ifdef CONFIG_SECCOMP_FILTER
case SECCOMP_MODE_FILTER:
- ret = seccomp_attach_user_filter(filter);
- if (ret)
- goto out;
+ op = SECCOMP_SET_MODE_FILTER;
+ uargs = filter;
break;
-#endif
default:
- goto out;
+ return -EINVAL;
}
- current->seccomp.mode = seccomp_mode;
- set_thread_flag(TIF_SECCOMP);
-out:
- return ret;
+ /* prctl interface doesn't have flags, so they are always zero. */
+ return do_seccomp(op, 0, uargs);
}
diff --git a/kernel/signal.c b/kernel/signal.c
index 52f881db1ca0..8f0876f9f6dd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -33,6 +33,8 @@
#include <linux/uprobes.h>
#include <linux/compat.h>
#include <linux/cn_proc.h>
+#include <linux/compiler.h>
+
#define CREATE_TRACE_POINTS
#include <trace/events/signal.h>
@@ -275,6 +277,7 @@ void task_clear_jobctl_trapping(struct task_struct *task)
{
if (unlikely(task->jobctl & JOBCTL_TRAPPING)) {
task->jobctl &= ~JOBCTL_TRAPPING;
+ smp_mb(); /* advised by wake_up_bit() */
wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT);
}
}
@@ -703,11 +706,8 @@ void signal_wake_up_state(struct task_struct *t, unsigned int state)
* Returns 1 if any signals were found.
*
* All callers must be holding the siglock.
- *
- * This version takes a sigset mask and looks at all signals,
- * not just those in the first mask word.
*/
-static int rm_from_queue_full(sigset_t *mask, struct sigpending *s)
+static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s)
{
struct sigqueue *q, *n;
sigset_t m;
@@ -725,29 +725,6 @@ static int rm_from_queue_full(sigset_t *mask, struct sigpending *s)
}
return 1;
}
-/*
- * Remove signals in mask from the pending set and queue.
- * Returns 1 if any signals were found.
- *
- * All callers must be holding the siglock.
- */
-static int rm_from_queue(unsigned long mask, struct sigpending *s)
-{
- struct sigqueue *q, *n;
-
- if (!sigtestsetmask(&s->signal, mask))
- return 0;
-
- sigdelsetmask(&s->signal, mask);
- list_for_each_entry_safe(q, n, &s->list, list) {
- if (q->info.si_signo < SIGRTMIN &&
- (mask & sigmask(q->info.si_signo))) {
- list_del_init(&q->list);
- __sigqueue_free(q);
- }
- }
- return 1;
-}
static inline int is_si_special(const struct siginfo *info)
{
@@ -859,6 +836,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
{
struct signal_struct *signal = p->signal;
struct task_struct *t;
+ sigset_t flush;
if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) {
if (signal->flags & SIGNAL_GROUP_COREDUMP)
@@ -870,26 +848,25 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
/*
* This is a stop signal. Remove SIGCONT from all queues.
*/
- rm_from_queue(sigmask(SIGCONT), &signal->shared_pending);
- t = p;
- do {
- rm_from_queue(sigmask(SIGCONT), &t->pending);
- } while_each_thread(p, t);
+ siginitset(&flush, sigmask(SIGCONT));
+ flush_sigqueue_mask(&flush, &signal->shared_pending);
+ for_each_thread(p, t)
+ flush_sigqueue_mask(&flush, &t->pending);
} else if (sig == SIGCONT) {
unsigned int why;
/*
* Remove all stop signals from all queues, wake all threads.
*/
- rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
- t = p;
- do {
+ siginitset(&flush, SIG_KERNEL_STOP_MASK);
+ flush_sigqueue_mask(&flush, &signal->shared_pending);
+ for_each_thread(p, t) {
+ flush_sigqueue_mask(&flush, &t->pending);
task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
- rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
if (likely(!(t->ptrace & PT_SEIZED)))
wake_up_state(t, __TASK_STOPPED);
else
ptrace_trap_notify(t);
- } while_each_thread(p, t);
+ }
/*
* Notify the parent with CLD_CONTINUED if we were stopped.
@@ -1286,6 +1263,10 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
struct sighand_struct *sighand;
for (;;) {
+ /*
+ * Disable interrupts early to avoid deadlocks.
+ * See rcu_read_unlock() comment header for details.
+ */
local_irq_save(*flags);
rcu_read_lock();
sighand = rcu_dereference(tsk->sighand);
@@ -2189,8 +2170,7 @@ static int ptrace_signal(int signr, siginfo_t *info)
return signr;
}
-int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
- struct pt_regs *regs, void *cookie)
+int get_signal(struct ksignal *ksig)
{
struct sighand_struct *sighand = current->sighand;
struct signal_struct *signal = current->signal;
@@ -2260,13 +2240,13 @@ relock:
goto relock;
}
- signr = dequeue_signal(current, &current->blocked, info);
+ signr = dequeue_signal(current, &current->blocked, &ksig->info);
if (!signr)
break; /* will return 0 */
if (unlikely(current->ptrace) && signr != SIGKILL) {
- signr = ptrace_signal(signr, info);
+ signr = ptrace_signal(signr, &ksig->info);
if (!signr)
continue;
}
@@ -2274,13 +2254,13 @@ relock:
ka = &sighand->action[signr-1];
/* Trace actually delivered signals. */
- trace_signal_deliver(signr, info, ka);
+ trace_signal_deliver(signr, &ksig->info, ka);
if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
continue;
if (ka->sa.sa_handler != SIG_DFL) {
/* Run the handler. */
- *return_ka = *ka;
+ ksig->ka = *ka;
if (ka->sa.sa_flags & SA_ONESHOT)
ka->sa.sa_handler = SIG_DFL;
@@ -2330,7 +2310,7 @@ relock:
spin_lock_irq(&sighand->siglock);
}
- if (likely(do_signal_stop(info->si_signo))) {
+ if (likely(do_signal_stop(ksig->info.si_signo))) {
/* It released the siglock. */
goto relock;
}
@@ -2351,7 +2331,7 @@ relock:
if (sig_kernel_coredump(signr)) {
if (print_fatal_signals)
- print_fatal_signal(info->si_signo);
+ print_fatal_signal(ksig->info.si_signo);
proc_coredump_connector(current);
/*
* If it was able to dump core, this kills all
@@ -2361,34 +2341,32 @@ relock:
* first and our do_group_exit call below will use
* that value and ignore the one we pass it.
*/
- do_coredump(info);
+ do_coredump(&ksig->info);
}
/*
* Death signals, no core dump.
*/
- do_group_exit(info->si_signo);
+ do_group_exit(ksig->info.si_signo);
/* NOTREACHED */
}
spin_unlock_irq(&sighand->siglock);
- return signr;
+
+ ksig->sig = signr;
+ return ksig->sig > 0;
}
/**
* signal_delivered -
- * @sig: number of signal being delivered
- * @info: siginfo_t of signal being delivered
- * @ka: sigaction setting that chose the handler
- * @regs: user register state
+ * @ksig: kernel signal struct
* @stepping: nonzero if debugger single-step or block-step in use
*
- * This function should be called when a signal has succesfully been
- * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask
+ * This function should be called when a signal has successfully been
+ * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask
* is always blocked, and the signal itself is blocked unless %SA_NODEFER
- * is set in @ka->sa.sa_flags. Tracing is notified.
+ * is set in @ksig->ka.sa.sa_flags. Tracing is notified.
*/
-void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
- struct pt_regs *regs, int stepping)
+static void signal_delivered(struct ksignal *ksig, int stepping)
{
sigset_t blocked;
@@ -2398,11 +2376,11 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
simply clear the restore sigmask flag. */
clear_restore_sigmask();
- sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
- if (!(ka->sa.sa_flags & SA_NODEFER))
- sigaddset(&blocked, sig);
+ sigorsets(&blocked, &current->blocked, &ksig->ka.sa.sa_mask);
+ if (!(ksig->ka.sa.sa_flags & SA_NODEFER))
+ sigaddset(&blocked, ksig->sig);
set_current_blocked(&blocked);
- tracehook_signal_handler(sig, info, ka, regs, stepping);
+ tracehook_signal_handler(stepping);
}
void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
@@ -2410,8 +2388,7 @@ void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
if (failed)
force_sigsegv(ksig->sig, current);
else
- signal_delivered(ksig->sig, &ksig->info, &ksig->ka,
- signal_pt_regs(), stepping);
+ signal_delivered(ksig, stepping);
}
/*
@@ -2852,7 +2829,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
spin_lock_irq(&tsk->sighand->siglock);
__set_task_blocked(tsk, &tsk->real_blocked);
- siginitset(&tsk->real_blocked, 0);
+ sigemptyset(&tsk->real_blocked);
sig = dequeue_signal(tsk, &mask, info);
}
spin_unlock_irq(&tsk->sighand->siglock);
@@ -3089,18 +3066,39 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
}
#endif
+/*
+ * For kthreads only, must not be used if cloned with CLONE_SIGHAND
+ */
+void kernel_sigaction(int sig, __sighandler_t action)
+{
+ spin_lock_irq(&current->sighand->siglock);
+ current->sighand->action[sig - 1].sa.sa_handler = action;
+ if (action == SIG_IGN) {
+ sigset_t mask;
+
+ sigemptyset(&mask);
+ sigaddset(&mask, sig);
+
+ flush_sigqueue_mask(&mask, &current->signal->shared_pending);
+ flush_sigqueue_mask(&mask, &current->pending);
+ recalc_sigpending();
+ }
+ spin_unlock_irq(&current->sighand->siglock);
+}
+EXPORT_SYMBOL(kernel_sigaction);
+
int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
{
- struct task_struct *t = current;
+ struct task_struct *p = current, *t;
struct k_sigaction *k;
sigset_t mask;
if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
return -EINVAL;
- k = &t->sighand->action[sig-1];
+ k = &p->sighand->action[sig-1];
- spin_lock_irq(&current->sighand->siglock);
+ spin_lock_irq(&p->sighand->siglock);
if (oact)
*oact = *k;
@@ -3119,21 +3117,20 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
* (for example, SIGCHLD), shall cause the pending signal to
* be discarded, whether or not it is blocked"
*/
- if (sig_handler_ignored(sig_handler(t, sig), sig)) {
+ if (sig_handler_ignored(sig_handler(p, sig), sig)) {
sigemptyset(&mask);
sigaddset(&mask, sig);
- rm_from_queue_full(&mask, &t->signal->shared_pending);
- do {
- rm_from_queue_full(&mask, &t->pending);
- } while_each_thread(current, t);
+ flush_sigqueue_mask(&mask, &p->signal->shared_pending);
+ for_each_thread(p, t)
+ flush_sigqueue_mask(&mask, &t->pending);
}
}
- spin_unlock_irq(&current->sighand->siglock);
+ spin_unlock_irq(&p->sighand->siglock);
return 0;
}
-static int
+static int
do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp)
{
stack_t oss;
@@ -3494,7 +3491,7 @@ COMPAT_SYSCALL_DEFINE3(sigaction, int, sig,
}
#endif
-#ifdef __ARCH_WANT_SYS_SGETMASK
+#ifdef CONFIG_SGETMASK_SYSCALL
/*
* For backwards compatibility. Functionality superseded by sigprocmask.
@@ -3515,7 +3512,7 @@ SYSCALL_DEFINE1(ssetmask, int, newmask)
return old;
}
-#endif /* __ARCH_WANT_SGETMASK */
+#endif /* CONFIG_SGETMASK_SYSCALL */
#ifdef __ARCH_WANT_SYS_SIGNAL
/*
@@ -3618,7 +3615,7 @@ SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask)
}
#endif
-__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
+__weak const char *arch_vma_name(struct vm_area_struct *vma)
{
return NULL;
}
diff --git a/kernel/smp.c b/kernel/smp.c
index ffee35bef179..aff8aa14f547 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -3,6 +3,7 @@
*
* (C) Jens Axboe <jens.axboe@oracle.com> 2008
*/
+#include <linux/irq_work.h>
#include <linux/rcupdate.h>
#include <linux/rculist.h>
#include <linux/kernel.h>
@@ -29,6 +30,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
+static void flush_smp_call_function_queue(bool warn_cpu_offline);
+
static int
hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
@@ -51,12 +54,27 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
+ /* Fall-through to the CPU_DEAD[_FROZEN] case. */
case CPU_DEAD:
case CPU_DEAD_FROZEN:
free_cpumask_var(cfd->cpumask);
free_percpu(cfd->csd);
break;
+
+ case CPU_DYING:
+ case CPU_DYING_FROZEN:
+ /*
+ * The IPIs for the smp-call-function callbacks queued by other
+ * CPUs might arrive late, either due to hardware latencies or
+ * because this CPU disabled interrupts (inside stop-machine)
+ * before the IPIs were sent. So flush out any pending callbacks
+ * explicitly (without waiting for the IPIs to arrive), to
+ * ensure that the outgoing CPU doesn't go offline with work
+ * still pending.
+ */
+ flush_smp_call_function_queue(false);
+ break;
#endif
};
@@ -117,13 +135,43 @@ static void csd_unlock(struct call_single_data *csd)
csd->flags &= ~CSD_FLAG_LOCK;
}
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
+
/*
* Insert a previously allocated call_single_data element
* for execution on the given CPU. data must already have
* ->func, ->info, and ->flags set.
*/
-static void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
+static int generic_exec_single(int cpu, struct call_single_data *csd,
+ smp_call_func_t func, void *info, int wait)
{
+ struct call_single_data csd_stack = { .flags = 0 };
+ unsigned long flags;
+
+
+ if (cpu == smp_processor_id()) {
+ local_irq_save(flags);
+ func(info);
+ local_irq_restore(flags);
+ return 0;
+ }
+
+
+ if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu))
+ return -ENXIO;
+
+
+ if (!csd) {
+ csd = &csd_stack;
+ if (!wait)
+ csd = &__get_cpu_var(csd_data);
+ }
+
+ csd_lock(csd);
+
+ csd->func = func;
+ csd->info = info;
+
if (wait)
csd->flags |= CSD_FLAG_WAIT;
@@ -143,38 +191,76 @@ static void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
if (wait)
csd_lock_wait(csd);
+
+ return 0;
}
-/*
- * Invoked by arch to handle an IPI for call function single. Must be
- * called from the arch with interrupts disabled.
+/**
+ * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks
+ *
+ * Invoked by arch to handle an IPI for call function single.
+ * Must be called with interrupts disabled.
*/
void generic_smp_call_function_single_interrupt(void)
{
- struct llist_node *entry, *next;
+ flush_smp_call_function_queue(true);
+}
- /*
- * Shouldn't receive this interrupt on a cpu that is not yet online.
- */
- WARN_ON_ONCE(!cpu_online(smp_processor_id()));
+/**
+ * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
+ *
+ * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
+ * offline CPU. Skip this check if set to 'false'.
+ *
+ * Flush any pending smp-call-function callbacks queued on this CPU. This is
+ * invoked by the generic IPI handler, as well as by a CPU about to go offline,
+ * to ensure that all pending IPI callbacks are run before it goes completely
+ * offline.
+ *
+ * Loop through the call_single_queue and run all the queued callbacks.
+ * Must be called with interrupts disabled.
+ */
+static void flush_smp_call_function_queue(bool warn_cpu_offline)
+{
+ struct llist_head *head;
+ struct llist_node *entry;
+ struct call_single_data *csd, *csd_next;
+ static bool warned;
- entry = llist_del_all(&__get_cpu_var(call_single_queue));
+ WARN_ON(!irqs_disabled());
+
+ head = &__get_cpu_var(call_single_queue);
+ entry = llist_del_all(head);
entry = llist_reverse_order(entry);
- while (entry) {
- struct call_single_data *csd;
+ /* There shouldn't be any pending callbacks on an offline CPU. */
+ if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
+ !warned && !llist_empty(head))) {
+ warned = true;
+ WARN(1, "IPI on offline CPU %d\n", smp_processor_id());
- next = entry->next;
+ /*
+ * We don't have to use the _safe() variant here
+ * because we are not invoking the IPI handlers yet.
+ */
+ llist_for_each_entry(csd, entry, llist)
+ pr_warn("IPI callback %pS sent to offline CPU\n",
+ csd->func);
+ }
- csd = llist_entry(entry, struct call_single_data, llist);
+ llist_for_each_entry_safe(csd, csd_next, entry, llist) {
csd->func(csd->info);
csd_unlock(csd);
-
- entry = next;
}
-}
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
+ /*
+ * Handle irq works queued remotely by irq_work_queue_on().
+ * Smp functions above are typically synchronous so they
+ * better run first since some other CPUs may be busy waiting
+ * for them.
+ */
+ irq_work_run();
+}
/*
* smp_call_function_single - Run a function on a specific CPU
@@ -187,12 +273,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
int wait)
{
- struct call_single_data d = {
- .flags = 0,
- };
- unsigned long flags;
int this_cpu;
- int err = 0;
+ int err;
/*
* prevent preemption and reschedule on another processor,
@@ -209,32 +291,41 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
&& !oops_in_progress);
- if (cpu == this_cpu) {
- local_irq_save(flags);
- func(info);
- local_irq_restore(flags);
- } else {
- if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
- struct call_single_data *csd = &d;
+ err = generic_exec_single(cpu, NULL, func, info, wait);
- if (!wait)
- csd = &__get_cpu_var(csd_data);
+ put_cpu();
- csd_lock(csd);
+ return err;
+}
+EXPORT_SYMBOL(smp_call_function_single);
- csd->func = func;
- csd->info = info;
- generic_exec_single(cpu, csd, wait);
- } else {
- err = -ENXIO; /* CPU not online */
- }
- }
+/**
+ * smp_call_function_single_async(): Run an asynchronous function on a
+ * specific CPU.
+ * @cpu: The CPU to run on.
+ * @csd: Pre-allocated and setup data structure
+ *
+ * Like smp_call_function_single(), but the call is asynchonous and
+ * can thus be done from contexts with disabled interrupts.
+ *
+ * The caller passes his own pre-allocated data structure
+ * (ie: embedded in an object) and is responsible for synchronizing it
+ * such that the IPIs performed on the @csd are strictly serialized.
+ *
+ * NOTE: Be careful, there is unfortunately no current debugging facility to
+ * validate the correctness of this serialization.
+ */
+int smp_call_function_single_async(int cpu, struct call_single_data *csd)
+{
+ int err = 0;
- put_cpu();
+ preempt_disable();
+ err = generic_exec_single(cpu, csd, csd->func, csd->info, 0);
+ preempt_enable();
return err;
}
-EXPORT_SYMBOL(smp_call_function_single);
+EXPORT_SYMBOL_GPL(smp_call_function_single_async);
/*
* smp_call_function_any - Run a function on any of the given cpus
@@ -280,44 +371,6 @@ call:
EXPORT_SYMBOL_GPL(smp_call_function_any);
/**
- * __smp_call_function_single(): Run a function on a specific CPU
- * @cpu: The CPU to run on.
- * @data: Pre-allocated and setup data structure
- * @wait: If true, wait until function has completed on specified CPU.
- *
- * Like smp_call_function_single(), but allow caller to pass in a
- * pre-allocated data structure. Useful for embedding @data inside
- * other structures, for instance.
- */
-void __smp_call_function_single(int cpu, struct call_single_data *csd,
- int wait)
-{
- unsigned int this_cpu;
- unsigned long flags;
-
- this_cpu = get_cpu();
- /*
- * Can deadlock when called with interrupts disabled.
- * We allow cpu's that are not yet online though, as no one else can
- * send smp call function interrupt to this cpu and as such deadlocks
- * can't happen.
- */
- WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled()
- && !oops_in_progress);
-
- if (cpu == this_cpu) {
- local_irq_save(flags);
- csd->func(csd->info);
- local_irq_restore(flags);
- } else {
- csd_lock(csd);
- generic_exec_single(cpu, csd, wait);
- }
- put_cpu();
-}
-EXPORT_SYMBOL_GPL(__smp_call_function_single);
-
-/**
* smp_call_function_many(): Run a function on a set of other CPUs.
* @mask: The set of cpus to run on (only runs on online subset).
* @func: The function to run. This must be fast and non-blocking.
@@ -617,7 +670,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
if (cond_func(cpu, info)) {
ret = smp_call_function_single(cpu, func,
info, wait);
- WARN_ON_ONCE(!ret);
+ WARN_ON_ONCE(ret);
}
preempt_enable();
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 490fcbb1dc5b..5918d227730f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -25,6 +25,7 @@
#include <linux/smp.h>
#include <linux/smpboot.h>
#include <linux/tick.h>
+#include <linux/irq.h>
#define CREATE_TRACE_POINTS
#include <trace/events/irq.h>
@@ -222,7 +223,7 @@ static inline bool lockdep_softirq_start(void) { return false; }
static inline void lockdep_softirq_end(bool in_hardirq) { }
#endif
-asmlinkage void __do_softirq(void)
+asmlinkage __visible void __do_softirq(void)
{
unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
unsigned long old_flags = current->flags;
@@ -231,7 +232,6 @@ asmlinkage void __do_softirq(void)
bool in_hardirq;
__u32 pending;
int softirq_bit;
- int cpu;
/*
* Mask out PF_MEMALLOC s current task context is borrowed for the
@@ -246,7 +246,6 @@ asmlinkage void __do_softirq(void)
__local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
in_hardirq = lockdep_softirq_start();
- cpu = smp_processor_id();
restart:
/* Reset the pending bitmask before enabling irqs */
set_softirq_pending(0);
@@ -275,11 +274,11 @@ restart:
prev_count, preempt_count());
preempt_count_set(prev_count);
}
- rcu_bh_qs(cpu);
h++;
pending >>= softirq_bit;
}
+ rcu_bh_qs(smp_processor_id());
local_irq_disable();
pending = local_softirq_pending();
@@ -298,7 +297,7 @@ restart:
tsk_restore_flags(current, old_flags, PF_MEMALLOC);
}
-asmlinkage void do_softirq(void)
+asmlinkage __visible void do_softirq(void)
{
__u32 pending;
unsigned long flags;
@@ -778,3 +777,8 @@ int __init __weak arch_early_irq_init(void)
{
return 0;
}
+
+unsigned int __weak arch_dynirq_lower_bound(unsigned int from)
+{
+ return from;
+}
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 01fbae5b97b7..695f0c6cd169 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -307,6 +307,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
* @cpu: cpu to stop
* @fn: function to execute
* @arg: argument to @fn
+ * @work_buf: pointer to cpu_stop_work structure
*
* Similar to stop_one_cpu() but doesn't wait for completion. The
* caller is responsible for ensuring @work_buf is currently unused
diff --git a/kernel/sys.c b/kernel/sys.c
index c0a58be780a4..ce8129192a26 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -174,10 +174,10 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
/* normalize: avoid signed division (rounding problems) */
error = -ESRCH;
- if (niceval < -20)
- niceval = -20;
- if (niceval > 19)
- niceval = 19;
+ if (niceval < MIN_NICE)
+ niceval = MIN_NICE;
+ if (niceval > MAX_NICE)
+ niceval = MAX_NICE;
rcu_read_lock();
read_lock(&tasklist_lock);
@@ -250,7 +250,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
else
p = current;
if (p) {
- niceval = 20 - task_nice(p);
+ niceval = nice_to_rlimit(task_nice(p));
if (niceval > retval)
retval = niceval;
}
@@ -261,7 +261,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
else
pgrp = task_pgrp(current);
do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
- niceval = 20 - task_nice(p);
+ niceval = nice_to_rlimit(task_nice(p));
if (niceval > retval)
retval = niceval;
} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
@@ -277,7 +277,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
do_each_thread(g, p) {
if (uid_eq(task_uid(p), uid)) {
- niceval = 20 - task_nice(p);
+ niceval = nice_to_rlimit(task_nice(p));
if (niceval > retval)
retval = niceval;
}
@@ -1990,12 +1990,27 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
if (arg2 != 1 || arg3 || arg4 || arg5)
return -EINVAL;
- current->no_new_privs = 1;
+ task_set_no_new_privs(current);
break;
case PR_GET_NO_NEW_PRIVS:
if (arg2 || arg3 || arg4 || arg5)
return -EINVAL;
- return current->no_new_privs ? 1 : 0;
+ return task_no_new_privs(current) ? 1 : 0;
+ case PR_GET_THP_DISABLE:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = !!(me->mm->def_flags & VM_NOHUGEPAGE);
+ break;
+ case PR_SET_THP_DISABLE:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ down_write(&me->mm->mmap_sem);
+ if (arg2)
+ me->mm->def_flags |= VM_NOHUGEPAGE;
+ else
+ me->mm->def_flags &= ~VM_NOHUGEPAGE;
+ up_write(&me->mm->mmap_sem);
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7078052284fd..d4709d481053 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -25,6 +25,7 @@ cond_syscall(sys_swapon);
cond_syscall(sys_swapoff);
cond_syscall(sys_kexec_load);
cond_syscall(compat_sys_kexec_load);
+cond_syscall(sys_kexec_file_load);
cond_syscall(sys_init_module);
cond_syscall(sys_finit_module);
cond_syscall(sys_delete_module);
@@ -135,6 +136,8 @@ cond_syscall(sys_setresgid16);
cond_syscall(sys_setresuid16);
cond_syscall(sys_setreuid16);
cond_syscall(sys_setuid16);
+cond_syscall(sys_sgetmask);
+cond_syscall(sys_ssetmask);
cond_syscall(sys_vm86old);
cond_syscall(sys_vm86);
cond_syscall(sys_ipc);
@@ -146,11 +149,16 @@ cond_syscall(sys_io_destroy);
cond_syscall(sys_io_submit);
cond_syscall(sys_io_cancel);
cond_syscall(sys_io_getevents);
+cond_syscall(sys_sysfs);
cond_syscall(sys_syslog);
cond_syscall(sys_process_vm_readv);
cond_syscall(sys_process_vm_writev);
cond_syscall(compat_sys_process_vm_readv);
cond_syscall(compat_sys_process_vm_writev);
+cond_syscall(sys_uselib);
+cond_syscall(sys_fadvise64);
+cond_syscall(sys_fadvise64_64);
+cond_syscall(sys_madvise);
/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read);
@@ -193,6 +201,7 @@ cond_syscall(compat_sys_timerfd_settime);
cond_syscall(compat_sys_timerfd_gettime);
cond_syscall(sys_eventfd);
cond_syscall(sys_eventfd2);
+cond_syscall(sys_memfd_create);
/* performance counters: */
cond_syscall(sys_perf_event_open);
@@ -209,3 +218,6 @@ cond_syscall(compat_sys_open_by_handle_at);
/* compare kernel pointers */
cond_syscall(sys_kcmp);
+
+/* operate on Secure Computing state */
+cond_syscall(sys_seccomp);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 49e13e1f8fe6..75875a741b5e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -112,9 +112,6 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max;
#ifndef CONFIG_MMU
extern int sysctl_nr_trim_pages;
#endif
-#ifdef CONFIG_BLOCK
-extern int blk_iopoll_enabled;
-#endif
/* Constants used for minimum and maximum */
#ifdef CONFIG_LOCKUP_DETECTOR
@@ -126,7 +123,7 @@ static int __maybe_unused neg_one = -1;
static int zero;
static int __maybe_unused one = 1;
static int __maybe_unused two = 2;
-static int __maybe_unused three = 3;
+static int __maybe_unused four = 4;
static unsigned long one_ul = 1;
static int one_hundred = 100;
#ifdef CONFIG_PRINTK
@@ -139,21 +136,21 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
static int maxolduid = 65535;
static int minolduid;
-static int min_percpu_pagelist_fract = 8;
static int ngroups_max = NGROUPS_MAX;
static const int cap_last_cap = CAP_LAST_CAP;
+/*this is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs */
+#ifdef CONFIG_DETECT_HUNG_TASK
+static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
+#endif
+
#ifdef CONFIG_INOTIFY_USER
#include <linux/inotify.h>
#endif
#ifdef CONFIG_SPARC
#endif
-#ifdef CONFIG_SPARC64
-extern int sysctl_tsb_ratio;
-#endif
-
#ifdef __hppa__
extern int pwrsw_enabled;
#endif
@@ -171,6 +168,13 @@ extern int no_unaligned_warning;
#endif
#ifdef CONFIG_PROC_SYSCTL
+
+#define SYSCTL_WRITES_LEGACY -1
+#define SYSCTL_WRITES_WARN 0
+#define SYSCTL_WRITES_STRICT 1
+
+static int sysctl_writes_strict = SYSCTL_WRITES_WARN;
+
static int proc_do_cad_pid(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
static int proc_taint(struct ctl_table *table, int write,
@@ -193,7 +197,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
/* Note: sysrq code uses it's own private copy */
static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE;
-static int sysrq_sysctl_handler(ctl_table *table, int write,
+static int sysrq_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
@@ -386,13 +390,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
{
- .procname = "numa_balancing_migrate_deferred",
- .data = &sysctl_numa_balancing_migrate_deferred,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
.procname = "numa_balancing",
.data = NULL, /* filled in by handler */
.maxlen = sizeof(unsigned int),
@@ -500,6 +497,15 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_taint,
},
+ {
+ .procname = "sysctl_writes_strict",
+ .data = &sysctl_writes_strict,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &neg_one,
+ .extra2 = &one,
+ },
#endif
#ifdef CONFIG_LATENCYTOP
{
@@ -648,7 +654,7 @@ static struct ctl_table kern_table[] = {
.extra2 = &one,
},
#endif
-
+#ifdef CONFIG_UEVENT_HELPER
{
.procname = "hotplug",
.data = &uevent_helper,
@@ -656,7 +662,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dostring,
},
-
+#endif
#ifdef CONFIG_CHR_DEV_SG
{
.procname = "sg-big-buff",
@@ -854,6 +860,17 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one,
},
+#ifdef CONFIG_SMP
+ {
+ .procname = "softlockup_all_cpu_backtrace",
+ .data = &sysctl_softlockup_all_cpu_backtrace,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif /* CONFIG_SMP */
{
.procname = "nmi_watchdog",
.data = &watchdog_user_enabled,
@@ -995,6 +1012,7 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_dohung_task_timeout_secs,
+ .extra2 = &hung_task_timeout_max,
},
{
.procname = "hung_task_warnings",
@@ -1094,15 +1112,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
-#ifdef CONFIG_BLOCK
- {
- .procname = "blk_iopoll",
- .data = &blk_iopoll_enabled,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
{ }
};
@@ -1231,8 +1240,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = hugetlb_sysctl_handler,
- .extra1 = (void *)&hugetlb_zero,
- .extra2 = (void *)&hugetlb_infinity,
+ .extra1 = &zero,
},
#ifdef CONFIG_NUMA
{
@@ -1241,8 +1249,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = &hugetlb_mempolicy_sysctl_handler,
- .extra1 = (void *)&hugetlb_zero,
- .extra2 = (void *)&hugetlb_infinity,
+ .extra1 = &zero,
},
#endif
{
@@ -1265,8 +1272,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = hugetlb_overcommit_handler,
- .extra1 = (void *)&hugetlb_zero,
- .extra2 = (void *)&hugetlb_infinity,
+ .extra1 = &zero,
},
#endif
{
@@ -1283,7 +1289,7 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = drop_caches_sysctl_handler,
.extra1 = &one,
- .extra2 = &three,
+ .extra2 = &four,
},
#ifdef CONFIG_COMPACTION
{
@@ -1318,7 +1324,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(percpu_pagelist_fraction),
.mode = 0644,
.proc_handler = percpu_pagelist_fraction_sysctl_handler,
- .extra1 = &min_percpu_pagelist_fract,
+ .extra1 = &zero,
},
#ifdef CONFIG_MMU
{
@@ -1431,8 +1437,13 @@ static struct ctl_table vm_table[] = {
(defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
{
.procname = "vdso_enabled",
+#ifdef CONFIG_X86_32
+ .data = &vdso32_enabled,
+ .maxlen = sizeof(vdso32_enabled),
+#else
.data = &vdso_enabled,
.maxlen = sizeof(vdso_enabled),
+#endif
.mode = 0644,
.proc_handler = proc_dointvec,
.extra1 = &zero,
@@ -1711,8 +1722,8 @@ int __init sysctl_init(void)
#ifdef CONFIG_PROC_SYSCTL
-static int _proc_do_string(void* data, int maxlen, int write,
- void __user *buffer,
+static int _proc_do_string(char *data, int maxlen, int write,
+ char __user *buffer,
size_t *lenp, loff_t *ppos)
{
size_t len;
@@ -1725,21 +1736,30 @@ static int _proc_do_string(void* data, int maxlen, int write,
}
if (write) {
- len = 0;
+ if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) {
+ /* Only continue writes not past the end of buffer. */
+ len = strlen(data);
+ if (len > maxlen - 1)
+ len = maxlen - 1;
+
+ if (*ppos > len)
+ return 0;
+ len = *ppos;
+ } else {
+ /* Start writing from beginning of buffer. */
+ len = 0;
+ }
+
+ *ppos += *lenp;
p = buffer;
- while (len < *lenp) {
+ while ((p - buffer) < *lenp && len < maxlen - 1) {
if (get_user(c, p++))
return -EFAULT;
if (c == 0 || c == '\n')
break;
- len++;
+ data[len++] = c;
}
- if (len >= maxlen)
- len = maxlen-1;
- if(copy_from_user(data, buffer, len))
- return -EFAULT;
- ((char *) data)[len] = 0;
- *ppos += *lenp;
+ data[len] = 0;
} else {
len = strlen(data);
if (len > maxlen)
@@ -1756,10 +1776,10 @@ static int _proc_do_string(void* data, int maxlen, int write,
if (len > *lenp)
len = *lenp;
if (len)
- if(copy_to_user(buffer, data, len))
+ if (copy_to_user(buffer, data, len))
return -EFAULT;
if (len < *lenp) {
- if(put_user('\n', ((char __user *) buffer) + len))
+ if (put_user('\n', buffer + len))
return -EFAULT;
len++;
}
@@ -1769,6 +1789,14 @@ static int _proc_do_string(void* data, int maxlen, int write,
return 0;
}
+static void warn_sysctl_write(struct ctl_table *table)
+{
+ pr_warn_once("%s wrote to %s when file position was not 0!\n"
+ "This will not be supported in the future. To silence this\n"
+ "warning, set kernel.sysctl_writes_strict = -1\n",
+ current->comm, table->procname);
+}
+
/**
* proc_dostring - read a string sysctl
* @table: the sysctl table
@@ -1789,8 +1817,11 @@ static int _proc_do_string(void* data, int maxlen, int write,
int proc_dostring(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- return _proc_do_string(table->data, table->maxlen, write,
- buffer, lenp, ppos);
+ if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN)
+ warn_sysctl_write(table);
+
+ return _proc_do_string((char *)(table->data), table->maxlen, write,
+ (char __user *)buffer, lenp, ppos);
}
static size_t proc_skip_spaces(char **buf)
@@ -1964,6 +1995,18 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
conv = do_proc_dointvec_conv;
if (write) {
+ if (*ppos) {
+ switch (sysctl_writes_strict) {
+ case SYSCTL_WRITES_STRICT:
+ goto out;
+ case SYSCTL_WRITES_WARN:
+ warn_sysctl_write(table);
+ break;
+ default:
+ break;
+ }
+ }
+
if (left > PAGE_SIZE - 1)
left = PAGE_SIZE - 1;
page = __get_free_page(GFP_TEMPORARY);
@@ -2021,6 +2064,7 @@ free:
return err ? : -EINVAL;
}
*lenp -= left;
+out:
*ppos += *lenp;
return err;
}
@@ -2213,6 +2257,18 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
left = *lenp;
if (write) {
+ if (*ppos) {
+ switch (sysctl_writes_strict) {
+ case SYSCTL_WRITES_STRICT:
+ goto out;
+ case SYSCTL_WRITES_WARN:
+ warn_sysctl_write(table);
+ break;
+ default:
+ break;
+ }
+ }
+
if (left > PAGE_SIZE - 1)
left = PAGE_SIZE - 1;
page = __get_free_page(GFP_TEMPORARY);
@@ -2268,6 +2324,7 @@ free:
return err ? : -EINVAL;
}
*lenp -= left;
+out:
*ppos += *lenp;
return err;
}
@@ -2514,11 +2571,11 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
bool first = 1;
size_t left = *lenp;
unsigned long bitmap_len = table->maxlen;
- unsigned long *bitmap = (unsigned long *) table->data;
+ unsigned long *bitmap = *(unsigned long **) table->data;
unsigned long *tmp_bitmap = NULL;
char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
- if (!bitmap_len || !left || (*ppos && !write)) {
+ if (!bitmap || !bitmap_len || !left || (*ppos && !write)) {
*lenp = 0;
return 0;
}
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index d457005acedf..9a4f750a2963 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -521,6 +521,7 @@ static const struct bin_table bin_net_ipv6_conf_var_table[] = {
{ CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" },
{ CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" },
{ CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
+ { CTL_INT, NET_IPV6_ACCEPT_RA_FROM_LOCAL, "accept_ra_from_local" },
{}
};
diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c
index 52ebc70263f4..875f64e8935b 100644
--- a/kernel/system_keyring.c
+++ b/kernel/system_keyring.c
@@ -89,6 +89,7 @@ static __init int load_system_certificate_list(void)
pr_err("Problem loading in-kernel X.509 certificate (%ld)\n",
PTR_ERR(key));
} else {
+ set_bit(KEY_FLAG_BUILTIN, &key_ref_to_ptr(key)->flags);
pr_notice("Loaded X.509 cert '%s'\n",
key_ref_to_ptr(key)->description);
key_ref_put(key);
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 12d6ebbfdd83..0dbab6d1acb4 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -14,6 +14,8 @@
* the GNU General Public License for more details.
*/
+#define pr_fmt(fmt) "Kprobe smoke test: " fmt
+
#include <linux/kernel.h>
#include <linux/kprobes.h>
#include <linux/random.h>
@@ -41,8 +43,7 @@ static void kp_post_handler(struct kprobe *p, struct pt_regs *regs,
{
if (preh_val != (rand1 / div_factor)) {
handler_errors++;
- printk(KERN_ERR "Kprobe smoke test failed: "
- "incorrect value in post_handler\n");
+ pr_err("incorrect value in post_handler\n");
}
posth_val = preh_val + div_factor;
}
@@ -59,8 +60,7 @@ static int test_kprobe(void)
ret = register_kprobe(&kp);
if (ret < 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "register_kprobe returned %d\n", ret);
+ pr_err("register_kprobe returned %d\n", ret);
return ret;
}
@@ -68,14 +68,12 @@ static int test_kprobe(void)
unregister_kprobe(&kp);
if (preh_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kprobe pre_handler not called\n");
+ pr_err("kprobe pre_handler not called\n");
handler_errors++;
}
if (posth_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kprobe post_handler not called\n");
+ pr_err("kprobe post_handler not called\n");
handler_errors++;
}
@@ -98,8 +96,7 @@ static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs,
{
if (preh_val != (rand1 / div_factor) + 1) {
handler_errors++;
- printk(KERN_ERR "Kprobe smoke test failed: "
- "incorrect value in post_handler2\n");
+ pr_err("incorrect value in post_handler2\n");
}
posth_val = preh_val + div_factor;
}
@@ -120,8 +117,7 @@ static int test_kprobes(void)
kp.flags = 0;
ret = register_kprobes(kps, 2);
if (ret < 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "register_kprobes returned %d\n", ret);
+ pr_err("register_kprobes returned %d\n", ret);
return ret;
}
@@ -130,14 +126,12 @@ static int test_kprobes(void)
ret = target(rand1);
if (preh_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kprobe pre_handler not called\n");
+ pr_err("kprobe pre_handler not called\n");
handler_errors++;
}
if (posth_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kprobe post_handler not called\n");
+ pr_err("kprobe post_handler not called\n");
handler_errors++;
}
@@ -146,14 +140,12 @@ static int test_kprobes(void)
ret = target2(rand1);
if (preh_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kprobe pre_handler2 not called\n");
+ pr_err("kprobe pre_handler2 not called\n");
handler_errors++;
}
if (posth_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kprobe post_handler2 not called\n");
+ pr_err("kprobe post_handler2 not called\n");
handler_errors++;
}
@@ -166,8 +158,7 @@ static u32 j_kprobe_target(u32 value)
{
if (value != rand1) {
handler_errors++;
- printk(KERN_ERR "Kprobe smoke test failed: "
- "incorrect value in jprobe handler\n");
+ pr_err("incorrect value in jprobe handler\n");
}
jph_val = rand1;
@@ -186,16 +177,14 @@ static int test_jprobe(void)
ret = register_jprobe(&jp);
if (ret < 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "register_jprobe returned %d\n", ret);
+ pr_err("register_jprobe returned %d\n", ret);
return ret;
}
ret = target(rand1);
unregister_jprobe(&jp);
if (jph_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "jprobe handler not called\n");
+ pr_err("jprobe handler not called\n");
handler_errors++;
}
@@ -217,24 +206,21 @@ static int test_jprobes(void)
jp.kp.flags = 0;
ret = register_jprobes(jps, 2);
if (ret < 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "register_jprobes returned %d\n", ret);
+ pr_err("register_jprobes returned %d\n", ret);
return ret;
}
jph_val = 0;
ret = target(rand1);
if (jph_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "jprobe handler not called\n");
+ pr_err("jprobe handler not called\n");
handler_errors++;
}
jph_val = 0;
ret = target2(rand1);
if (jph_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "jprobe handler2 not called\n");
+ pr_err("jprobe handler2 not called\n");
handler_errors++;
}
unregister_jprobes(jps, 2);
@@ -256,13 +242,11 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
if (ret != (rand1 / div_factor)) {
handler_errors++;
- printk(KERN_ERR "Kprobe smoke test failed: "
- "incorrect value in kretprobe handler\n");
+ pr_err("incorrect value in kretprobe handler\n");
}
if (krph_val == 0) {
handler_errors++;
- printk(KERN_ERR "Kprobe smoke test failed: "
- "call to kretprobe entry handler failed\n");
+ pr_err("call to kretprobe entry handler failed\n");
}
krph_val = rand1;
@@ -281,16 +265,14 @@ static int test_kretprobe(void)
ret = register_kretprobe(&rp);
if (ret < 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "register_kretprobe returned %d\n", ret);
+ pr_err("register_kretprobe returned %d\n", ret);
return ret;
}
ret = target(rand1);
unregister_kretprobe(&rp);
if (krph_val != rand1) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kretprobe handler not called\n");
+ pr_err("kretprobe handler not called\n");
handler_errors++;
}
@@ -303,13 +285,11 @@ static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs)
if (ret != (rand1 / div_factor) + 1) {
handler_errors++;
- printk(KERN_ERR "Kprobe smoke test failed: "
- "incorrect value in kretprobe handler2\n");
+ pr_err("incorrect value in kretprobe handler2\n");
}
if (krph_val == 0) {
handler_errors++;
- printk(KERN_ERR "Kprobe smoke test failed: "
- "call to kretprobe entry handler failed\n");
+ pr_err("call to kretprobe entry handler failed\n");
}
krph_val = rand1;
@@ -332,24 +312,21 @@ static int test_kretprobes(void)
rp.kp.flags = 0;
ret = register_kretprobes(rps, 2);
if (ret < 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "register_kretprobe returned %d\n", ret);
+ pr_err("register_kretprobe returned %d\n", ret);
return ret;
}
krph_val = 0;
ret = target(rand1);
if (krph_val != rand1) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kretprobe handler not called\n");
+ pr_err("kretprobe handler not called\n");
handler_errors++;
}
krph_val = 0;
ret = target2(rand1);
if (krph_val != rand1) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kretprobe handler2 not called\n");
+ pr_err("kretprobe handler2 not called\n");
handler_errors++;
}
unregister_kretprobes(rps, 2);
@@ -368,7 +345,7 @@ int init_test_probes(void)
rand1 = prandom_u32();
} while (rand1 <= div_factor);
- printk(KERN_INFO "Kprobe smoke test started\n");
+ pr_info("started\n");
num_tests++;
ret = test_kprobe();
if (ret < 0)
@@ -402,13 +379,11 @@ int init_test_probes(void)
#endif /* CONFIG_KRETPROBES */
if (errors)
- printk(KERN_ERR "BUG: Kprobe smoke test: %d out of "
- "%d tests failed\n", errors, num_tests);
+ pr_err("BUG: %d out of %d tests failed\n", errors, num_tests);
else if (handler_errors)
- printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) "
- "running handlers\n", handler_errors);
+ pr_err("BUG: %d error(s) running handlers\n", handler_errors);
else
- printk(KERN_INFO "Kprobe smoke test passed successfully\n");
+ pr_info("passed successfully\n");
return 0;
}
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 3ce6e8c5f3fc..d626dc98e8df 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -12,6 +12,11 @@ config CLOCKSOURCE_WATCHDOG
config ARCH_CLOCKSOURCE_DATA
bool
+# Clocksources require validation of the clocksource against the last
+# cycle update - x86/TSC misfeature
+config CLOCKSOURCE_VALIDATE_LAST_CYCLE
+ bool
+
# Timekeeping vsyscall support
config GENERIC_TIME_VSYSCALL
bool
@@ -20,10 +25,6 @@ config GENERIC_TIME_VSYSCALL
config GENERIC_TIME_VSYSCALL_OLD
bool
-# ktime_t scalar 64bit nsec representation
-config KTIME_SCALAR
- bool
-
# Old style timekeeping
config ARCH_USES_GETTIMEOFFSET
bool
@@ -124,7 +125,7 @@ config NO_HZ_FULL
endchoice
config NO_HZ_FULL_ALL
- bool "Full dynticks system on all CPUs by default"
+ bool "Full dynticks system on all CPUs by default (except CPU 0)"
depends on NO_HZ_FULL
help
If the user doesn't pass the nohz_full boot option to
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 9250130646f5..7347426fa68d 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,11 +1,33 @@
+obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o
obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
obj-y += timeconv.o posix-clock.o alarmtimer.o
obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
-obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o
+ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y)
+ obj-y += tick-broadcast.o
+ obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o
+endif
obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o
obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
obj-$(CONFIG_TIMER_STATS) += timer_stats.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
+obj-$(CONFIG_TEST_UDELAY) += udelay_test.o
+
+$(obj)/time.o: $(obj)/timeconst.h
+
+quiet_cmd_hzfile = HZFILE $@
+ cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
+
+targets += hz.bc
+$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
+ $(call if_changed,hzfile)
+
+quiet_cmd_bc = BC $@
+ cmd_bc = bc -q $(filter-out FORCE,$^) > $@
+
+targets += timeconst.h
+$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
+ $(call if_changed,bc)
+
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 88c9c65a430d..a7077d3ae52f 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -71,7 +71,7 @@ struct rtc_device *alarmtimer_get_rtcdev(void)
return ret;
}
-
+EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev);
static int alarmtimer_rtc_add_device(struct device *dev,
struct class_interface *class_intf)
@@ -464,18 +464,26 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid)
static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
ktime_t now)
{
+ unsigned long flags;
struct k_itimer *ptr = container_of(alarm, struct k_itimer,
it.alarm.alarmtimer);
- if (posix_timer_event(ptr, 0) != 0)
- ptr->it_overrun++;
+ enum alarmtimer_restart result = ALARMTIMER_NORESTART;
+
+ spin_lock_irqsave(&ptr->it_lock, flags);
+ if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) {
+ if (posix_timer_event(ptr, 0) != 0)
+ ptr->it_overrun++;
+ }
/* Re-add periodic timers */
if (ptr->it.alarm.interval.tv64) {
ptr->it_overrun += alarm_forward(alarm, now,
ptr->it.alarm.interval);
- return ALARMTIMER_RESTART;
+ result = ALARMTIMER_RESTART;
}
- return ALARMTIMER_NORESTART;
+ spin_unlock_irqrestore(&ptr->it_lock, flags);
+
+ return result;
}
/**
@@ -541,18 +549,22 @@ static int alarm_timer_create(struct k_itimer *new_timer)
* @new_timer: k_itimer pointer
* @cur_setting: itimerspec data to fill
*
- * Copies the itimerspec data out from the k_itimer
+ * Copies out the current itimerspec data
*/
static void alarm_timer_get(struct k_itimer *timr,
struct itimerspec *cur_setting)
{
- memset(cur_setting, 0, sizeof(struct itimerspec));
+ ktime_t relative_expiry_time =
+ alarm_expires_remaining(&(timr->it.alarm.alarmtimer));
+
+ if (ktime_to_ns(relative_expiry_time) > 0) {
+ cur_setting->it_value = ktime_to_timespec(relative_expiry_time);
+ } else {
+ cur_setting->it_value.tv_sec = 0;
+ cur_setting->it_value.tv_nsec = 0;
+ }
- cur_setting->it_interval =
- ktime_to_timespec(timr->it.alarm.interval);
- cur_setting->it_value =
- ktime_to_timespec(timr->it.alarm.alarmtimer.node.expires);
- return;
+ cur_setting->it_interval = ktime_to_timespec(timr->it.alarm.interval);
}
/**
@@ -585,9 +597,14 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
struct itimerspec *new_setting,
struct itimerspec *old_setting)
{
+ ktime_t exp;
+
if (!rtcdev)
return -ENOTSUPP;
+ if (flags & ~TIMER_ABSTIME)
+ return -EINVAL;
+
if (old_setting)
alarm_timer_get(timr, old_setting);
@@ -597,8 +614,16 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
/* start the timer */
timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
- alarm_start(&timr->it.alarm.alarmtimer,
- timespec_to_ktime(new_setting->it_value));
+ exp = timespec_to_ktime(new_setting->it_value);
+ /* Convert (if necessary) to absolute time */
+ if (flags != TIMER_ABSTIME) {
+ ktime_t now;
+
+ now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime();
+ exp = ktime_add(now, exp);
+ }
+
+ alarm_start(&timr->it.alarm.alarmtimer, exp);
return 0;
}
@@ -730,6 +755,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
if (!alarmtimer_get_rtcdev())
return -ENOTSUPP;
+ if (flags & ~TIMER_ABSTIME)
+ return -EINVAL;
+
if (!capable(CAP_WAKE_ALARM))
return -EPERM;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 086ad6043bcb..9c94c19f1305 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -146,7 +146,8 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev)
{
/* Nothing to do if we already reached the limit */
if (dev->min_delta_ns >= MIN_DELTA_LIMIT) {
- printk(KERN_WARNING "CE: Reprogramming failure. Giving up\n");
+ printk_deferred(KERN_WARNING
+ "CE: Reprogramming failure. Giving up\n");
dev->next_event.tv64 = KTIME_MAX;
return -ETIME;
}
@@ -159,9 +160,10 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev)
if (dev->min_delta_ns > MIN_DELTA_LIMIT)
dev->min_delta_ns = MIN_DELTA_LIMIT;
- printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
- dev->name ? dev->name : "?",
- (unsigned long long) dev->min_delta_ns);
+ printk_deferred(KERN_WARNING
+ "CE: %s increased min_delta_ns to %llu nsec\n",
+ dev->name ? dev->name : "?",
+ (unsigned long long) dev->min_delta_ns);
return 0;
}
@@ -439,6 +441,19 @@ void clockevents_config_and_register(struct clock_event_device *dev,
}
EXPORT_SYMBOL_GPL(clockevents_config_and_register);
+int __clockevents_update_freq(struct clock_event_device *dev, u32 freq)
+{
+ clockevents_config(dev, freq);
+
+ if (dev->mode == CLOCK_EVT_MODE_ONESHOT)
+ return clockevents_program_event(dev, dev->next_event, false);
+
+ if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
+ dev->set_mode(CLOCK_EVT_MODE_PERIODIC, dev);
+
+ return 0;
+}
+
/**
* clockevents_update_freq - Update frequency and reprogram a clock event device.
* @dev: device to modify
@@ -446,17 +461,22 @@ EXPORT_SYMBOL_GPL(clockevents_config_and_register);
*
* Reconfigure and reprogram a clock event device in oneshot
* mode. Must be called on the cpu for which the device delivers per
- * cpu timer events with interrupts disabled! Returns 0 on success,
- * -ETIME when the event is in the past.
+ * cpu timer events. If called for the broadcast device the core takes
+ * care of serialization.
+ *
+ * Returns 0 on success, -ETIME when the event is in the past.
*/
int clockevents_update_freq(struct clock_event_device *dev, u32 freq)
{
- clockevents_config(dev, freq);
-
- if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
- return 0;
+ unsigned long flags;
+ int ret;
- return clockevents_program_event(dev, dev->next_event, false);
+ local_irq_save(flags);
+ ret = tick_broadcast_update_freq(dev, freq);
+ if (ret == -ENODEV)
+ ret = __clockevents_update_freq(dev, freq);
+ local_irq_restore(flags);
+ return ret;
}
/*
@@ -524,12 +544,13 @@ void clockevents_resume(void)
#ifdef CONFIG_GENERIC_CLOCKEVENTS
/**
* clockevents_notify - notification about relevant events
+ * Returns 0 on success, any other value on error
*/
-void clockevents_notify(unsigned long reason, void *arg)
+int clockevents_notify(unsigned long reason, void *arg)
{
struct clock_event_device *dev, *tmp;
unsigned long flags;
- int cpu;
+ int cpu, ret = 0;
raw_spin_lock_irqsave(&clockevents_lock, flags);
@@ -542,7 +563,7 @@ void clockevents_notify(unsigned long reason, void *arg)
case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
- tick_broadcast_oneshot_control(reason);
+ ret = tick_broadcast_oneshot_control(reason);
break;
case CLOCK_EVT_NOTIFY_CPU_DYING:
@@ -585,6 +606,7 @@ void clockevents_notify(unsigned long reason, void *arg)
break;
}
raw_spin_unlock_irqrestore(&clockevents_lock, flags);
+ return ret;
}
EXPORT_SYMBOL_GPL(clockevents_notify);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index ba3e502c955a..2e949cc9c9f1 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -32,6 +32,7 @@
#include <linux/kthread.h>
#include "tick-internal.h"
+#include "timekeeping_internal.h"
void timecounter_init(struct timecounter *tc,
const struct cyclecounter *cc,
@@ -249,7 +250,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
static void clocksource_watchdog(unsigned long data)
{
struct clocksource *cs;
- cycle_t csnow, wdnow;
+ cycle_t csnow, wdnow, delta;
int64_t wd_nsec, cs_nsec;
int next_cpu, reset_pending;
@@ -282,11 +283,12 @@ static void clocksource_watchdog(unsigned long data)
continue;
}
- wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask,
- watchdog->mult, watchdog->shift);
+ delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask);
+ wd_nsec = clocksource_cyc2ns(delta, watchdog->mult,
+ watchdog->shift);
- cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) &
- cs->mask, cs->mult, cs->shift);
+ delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
+ cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
cs->cs_last = csnow;
cs->wd_last = wdnow;
diff --git a/kernel/hrtimer.c b/kernel/time/hrtimer.c
index 09094361dce5..1c2fe7de2842 100644
--- a/kernel/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -54,6 +54,8 @@
#include <trace/events/timer.h>
+#include "timekeeping.h"
+
/*
* The timer bases:
*
@@ -114,21 +116,18 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
*/
static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
{
- ktime_t xtim, mono, boot;
- struct timespec xts, tom, slp;
- s32 tai_offset;
+ ktime_t xtim, mono, boot, tai;
+ ktime_t off_real, off_boot, off_tai;
- get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
- tai_offset = timekeeping_get_tai_offset();
+ mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai);
+ boot = ktime_add(mono, off_boot);
+ xtim = ktime_add(mono, off_real);
+ tai = ktime_add(xtim, off_tai);
- xtim = timespec_to_ktime(xts);
- mono = ktime_add(xtim, timespec_to_ktime(tom));
- boot = ktime_add(mono, timespec_to_ktime(slp));
base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
- base->clock_base[HRTIMER_BASE_TAI].softirq_time =
- ktime_add(xtim, ktime_set(tai_offset, 0));
+ base->clock_base[HRTIMER_BASE_TAI].softirq_time = tai;
}
/*
@@ -168,19 +167,6 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
}
}
-
-/*
- * Get the preferred target CPU for NOHZ
- */
-static int hrtimer_get_target(int this_cpu, int pinned)
-{
-#ifdef CONFIG_NO_HZ_COMMON
- if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
- return get_nohz_timer_target();
-#endif
- return this_cpu;
-}
-
/*
* With HIGHRES=y we do not migrate the timer when it is expiring
* before the next event on the target cpu because we cannot reprogram
@@ -214,7 +200,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
struct hrtimer_clock_base *new_base;
struct hrtimer_cpu_base *new_cpu_base;
int this_cpu = smp_processor_id();
- int cpu = hrtimer_get_target(this_cpu, pinned);
+ int cpu = get_nohz_timer_target(pinned);
int basenum = base->index;
again:
@@ -247,6 +233,11 @@ again:
goto again;
}
timer->base = new_base;
+ } else {
+ if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
+ cpu = this_cpu;
+ goto again;
+ }
}
return new_base;
}
@@ -272,60 +263,6 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
* too large for inlining:
*/
#if BITS_PER_LONG < 64
-# ifndef CONFIG_KTIME_SCALAR
-/**
- * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
- * @kt: addend
- * @nsec: the scalar nsec value to add
- *
- * Returns the sum of kt and nsec in ktime_t format
- */
-ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
-{
- ktime_t tmp;
-
- if (likely(nsec < NSEC_PER_SEC)) {
- tmp.tv64 = nsec;
- } else {
- unsigned long rem = do_div(nsec, NSEC_PER_SEC);
-
- /* Make sure nsec fits into long */
- if (unlikely(nsec > KTIME_SEC_MAX))
- return (ktime_t){ .tv64 = KTIME_MAX };
-
- tmp = ktime_set((long)nsec, rem);
- }
-
- return ktime_add(kt, tmp);
-}
-
-EXPORT_SYMBOL_GPL(ktime_add_ns);
-
-/**
- * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable
- * @kt: minuend
- * @nsec: the scalar nsec value to subtract
- *
- * Returns the subtraction of @nsec from @kt in ktime_t format
- */
-ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec)
-{
- ktime_t tmp;
-
- if (likely(nsec < NSEC_PER_SEC)) {
- tmp.tv64 = nsec;
- } else {
- unsigned long rem = do_div(nsec, NSEC_PER_SEC);
-
- tmp = ktime_set((long)nsec, rem);
- }
-
- return ktime_sub(kt, tmp);
-}
-
-EXPORT_SYMBOL_GPL(ktime_sub_ns);
-# endif /* !CONFIG_KTIME_SCALAR */
-
/*
* Divide a ktime value by a nanosecond value
*/
@@ -345,6 +282,7 @@ u64 ktime_divns(const ktime_t kt, s64 div)
return dclc;
}
+EXPORT_SYMBOL_GPL(ktime_divns);
#endif /* BITS_PER_LONG >= 64 */
/*
@@ -582,6 +520,23 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
cpu_base->expires_next.tv64 = expires_next.tv64;
+ /*
+ * If a hang was detected in the last timer interrupt then we
+ * leave the hang delay active in the hardware. We want the
+ * system to make progress. That also prevents the following
+ * scenario:
+ * T1 expires 50ms from now
+ * T2 expires 5s from now
+ *
+ * T1 is removed, so this code is called and would reprogram
+ * the hardware to 5s from now. Any hrtimer_start after that
+ * will not reprogram the hardware due to hang_detected being
+ * set. So we'd effectivly block all timers until the T2 event
+ * fires.
+ */
+ if (cpu_base->hang_detected)
+ return;
+
if (cpu_base->expires_next.tv64 != KTIME_MAX)
tick_program_event(cpu_base->expires_next, 1);
}
@@ -593,6 +548,11 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
* timers, we have to check, whether it expires earlier than the timer for
* which the clock event device was armed.
*
+ * Note, that in case the state has HRTIMER_STATE_CALLBACK set, no reprogramming
+ * and no expiry check happens. The timer gets enqueued into the rbtree. The
+ * reprogramming and expiry check is done in the hrtimer_interrupt or in the
+ * softirq.
+ *
* Called with interrupts disabled and base->cpu_base.lock held
*/
static int hrtimer_reprogram(struct hrtimer *timer,
@@ -653,25 +613,13 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
base->hres_active = 0;
}
-/*
- * When High resolution timers are active, try to reprogram. Note, that in case
- * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
- * check happens. The timer gets enqueued into the rbtree. The reprogramming
- * and expiry check is done in the hrtimer_interrupt or in the softirq.
- */
-static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base)
-{
- return base->cpu_base->hres_active && hrtimer_reprogram(timer, base);
-}
-
static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
{
ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
- return ktime_get_update_offsets(offs_real, offs_boot, offs_tai);
+ return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai);
}
/*
@@ -746,8 +694,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }
static inline int hrtimer_switch_to_hres(void) { return 0; }
static inline void
hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
-static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base)
+static inline int hrtimer_reprogram(struct hrtimer *timer,
+ struct hrtimer_clock_base *base)
{
return 0;
}
@@ -981,11 +929,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
/* Remove an active timer from the queue: */
ret = remove_hrtimer(timer, base);
- /* Switch the timer base, if necessary: */
- new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
-
if (mode & HRTIMER_MODE_REL) {
- tim = ktime_add_safe(tim, new_base->get_time());
+ tim = ktime_add_safe(tim, base->get_time());
/*
* CONFIG_TIME_LOW_RES is a temporary way for architectures
* to signal that they simply return xtime in
@@ -1000,18 +945,32 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
hrtimer_set_expires_range_ns(timer, tim, delta_ns);
+ /* Switch the timer base, if necessary: */
+ new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
+
timer_stats_hrtimer_set_start_info(timer);
leftmost = enqueue_hrtimer(timer, new_base);
- /*
- * Only allow reprogramming if the new base is on this CPU.
- * (it might still be on another CPU if the timer was pending)
- *
- * XXX send_remote_softirq() ?
- */
- if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)
- && hrtimer_enqueue_reprogram(timer, new_base)) {
+ if (!leftmost) {
+ unlock_hrtimer_base(timer, &flags);
+ return ret;
+ }
+
+ if (!hrtimer_is_hres_active(timer)) {
+ /*
+ * Kick to reschedule the next tick to handle the new timer
+ * on dynticks target.
+ */
+ wake_up_nohz_cpu(new_base->cpu_base->cpu);
+ } else if (new_base->cpu_base == &__get_cpu_var(hrtimer_bases) &&
+ hrtimer_reprogram(timer, new_base)) {
+ /*
+ * Only allow reprogramming if the new base is on this CPU.
+ * (it might still be on another CPU if the timer was pending)
+ *
+ * XXX send_remote_softirq() ?
+ */
if (wakeup) {
/*
* We need to drop cpu_base->lock to avoid a
@@ -1030,6 +989,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
return ret;
}
+EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns);
/**
* hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
@@ -1670,6 +1630,7 @@ static void init_hrtimers_cpu(int cpu)
timerqueue_init_head(&cpu_base->clock_base[i].active);
}
+ cpu_base->cpu = cpu;
hrtimer_init_hres(cpu_base);
}
diff --git a/kernel/itimer.c b/kernel/time/itimer.c
index 8d262b467573..8d262b467573 100644
--- a/kernel/itimer.c
+++ b/kernel/time/itimer.c
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index af8d1d4f3d55..87a346fd6d61 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -165,21 +165,21 @@ static inline void pps_set_freq(s64 freq)
static inline int is_error_status(int status)
{
- return (time_status & (STA_UNSYNC|STA_CLOCKERR))
+ return (status & (STA_UNSYNC|STA_CLOCKERR))
/* PPS signal lost when either PPS time or
* PPS frequency synchronization requested
*/
- || ((time_status & (STA_PPSFREQ|STA_PPSTIME))
- && !(time_status & STA_PPSSIGNAL))
+ || ((status & (STA_PPSFREQ|STA_PPSTIME))
+ && !(status & STA_PPSSIGNAL))
/* PPS jitter exceeded when
* PPS time synchronization requested */
- || ((time_status & (STA_PPSTIME|STA_PPSJITTER))
+ || ((status & (STA_PPSTIME|STA_PPSJITTER))
== (STA_PPSTIME|STA_PPSJITTER))
/* PPS wander exceeded or calibration error when
* PPS frequency synchronization requested
*/
- || ((time_status & STA_PPSFREQ)
- && (time_status & (STA_PPSWANDER|STA_PPSERROR)));
+ || ((status & STA_PPSFREQ)
+ && (status & (STA_PPSWANDER|STA_PPSERROR)));
}
static inline void pps_fill_timex(struct timex *txc)
@@ -466,7 +466,8 @@ static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
static void sync_cmos_clock(struct work_struct *work)
{
- struct timespec now, next;
+ struct timespec64 now;
+ struct timespec next;
int fail = 1;
/*
@@ -485,9 +486,9 @@ static void sync_cmos_clock(struct work_struct *work)
return;
}
- getnstimeofday(&now);
+ getnstimeofday64(&now);
if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
- struct timespec adjust = now;
+ struct timespec adjust = timespec64_to_timespec(now);
fail = -ENODEV;
if (persistent_clock_is_local)
@@ -514,12 +515,13 @@ static void sync_cmos_clock(struct work_struct *work)
next.tv_sec++;
next.tv_nsec -= NSEC_PER_SEC;
}
- schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next));
+ queue_delayed_work(system_power_efficient_wq,
+ &sync_cmos_work, timespec_to_jiffies(&next));
}
void ntp_notify_cmos_timer(void)
{
- schedule_delayed_work(&sync_cmos_work, 0);
+ queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
}
#else
@@ -530,7 +532,7 @@ void ntp_notify_cmos_timer(void) { }
/*
* Propagate a new txc->status value into the NTP state:
*/
-static inline void process_adj_status(struct timex *txc, struct timespec *ts)
+static inline void process_adj_status(struct timex *txc, struct timespec64 *ts)
{
if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
time_state = TIME_OK;
@@ -553,7 +555,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
static inline void process_adjtimex_modes(struct timex *txc,
- struct timespec *ts,
+ struct timespec64 *ts,
s32 *time_tai)
{
if (txc->modes & ADJ_STATUS)
@@ -639,7 +641,7 @@ int ntp_validate_timex(struct timex *txc)
* adjtimex mainly allows reading (and writing, if superuser) of
* kernel time-keeping variables. used by xntpd.
*/
-int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai)
+int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai)
{
int result;
@@ -683,7 +685,7 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai)
/* fill PPS status fields */
pps_fill_timex(txc);
- txc->time.tv_sec = ts->tv_sec;
+ txc->time.tv_sec = (time_t)ts->tv_sec;
txc->time.tv_usec = ts->tv_nsec;
if (!(time_status & STA_NANO))
txc->time.tv_usec /= NSEC_PER_USEC;
@@ -785,8 +787,9 @@ static long hardpps_update_freq(struct pps_normtime freq_norm)
time_status |= STA_PPSERROR;
pps_errcnt++;
pps_dec_freq_interval();
- pr_err("hardpps: PPSERROR: interval too long - %ld s\n",
- freq_norm.sec);
+ printk_deferred(KERN_ERR
+ "hardpps: PPSERROR: interval too long - %ld s\n",
+ freq_norm.sec);
return 0;
}
@@ -799,7 +802,8 @@ static long hardpps_update_freq(struct pps_normtime freq_norm)
delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT);
pps_freq = ftemp;
if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) {
- pr_warning("hardpps: PPSWANDER: change=%ld\n", delta);
+ printk_deferred(KERN_WARNING
+ "hardpps: PPSWANDER: change=%ld\n", delta);
time_status |= STA_PPSWANDER;
pps_stbcnt++;
pps_dec_freq_interval();
@@ -843,8 +847,9 @@ static void hardpps_update_phase(long error)
* the time offset is updated.
*/
if (jitter > (pps_jitter << PPS_POPCORN)) {
- pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n",
- jitter, (pps_jitter << PPS_POPCORN));
+ printk_deferred(KERN_WARNING
+ "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n",
+ jitter, (pps_jitter << PPS_POPCORN));
time_status |= STA_PPSJITTER;
pps_jitcnt++;
} else if (time_status & STA_PPSTIME) {
@@ -901,7 +906,7 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
time_status |= STA_PPSJITTER;
/* restart the frequency calibration interval */
pps_fbase = *raw_ts;
- pr_err("hardpps: PPSJITTER: bad pulse\n");
+ printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n");
return;
}
@@ -922,7 +927,10 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
static int __init ntp_tick_adj_setup(char *str)
{
- ntp_tick_adj = simple_strtol(str, NULL, 0);
+ int rc = kstrtol(str, 0, (long *)&ntp_tick_adj);
+
+ if (rc)
+ return rc;
ntp_tick_adj <<= NTP_SCALE_SHIFT;
return 1;
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index 1950cb4ca2a4..bbd102ad9df7 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -7,6 +7,6 @@ extern void ntp_clear(void);
extern u64 ntp_tick_length(void);
extern int second_overflow(unsigned long secs);
extern int ntp_validate_timex(struct timex *);
-extern int __do_adjtimex(struct timex *, struct timespec *, s32 *);
+extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *);
extern void __hardpps(const struct timespec *, const struct timespec *);
#endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 3b8946416a5f..3b8946416a5f 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
diff --git a/kernel/posix-timers.c b/kernel/time/posix-timers.c
index 424c2d4265c9..42b463ad90f2 100644
--- a/kernel/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -49,6 +49,8 @@
#include <linux/export.h>
#include <linux/hashtable.h>
+#include "timekeeping.h"
+
/*
* Management arrays for POSIX timers. Timers are now kept in static hash table
* with 512 entries.
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 4d23dc4d8139..01d2d15aa662 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -49,13 +49,6 @@ static u64 notrace jiffy_sched_clock_read(void)
return (u64)(jiffies - INITIAL_JIFFIES);
}
-static u32 __read_mostly (*read_sched_clock_32)(void);
-
-static u64 notrace read_sched_clock_32_wrapper(void)
-{
- return read_sched_clock_32();
-}
-
static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
@@ -176,12 +169,6 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
pr_debug("Registered %pF as sched_clock source\n", read);
}
-void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
-{
- read_sched_clock_32 = read;
- sched_clock_register(read_sched_clock_32_wrapper, bits, rate);
-}
-
void __init sched_clock_postinit(void)
{
/*
@@ -204,7 +191,8 @@ void __init sched_clock_postinit(void)
static int sched_clock_suspend(void)
{
- sched_clock_poll(&sched_clock_timer);
+ update_sched_clock();
+ hrtimer_cancel(&sched_clock_timer);
cd.suspended = true;
return 0;
}
@@ -212,6 +200,7 @@ static int sched_clock_suspend(void)
static void sched_clock_resume(void)
{
cd.epoch_cyc = read_sched_clock();
+ hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
cd.suspended = false;
}
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
new file mode 100644
index 000000000000..eb682d5c697c
--- /dev/null
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -0,0 +1,106 @@
+/*
+ * linux/kernel/time/tick-broadcast-hrtimer.c
+ * This file emulates a local clock event device
+ * via a pseudo clock device.
+ */
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/clockchips.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+
+#include "tick-internal.h"
+
+static struct hrtimer bctimer;
+
+static void bc_set_mode(enum clock_event_mode mode,
+ struct clock_event_device *bc)
+{
+ switch (mode) {
+ case CLOCK_EVT_MODE_SHUTDOWN:
+ /*
+ * Note, we cannot cancel the timer here as we might
+ * run into the following live lock scenario:
+ *
+ * cpu 0 cpu1
+ * lock(broadcast_lock);
+ * hrtimer_interrupt()
+ * bc_handler()
+ * tick_handle_oneshot_broadcast();
+ * lock(broadcast_lock);
+ * hrtimer_cancel()
+ * wait_for_callback()
+ */
+ hrtimer_try_to_cancel(&bctimer);
+ break;
+ default:
+ break;
+ }
+}
+
+/*
+ * This is called from the guts of the broadcast code when the cpu
+ * which is about to enter idle has the earliest broadcast timer event.
+ */
+static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
+{
+ /*
+ * We try to cancel the timer first. If the callback is on
+ * flight on some other cpu then we let it handle it. If we
+ * were able to cancel the timer nothing can rearm it as we
+ * own broadcast_lock.
+ *
+ * However we can also be called from the event handler of
+ * ce_broadcast_hrtimer itself when it expires. We cannot
+ * restart the timer because we are in the callback, but we
+ * can set the expiry time and let the callback return
+ * HRTIMER_RESTART.
+ */
+ if (hrtimer_try_to_cancel(&bctimer) >= 0) {
+ hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED);
+ /* Bind the "device" to the cpu */
+ bc->bound_on = smp_processor_id();
+ } else if (bc->bound_on == smp_processor_id()) {
+ hrtimer_set_expires(&bctimer, expires);
+ }
+ return 0;
+}
+
+static struct clock_event_device ce_broadcast_hrtimer = {
+ .set_mode = bc_set_mode,
+ .set_next_ktime = bc_set_next,
+ .features = CLOCK_EVT_FEAT_ONESHOT |
+ CLOCK_EVT_FEAT_KTIME |
+ CLOCK_EVT_FEAT_HRTIMER,
+ .rating = 0,
+ .bound_on = -1,
+ .min_delta_ns = 1,
+ .max_delta_ns = KTIME_MAX,
+ .min_delta_ticks = 1,
+ .max_delta_ticks = ULONG_MAX,
+ .mult = 1,
+ .shift = 0,
+ .cpumask = cpu_all_mask,
+};
+
+static enum hrtimer_restart bc_handler(struct hrtimer *t)
+{
+ ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer);
+
+ if (ce_broadcast_hrtimer.next_event.tv64 == KTIME_MAX)
+ return HRTIMER_NORESTART;
+
+ return HRTIMER_RESTART;
+}
+
+void tick_setup_hrtimer_broadcast(void)
+{
+ hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ bctimer.function = bc_handler;
+ clockevents_register_device(&ce_broadcast_hrtimer);
+}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 98977a57ac72..64c5990fd500 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -120,6 +120,19 @@ int tick_is_broadcast_device(struct clock_event_device *dev)
return (dev && tick_broadcast_device.evtdev == dev);
}
+int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq)
+{
+ int ret = -ENODEV;
+
+ if (tick_is_broadcast_device(dev)) {
+ raw_spin_lock(&tick_broadcast_lock);
+ ret = __clockevents_update_freq(dev, freq);
+ raw_spin_unlock(&tick_broadcast_lock);
+ }
+ return ret;
+}
+
+
static void err_broadcast(const struct cpumask *mask)
{
pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n");
@@ -272,12 +285,8 @@ static void tick_do_broadcast(struct cpumask *mask)
*/
static void tick_do_periodic_broadcast(void)
{
- raw_spin_lock(&tick_broadcast_lock);
-
cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask);
tick_do_broadcast(tmpmask);
-
- raw_spin_unlock(&tick_broadcast_lock);
}
/*
@@ -287,13 +296,15 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
{
ktime_t next;
+ raw_spin_lock(&tick_broadcast_lock);
+
tick_do_periodic_broadcast();
/*
* The device is in periodic mode. No reprogramming necessary:
*/
if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
- return;
+ goto unlock;
/*
* Setup the next period for devices, which do not have
@@ -306,9 +317,11 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
next = ktime_add(next, tick_period);
if (!clockevents_program_event(dev, next, false))
- return;
+ goto unlock;
tick_do_periodic_broadcast();
}
+unlock:
+ raw_spin_unlock(&tick_broadcast_lock);
}
/*
@@ -630,24 +643,61 @@ again:
raw_spin_unlock(&tick_broadcast_lock);
}
+static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu)
+{
+ if (!(bc->features & CLOCK_EVT_FEAT_HRTIMER))
+ return 0;
+ if (bc->next_event.tv64 == KTIME_MAX)
+ return 0;
+ return bc->bound_on == cpu ? -EBUSY : 0;
+}
+
+static void broadcast_shutdown_local(struct clock_event_device *bc,
+ struct clock_event_device *dev)
+{
+ /*
+ * For hrtimer based broadcasting we cannot shutdown the cpu
+ * local device if our own event is the first one to expire or
+ * if we own the broadcast timer.
+ */
+ if (bc->features & CLOCK_EVT_FEAT_HRTIMER) {
+ if (broadcast_needs_cpu(bc, smp_processor_id()))
+ return;
+ if (dev->next_event.tv64 < bc->next_event.tv64)
+ return;
+ }
+ clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+}
+
+static void broadcast_move_bc(int deadcpu)
+{
+ struct clock_event_device *bc = tick_broadcast_device.evtdev;
+
+ if (!bc || !broadcast_needs_cpu(bc, deadcpu))
+ return;
+ /* This moves the broadcast assignment to this cpu */
+ clockevents_program_event(bc, bc->next_event, 1);
+}
+
/*
* Powerstate information: The system enters/leaves a state, where
* affected devices might stop
+ * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups.
*/
-void tick_broadcast_oneshot_control(unsigned long reason)
+int tick_broadcast_oneshot_control(unsigned long reason)
{
struct clock_event_device *bc, *dev;
struct tick_device *td;
unsigned long flags;
ktime_t now;
- int cpu;
+ int cpu, ret = 0;
/*
* Periodic mode does not care about the enter/exit of power
* states
*/
if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
- return;
+ return 0;
/*
* We are called with preemtion disabled from the depth of the
@@ -658,7 +708,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
dev = td->evtdev;
if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
- return;
+ return 0;
bc = tick_broadcast_device.evtdev;
@@ -666,7 +716,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
- clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+ broadcast_shutdown_local(bc, dev);
/*
* We only reprogram the broadcast timer if we
* did not mark ourself in the force mask and
@@ -679,6 +729,16 @@ void tick_broadcast_oneshot_control(unsigned long reason)
dev->next_event.tv64 < bc->next_event.tv64)
tick_broadcast_set_event(bc, cpu, dev->next_event, 1);
}
+ /*
+ * If the current CPU owns the hrtimer broadcast
+ * mechanism, it cannot go deep idle and we remove the
+ * CPU from the broadcast mask. We don't have to go
+ * through the EXIT path as the local timer is not
+ * shutdown.
+ */
+ ret = broadcast_needs_cpu(bc, cpu);
+ if (ret)
+ cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
} else {
if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
@@ -746,6 +806,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
}
out:
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+ return ret;
}
/*
@@ -852,6 +913,8 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
+ broadcast_move_bc(cpu);
+
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 20b2fe37d105..0a0608edeb26 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -98,18 +98,19 @@ static void tick_periodic(int cpu)
void tick_handle_periodic(struct clock_event_device *dev)
{
int cpu = smp_processor_id();
- ktime_t next;
+ ktime_t next = dev->next_event;
tick_periodic(cpu);
if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
return;
- /*
- * Setup the next period for devices, which do not have
- * periodic mode:
- */
- next = ktime_add(dev->next_event, tick_period);
for (;;) {
+ /*
+ * Setup the next period for devices, which do not have
+ * periodic mode:
+ */
+ next = ktime_add(next, tick_period);
+
if (!clockevents_program_event(dev, next, false))
return;
/*
@@ -118,12 +119,11 @@ void tick_handle_periodic(struct clock_event_device *dev)
* to be sure we're using a real hardware clocksource.
* Otherwise we could get trapped in an infinite
* loop, as the tick_periodic() increments jiffies,
- * when then will increment time, posibly causing
+ * which then will increment time, possibly causing
* the loop to trigger again and again.
*/
if (timekeeping_valid_for_hres())
tick_periodic(cpu);
- next = ktime_add(next, tick_period);
}
}
@@ -276,7 +276,7 @@ static bool tick_check_preferred(struct clock_event_device *curdev,
bool tick_check_replacement(struct clock_event_device *curdev,
struct clock_event_device *newdev)
{
- if (tick_check_percpu(curdev, newdev, smp_processor_id()))
+ if (!tick_check_percpu(curdev, newdev, smp_processor_id()))
return false;
return tick_check_preferred(curdev, newdev);
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 8329669b51ec..c19c1d84b6f3 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -4,6 +4,8 @@
#include <linux/hrtimer.h>
#include <linux/tick.h>
+#include "timekeeping.h"
+
extern seqlock_t jiffies_lock;
#define CS_NAME_LEN 32
@@ -46,7 +48,7 @@ extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
extern void tick_resume_oneshot(void);
# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
-extern void tick_broadcast_oneshot_control(unsigned long reason);
+extern int tick_broadcast_oneshot_control(unsigned long reason);
extern void tick_broadcast_switch_to_oneshot(void);
extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
@@ -58,7 +60,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
{
BUG();
}
-static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
+static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; }
static inline void tick_broadcast_switch_to_oneshot(void) { }
static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
static inline int tick_broadcast_oneshot_active(void) { return 0; }
@@ -87,7 +89,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
{
BUG();
}
-static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
+static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; }
static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
{
@@ -111,6 +113,7 @@ extern int tick_resume_broadcast(void);
extern void tick_broadcast_init(void);
extern void
tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
+int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq);
#else /* !BROADCAST */
@@ -133,6 +136,8 @@ static inline void tick_shutdown_broadcast(unsigned int *cpup) { }
static inline void tick_suspend_broadcast(void) { }
static inline int tick_resume_broadcast(void) { return 0; }
static inline void tick_broadcast_init(void) { }
+static inline int tick_broadcast_update_freq(struct clock_event_device *dev,
+ u32 freq) { return -ENODEV; }
/*
* Set the periodic handler in non broadcast mode
@@ -152,6 +157,8 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
}
+int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
+
#endif
extern void do_timer(unsigned long ticks);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 9f8af69c67ec..f654a8a298fa 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -84,6 +84,9 @@ static void tick_do_update_jiffies64(ktime_t now)
/* Keep the tick_next_period variable up to date */
tick_next_period = ktime_add(last_jiffies_update, tick_period);
+ } else {
+ write_sequnlock(&jiffies_lock);
+ return;
}
write_sequnlock(&jiffies_lock);
update_wall_time();
@@ -151,6 +154,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
#ifdef CONFIG_NO_HZ_FULL
cpumask_var_t tick_nohz_full_mask;
+cpumask_var_t housekeeping_mask;
bool tick_nohz_full_running;
static bool can_stop_full_tick(void)
@@ -221,13 +225,29 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
};
/*
- * Kick the current CPU if it's full dynticks in order to force it to
+ * Kick this CPU if it's full dynticks in order to force it to
* re-evaluate its dependency on the tick and restart it if necessary.
+ * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(),
+ * is NMI safe.
*/
void tick_nohz_full_kick(void)
{
- if (tick_nohz_full_cpu(smp_processor_id()))
- irq_work_queue(&__get_cpu_var(nohz_full_kick_work));
+ if (!tick_nohz_full_cpu(smp_processor_id()))
+ return;
+
+ irq_work_queue(&__get_cpu_var(nohz_full_kick_work));
+}
+
+/*
+ * Kick the CPU if it's full dynticks in order to force it to
+ * re-evaluate its dependency on the tick and restart it if necessary.
+ */
+void tick_nohz_full_kick_cpu(int cpu)
+{
+ if (!tick_nohz_full_cpu(cpu))
+ return;
+
+ irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
}
static void nohz_full_kick_ipi(void *info)
@@ -278,6 +298,7 @@ static int __init tick_nohz_full_setup(char *str)
int cpu;
alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
+ alloc_bootmem_cpumask_var(&housekeeping_mask);
if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
return 1;
@@ -288,6 +309,8 @@ static int __init tick_nohz_full_setup(char *str)
pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
cpumask_clear_cpu(cpu, tick_nohz_full_mask);
}
+ cpumask_andnot(housekeeping_mask,
+ cpu_possible_mask, tick_nohz_full_mask);
tick_nohz_full_running = true;
return 1;
@@ -329,9 +352,15 @@ static int tick_nohz_init_all(void)
pr_err("NO_HZ: Can't allocate full dynticks cpumask\n");
return err;
}
+ if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
+ pr_err("NO_HZ: Can't allocate not-full dynticks cpumask\n");
+ return err;
+ }
err = 0;
cpumask_setall(tick_nohz_full_mask);
cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask);
+ cpumask_clear(housekeeping_mask);
+ cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
tick_nohz_full_running = true;
#endif
return err;
@@ -967,7 +996,7 @@ static void tick_nohz_switch_to_nohz(void)
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
ktime_t next;
- if (!tick_nohz_active)
+ if (!tick_nohz_enabled)
return;
local_irq_disable();
diff --git a/kernel/time.c b/kernel/time/time.c
index 7c7964c33ae7..a9ae20fb0b11 100644
--- a/kernel/time.c
+++ b/kernel/time/time.c
@@ -42,6 +42,7 @@
#include <asm/unistd.h>
#include "timeconst.h"
+#include "timekeeping.h"
/*
* The timezone where the local system is located. Used as a default by some
@@ -420,6 +421,68 @@ struct timeval ns_to_timeval(const s64 nsec)
}
EXPORT_SYMBOL(ns_to_timeval);
+#if BITS_PER_LONG == 32
+/**
+ * set_normalized_timespec - set timespec sec and nsec parts and normalize
+ *
+ * @ts: pointer to timespec variable to be set
+ * @sec: seconds to set
+ * @nsec: nanoseconds to set
+ *
+ * Set seconds and nanoseconds field of a timespec variable and
+ * normalize to the timespec storage format
+ *
+ * Note: The tv_nsec part is always in the range of
+ * 0 <= tv_nsec < NSEC_PER_SEC
+ * For negative values only the tv_sec field is negative !
+ */
+void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec)
+{
+ while (nsec >= NSEC_PER_SEC) {
+ /*
+ * The following asm() prevents the compiler from
+ * optimising this loop into a modulo operation. See
+ * also __iter_div_u64_rem() in include/linux/time.h
+ */
+ asm("" : "+rm"(nsec));
+ nsec -= NSEC_PER_SEC;
+ ++sec;
+ }
+ while (nsec < 0) {
+ asm("" : "+rm"(nsec));
+ nsec += NSEC_PER_SEC;
+ --sec;
+ }
+ ts->tv_sec = sec;
+ ts->tv_nsec = nsec;
+}
+EXPORT_SYMBOL(set_normalized_timespec64);
+
+/**
+ * ns_to_timespec64 - Convert nanoseconds to timespec64
+ * @nsec: the nanoseconds value to be converted
+ *
+ * Returns the timespec64 representation of the nsec parameter.
+ */
+struct timespec64 ns_to_timespec64(const s64 nsec)
+{
+ struct timespec64 ts;
+ s32 rem;
+
+ if (!nsec)
+ return (struct timespec64) {0, 0};
+
+ ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
+ if (unlikely(rem < 0)) {
+ ts.tv_sec--;
+ rem += NSEC_PER_SEC;
+ }
+ ts.tv_nsec = rem;
+
+ return ts;
+}
+EXPORT_SYMBOL(ns_to_timespec64);
+#endif
/*
* When we convert to jiffies then we interpret incoming values
* the following way:
@@ -496,17 +559,20 @@ EXPORT_SYMBOL(usecs_to_jiffies);
* that a remainder subtract here would not do the right thing as the
* resolution values don't fall on second boundries. I.e. the line:
* nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
+ * Note that due to the small error in the multiplier here, this
+ * rounding is incorrect for sufficiently large values of tv_nsec, but
+ * well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're
+ * OK.
*
* Rather, we just shift the bits off the right.
*
* The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
* value to a scaled second value.
*/
-unsigned long
-timespec_to_jiffies(const struct timespec *value)
+static unsigned long
+__timespec_to_jiffies(unsigned long sec, long nsec)
{
- unsigned long sec = value->tv_sec;
- long nsec = value->tv_nsec + TICK_NSEC - 1;
+ nsec = nsec + TICK_NSEC - 1;
if (sec >= MAX_SEC_IN_JIFFIES){
sec = MAX_SEC_IN_JIFFIES;
@@ -517,6 +583,13 @@ timespec_to_jiffies(const struct timespec *value)
(NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
}
+
+unsigned long
+timespec_to_jiffies(const struct timespec *value)
+{
+ return __timespec_to_jiffies(value->tv_sec, value->tv_nsec);
+}
+
EXPORT_SYMBOL(timespec_to_jiffies);
void
@@ -533,31 +606,27 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
}
EXPORT_SYMBOL(jiffies_to_timespec);
-/* Same for "timeval"
+/*
+ * We could use a similar algorithm to timespec_to_jiffies (with a
+ * different multiplier for usec instead of nsec). But this has a
+ * problem with rounding: we can't exactly add TICK_NSEC - 1 to the
+ * usec value, since it's not necessarily integral.
*
- * Well, almost. The problem here is that the real system resolution is
- * in nanoseconds and the value being converted is in micro seconds.
- * Also for some machines (those that use HZ = 1024, in-particular),
- * there is a LARGE error in the tick size in microseconds.
-
- * The solution we use is to do the rounding AFTER we convert the
- * microsecond part. Thus the USEC_ROUND, the bits to be shifted off.
- * Instruction wise, this should cost only an additional add with carry
- * instruction above the way it was done above.
+ * We could instead round in the intermediate scaled representation
+ * (i.e. in units of 1/2^(large scale) jiffies) but that's also
+ * perilous: the scaling introduces a small positive error, which
+ * combined with a division-rounding-upward (i.e. adding 2^(scale) - 1
+ * units to the intermediate before shifting) leads to accidental
+ * overflow and overestimates.
+ *
+ * At the cost of one additional multiplication by a constant, just
+ * use the timespec implementation.
*/
unsigned long
timeval_to_jiffies(const struct timeval *value)
{
- unsigned long sec = value->tv_sec;
- long usec = value->tv_usec;
-
- if (sec >= MAX_SEC_IN_JIFFIES){
- sec = MAX_SEC_IN_JIFFIES;
- usec = 0;
- }
- return (((u64)sec * SEC_CONVERSION) +
- (((u64)usec * USEC_CONVERSION + USEC_ROUND) >>
- (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
+ return __timespec_to_jiffies(value->tv_sec,
+ value->tv_usec * NSEC_PER_USEC);
}
EXPORT_SYMBOL(timeval_to_jiffies);
@@ -694,6 +763,7 @@ unsigned long nsecs_to_jiffies(u64 n)
{
return (unsigned long)nsecs_to_jiffies64(n);
}
+EXPORT_SYMBOL_GPL(nsecs_to_jiffies);
/*
* Add two timespec values and do a safety check for overflow.
diff --git a/kernel/timeconst.bc b/kernel/time/timeconst.bc
index 511bdf2cafda..511bdf2cafda 100644
--- a/kernel/timeconst.bc
+++ b/kernel/time/timeconst.bc
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 0aa4ce81bc16..ec1791fae965 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -22,6 +22,7 @@
#include <linux/tick.h>
#include <linux/stop_machine.h>
#include <linux/pvclock_gtod.h>
+#include <linux/compiler.h>
#include "tick-internal.h"
#include "ntp_internal.h"
@@ -31,11 +32,34 @@
#define TK_MIRROR (1 << 1)
#define TK_CLOCK_WAS_SET (1 << 2)
-static struct timekeeper timekeeper;
+/*
+ * The most important data for readout fits into a single 64 byte
+ * cache line.
+ */
+static struct {
+ seqcount_t seq;
+ struct timekeeper timekeeper;
+} tk_core ____cacheline_aligned;
+
static DEFINE_RAW_SPINLOCK(timekeeper_lock);
-static seqcount_t timekeeper_seq;
static struct timekeeper shadow_timekeeper;
+/**
+ * struct tk_fast - NMI safe timekeeper
+ * @seq: Sequence counter for protecting updates. The lowest bit
+ * is the index for the tk_read_base array
+ * @base: tk_read_base array. Access is indexed by the lowest bit of
+ * @seq.
+ *
+ * See @update_fast_timekeeper() below.
+ */
+struct tk_fast {
+ seqcount_t seq;
+ struct tk_read_base base[2];
+};
+
+static struct tk_fast tk_fast_mono ____cacheline_aligned;
+
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
@@ -44,49 +68,54 @@ bool __read_mostly persistent_clock_exist = false;
static inline void tk_normalize_xtime(struct timekeeper *tk)
{
- while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) {
- tk->xtime_nsec -= (u64)NSEC_PER_SEC << tk->shift;
+ while (tk->tkr.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr.shift)) {
+ tk->tkr.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr.shift;
tk->xtime_sec++;
}
}
-static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts)
+static inline struct timespec64 tk_xtime(struct timekeeper *tk)
+{
+ struct timespec64 ts;
+
+ ts.tv_sec = tk->xtime_sec;
+ ts.tv_nsec = (long)(tk->tkr.xtime_nsec >> tk->tkr.shift);
+ return ts;
+}
+
+static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
{
tk->xtime_sec = ts->tv_sec;
- tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift;
+ tk->tkr.xtime_nsec = (u64)ts->tv_nsec << tk->tkr.shift;
}
-static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts)
+static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
{
tk->xtime_sec += ts->tv_sec;
- tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift;
+ tk->tkr.xtime_nsec += (u64)ts->tv_nsec << tk->tkr.shift;
tk_normalize_xtime(tk);
}
-static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm)
+static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
{
- struct timespec tmp;
+ struct timespec64 tmp;
/*
* Verify consistency of: offset_real = -wall_to_monotonic
* before modifying anything
*/
- set_normalized_timespec(&tmp, -tk->wall_to_monotonic.tv_sec,
+ set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec,
-tk->wall_to_monotonic.tv_nsec);
- WARN_ON_ONCE(tk->offs_real.tv64 != timespec_to_ktime(tmp).tv64);
+ WARN_ON_ONCE(tk->offs_real.tv64 != timespec64_to_ktime(tmp).tv64);
tk->wall_to_monotonic = wtm;
- set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
- tk->offs_real = timespec_to_ktime(tmp);
+ set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
+ tk->offs_real = timespec64_to_ktime(tmp);
tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0));
}
-static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
+static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
{
- /* Verify consistency before modifying */
- WARN_ON_ONCE(tk->offs_boot.tv64 != timespec_to_ktime(tk->total_sleep_time).tv64);
-
- tk->total_sleep_time = t;
- tk->offs_boot = timespec_to_ktime(t);
+ tk->offs_boot = ktime_add(tk->offs_boot, delta);
}
/**
@@ -106,9 +135,11 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
u64 tmp, ntpinterval;
struct clocksource *old_clock;
- old_clock = tk->clock;
- tk->clock = clock;
- tk->cycle_last = clock->cycle_last = clock->read(clock);
+ old_clock = tk->tkr.clock;
+ tk->tkr.clock = clock;
+ tk->tkr.read = clock->read;
+ tk->tkr.mask = clock->mask;
+ tk->tkr.cycle_last = tk->tkr.read(clock);
/* Do the ns -> cycle conversion first, using original mult */
tmp = NTP_INTERVAL_LENGTH;
@@ -132,78 +163,213 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
if (old_clock) {
int shift_change = clock->shift - old_clock->shift;
if (shift_change < 0)
- tk->xtime_nsec >>= -shift_change;
+ tk->tkr.xtime_nsec >>= -shift_change;
else
- tk->xtime_nsec <<= shift_change;
+ tk->tkr.xtime_nsec <<= shift_change;
}
- tk->shift = clock->shift;
+ tk->tkr.shift = clock->shift;
tk->ntp_error = 0;
tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
+ tk->ntp_tick = ntpinterval << tk->ntp_error_shift;
/*
* The timekeeper keeps its own mult values for the currently
* active clocksource. These value will be adjusted via NTP
* to counteract clock drifting.
*/
- tk->mult = clock->mult;
+ tk->tkr.mult = clock->mult;
+ tk->ntp_err_mult = 0;
}
/* Timekeeper helper functions. */
#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
-u32 (*arch_gettimeoffset)(void);
-
-u32 get_arch_timeoffset(void)
-{
- if (likely(arch_gettimeoffset))
- return arch_gettimeoffset();
- return 0;
-}
+static u32 default_arch_gettimeoffset(void) { return 0; }
+u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset;
#else
-static inline u32 get_arch_timeoffset(void) { return 0; }
+static inline u32 arch_gettimeoffset(void) { return 0; }
#endif
-static inline s64 timekeeping_get_ns(struct timekeeper *tk)
+static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
{
- cycle_t cycle_now, cycle_delta;
- struct clocksource *clock;
+ cycle_t cycle_now, delta;
s64 nsec;
/* read clocksource: */
- clock = tk->clock;
- cycle_now = clock->read(clock);
+ cycle_now = tkr->read(tkr->clock);
/* calculate the delta since the last update_wall_time: */
- cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+ delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
- nsec = cycle_delta * tk->mult + tk->xtime_nsec;
- nsec >>= tk->shift;
+ nsec = delta * tkr->mult + tkr->xtime_nsec;
+ nsec >>= tkr->shift;
/* If arch requires, add in get_arch_timeoffset() */
- return nsec + get_arch_timeoffset();
+ return nsec + arch_gettimeoffset();
}
static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
{
- cycle_t cycle_now, cycle_delta;
- struct clocksource *clock;
+ struct clocksource *clock = tk->tkr.clock;
+ cycle_t cycle_now, delta;
s64 nsec;
/* read clocksource: */
- clock = tk->clock;
- cycle_now = clock->read(clock);
+ cycle_now = tk->tkr.read(clock);
/* calculate the delta since the last update_wall_time: */
- cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+ delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
/* convert delta to nanoseconds. */
- nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
+ nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
/* If arch requires, add in get_arch_timeoffset() */
- return nsec + get_arch_timeoffset();
+ return nsec + arch_gettimeoffset();
+}
+
+/**
+ * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
+ * @tk: The timekeeper from which we take the update
+ * @tkf: The fast timekeeper to update
+ * @tbase: The time base for the fast timekeeper (mono/raw)
+ *
+ * We want to use this from any context including NMI and tracing /
+ * instrumenting the timekeeping code itself.
+ *
+ * So we handle this differently than the other timekeeping accessor
+ * functions which retry when the sequence count has changed. The
+ * update side does:
+ *
+ * smp_wmb(); <- Ensure that the last base[1] update is visible
+ * tkf->seq++;
+ * smp_wmb(); <- Ensure that the seqcount update is visible
+ * update(tkf->base[0], tk);
+ * smp_wmb(); <- Ensure that the base[0] update is visible
+ * tkf->seq++;
+ * smp_wmb(); <- Ensure that the seqcount update is visible
+ * update(tkf->base[1], tk);
+ *
+ * The reader side does:
+ *
+ * do {
+ * seq = tkf->seq;
+ * smp_rmb();
+ * idx = seq & 0x01;
+ * now = now(tkf->base[idx]);
+ * smp_rmb();
+ * } while (seq != tkf->seq)
+ *
+ * As long as we update base[0] readers are forced off to
+ * base[1]. Once base[0] is updated readers are redirected to base[0]
+ * and the base[1] update takes place.
+ *
+ * So if a NMI hits the update of base[0] then it will use base[1]
+ * which is still consistent. In the worst case this can result is a
+ * slightly wrong timestamp (a few nanoseconds). See
+ * @ktime_get_mono_fast_ns.
+ */
+static void update_fast_timekeeper(struct timekeeper *tk)
+{
+ struct tk_read_base *base = tk_fast_mono.base;
+
+ /* Force readers off to base[1] */
+ raw_write_seqcount_latch(&tk_fast_mono.seq);
+
+ /* Update base[0] */
+ memcpy(base, &tk->tkr, sizeof(*base));
+
+ /* Force readers back to base[0] */
+ raw_write_seqcount_latch(&tk_fast_mono.seq);
+
+ /* Update base[1] */
+ memcpy(base + 1, base, sizeof(*base));
}
+/**
+ * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
+ *
+ * This timestamp is not guaranteed to be monotonic across an update.
+ * The timestamp is calculated by:
+ *
+ * now = base_mono + clock_delta * slope
+ *
+ * So if the update lowers the slope, readers who are forced to the
+ * not yet updated second array are still using the old steeper slope.
+ *
+ * tmono
+ * ^
+ * | o n
+ * | o n
+ * | u
+ * | o
+ * |o
+ * |12345678---> reader order
+ *
+ * o = old slope
+ * u = update
+ * n = new slope
+ *
+ * So reader 6 will observe time going backwards versus reader 5.
+ *
+ * While other CPUs are likely to be able observe that, the only way
+ * for a CPU local observation is when an NMI hits in the middle of
+ * the update. Timestamps taken from that NMI context might be ahead
+ * of the following timestamps. Callers need to be aware of that and
+ * deal with it.
+ */
+u64 notrace ktime_get_mono_fast_ns(void)
+{
+ struct tk_read_base *tkr;
+ unsigned int seq;
+ u64 now;
+
+ do {
+ seq = raw_read_seqcount(&tk_fast_mono.seq);
+ tkr = tk_fast_mono.base + (seq & 0x01);
+ now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr);
+
+ } while (read_seqcount_retry(&tk_fast_mono.seq, seq));
+ return now;
+}
+EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
+
+#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
+
+static inline void update_vsyscall(struct timekeeper *tk)
+{
+ struct timespec xt, wm;
+
+ xt = timespec64_to_timespec(tk_xtime(tk));
+ wm = timespec64_to_timespec(tk->wall_to_monotonic);
+ update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult,
+ tk->tkr.cycle_last);
+}
+
+static inline void old_vsyscall_fixup(struct timekeeper *tk)
+{
+ s64 remainder;
+
+ /*
+ * Store only full nanoseconds into xtime_nsec after rounding
+ * it up and add the remainder to the error difference.
+ * XXX - This is necessary to avoid small 1ns inconsistnecies caused
+ * by truncating the remainder in vsyscalls. However, it causes
+ * additional work to be done in timekeeping_adjust(). Once
+ * the vsyscall implementations are converted to use xtime_nsec
+ * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
+ * users are removed, this can be killed.
+ */
+ remainder = tk->tkr.xtime_nsec & ((1ULL << tk->tkr.shift) - 1);
+ tk->tkr.xtime_nsec -= remainder;
+ tk->tkr.xtime_nsec += 1ULL << tk->tkr.shift;
+ tk->ntp_error += remainder << tk->ntp_error_shift;
+ tk->ntp_error -= (1ULL << tk->tkr.shift) << tk->ntp_error_shift;
+}
+#else
+#define old_vsyscall_fixup(tk)
+#endif
+
static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
@@ -216,7 +382,7 @@ static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
*/
int pvclock_gtod_register_notifier(struct notifier_block *nb)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
unsigned long flags;
int ret;
@@ -246,6 +412,29 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
+/*
+ * Update the ktime_t based scalar nsec members of the timekeeper
+ */
+static inline void tk_update_ktime_data(struct timekeeper *tk)
+{
+ s64 nsec;
+
+ /*
+ * The xtime based monotonic readout is:
+ * nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now();
+ * The ktime based monotonic readout is:
+ * nsec = base_mono + now();
+ * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec
+ */
+ nsec = (s64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
+ nsec *= NSEC_PER_SEC;
+ nsec += tk->wall_to_monotonic.tv_nsec;
+ tk->tkr.base_mono = ns_to_ktime(nsec);
+
+ /* Update the monotonic raw base */
+ tk->base_raw = timespec64_to_ktime(tk->raw_time);
+}
+
/* must hold timekeeper_lock */
static void timekeeping_update(struct timekeeper *tk, unsigned int action)
{
@@ -253,11 +442,17 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
tk->ntp_error = 0;
ntp_clear();
}
+
+ tk_update_ktime_data(tk);
+
update_vsyscall(tk);
update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
if (action & TK_MIRROR)
- memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
+ memcpy(&shadow_timekeeper, &tk_core.timekeeper,
+ sizeof(tk_core.timekeeper));
+
+ update_fast_timekeeper(tk);
}
/**
@@ -269,49 +464,48 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
*/
static void timekeeping_forward_now(struct timekeeper *tk)
{
- cycle_t cycle_now, cycle_delta;
- struct clocksource *clock;
+ struct clocksource *clock = tk->tkr.clock;
+ cycle_t cycle_now, delta;
s64 nsec;
- clock = tk->clock;
- cycle_now = clock->read(clock);
- cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
- tk->cycle_last = clock->cycle_last = cycle_now;
+ cycle_now = tk->tkr.read(clock);
+ delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
+ tk->tkr.cycle_last = cycle_now;
- tk->xtime_nsec += cycle_delta * tk->mult;
+ tk->tkr.xtime_nsec += delta * tk->tkr.mult;
/* If arch requires, add in get_arch_timeoffset() */
- tk->xtime_nsec += (u64)get_arch_timeoffset() << tk->shift;
+ tk->tkr.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr.shift;
tk_normalize_xtime(tk);
- nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
- timespec_add_ns(&tk->raw_time, nsec);
+ nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
+ timespec64_add_ns(&tk->raw_time, nsec);
}
/**
- * __getnstimeofday - Returns the time of day in a timespec.
+ * __getnstimeofday64 - Returns the time of day in a timespec64.
* @ts: pointer to the timespec to be set
*
* Updates the time of day in the timespec.
* Returns 0 on success, or -ve when suspended (timespec will be undefined).
*/
-int __getnstimeofday(struct timespec *ts)
+int __getnstimeofday64(struct timespec64 *ts)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
unsigned long seq;
s64 nsecs = 0;
do {
- seq = read_seqcount_begin(&timekeeper_seq);
+ seq = read_seqcount_begin(&tk_core.seq);
ts->tv_sec = tk->xtime_sec;
- nsecs = timekeeping_get_ns(tk);
+ nsecs = timekeeping_get_ns(&tk->tkr);
- } while (read_seqcount_retry(&timekeeper_seq, seq));
+ } while (read_seqcount_retry(&tk_core.seq, seq));
ts->tv_nsec = 0;
- timespec_add_ns(ts, nsecs);
+ timespec64_add_ns(ts, nsecs);
/*
* Do not bail out early, in case there were callers still using
@@ -321,116 +515,138 @@ int __getnstimeofday(struct timespec *ts)
return -EAGAIN;
return 0;
}
-EXPORT_SYMBOL(__getnstimeofday);
+EXPORT_SYMBOL(__getnstimeofday64);
/**
- * getnstimeofday - Returns the time of day in a timespec.
+ * getnstimeofday64 - Returns the time of day in a timespec64.
* @ts: pointer to the timespec to be set
*
* Returns the time of day in a timespec (WARN if suspended).
*/
-void getnstimeofday(struct timespec *ts)
+void getnstimeofday64(struct timespec64 *ts)
{
- WARN_ON(__getnstimeofday(ts));
+ WARN_ON(__getnstimeofday64(ts));
}
-EXPORT_SYMBOL(getnstimeofday);
+EXPORT_SYMBOL(getnstimeofday64);
ktime_t ktime_get(void)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
- s64 secs, nsecs;
+ ktime_t base;
+ s64 nsecs;
WARN_ON(timekeeping_suspended);
do {
- seq = read_seqcount_begin(&timekeeper_seq);
- secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
- nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec;
+ seq = read_seqcount_begin(&tk_core.seq);
+ base = tk->tkr.base_mono;
+ nsecs = timekeeping_get_ns(&tk->tkr);
- } while (read_seqcount_retry(&timekeeper_seq, seq));
- /*
- * Use ktime_set/ktime_add_ns to create a proper ktime on
- * 32-bit architectures without CONFIG_KTIME_SCALAR.
- */
- return ktime_add_ns(ktime_set(secs, 0), nsecs);
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ return ktime_add_ns(base, nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get);
-/**
- * ktime_get_ts - get the monotonic clock in timespec format
- * @ts: pointer to timespec variable
- *
- * The function calculates the monotonic clock from the realtime
- * clock and the wall_to_monotonic offset and stores the result
- * in normalized timespec format in the variable pointed to by @ts.
- */
-void ktime_get_ts(struct timespec *ts)
+static ktime_t *offsets[TK_OFFS_MAX] = {
+ [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real,
+ [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot,
+ [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai,
+};
+
+ktime_t ktime_get_with_offset(enum tk_offsets offs)
{
- struct timekeeper *tk = &timekeeper;
- struct timespec tomono;
- s64 nsec;
+ struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
+ ktime_t base, *offset = offsets[offs];
+ s64 nsecs;
WARN_ON(timekeeping_suspended);
do {
- seq = read_seqcount_begin(&timekeeper_seq);
- ts->tv_sec = tk->xtime_sec;
- nsec = timekeeping_get_ns(tk);
- tomono = tk->wall_to_monotonic;
+ seq = read_seqcount_begin(&tk_core.seq);
+ base = ktime_add(tk->tkr.base_mono, *offset);
+ nsecs = timekeeping_get_ns(&tk->tkr);
- } while (read_seqcount_retry(&timekeeper_seq, seq));
+ } while (read_seqcount_retry(&tk_core.seq, seq));
- ts->tv_sec += tomono.tv_sec;
- ts->tv_nsec = 0;
- timespec_add_ns(ts, nsec + tomono.tv_nsec);
-}
-EXPORT_SYMBOL_GPL(ktime_get_ts);
+ return ktime_add_ns(base, nsecs);
+}
+EXPORT_SYMBOL_GPL(ktime_get_with_offset);
/**
- * timekeeping_clocktai - Returns the TAI time of day in a timespec
- * @ts: pointer to the timespec to be set
- *
- * Returns the time of day in a timespec.
+ * ktime_mono_to_any() - convert mononotic time to any other time
+ * @tmono: time to convert.
+ * @offs: which offset to use
*/
-void timekeeping_clocktai(struct timespec *ts)
+ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)
{
- struct timekeeper *tk = &timekeeper;
+ ktime_t *offset = offsets[offs];
unsigned long seq;
- u64 nsecs;
-
- WARN_ON(timekeeping_suspended);
+ ktime_t tconv;
do {
- seq = read_seqcount_begin(&timekeeper_seq);
+ seq = read_seqcount_begin(&tk_core.seq);
+ tconv = ktime_add(tmono, *offset);
+ } while (read_seqcount_retry(&tk_core.seq, seq));
- ts->tv_sec = tk->xtime_sec + tk->tai_offset;
- nsecs = timekeeping_get_ns(tk);
+ return tconv;
+}
+EXPORT_SYMBOL_GPL(ktime_mono_to_any);
- } while (read_seqcount_retry(&timekeeper_seq, seq));
+/**
+ * ktime_get_raw - Returns the raw monotonic time in ktime_t format
+ */
+ktime_t ktime_get_raw(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned int seq;
+ ktime_t base;
+ s64 nsecs;
- ts->tv_nsec = 0;
- timespec_add_ns(ts, nsecs);
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ base = tk->base_raw;
+ nsecs = timekeeping_get_ns_raw(tk);
-}
-EXPORT_SYMBOL(timekeeping_clocktai);
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+ return ktime_add_ns(base, nsecs);
+}
+EXPORT_SYMBOL_GPL(ktime_get_raw);
/**
- * ktime_get_clocktai - Returns the TAI time of day in a ktime
+ * ktime_get_ts64 - get the monotonic clock in timespec64 format
+ * @ts: pointer to timespec variable
*
- * Returns the time of day in a ktime.
+ * The function calculates the monotonic clock from the realtime
+ * clock and the wall_to_monotonic offset and stores the result
+ * in normalized timespec format in the variable pointed to by @ts.
*/
-ktime_t ktime_get_clocktai(void)
+void ktime_get_ts64(struct timespec64 *ts)
{
- struct timespec ts;
+ struct timekeeper *tk = &tk_core.timekeeper;
+ struct timespec64 tomono;
+ s64 nsec;
+ unsigned int seq;
- timekeeping_clocktai(&ts);
- return timespec_to_ktime(ts);
+ WARN_ON(timekeeping_suspended);
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ ts->tv_sec = tk->xtime_sec;
+ nsec = timekeeping_get_ns(&tk->tkr);
+ tomono = tk->wall_to_monotonic;
+
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ ts->tv_sec += tomono.tv_sec;
+ ts->tv_nsec = 0;
+ timespec64_add_ns(ts, nsec + tomono.tv_nsec);
}
-EXPORT_SYMBOL(ktime_get_clocktai);
+EXPORT_SYMBOL_GPL(ktime_get_ts64);
#ifdef CONFIG_NTP_PPS
@@ -445,23 +661,23 @@ EXPORT_SYMBOL(ktime_get_clocktai);
*/
void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
unsigned long seq;
s64 nsecs_raw, nsecs_real;
WARN_ON_ONCE(timekeeping_suspended);
do {
- seq = read_seqcount_begin(&timekeeper_seq);
+ seq = read_seqcount_begin(&tk_core.seq);
- *ts_raw = tk->raw_time;
+ *ts_raw = timespec64_to_timespec(tk->raw_time);
ts_real->tv_sec = tk->xtime_sec;
ts_real->tv_nsec = 0;
nsecs_raw = timekeeping_get_ns_raw(tk);
- nsecs_real = timekeeping_get_ns(tk);
+ nsecs_real = timekeeping_get_ns(&tk->tkr);
- } while (read_seqcount_retry(&timekeeper_seq, seq));
+ } while (read_seqcount_retry(&tk_core.seq, seq));
timespec_add_ns(ts_raw, nsecs_raw);
timespec_add_ns(ts_real, nsecs_real);
@@ -478,9 +694,9 @@ EXPORT_SYMBOL(getnstime_raw_and_real);
*/
void do_gettimeofday(struct timeval *tv)
{
- struct timespec now;
+ struct timespec64 now;
- getnstimeofday(&now);
+ getnstimeofday64(&now);
tv->tv_sec = now.tv_sec;
tv->tv_usec = now.tv_nsec/1000;
}
@@ -494,15 +710,15 @@ EXPORT_SYMBOL(do_gettimeofday);
*/
int do_settimeofday(const struct timespec *tv)
{
- struct timekeeper *tk = &timekeeper;
- struct timespec ts_delta, xt;
+ struct timekeeper *tk = &tk_core.timekeeper;
+ struct timespec64 ts_delta, xt, tmp;
unsigned long flags;
if (!timespec_valid_strict(tv))
return -EINVAL;
raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&timekeeper_seq);
+ write_seqcount_begin(&tk_core.seq);
timekeeping_forward_now(tk);
@@ -510,13 +726,14 @@ int do_settimeofday(const struct timespec *tv)
ts_delta.tv_sec = tv->tv_sec - xt.tv_sec;
ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec;
- tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts_delta));
+ tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta));
- tk_set_xtime(tk, tv);
+ tmp = timespec_to_timespec64(*tv);
+ tk_set_xtime(tk, &tmp);
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
- write_seqcount_end(&timekeeper_seq);
+ write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
/* signal hrtimers about time change */
@@ -534,33 +751,35 @@ EXPORT_SYMBOL(do_settimeofday);
*/
int timekeeping_inject_offset(struct timespec *ts)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
unsigned long flags;
- struct timespec tmp;
+ struct timespec64 ts64, tmp;
int ret = 0;
if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
return -EINVAL;
+ ts64 = timespec_to_timespec64(*ts);
+
raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&timekeeper_seq);
+ write_seqcount_begin(&tk_core.seq);
timekeeping_forward_now(tk);
/* Make sure the proposed value is valid */
- tmp = timespec_add(tk_xtime(tk), *ts);
- if (!timespec_valid_strict(&tmp)) {
+ tmp = timespec64_add(tk_xtime(tk), ts64);
+ if (!timespec64_valid_strict(&tmp)) {
ret = -EINVAL;
goto error;
}
- tk_xtime_add(tk, ts);
- tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts));
+ tk_xtime_add(tk, &ts64);
+ tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts64));
error: /* even if we error out, we forwarded the time, so call update */
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
- write_seqcount_end(&timekeeper_seq);
+ write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
/* signal hrtimers about time change */
@@ -577,14 +796,14 @@ EXPORT_SYMBOL(timekeeping_inject_offset);
*/
s32 timekeeping_get_tai_offset(void)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
s32 ret;
do {
- seq = read_seqcount_begin(&timekeeper_seq);
+ seq = read_seqcount_begin(&tk_core.seq);
ret = tk->tai_offset;
- } while (read_seqcount_retry(&timekeeper_seq, seq));
+ } while (read_seqcount_retry(&tk_core.seq, seq));
return ret;
}
@@ -605,14 +824,14 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
*/
void timekeeping_set_tai_offset(s32 tai_offset)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
unsigned long flags;
raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&timekeeper_seq);
+ write_seqcount_begin(&tk_core.seq);
__timekeeping_set_tai_offset(tk, tai_offset);
timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
- write_seqcount_end(&timekeeper_seq);
+ write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
clock_was_set();
}
@@ -624,14 +843,14 @@ void timekeeping_set_tai_offset(s32 tai_offset)
*/
static int change_clocksource(void *data)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
struct clocksource *new, *old;
unsigned long flags;
new = (struct clocksource *) data;
raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&timekeeper_seq);
+ write_seqcount_begin(&tk_core.seq);
timekeeping_forward_now(tk);
/*
@@ -640,7 +859,7 @@ static int change_clocksource(void *data)
*/
if (try_module_get(new->owner)) {
if (!new->enable || new->enable(new) == 0) {
- old = tk->clock;
+ old = tk->tkr.clock;
tk_setup_internals(tk, new);
if (old->disable)
old->disable(old);
@@ -651,7 +870,7 @@ static int change_clocksource(void *data)
}
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
- write_seqcount_end(&timekeeper_seq);
+ write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
return 0;
@@ -666,29 +885,14 @@ static int change_clocksource(void *data)
*/
int timekeeping_notify(struct clocksource *clock)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
- if (tk->clock == clock)
+ if (tk->tkr.clock == clock)
return 0;
stop_machine(change_clocksource, clock, NULL);
tick_clock_notify();
- return tk->clock == clock ? 0 : -1;
-}
-
-/**
- * ktime_get_real - get the real (wall-) time in ktime_t format
- *
- * returns the time in ktime_t format
- */
-ktime_t ktime_get_real(void)
-{
- struct timespec now;
-
- getnstimeofday(&now);
-
- return timespec_to_ktime(now);
+ return tk->tkr.clock == clock ? 0 : -1;
}
-EXPORT_SYMBOL_GPL(ktime_get_real);
/**
* getrawmonotonic - Returns the raw monotonic time in a timespec
@@ -698,18 +902,20 @@ EXPORT_SYMBOL_GPL(ktime_get_real);
*/
void getrawmonotonic(struct timespec *ts)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
+ struct timespec64 ts64;
unsigned long seq;
s64 nsecs;
do {
- seq = read_seqcount_begin(&timekeeper_seq);
+ seq = read_seqcount_begin(&tk_core.seq);
nsecs = timekeeping_get_ns_raw(tk);
- *ts = tk->raw_time;
+ ts64 = tk->raw_time;
- } while (read_seqcount_retry(&timekeeper_seq, seq));
+ } while (read_seqcount_retry(&tk_core.seq, seq));
- timespec_add_ns(ts, nsecs);
+ timespec64_add_ns(&ts64, nsecs);
+ *ts = timespec64_to_timespec(ts64);
}
EXPORT_SYMBOL(getrawmonotonic);
@@ -718,16 +924,16 @@ EXPORT_SYMBOL(getrawmonotonic);
*/
int timekeeping_valid_for_hres(void)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
unsigned long seq;
int ret;
do {
- seq = read_seqcount_begin(&timekeeper_seq);
+ seq = read_seqcount_begin(&tk_core.seq);
- ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+ ret = tk->tkr.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
- } while (read_seqcount_retry(&timekeeper_seq, seq));
+ } while (read_seqcount_retry(&tk_core.seq, seq));
return ret;
}
@@ -737,16 +943,16 @@ int timekeeping_valid_for_hres(void)
*/
u64 timekeeping_max_deferment(void)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
unsigned long seq;
u64 ret;
do {
- seq = read_seqcount_begin(&timekeeper_seq);
+ seq = read_seqcount_begin(&tk_core.seq);
- ret = tk->clock->max_idle_ns;
+ ret = tk->tkr.clock->max_idle_ns;
- } while (read_seqcount_retry(&timekeeper_seq, seq));
+ } while (read_seqcount_retry(&tk_core.seq, seq));
return ret;
}
@@ -760,7 +966,7 @@ u64 timekeeping_max_deferment(void)
*
* XXX - Do be sure to remove it once all arches implement it.
*/
-void __attribute__((weak)) read_persistent_clock(struct timespec *ts)
+void __weak read_persistent_clock(struct timespec *ts)
{
ts->tv_sec = 0;
ts->tv_nsec = 0;
@@ -775,7 +981,7 @@ void __attribute__((weak)) read_persistent_clock(struct timespec *ts)
*
* XXX - Do be sure to remove it once all arches implement it.
*/
-void __attribute__((weak)) read_boot_clock(struct timespec *ts)
+void __weak read_boot_clock(struct timespec *ts)
{
ts->tv_sec = 0;
ts->tv_nsec = 0;
@@ -786,14 +992,15 @@ void __attribute__((weak)) read_boot_clock(struct timespec *ts)
*/
void __init timekeeping_init(void)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
struct clocksource *clock;
unsigned long flags;
- struct timespec now, boot, tmp;
-
- read_persistent_clock(&now);
+ struct timespec64 now, boot, tmp;
+ struct timespec ts;
- if (!timespec_valid_strict(&now)) {
+ read_persistent_clock(&ts);
+ now = timespec_to_timespec64(ts);
+ if (!timespec64_valid_strict(&now)) {
pr_warn("WARNING: Persistent clock returned invalid value!\n"
" Check your CMOS/BIOS settings.\n");
now.tv_sec = 0;
@@ -801,8 +1008,9 @@ void __init timekeeping_init(void)
} else if (now.tv_sec || now.tv_nsec)
persistent_clock_exist = true;
- read_boot_clock(&boot);
- if (!timespec_valid_strict(&boot)) {
+ read_boot_clock(&ts);
+ boot = timespec_to_timespec64(ts);
+ if (!timespec64_valid_strict(&boot)) {
pr_warn("WARNING: Boot clock returned invalid value!\n"
" Check your CMOS/BIOS settings.\n");
boot.tv_sec = 0;
@@ -810,7 +1018,7 @@ void __init timekeeping_init(void)
}
raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&timekeeper_seq);
+ write_seqcount_begin(&tk_core.seq);
ntp_init();
clock = clocksource_default_clock();
@@ -821,24 +1029,21 @@ void __init timekeeping_init(void)
tk_set_xtime(tk, &now);
tk->raw_time.tv_sec = 0;
tk->raw_time.tv_nsec = 0;
+ tk->base_raw.tv64 = 0;
if (boot.tv_sec == 0 && boot.tv_nsec == 0)
boot = tk_xtime(tk);
- set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec);
+ set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec);
tk_set_wall_to_mono(tk, tmp);
- tmp.tv_sec = 0;
- tmp.tv_nsec = 0;
- tk_set_sleep_time(tk, tmp);
-
- memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
+ timekeeping_update(tk, TK_MIRROR);
- write_seqcount_end(&timekeeper_seq);
+ write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
}
/* time in seconds when suspend began */
-static struct timespec timekeeping_suspend_time;
+static struct timespec64 timekeeping_suspend_time;
/**
* __timekeeping_inject_sleeptime - Internal function to add sleep interval
@@ -848,16 +1053,17 @@ static struct timespec timekeeping_suspend_time;
* adds the sleep offset to the timekeeping variables.
*/
static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
- struct timespec *delta)
+ struct timespec64 *delta)
{
- if (!timespec_valid_strict(delta)) {
- printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid "
- "sleep delta value!\n");
+ if (!timespec64_valid_strict(delta)) {
+ printk_deferred(KERN_WARNING
+ "__timekeeping_inject_sleeptime: Invalid "
+ "sleep delta value!\n");
return;
}
tk_xtime_add(tk, delta);
- tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta));
- tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta));
+ tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta));
+ tk_update_sleep_time(tk, timespec64_to_ktime(*delta));
tk_debug_account_sleep_time(delta);
}
@@ -873,7 +1079,8 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
*/
void timekeeping_inject_sleeptime(struct timespec *delta)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
+ struct timespec64 tmp;
unsigned long flags;
/*
@@ -884,15 +1091,16 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
return;
raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&timekeeper_seq);
+ write_seqcount_begin(&tk_core.seq);
timekeeping_forward_now(tk);
- __timekeeping_inject_sleeptime(tk, delta);
+ tmp = timespec_to_timespec64(*delta);
+ __timekeeping_inject_sleeptime(tk, &tmp);
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
- write_seqcount_end(&timekeeper_seq);
+ write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
/* signal hrtimers about time change */
@@ -908,20 +1116,22 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
*/
static void timekeeping_resume(void)
{
- struct timekeeper *tk = &timekeeper;
- struct clocksource *clock = tk->clock;
+ struct timekeeper *tk = &tk_core.timekeeper;
+ struct clocksource *clock = tk->tkr.clock;
unsigned long flags;
- struct timespec ts_new, ts_delta;
+ struct timespec64 ts_new, ts_delta;
+ struct timespec tmp;
cycle_t cycle_now, cycle_delta;
bool suspendtime_found = false;
- read_persistent_clock(&ts_new);
+ read_persistent_clock(&tmp);
+ ts_new = timespec_to_timespec64(tmp);
clockevents_resume();
clocksource_resume();
raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&timekeeper_seq);
+ write_seqcount_begin(&tk_core.seq);
/*
* After system resumes, we need to calculate the suspended time and
@@ -935,15 +1145,16 @@ static void timekeeping_resume(void)
* The less preferred source will only be tried if there is no better
* usable source. The rtc part is handled separately in rtc core code.
*/
- cycle_now = clock->read(clock);
+ cycle_now = tk->tkr.read(clock);
if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
- cycle_now > clock->cycle_last) {
+ cycle_now > tk->tkr.cycle_last) {
u64 num, max = ULLONG_MAX;
u32 mult = clock->mult;
u32 shift = clock->shift;
s64 nsec = 0;
- cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+ cycle_delta = clocksource_delta(cycle_now, tk->tkr.cycle_last,
+ tk->tkr.mask);
/*
* "cycle_delta * mutl" may cause 64 bits overflow, if the
@@ -958,10 +1169,10 @@ static void timekeeping_resume(void)
}
nsec += ((u64) cycle_delta * mult) >> shift;
- ts_delta = ns_to_timespec(nsec);
+ ts_delta = ns_to_timespec64(nsec);
suspendtime_found = true;
- } else if (timespec_compare(&ts_new, &timekeeping_suspend_time) > 0) {
- ts_delta = timespec_sub(ts_new, timekeeping_suspend_time);
+ } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
+ ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time);
suspendtime_found = true;
}
@@ -969,11 +1180,11 @@ static void timekeeping_resume(void)
__timekeeping_inject_sleeptime(tk, &ts_delta);
/* Re-base the last cycle value */
- tk->cycle_last = clock->cycle_last = cycle_now;
+ tk->tkr.cycle_last = cycle_now;
tk->ntp_error = 0;
timekeeping_suspended = 0;
timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
- write_seqcount_end(&timekeeper_seq);
+ write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
touch_softlockup_watchdog();
@@ -986,12 +1197,14 @@ static void timekeeping_resume(void)
static int timekeeping_suspend(void)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
unsigned long flags;
- struct timespec delta, delta_delta;
- static struct timespec old_delta;
+ struct timespec64 delta, delta_delta;
+ static struct timespec64 old_delta;
+ struct timespec tmp;
- read_persistent_clock(&timekeeping_suspend_time);
+ read_persistent_clock(&tmp);
+ timekeeping_suspend_time = timespec_to_timespec64(tmp);
/*
* On some systems the persistent_clock can not be detected at
@@ -1002,7 +1215,7 @@ static int timekeeping_suspend(void)
persistent_clock_exist = true;
raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&timekeeper_seq);
+ write_seqcount_begin(&tk_core.seq);
timekeeping_forward_now(tk);
timekeeping_suspended = 1;
@@ -1012,8 +1225,8 @@ static int timekeeping_suspend(void)
* try to compensate so the difference in system time
* and persistent_clock time stays close to constant.
*/
- delta = timespec_sub(tk_xtime(tk), timekeeping_suspend_time);
- delta_delta = timespec_sub(delta, old_delta);
+ delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time);
+ delta_delta = timespec64_sub(delta, old_delta);
if (abs(delta_delta.tv_sec) >= 2) {
/*
* if delta_delta is too large, assume time correction
@@ -1023,11 +1236,11 @@ static int timekeeping_suspend(void)
} else {
/* Otherwise try to adjust old_system to compensate */
timekeeping_suspend_time =
- timespec_add(timekeeping_suspend_time, delta_delta);
+ timespec64_add(timekeeping_suspend_time, delta_delta);
}
timekeeping_update(tk, TK_MIRROR);
- write_seqcount_end(&timekeeper_seq);
+ write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
@@ -1048,125 +1261,34 @@ static int __init timekeeping_init_ops(void)
register_syscore_ops(&timekeeping_syscore_ops);
return 0;
}
-
device_initcall(timekeeping_init_ops);
/*
- * If the error is already larger, we look ahead even further
- * to compensate for late or lost adjustments.
+ * Apply a multiplier adjustment to the timekeeper
*/
-static __always_inline int timekeeping_bigadjust(struct timekeeper *tk,
- s64 error, s64 *interval,
- s64 *offset)
+static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
+ s64 offset,
+ bool negative,
+ int adj_scale)
{
- s64 tick_error, i;
- u32 look_ahead, adj;
- s32 error2, mult;
-
- /*
- * Use the current error value to determine how much to look ahead.
- * The larger the error the slower we adjust for it to avoid problems
- * with losing too many ticks, otherwise we would overadjust and
- * produce an even larger error. The smaller the adjustment the
- * faster we try to adjust for it, as lost ticks can do less harm
- * here. This is tuned so that an error of about 1 msec is adjusted
- * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
- */
- error2 = tk->ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
- error2 = abs(error2);
- for (look_ahead = 0; error2 > 0; look_ahead++)
- error2 >>= 2;
+ s64 interval = tk->cycle_interval;
+ s32 mult_adj = 1;
- /*
- * Now calculate the error in (1 << look_ahead) ticks, but first
- * remove the single look ahead already included in the error.
- */
- tick_error = ntp_tick_length() >> (tk->ntp_error_shift + 1);
- tick_error -= tk->xtime_interval >> 1;
- error = ((error - tick_error) >> look_ahead) + tick_error;
-
- /* Finally calculate the adjustment shift value. */
- i = *interval;
- mult = 1;
- if (error < 0) {
- error = -error;
- *interval = -*interval;
- *offset = -*offset;
- mult = -1;
+ if (negative) {
+ mult_adj = -mult_adj;
+ interval = -interval;
+ offset = -offset;
}
- for (adj = 0; error > i; adj++)
- error >>= 1;
+ mult_adj <<= adj_scale;
+ interval <<= adj_scale;
+ offset <<= adj_scale;
- *interval <<= adj;
- *offset <<= adj;
- return mult << adj;
-}
-
-/*
- * Adjust the multiplier to reduce the error value,
- * this is optimized for the most common adjustments of -1,0,1,
- * for other values we can do a bit more work.
- */
-static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
-{
- s64 error, interval = tk->cycle_interval;
- int adj;
-
- /*
- * The point of this is to check if the error is greater than half
- * an interval.
- *
- * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs.
- *
- * Note we subtract one in the shift, so that error is really error*2.
- * This "saves" dividing(shifting) interval twice, but keeps the
- * (error > interval) comparison as still measuring if error is
- * larger than half an interval.
- *
- * Note: It does not "save" on aggravation when reading the code.
- */
- error = tk->ntp_error >> (tk->ntp_error_shift - 1);
- if (error > interval) {
- /*
- * We now divide error by 4(via shift), which checks if
- * the error is greater than twice the interval.
- * If it is greater, we need a bigadjust, if its smaller,
- * we can adjust by 1.
- */
- error >>= 2;
- if (likely(error <= interval))
- adj = 1;
- else
- adj = timekeeping_bigadjust(tk, error, &interval, &offset);
- } else {
- if (error < -interval) {
- /* See comment above, this is just switched for the negative */
- error >>= 2;
- if (likely(error >= -interval)) {
- adj = -1;
- interval = -interval;
- offset = -offset;
- } else {
- adj = timekeeping_bigadjust(tk, error, &interval, &offset);
- }
- } else {
- goto out_adjust;
- }
- }
-
- if (unlikely(tk->clock->maxadj &&
- (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) {
- printk_once(KERN_WARNING
- "Adjusting %s more than 11%% (%ld vs %ld)\n",
- tk->clock->name, (long)tk->mult + adj,
- (long)tk->clock->mult + tk->clock->maxadj);
- }
/*
* So the following can be confusing.
*
- * To keep things simple, lets assume adj == 1 for now.
+ * To keep things simple, lets assume mult_adj == 1 for now.
*
- * When adj != 1, remember that the interval and offset values
+ * When mult_adj != 1, remember that the interval and offset values
* have been appropriately scaled so the math is the same.
*
* The basic idea here is that we're increasing the multiplier
@@ -1210,12 +1332,78 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
*
* XXX - TODO: Doc ntp_error calculation.
*/
- tk->mult += adj;
+ tk->tkr.mult += mult_adj;
tk->xtime_interval += interval;
- tk->xtime_nsec -= offset;
+ tk->tkr.xtime_nsec -= offset;
tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
+}
+
+/*
+ * Calculate the multiplier adjustment needed to match the frequency
+ * specified by NTP
+ */
+static __always_inline void timekeeping_freqadjust(struct timekeeper *tk,
+ s64 offset)
+{
+ s64 interval = tk->cycle_interval;
+ s64 xinterval = tk->xtime_interval;
+ s64 tick_error;
+ bool negative;
+ u32 adj;
+
+ /* Remove any current error adj from freq calculation */
+ if (tk->ntp_err_mult)
+ xinterval -= tk->cycle_interval;
+
+ tk->ntp_tick = ntp_tick_length();
+
+ /* Calculate current error per tick */
+ tick_error = ntp_tick_length() >> tk->ntp_error_shift;
+ tick_error -= (xinterval + tk->xtime_remainder);
+
+ /* Don't worry about correcting it if its small */
+ if (likely((tick_error >= 0) && (tick_error <= interval)))
+ return;
+
+ /* preserve the direction of correction */
+ negative = (tick_error < 0);
+
+ /* Sort out the magnitude of the correction */
+ tick_error = abs(tick_error);
+ for (adj = 0; tick_error > interval; adj++)
+ tick_error >>= 1;
+
+ /* scale the corrections */
+ timekeeping_apply_adjustment(tk, offset, negative, adj);
+}
+
+/*
+ * Adjust the timekeeper's multiplier to the correct frequency
+ * and also to reduce the accumulated error value.
+ */
+static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
+{
+ /* Correct for the current frequency error */
+ timekeeping_freqadjust(tk, offset);
+
+ /* Next make a small adjustment to fix any cumulative error */
+ if (!tk->ntp_err_mult && (tk->ntp_error > 0)) {
+ tk->ntp_err_mult = 1;
+ timekeeping_apply_adjustment(tk, offset, 0, 0);
+ } else if (tk->ntp_err_mult && (tk->ntp_error <= 0)) {
+ /* Undo any existing error adjustment */
+ timekeeping_apply_adjustment(tk, offset, 1, 0);
+ tk->ntp_err_mult = 0;
+ }
+
+ if (unlikely(tk->tkr.clock->maxadj &&
+ (tk->tkr.mult > tk->tkr.clock->mult + tk->tkr.clock->maxadj))) {
+ printk_once(KERN_WARNING
+ "Adjusting %s more than 11%% (%ld vs %ld)\n",
+ tk->tkr.clock->name, (long)tk->tkr.mult,
+ (long)tk->tkr.clock->mult + tk->tkr.clock->maxadj);
+ }
-out_adjust:
/*
* It may be possible that when we entered this function, xtime_nsec
* was very small. Further, if we're slightly speeding the clocksource
@@ -1230,12 +1418,11 @@ out_adjust:
* We'll correct this error next time through this function, when
* xtime_nsec is not as small.
*/
- if (unlikely((s64)tk->xtime_nsec < 0)) {
- s64 neg = -(s64)tk->xtime_nsec;
- tk->xtime_nsec = 0;
+ if (unlikely((s64)tk->tkr.xtime_nsec < 0)) {
+ s64 neg = -(s64)tk->tkr.xtime_nsec;
+ tk->tkr.xtime_nsec = 0;
tk->ntp_error += neg << tk->ntp_error_shift;
}
-
}
/**
@@ -1248,26 +1435,26 @@ out_adjust:
*/
static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
{
- u64 nsecps = (u64)NSEC_PER_SEC << tk->shift;
+ u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr.shift;
unsigned int clock_set = 0;
- while (tk->xtime_nsec >= nsecps) {
+ while (tk->tkr.xtime_nsec >= nsecps) {
int leap;
- tk->xtime_nsec -= nsecps;
+ tk->tkr.xtime_nsec -= nsecps;
tk->xtime_sec++;
/* Figure out if its a leap sec and apply if needed */
leap = second_overflow(tk->xtime_sec);
if (unlikely(leap)) {
- struct timespec ts;
+ struct timespec64 ts;
tk->xtime_sec += leap;
ts.tv_sec = leap;
ts.tv_nsec = 0;
tk_set_wall_to_mono(tk,
- timespec_sub(tk->wall_to_monotonic, ts));
+ timespec64_sub(tk->wall_to_monotonic, ts));
__timekeeping_set_tai_offset(tk, tk->tai_offset - leap);
@@ -1299,9 +1486,9 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
/* Accumulate one shifted interval */
offset -= interval;
- tk->cycle_last += interval;
+ tk->tkr.cycle_last += interval;
- tk->xtime_nsec += tk->xtime_interval << shift;
+ tk->tkr.xtime_nsec += tk->xtime_interval << shift;
*clock_set |= accumulate_nsecs_to_secs(tk);
/* Accumulate raw time */
@@ -1315,48 +1502,20 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
tk->raw_time.tv_nsec = raw_nsecs;
/* Accumulate error between NTP and clock interval */
- tk->ntp_error += ntp_tick_length() << shift;
+ tk->ntp_error += tk->ntp_tick << shift;
tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) <<
(tk->ntp_error_shift + shift);
return offset;
}
-#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
-static inline void old_vsyscall_fixup(struct timekeeper *tk)
-{
- s64 remainder;
-
- /*
- * Store only full nanoseconds into xtime_nsec after rounding
- * it up and add the remainder to the error difference.
- * XXX - This is necessary to avoid small 1ns inconsistnecies caused
- * by truncating the remainder in vsyscalls. However, it causes
- * additional work to be done in timekeeping_adjust(). Once
- * the vsyscall implementations are converted to use xtime_nsec
- * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
- * users are removed, this can be killed.
- */
- remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1);
- tk->xtime_nsec -= remainder;
- tk->xtime_nsec += 1ULL << tk->shift;
- tk->ntp_error += remainder << tk->ntp_error_shift;
- tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift;
-}
-#else
-#define old_vsyscall_fixup(tk)
-#endif
-
-
-
/**
* update_wall_time - Uses the current clocksource to increment the wall time
*
*/
void update_wall_time(void)
{
- struct clocksource *clock;
- struct timekeeper *real_tk = &timekeeper;
+ struct timekeeper *real_tk = &tk_core.timekeeper;
struct timekeeper *tk = &shadow_timekeeper;
cycle_t offset;
int shift = 0, maxshift;
@@ -1369,12 +1528,11 @@ void update_wall_time(void)
if (unlikely(timekeeping_suspended))
goto out;
- clock = real_tk->clock;
-
#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
offset = real_tk->cycle_interval;
#else
- offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
+ offset = clocksource_delta(tk->tkr.read(tk->tkr.clock),
+ tk->tkr.cycle_last, tk->tkr.mask);
#endif
/* Check if there's really nothing to do */
@@ -1416,9 +1574,7 @@ void update_wall_time(void)
*/
clock_set |= accumulate_nsecs_to_secs(tk);
- write_seqcount_begin(&timekeeper_seq);
- /* Update clock->cycle_last with the new value */
- clock->cycle_last = tk->cycle_last;
+ write_seqcount_begin(&tk_core.seq);
/*
* Update the real timekeeper.
*
@@ -1426,16 +1582,17 @@ void update_wall_time(void)
* requires changes to all other timekeeper usage sites as
* well, i.e. move the timekeeper pointer getter into the
* spinlocked/seqcount protected sections. And we trade this
- * memcpy under the timekeeper_seq against one before we start
+ * memcpy under the tk_core.seq against one before we start
* updating.
*/
memcpy(real_tk, tk, sizeof(*tk));
timekeeping_update(real_tk, clock_set);
- write_seqcount_end(&timekeeper_seq);
+ write_seqcount_end(&tk_core.seq);
out:
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
if (clock_set)
- clock_was_set();
+ /* Have to call _delayed version, since in irq context*/
+ clock_was_set_delayed();
}
/**
@@ -1451,83 +1608,16 @@ out:
*/
void getboottime(struct timespec *ts)
{
- struct timekeeper *tk = &timekeeper;
- struct timespec boottime = {
- .tv_sec = tk->wall_to_monotonic.tv_sec +
- tk->total_sleep_time.tv_sec,
- .tv_nsec = tk->wall_to_monotonic.tv_nsec +
- tk->total_sleep_time.tv_nsec
- };
-
- set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
-}
-EXPORT_SYMBOL_GPL(getboottime);
+ struct timekeeper *tk = &tk_core.timekeeper;
+ ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);
-/**
- * get_monotonic_boottime - Returns monotonic time since boot
- * @ts: pointer to the timespec to be set
- *
- * Returns the monotonic time since boot in a timespec.
- *
- * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also
- * includes the time spent in suspend.
- */
-void get_monotonic_boottime(struct timespec *ts)
-{
- struct timekeeper *tk = &timekeeper;
- struct timespec tomono, sleep;
- s64 nsec;
- unsigned int seq;
-
- WARN_ON(timekeeping_suspended);
-
- do {
- seq = read_seqcount_begin(&timekeeper_seq);
- ts->tv_sec = tk->xtime_sec;
- nsec = timekeeping_get_ns(tk);
- tomono = tk->wall_to_monotonic;
- sleep = tk->total_sleep_time;
-
- } while (read_seqcount_retry(&timekeeper_seq, seq));
-
- ts->tv_sec += tomono.tv_sec + sleep.tv_sec;
- ts->tv_nsec = 0;
- timespec_add_ns(ts, nsec + tomono.tv_nsec + sleep.tv_nsec);
+ *ts = ktime_to_timespec(t);
}
-EXPORT_SYMBOL_GPL(get_monotonic_boottime);
-
-/**
- * ktime_get_boottime - Returns monotonic time since boot in a ktime
- *
- * Returns the monotonic time since boot in a ktime
- *
- * This is similar to CLOCK_MONTONIC/ktime_get, but also
- * includes the time spent in suspend.
- */
-ktime_t ktime_get_boottime(void)
-{
- struct timespec ts;
-
- get_monotonic_boottime(&ts);
- return timespec_to_ktime(ts);
-}
-EXPORT_SYMBOL_GPL(ktime_get_boottime);
-
-/**
- * monotonic_to_bootbased - Convert the monotonic time to boot based.
- * @ts: pointer to the timespec to be converted
- */
-void monotonic_to_bootbased(struct timespec *ts)
-{
- struct timekeeper *tk = &timekeeper;
-
- *ts = timespec_add(*ts, tk->total_sleep_time);
-}
-EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
+EXPORT_SYMBOL_GPL(getboottime);
unsigned long get_seconds(void)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
return tk->xtime_sec;
}
@@ -1535,43 +1625,44 @@ EXPORT_SYMBOL(get_seconds);
struct timespec __current_kernel_time(void)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
- return tk_xtime(tk);
+ return timespec64_to_timespec(tk_xtime(tk));
}
struct timespec current_kernel_time(void)
{
- struct timekeeper *tk = &timekeeper;
- struct timespec now;
+ struct timekeeper *tk = &tk_core.timekeeper;
+ struct timespec64 now;
unsigned long seq;
do {
- seq = read_seqcount_begin(&timekeeper_seq);
+ seq = read_seqcount_begin(&tk_core.seq);
now = tk_xtime(tk);
- } while (read_seqcount_retry(&timekeeper_seq, seq));
+ } while (read_seqcount_retry(&tk_core.seq, seq));
- return now;
+ return timespec64_to_timespec(now);
}
EXPORT_SYMBOL(current_kernel_time);
struct timespec get_monotonic_coarse(void)
{
- struct timekeeper *tk = &timekeeper;
- struct timespec now, mono;
+ struct timekeeper *tk = &tk_core.timekeeper;
+ struct timespec64 now, mono;
unsigned long seq;
do {
- seq = read_seqcount_begin(&timekeeper_seq);
+ seq = read_seqcount_begin(&tk_core.seq);
now = tk_xtime(tk);
mono = tk->wall_to_monotonic;
- } while (read_seqcount_retry(&timekeeper_seq, seq));
+ } while (read_seqcount_retry(&tk_core.seq, seq));
- set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
+ set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec,
now.tv_nsec + mono.tv_nsec);
- return now;
+
+ return timespec64_to_timespec(now);
}
/*
@@ -1584,29 +1675,38 @@ void do_timer(unsigned long ticks)
}
/**
- * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic,
- * and sleep offsets.
- * @xtim: pointer to timespec to be set with xtime
- * @wtom: pointer to timespec to be set with wall_to_monotonic
- * @sleep: pointer to timespec to be set with time in suspend
+ * ktime_get_update_offsets_tick - hrtimer helper
+ * @offs_real: pointer to storage for monotonic -> realtime offset
+ * @offs_boot: pointer to storage for monotonic -> boottime offset
+ * @offs_tai: pointer to storage for monotonic -> clock tai offset
+ *
+ * Returns monotonic time at last tick and various offsets
*/
-void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
- struct timespec *wtom, struct timespec *sleep)
+ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot,
+ ktime_t *offs_tai)
{
- struct timekeeper *tk = &timekeeper;
- unsigned long seq;
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned int seq;
+ ktime_t base;
+ u64 nsecs;
do {
- seq = read_seqcount_begin(&timekeeper_seq);
- *xtim = tk_xtime(tk);
- *wtom = tk->wall_to_monotonic;
- *sleep = tk->total_sleep_time;
- } while (read_seqcount_retry(&timekeeper_seq, seq));
+ seq = read_seqcount_begin(&tk_core.seq);
+
+ base = tk->tkr.base_mono;
+ nsecs = tk->tkr.xtime_nsec >> tk->tkr.shift;
+
+ *offs_real = tk->offs_real;
+ *offs_boot = tk->offs_boot;
+ *offs_tai = tk->offs_tai;
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ return ktime_add_ns(base, nsecs);
}
#ifdef CONFIG_HIGH_RES_TIMERS
/**
- * ktime_get_update_offsets - hrtimer helper
+ * ktime_get_update_offsets_now - hrtimer helper
* @offs_real: pointer to storage for monotonic -> realtime offset
* @offs_boot: pointer to storage for monotonic -> boottime offset
* @offs_tai: pointer to storage for monotonic -> clock tai offset
@@ -1614,57 +1714,37 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
* Returns current monotonic time and updates the offsets
* Called from hrtimer_interrupt() or retrigger_next_event()
*/
-ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot,
+ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
ktime_t *offs_tai)
{
- struct timekeeper *tk = &timekeeper;
- ktime_t now;
+ struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
- u64 secs, nsecs;
+ ktime_t base;
+ u64 nsecs;
do {
- seq = read_seqcount_begin(&timekeeper_seq);
+ seq = read_seqcount_begin(&tk_core.seq);
- secs = tk->xtime_sec;
- nsecs = timekeeping_get_ns(tk);
+ base = tk->tkr.base_mono;
+ nsecs = timekeeping_get_ns(&tk->tkr);
*offs_real = tk->offs_real;
*offs_boot = tk->offs_boot;
*offs_tai = tk->offs_tai;
- } while (read_seqcount_retry(&timekeeper_seq, seq));
+ } while (read_seqcount_retry(&tk_core.seq, seq));
- now = ktime_add_ns(ktime_set(secs, 0), nsecs);
- now = ktime_sub(now, *offs_real);
- return now;
+ return ktime_add_ns(base, nsecs);
}
#endif
/**
- * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
- */
-ktime_t ktime_get_monotonic_offset(void)
-{
- struct timekeeper *tk = &timekeeper;
- unsigned long seq;
- struct timespec wtom;
-
- do {
- seq = read_seqcount_begin(&timekeeper_seq);
- wtom = tk->wall_to_monotonic;
- } while (read_seqcount_retry(&timekeeper_seq, seq));
-
- return timespec_to_ktime(wtom);
-}
-EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
-
-/**
* do_adjtimex() - Accessor function to NTP __do_adjtimex function
*/
int do_adjtimex(struct timex *txc)
{
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *tk = &tk_core.timekeeper;
unsigned long flags;
- struct timespec ts;
+ struct timespec64 ts;
s32 orig_tai, tai;
int ret;
@@ -1684,10 +1764,10 @@ int do_adjtimex(struct timex *txc)
return ret;
}
- getnstimeofday(&ts);
+ getnstimeofday64(&ts);
raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&timekeeper_seq);
+ write_seqcount_begin(&tk_core.seq);
orig_tai = tai = tk->tai_offset;
ret = __do_adjtimex(txc, &ts, &tai);
@@ -1696,7 +1776,7 @@ int do_adjtimex(struct timex *txc)
__timekeeping_set_tai_offset(tk, tai);
timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
}
- write_seqcount_end(&timekeeper_seq);
+ write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
if (tai != orig_tai)
@@ -1716,11 +1796,11 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
unsigned long flags;
raw_spin_lock_irqsave(&timekeeper_lock, flags);
- write_seqcount_begin(&timekeeper_seq);
+ write_seqcount_begin(&tk_core.seq);
__hardpps(phase_ts, raw_ts);
- write_seqcount_end(&timekeeper_seq);
+ write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
}
EXPORT_SYMBOL(hardpps);
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
new file mode 100644
index 000000000000..adc1fc98bde3
--- /dev/null
+++ b/kernel/time/timekeeping.h
@@ -0,0 +1,20 @@
+#ifndef _KERNEL_TIME_TIMEKEEPING_H
+#define _KERNEL_TIME_TIMEKEEPING_H
+/*
+ * Internal interfaces for kernel/time/
+ */
+extern ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real,
+ ktime_t *offs_boot,
+ ktime_t *offs_tai);
+extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real,
+ ktime_t *offs_boot,
+ ktime_t *offs_tai);
+
+extern int timekeeping_valid_for_hres(void);
+extern u64 timekeeping_max_deferment(void);
+extern int timekeeping_inject_offset(struct timespec *ts);
+extern s32 timekeeping_get_tai_offset(void);
+extern void timekeeping_set_tai_offset(s32 tai_offset);
+extern void timekeeping_clocktai(struct timespec *ts);
+
+#endif
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index 802433a4f5eb..f6bd65236712 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -21,6 +21,8 @@
#include <linux/seq_file.h>
#include <linux/time.h>
+#include "timekeeping_internal.h"
+
static unsigned int sleep_time_bin[32] = {0};
static int tk_debug_show_sleep_time(struct seq_file *s, void *data)
@@ -65,7 +67,7 @@ static int __init tk_debug_sleep_time_init(void)
}
late_initcall(tk_debug_sleep_time_init);
-void tk_debug_account_sleep_time(struct timespec *t)
+void tk_debug_account_sleep_time(struct timespec64 *t)
{
sleep_time_bin[fls(t->tv_sec)]++;
}
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index 13323ea08ffa..4ea005a7f9da 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -3,12 +3,27 @@
/*
* timekeeping debug functions
*/
+#include <linux/clocksource.h>
#include <linux/time.h>
#ifdef CONFIG_DEBUG_FS
-extern void tk_debug_account_sleep_time(struct timespec *t);
+extern void tk_debug_account_sleep_time(struct timespec64 *t);
#else
#define tk_debug_account_sleep_time(x)
#endif
+#ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE
+static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
+{
+ cycle_t ret = (now - last) & mask;
+
+ return (s64) ret > 0 ? ret : 0;
+}
+#else
+static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
+{
+ return (now - last) & mask;
+}
+#endif
+
#endif /* _TIMEKEEPING_INTERNAL_H */
diff --git a/kernel/timer.c b/kernel/time/timer.c
index accfd241b9e5..aca5dfe2fa3d 100644
--- a/kernel/timer.c
+++ b/kernel/time/timer.c
@@ -52,7 +52,7 @@
#define CREATE_TRACE_POINTS
#include <trace/events/timer.h>
-u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
+__visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
EXPORT_SYMBOL(jiffies_64);
@@ -81,6 +81,8 @@ struct tvec_base {
unsigned long timer_jiffies;
unsigned long next_timer;
unsigned long active_timers;
+ unsigned long all_timers;
+ int cpu;
struct tvec_root tv1;
struct tvec tv2;
struct tvec tv3;
@@ -337,6 +339,20 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
}
EXPORT_SYMBOL_GPL(set_timer_slack);
+/*
+ * If the list is empty, catch up ->timer_jiffies to the current time.
+ * The caller must hold the tvec_base lock. Returns true if the list
+ * was empty and therefore ->timer_jiffies was updated.
+ */
+static bool catchup_timer_jiffies(struct tvec_base *base)
+{
+ if (!base->all_timers) {
+ base->timer_jiffies = jiffies;
+ return true;
+ }
+ return false;
+}
+
static void
__internal_add_timer(struct tvec_base *base, struct timer_list *timer)
{
@@ -383,15 +399,33 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
{
+ (void)catchup_timer_jiffies(base);
__internal_add_timer(base, timer);
/*
* Update base->active_timers and base->next_timer
*/
if (!tbase_get_deferrable(timer->base)) {
- if (time_before(timer->expires, base->next_timer))
+ if (!base->active_timers++ ||
+ time_before(timer->expires, base->next_timer))
base->next_timer = timer->expires;
- base->active_timers++;
}
+ base->all_timers++;
+
+ /*
+ * Check whether the other CPU is in dynticks mode and needs
+ * to be triggered to reevaluate the timer wheel.
+ * We are protected against the other CPU fiddling
+ * with the timer by holding the timer base lock. This also
+ * makes sure that a CPU on the way to stop its tick can not
+ * evaluate the timer wheel.
+ *
+ * Spare the IPI for deferrable timers on idle targets though.
+ * The next busy ticks will take care of it. Except full dynticks
+ * require special care against races with idle_cpu(), lets deal
+ * with that later.
+ */
+ if (!tbase_get_deferrable(base) || tick_nohz_full_cpu(base->cpu))
+ wake_up_nohz_cpu(base->cpu);
}
#ifdef CONFIG_TIMER_STATS
@@ -671,6 +705,8 @@ detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
detach_timer(timer, true);
if (!tbase_get_deferrable(timer->base))
base->active_timers--;
+ base->all_timers--;
+ (void)catchup_timer_jiffies(base);
}
static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
@@ -685,6 +721,8 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
if (timer->expires == base->next_timer)
base->next_timer = base->timer_jiffies;
}
+ base->all_timers--;
+ (void)catchup_timer_jiffies(base);
return 1;
}
@@ -739,12 +777,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
debug_activate(timer, expires);
- cpu = smp_processor_id();
-
-#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
- if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
- cpu = get_nohz_timer_target();
-#endif
+ cpu = get_nohz_timer_target(pinned);
new_base = per_cpu(tvec_bases, cpu);
if (base != new_base) {
@@ -822,7 +855,7 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
bit = find_last_bit(&mask, BITS_PER_LONG);
- mask = (1 << bit) - 1;
+ mask = (1UL << bit) - 1;
expires_limit = expires_limit & ~(mask);
@@ -932,15 +965,6 @@ void add_timer_on(struct timer_list *timer, int cpu)
timer_set_base(timer, base);
debug_activate(timer, timer->expires);
internal_add_timer(base, timer);
- /*
- * Check whether the other CPU is in dynticks mode and needs
- * to be triggered to reevaluate the timer wheel.
- * We are protected against the other CPU fiddling
- * with the timer by holding the timer base lock. This also
- * makes sure that a CPU on the way to stop its tick can not
- * evaluate the timer wheel.
- */
- wake_up_nohz_cpu(cpu);
spin_unlock_irqrestore(&base->lock, flags);
}
EXPORT_SYMBOL_GPL(add_timer_on);
@@ -1146,6 +1170,10 @@ static inline void __run_timers(struct tvec_base *base)
struct timer_list *timer;
spin_lock_irq(&base->lock);
+ if (catchup_timer_jiffies(base)) {
+ spin_unlock_irq(&base->lock);
+ return;
+ }
while (time_after_eq(jiffies, base->timer_jiffies)) {
struct list_head work_list;
struct list_head *head = &work_list;
@@ -1160,7 +1188,7 @@ static inline void __run_timers(struct tvec_base *base)
!cascade(base, &base->tv4, INDEX(2)))
cascade(base, &base->tv5, INDEX(3));
++base->timer_jiffies;
- list_replace_init(base->tv1.vec + index, &work_list);
+ list_replace_init(base->tv1.vec + index, head);
while (!list_empty(head)) {
void (*fn)(unsigned long);
unsigned long data;
@@ -1523,9 +1551,8 @@ static int init_timers_cpu(int cpu)
if (!base)
return -ENOMEM;
- /* Make sure that tvec_base is 2 byte aligned */
- if (tbase_get_deferrable(base)) {
- WARN_ON(1);
+ /* Make sure tvec_base has TIMER_FLAG_MASK bits free */
+ if (WARN_ON(base != tbase_get_base(base))) {
kfree(base);
return -ENOMEM;
}
@@ -1542,6 +1569,7 @@ static int init_timers_cpu(int cpu)
}
spin_lock_init(&base->lock);
tvec_base_done[cpu] = 1;
+ base->cpu = cpu;
} else {
base = per_cpu(tvec_bases, cpu);
}
@@ -1559,6 +1587,7 @@ static int init_timers_cpu(int cpu)
base->timer_jiffies = jiffies;
base->next_timer = base->timer_jiffies;
base->active_timers = 0;
+ base->all_timers = 0;
return 0;
}
@@ -1648,9 +1677,9 @@ void __init init_timers(void)
err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
(void *)(long)smp_processor_id());
- init_timer_stats();
-
BUG_ON(err != NOTIFY_OK);
+
+ init_timer_stats();
register_cpu_notifier(&timers_nb);
open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}
diff --git a/kernel/time/udelay_test.c b/kernel/time/udelay_test.c
new file mode 100644
index 000000000000..e622ba365a13
--- /dev/null
+++ b/kernel/time/udelay_test.c
@@ -0,0 +1,168 @@
+/*
+ * udelay() test kernel module
+ *
+ * Test is executed by writing and reading to /sys/kernel/debug/udelay_test
+ * Tests are configured by writing: USECS ITERATIONS
+ * Tests are executed by reading from the same file.
+ * Specifying usecs of 0 or negative values will run multiples tests.
+ *
+ * Copyright (C) 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/ktime.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+
+#define DEFAULT_ITERATIONS 100
+
+#define DEBUGFS_FILENAME "udelay_test"
+
+static DEFINE_MUTEX(udelay_test_lock);
+static struct dentry *udelay_test_debugfs_file;
+static int udelay_test_usecs;
+static int udelay_test_iterations = DEFAULT_ITERATIONS;
+
+static int udelay_test_single(struct seq_file *s, int usecs, uint32_t iters)
+{
+ int min = 0, max = 0, fail_count = 0;
+ uint64_t sum = 0;
+ uint64_t avg;
+ int i;
+ /* Allow udelay to be up to 0.5% fast */
+ int allowed_error_ns = usecs * 5;
+
+ for (i = 0; i < iters; ++i) {
+ struct timespec ts1, ts2;
+ int time_passed;
+
+ ktime_get_ts(&ts1);
+ udelay(usecs);
+ ktime_get_ts(&ts2);
+ time_passed = timespec_to_ns(&ts2) - timespec_to_ns(&ts1);
+
+ if (i == 0 || time_passed < min)
+ min = time_passed;
+ if (i == 0 || time_passed > max)
+ max = time_passed;
+ if ((time_passed + allowed_error_ns) / 1000 < usecs)
+ ++fail_count;
+ WARN_ON(time_passed < 0);
+ sum += time_passed;
+ }
+
+ avg = sum;
+ do_div(avg, iters);
+ seq_printf(s, "%d usecs x %d: exp=%d allowed=%d min=%d avg=%lld max=%d",
+ usecs, iters, usecs * 1000,
+ (usecs * 1000) - allowed_error_ns, min, avg, max);
+ if (fail_count)
+ seq_printf(s, " FAIL=%d", fail_count);
+ seq_puts(s, "\n");
+
+ return 0;
+}
+
+static int udelay_test_show(struct seq_file *s, void *v)
+{
+ int usecs;
+ int iters;
+ int ret = 0;
+
+ mutex_lock(&udelay_test_lock);
+ usecs = udelay_test_usecs;
+ iters = udelay_test_iterations;
+ mutex_unlock(&udelay_test_lock);
+
+ if (usecs > 0 && iters > 0) {
+ return udelay_test_single(s, usecs, iters);
+ } else if (usecs == 0) {
+ struct timespec ts;
+
+ ktime_get_ts(&ts);
+ seq_printf(s, "udelay() test (lpj=%ld kt=%ld.%09ld)\n",
+ loops_per_jiffy, ts.tv_sec, ts.tv_nsec);
+ seq_puts(s, "usage:\n");
+ seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n");
+ seq_puts(s, "cat " DEBUGFS_FILENAME "\n");
+ }
+
+ return ret;
+}
+
+static int udelay_test_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, udelay_test_show, inode->i_private);
+}
+
+static ssize_t udelay_test_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *pos)
+{
+ char lbuf[32];
+ int ret;
+ int usecs;
+ int iters;
+
+ if (count >= sizeof(lbuf))
+ return -EINVAL;
+
+ if (copy_from_user(lbuf, buf, count))
+ return -EFAULT;
+ lbuf[count] = '\0';
+
+ ret = sscanf(lbuf, "%d %d", &usecs, &iters);
+ if (ret < 1)
+ return -EINVAL;
+ else if (ret < 2)
+ iters = DEFAULT_ITERATIONS;
+
+ mutex_lock(&udelay_test_lock);
+ udelay_test_usecs = usecs;
+ udelay_test_iterations = iters;
+ mutex_unlock(&udelay_test_lock);
+
+ return count;
+}
+
+static const struct file_operations udelay_test_debugfs_ops = {
+ .owner = THIS_MODULE,
+ .open = udelay_test_open,
+ .read = seq_read,
+ .write = udelay_test_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init udelay_test_init(void)
+{
+ mutex_lock(&udelay_test_lock);
+ udelay_test_debugfs_file = debugfs_create_file(DEBUGFS_FILENAME,
+ S_IRUSR, NULL, NULL, &udelay_test_debugfs_ops);
+ mutex_unlock(&udelay_test_lock);
+
+ return 0;
+}
+
+module_init(udelay_test_init);
+
+static void __exit udelay_test_exit(void)
+{
+ mutex_lock(&udelay_test_lock);
+ debugfs_remove(udelay_test_debugfs_file);
+ mutex_unlock(&udelay_test_lock);
+}
+
+module_exit(udelay_test_exit);
+
+MODULE_AUTHOR("David Riley <davidriley@chromium.org>");
+MODULE_LICENSE("GPL");
diff --git a/kernel/torture.c b/kernel/torture.c
new file mode 100644
index 000000000000..d600af21f022
--- /dev/null
+++ b/kernel/torture.c
@@ -0,0 +1,733 @@
+/*
+ * Common functions for in-kernel torture tests.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2014
+ *
+ * Author: Paul E. McKenney <paulmck@us.ibm.com>
+ * Based on kernel/rcu/torture.c.
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/freezer.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/trace_clock.h>
+#include <asm/byteorder.h>
+#include <linux/torture.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
+
+static char *torture_type;
+static bool verbose;
+
+/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
+#define FULLSTOP_DONTSTOP 0 /* Normal operation. */
+#define FULLSTOP_SHUTDOWN 1 /* System shutdown with torture running. */
+#define FULLSTOP_RMMOD 2 /* Normal rmmod of torture. */
+static int fullstop = FULLSTOP_RMMOD;
+static DEFINE_MUTEX(fullstop_mutex);
+static int *torture_runnable;
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Variables for online-offline handling. Only present if CPU hotplug
+ * is enabled, otherwise does nothing.
+ */
+
+static struct task_struct *onoff_task;
+static long onoff_holdoff;
+static long onoff_interval;
+static long n_offline_attempts;
+static long n_offline_successes;
+static unsigned long sum_offline;
+static int min_offline = -1;
+static int max_offline;
+static long n_online_attempts;
+static long n_online_successes;
+static unsigned long sum_online;
+static int min_online = -1;
+static int max_online;
+
+/*
+ * Execute random CPU-hotplug operations at the interval specified
+ * by the onoff_interval.
+ */
+static int
+torture_onoff(void *arg)
+{
+ int cpu;
+ unsigned long delta;
+ int maxcpu = -1;
+ DEFINE_TORTURE_RANDOM(rand);
+ int ret;
+ unsigned long starttime;
+
+ VERBOSE_TOROUT_STRING("torture_onoff task started");
+ for_each_online_cpu(cpu)
+ maxcpu = cpu;
+ WARN_ON(maxcpu < 0);
+ if (onoff_holdoff > 0) {
+ VERBOSE_TOROUT_STRING("torture_onoff begin holdoff");
+ schedule_timeout_interruptible(onoff_holdoff);
+ VERBOSE_TOROUT_STRING("torture_onoff end holdoff");
+ }
+ while (!torture_must_stop()) {
+ cpu = (torture_random(&rand) >> 4) % (maxcpu + 1);
+ if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: offlining %d\n",
+ torture_type, cpu);
+ starttime = jiffies;
+ n_offline_attempts++;
+ ret = cpu_down(cpu);
+ if (ret) {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: offline %d failed: errno %d\n",
+ torture_type, cpu, ret);
+ } else {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: offlined %d\n",
+ torture_type, cpu);
+ n_offline_successes++;
+ delta = jiffies - starttime;
+ sum_offline += delta;
+ if (min_offline < 0) {
+ min_offline = delta;
+ max_offline = delta;
+ }
+ if (min_offline > delta)
+ min_offline = delta;
+ if (max_offline < delta)
+ max_offline = delta;
+ }
+ } else if (cpu_is_hotpluggable(cpu)) {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: onlining %d\n",
+ torture_type, cpu);
+ starttime = jiffies;
+ n_online_attempts++;
+ ret = cpu_up(cpu);
+ if (ret) {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: online %d failed: errno %d\n",
+ torture_type, cpu, ret);
+ } else {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: onlined %d\n",
+ torture_type, cpu);
+ n_online_successes++;
+ delta = jiffies - starttime;
+ sum_online += delta;
+ if (min_online < 0) {
+ min_online = delta;
+ max_online = delta;
+ }
+ if (min_online > delta)
+ min_online = delta;
+ if (max_online < delta)
+ max_online = delta;
+ }
+ }
+ schedule_timeout_interruptible(onoff_interval);
+ }
+ torture_kthread_stopping("torture_onoff");
+ return 0;
+}
+
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+
+/*
+ * Initiate online-offline handling.
+ */
+int torture_onoff_init(long ooholdoff, long oointerval)
+{
+ int ret = 0;
+
+#ifdef CONFIG_HOTPLUG_CPU
+ onoff_holdoff = ooholdoff;
+ onoff_interval = oointerval;
+ if (onoff_interval <= 0)
+ return 0;
+ ret = torture_create_kthread(torture_onoff, NULL, onoff_task);
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+ return ret;
+}
+EXPORT_SYMBOL_GPL(torture_onoff_init);
+
+/*
+ * Clean up after online/offline testing.
+ */
+static void torture_onoff_cleanup(void)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+ if (onoff_task == NULL)
+ return;
+ VERBOSE_TOROUT_STRING("Stopping torture_onoff task");
+ kthread_stop(onoff_task);
+ onoff_task = NULL;
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+}
+EXPORT_SYMBOL_GPL(torture_onoff_cleanup);
+
+/*
+ * Print online/offline testing statistics.
+ */
+char *torture_onoff_stats(char *page)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+ page += sprintf(page,
+ "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
+ n_online_successes, n_online_attempts,
+ n_offline_successes, n_offline_attempts,
+ min_online, max_online,
+ min_offline, max_offline,
+ sum_online, sum_offline, HZ);
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+ return page;
+}
+EXPORT_SYMBOL_GPL(torture_onoff_stats);
+
+/*
+ * Were all the online/offline operations successful?
+ */
+bool torture_onoff_failures(void)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+ return n_online_successes != n_online_attempts ||
+ n_offline_successes != n_offline_attempts;
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
+ return false;
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
+}
+EXPORT_SYMBOL_GPL(torture_onoff_failures);
+
+#define TORTURE_RANDOM_MULT 39916801 /* prime */
+#define TORTURE_RANDOM_ADD 479001701 /* prime */
+#define TORTURE_RANDOM_REFRESH 10000
+
+/*
+ * Crude but fast random-number generator. Uses a linear congruential
+ * generator, with occasional help from cpu_clock().
+ */
+unsigned long
+torture_random(struct torture_random_state *trsp)
+{
+ if (--trsp->trs_count < 0) {
+ trsp->trs_state += (unsigned long)local_clock();
+ trsp->trs_count = TORTURE_RANDOM_REFRESH;
+ }
+ trsp->trs_state = trsp->trs_state * TORTURE_RANDOM_MULT +
+ TORTURE_RANDOM_ADD;
+ return swahw32(trsp->trs_state);
+}
+EXPORT_SYMBOL_GPL(torture_random);
+
+/*
+ * Variables for shuffling. The idea is to ensure that each CPU stays
+ * idle for an extended period to test interactions with dyntick idle,
+ * as well as interactions with any per-CPU varibles.
+ */
+struct shuffle_task {
+ struct list_head st_l;
+ struct task_struct *st_t;
+};
+
+static long shuffle_interval; /* In jiffies. */
+static struct task_struct *shuffler_task;
+static cpumask_var_t shuffle_tmp_mask;
+static int shuffle_idle_cpu; /* Force all torture tasks off this CPU */
+static struct list_head shuffle_task_list = LIST_HEAD_INIT(shuffle_task_list);
+static DEFINE_MUTEX(shuffle_task_mutex);
+
+/*
+ * Register a task to be shuffled. If there is no memory, just splat
+ * and don't bother registering.
+ */
+void torture_shuffle_task_register(struct task_struct *tp)
+{
+ struct shuffle_task *stp;
+
+ if (WARN_ON_ONCE(tp == NULL))
+ return;
+ stp = kmalloc(sizeof(*stp), GFP_KERNEL);
+ if (WARN_ON_ONCE(stp == NULL))
+ return;
+ stp->st_t = tp;
+ mutex_lock(&shuffle_task_mutex);
+ list_add(&stp->st_l, &shuffle_task_list);
+ mutex_unlock(&shuffle_task_mutex);
+}
+EXPORT_SYMBOL_GPL(torture_shuffle_task_register);
+
+/*
+ * Unregister all tasks, for example, at the end of the torture run.
+ */
+static void torture_shuffle_task_unregister_all(void)
+{
+ struct shuffle_task *stp;
+ struct shuffle_task *p;
+
+ mutex_lock(&shuffle_task_mutex);
+ list_for_each_entry_safe(stp, p, &shuffle_task_list, st_l) {
+ list_del(&stp->st_l);
+ kfree(stp);
+ }
+ mutex_unlock(&shuffle_task_mutex);
+}
+
+/* Shuffle tasks such that we allow shuffle_idle_cpu to become idle.
+ * A special case is when shuffle_idle_cpu = -1, in which case we allow
+ * the tasks to run on all CPUs.
+ */
+static void torture_shuffle_tasks(void)
+{
+ struct shuffle_task *stp;
+
+ cpumask_setall(shuffle_tmp_mask);
+ get_online_cpus();
+
+ /* No point in shuffling if there is only one online CPU (ex: UP) */
+ if (num_online_cpus() == 1) {
+ put_online_cpus();
+ return;
+ }
+
+ /* Advance to the next CPU. Upon overflow, don't idle any CPUs. */
+ shuffle_idle_cpu = cpumask_next(shuffle_idle_cpu, shuffle_tmp_mask);
+ if (shuffle_idle_cpu >= nr_cpu_ids)
+ shuffle_idle_cpu = -1;
+ else
+ cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask);
+
+ mutex_lock(&shuffle_task_mutex);
+ list_for_each_entry(stp, &shuffle_task_list, st_l)
+ set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask);
+ mutex_unlock(&shuffle_task_mutex);
+
+ put_online_cpus();
+}
+
+/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
+ * system to become idle at a time and cut off its timer ticks. This is meant
+ * to test the support for such tickless idle CPU in RCU.
+ */
+static int torture_shuffle(void *arg)
+{
+ VERBOSE_TOROUT_STRING("torture_shuffle task started");
+ do {
+ schedule_timeout_interruptible(shuffle_interval);
+ torture_shuffle_tasks();
+ torture_shutdown_absorb("torture_shuffle");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("torture_shuffle");
+ return 0;
+}
+
+/*
+ * Start the shuffler, with shuffint in jiffies.
+ */
+int torture_shuffle_init(long shuffint)
+{
+ shuffle_interval = shuffint;
+
+ shuffle_idle_cpu = -1;
+
+ if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
+ VERBOSE_TOROUT_ERRSTRING("Failed to alloc mask");
+ return -ENOMEM;
+ }
+
+ /* Create the shuffler thread */
+ return torture_create_kthread(torture_shuffle, NULL, shuffler_task);
+}
+EXPORT_SYMBOL_GPL(torture_shuffle_init);
+
+/*
+ * Stop the shuffling.
+ */
+static void torture_shuffle_cleanup(void)
+{
+ torture_shuffle_task_unregister_all();
+ if (shuffler_task) {
+ VERBOSE_TOROUT_STRING("Stopping torture_shuffle task");
+ kthread_stop(shuffler_task);
+ free_cpumask_var(shuffle_tmp_mask);
+ }
+ shuffler_task = NULL;
+}
+EXPORT_SYMBOL_GPL(torture_shuffle_cleanup);
+
+/*
+ * Variables for auto-shutdown. This allows "lights out" torture runs
+ * to be fully scripted.
+ */
+static int shutdown_secs; /* desired test duration in seconds. */
+static struct task_struct *shutdown_task;
+static unsigned long shutdown_time; /* jiffies to system shutdown. */
+static void (*torture_shutdown_hook)(void);
+
+/*
+ * Absorb kthreads into a kernel function that won't return, so that
+ * they won't ever access module text or data again.
+ */
+void torture_shutdown_absorb(const char *title)
+{
+ while (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
+ pr_notice("torture thread %s parking due to system shutdown\n",
+ title);
+ schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
+ }
+}
+EXPORT_SYMBOL_GPL(torture_shutdown_absorb);
+
+/*
+ * Cause the torture test to shutdown the system after the test has
+ * run for the time specified by the shutdown_secs parameter.
+ */
+static int torture_shutdown(void *arg)
+{
+ long delta;
+ unsigned long jiffies_snap;
+
+ VERBOSE_TOROUT_STRING("torture_shutdown task started");
+ jiffies_snap = jiffies;
+ while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
+ !torture_must_stop()) {
+ delta = shutdown_time - jiffies_snap;
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_shutdown task: %lu jiffies remaining\n",
+ torture_type, delta);
+ schedule_timeout_interruptible(delta);
+ jiffies_snap = jiffies;
+ }
+ if (torture_must_stop()) {
+ torture_kthread_stopping("torture_shutdown");
+ return 0;
+ }
+
+ /* OK, shut down the system. */
+
+ VERBOSE_TOROUT_STRING("torture_shutdown task shutting down system");
+ shutdown_task = NULL; /* Avoid self-kill deadlock. */
+ if (torture_shutdown_hook)
+ torture_shutdown_hook();
+ else
+ VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping.");
+ kernel_power_off(); /* Shut down the system. */
+ return 0;
+}
+
+/*
+ * Start up the shutdown task.
+ */
+int torture_shutdown_init(int ssecs, void (*cleanup)(void))
+{
+ int ret = 0;
+
+ shutdown_secs = ssecs;
+ torture_shutdown_hook = cleanup;
+ if (shutdown_secs > 0) {
+ shutdown_time = jiffies + shutdown_secs * HZ;
+ ret = torture_create_kthread(torture_shutdown, NULL,
+ shutdown_task);
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(torture_shutdown_init);
+
+/*
+ * Detect and respond to a system shutdown.
+ */
+static int torture_shutdown_notify(struct notifier_block *unused1,
+ unsigned long unused2, void *unused3)
+{
+ mutex_lock(&fullstop_mutex);
+ if (ACCESS_ONCE(fullstop) == FULLSTOP_DONTSTOP) {
+ VERBOSE_TOROUT_STRING("Unscheduled system shutdown detected");
+ ACCESS_ONCE(fullstop) = FULLSTOP_SHUTDOWN;
+ } else {
+ pr_warn("Concurrent rmmod and shutdown illegal!\n");
+ }
+ mutex_unlock(&fullstop_mutex);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block torture_shutdown_nb = {
+ .notifier_call = torture_shutdown_notify,
+};
+
+/*
+ * Shut down the shutdown task. Say what??? Heh! This can happen if
+ * the torture module gets an rmmod before the shutdown time arrives. ;-)
+ */
+static void torture_shutdown_cleanup(void)
+{
+ unregister_reboot_notifier(&torture_shutdown_nb);
+ if (shutdown_task != NULL) {
+ VERBOSE_TOROUT_STRING("Stopping torture_shutdown task");
+ kthread_stop(shutdown_task);
+ }
+ shutdown_task = NULL;
+}
+
+/*
+ * Variables for stuttering, which means to periodically pause and
+ * restart testing in order to catch bugs that appear when load is
+ * suddenly applied to or removed from the system.
+ */
+static struct task_struct *stutter_task;
+static int stutter_pause_test;
+static int stutter;
+
+/*
+ * Block until the stutter interval ends. This must be called periodically
+ * by all running kthreads that need to be subject to stuttering.
+ */
+void stutter_wait(const char *title)
+{
+ while (ACCESS_ONCE(stutter_pause_test) ||
+ (torture_runnable && !ACCESS_ONCE(*torture_runnable))) {
+ if (stutter_pause_test)
+ if (ACCESS_ONCE(stutter_pause_test) == 1)
+ schedule_timeout_interruptible(1);
+ else
+ while (ACCESS_ONCE(stutter_pause_test))
+ cond_resched();
+ else
+ schedule_timeout_interruptible(round_jiffies_relative(HZ));
+ torture_shutdown_absorb(title);
+ }
+}
+EXPORT_SYMBOL_GPL(stutter_wait);
+
+/*
+ * Cause the torture test to "stutter", starting and stopping all
+ * threads periodically.
+ */
+static int torture_stutter(void *arg)
+{
+ VERBOSE_TOROUT_STRING("torture_stutter task started");
+ do {
+ if (!torture_must_stop()) {
+ if (stutter > 1) {
+ schedule_timeout_interruptible(stutter - 1);
+ ACCESS_ONCE(stutter_pause_test) = 2;
+ }
+ schedule_timeout_interruptible(1);
+ ACCESS_ONCE(stutter_pause_test) = 1;
+ }
+ if (!torture_must_stop())
+ schedule_timeout_interruptible(stutter);
+ ACCESS_ONCE(stutter_pause_test) = 0;
+ torture_shutdown_absorb("torture_stutter");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("torture_stutter");
+ return 0;
+}
+
+/*
+ * Initialize and kick off the torture_stutter kthread.
+ */
+int torture_stutter_init(int s)
+{
+ int ret;
+
+ stutter = s;
+ ret = torture_create_kthread(torture_stutter, NULL, stutter_task);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(torture_stutter_init);
+
+/*
+ * Cleanup after the torture_stutter kthread.
+ */
+static void torture_stutter_cleanup(void)
+{
+ if (!stutter_task)
+ return;
+ VERBOSE_TOROUT_STRING("Stopping torture_stutter task");
+ kthread_stop(stutter_task);
+ stutter_task = NULL;
+}
+
+/*
+ * Initialize torture module. Please note that this is -not- invoked via
+ * the usual module_init() mechanism, but rather by an explicit call from
+ * the client torture module. This call must be paired with a later
+ * torture_init_end().
+ *
+ * The runnable parameter points to a flag that controls whether or not
+ * the test is currently runnable. If there is no such flag, pass in NULL.
+ */
+bool torture_init_begin(char *ttype, bool v, int *runnable)
+{
+ mutex_lock(&fullstop_mutex);
+ if (torture_type != NULL) {
+ pr_alert("torture_init_begin: refusing %s init: %s running",
+ ttype, torture_type);
+ mutex_unlock(&fullstop_mutex);
+ return false;
+ }
+ torture_type = ttype;
+ verbose = v;
+ torture_runnable = runnable;
+ fullstop = FULLSTOP_DONTSTOP;
+ return true;
+}
+EXPORT_SYMBOL_GPL(torture_init_begin);
+
+/*
+ * Tell the torture module that initialization is complete.
+ */
+void torture_init_end(void)
+{
+ mutex_unlock(&fullstop_mutex);
+ register_reboot_notifier(&torture_shutdown_nb);
+}
+EXPORT_SYMBOL_GPL(torture_init_end);
+
+/*
+ * Clean up torture module. Please note that this is -not- invoked via
+ * the usual module_exit() mechanism, but rather by an explicit call from
+ * the client torture module. Returns true if a race with system shutdown
+ * is detected, otherwise, all kthreads started by functions in this file
+ * will be shut down.
+ *
+ * This must be called before the caller starts shutting down its own
+ * kthreads.
+ */
+bool torture_cleanup(void)
+{
+ mutex_lock(&fullstop_mutex);
+ if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
+ pr_warn("Concurrent rmmod and shutdown illegal!\n");
+ mutex_unlock(&fullstop_mutex);
+ schedule_timeout_uninterruptible(10);
+ return true;
+ }
+ ACCESS_ONCE(fullstop) = FULLSTOP_RMMOD;
+ mutex_unlock(&fullstop_mutex);
+ torture_shutdown_cleanup();
+ torture_shuffle_cleanup();
+ torture_stutter_cleanup();
+ torture_onoff_cleanup();
+ mutex_lock(&fullstop_mutex);
+ torture_type = NULL;
+ mutex_unlock(&fullstop_mutex);
+ return false;
+}
+EXPORT_SYMBOL_GPL(torture_cleanup);
+
+/*
+ * Is it time for the current torture test to stop?
+ */
+bool torture_must_stop(void)
+{
+ return torture_must_stop_irq() || kthread_should_stop();
+}
+EXPORT_SYMBOL_GPL(torture_must_stop);
+
+/*
+ * Is it time for the current torture test to stop? This is the irq-safe
+ * version, hence no check for kthread_should_stop().
+ */
+bool torture_must_stop_irq(void)
+{
+ return ACCESS_ONCE(fullstop) != FULLSTOP_DONTSTOP;
+}
+EXPORT_SYMBOL_GPL(torture_must_stop_irq);
+
+/*
+ * Each kthread must wait for kthread_should_stop() before returning from
+ * its top-level function, otherwise segfaults ensue. This function
+ * prints a "stopping" message and waits for kthread_should_stop(), and
+ * should be called from all torture kthreads immediately prior to
+ * returning.
+ */
+void torture_kthread_stopping(char *title)
+{
+ char buf[128];
+
+ snprintf(buf, sizeof(buf), "Stopping %s", title);
+ VERBOSE_TOROUT_STRING(buf);
+ while (!kthread_should_stop()) {
+ torture_shutdown_absorb(title);
+ schedule_timeout_uninterruptible(1);
+ }
+}
+EXPORT_SYMBOL_GPL(torture_kthread_stopping);
+
+/*
+ * Create a generic torture kthread that is immediately runnable. If you
+ * need the kthread to be stopped so that you can do something to it before
+ * it starts, you will need to open-code your own.
+ */
+int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m,
+ char *f, struct task_struct **tp)
+{
+ int ret = 0;
+
+ VERBOSE_TOROUT_STRING(m);
+ *tp = kthread_run(fn, arg, "%s", s);
+ if (IS_ERR(*tp)) {
+ ret = PTR_ERR(*tp);
+ VERBOSE_TOROUT_ERRSTRING(f);
+ *tp = NULL;
+ }
+ torture_shuffle_task_register(*tp);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(_torture_create_kthread);
+
+/*
+ * Stop a generic kthread, emitting a message.
+ */
+void _torture_stop_kthread(char *m, struct task_struct **tp)
+{
+ if (*tp == NULL)
+ return;
+ VERBOSE_TOROUT_STRING(m);
+ kthread_stop(*tp);
+ *tp = NULL;
+}
+EXPORT_SYMBOL_GPL(_torture_stop_kthread);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 015f85aaca08..a5da09c899dd 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -29,11 +29,6 @@ config HAVE_FUNCTION_GRAPH_FP_TEST
help
See Documentation/trace/ftrace-design.txt
-config HAVE_FUNCTION_TRACE_MCOUNT_TEST
- bool
- help
- See Documentation/trace/ftrace-design.txt
-
config HAVE_DYNAMIC_FTRACE
bool
help
@@ -424,6 +419,7 @@ config UPROBE_EVENT
bool "Enable uprobes-based dynamic events"
depends on ARCH_SUPPORTS_UPROBES
depends on MMU
+ depends on PERF_EVENTS
select UPROBES
select PROBE_EVENTS
select TRACING
@@ -534,6 +530,36 @@ config MMIOTRACE_TEST
Say N, unless you absolutely know what you are doing.
+config TRACEPOINT_BENCHMARK
+ bool "Add tracepoint that benchmarks tracepoints"
+ help
+ This option creates the tracepoint "benchmark:benchmark_event".
+ When the tracepoint is enabled, it kicks off a kernel thread that
+ goes into an infinite loop (calling cond_sched() to let other tasks
+ run), and calls the tracepoint. Each iteration will record the time
+ it took to write to the tracepoint and the next iteration that
+ data will be passed to the tracepoint itself. That is, the tracepoint
+ will report the time it took to do the previous tracepoint.
+ The string written to the tracepoint is a static string of 128 bytes
+ to keep the time the same. The initial string is simply a write of
+ "START". The second string records the cold cache time of the first
+ write which is not added to the rest of the calculations.
+
+ As it is a tight loop, it benchmarks as hot cache. That's fine because
+ we care most about hot paths that are probably in cache already.
+
+ An example of the output:
+
+ START
+ first=3672 [COLD CACHED]
+ last=632 first=3672 max=632 min=632 avg=316 std=446 std^2=199712
+ last=278 first=3672 max=632 min=278 avg=303 std=316 std^2=100337
+ last=277 first=3672 max=632 min=277 avg=296 std=258 std^2=67064
+ last=273 first=3672 max=632 min=273 avg=292 std=224 std^2=50411
+ last=273 first=3672 max=632 min=273 avg=288 std=200 std^2=40389
+ last=281 first=3672 max=632 min=273 avg=287 std=183 std^2=33666
+
+
config RING_BUFFER_BENCHMARK
tristate "Ring buffer benchmark stress tester"
depends on RING_BUFFER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 1378e84fbe39..67d6369ddf83 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -17,6 +17,7 @@ ifdef CONFIG_TRACING_BRANCHES
KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
endif
+CFLAGS_trace_benchmark.o := -I$(src)
CFLAGS_trace_events_filter.o := -I$(src)
obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o
@@ -27,6 +28,7 @@ obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o
obj-$(CONFIG_TRACING) += trace.o
obj-$(CONFIG_TRACING) += trace_output.o
+obj-$(CONFIG_TRACING) += trace_seq.o
obj-$(CONFIG_TRACING) += trace_stat.o
obj-$(CONFIG_TRACING) += trace_printk.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
@@ -62,4 +64,6 @@ endif
obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o
+obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
+
libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b418cb0d7242..c1bd4ada2a04 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -702,6 +702,7 @@ void blk_trace_shutdown(struct request_queue *q)
* blk_add_trace_rq - Add a trace for a request oriented action
* @q: queue the io is for
* @rq: the source request
+ * @nr_bytes: number of completed bytes
* @what: the action
*
* Description:
@@ -709,7 +710,7 @@ void blk_trace_shutdown(struct request_queue *q)
*
**/
static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
- u32 what)
+ unsigned int nr_bytes, u32 what)
{
struct blk_trace *bt = q->blk_trace;
@@ -718,11 +719,11 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
what |= BLK_TC_ACT(BLK_TC_PC);
- __blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags,
+ __blk_add_trace(bt, 0, nr_bytes, rq->cmd_flags,
what, rq->errors, rq->cmd_len, rq->cmd);
} else {
what |= BLK_TC_ACT(BLK_TC_FS);
- __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
+ __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes,
rq->cmd_flags, what, rq->errors, 0, NULL);
}
}
@@ -730,33 +731,34 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
static void blk_add_trace_rq_abort(void *ignore,
struct request_queue *q, struct request *rq)
{
- blk_add_trace_rq(q, rq, BLK_TA_ABORT);
+ blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ABORT);
}
static void blk_add_trace_rq_insert(void *ignore,
struct request_queue *q, struct request *rq)
{
- blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+ blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_INSERT);
}
static void blk_add_trace_rq_issue(void *ignore,
struct request_queue *q, struct request *rq)
{
- blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
+ blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ISSUE);
}
static void blk_add_trace_rq_requeue(void *ignore,
struct request_queue *q,
struct request *rq)
{
- blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+ blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_REQUEUE);
}
static void blk_add_trace_rq_complete(void *ignore,
struct request_queue *q,
- struct request *rq)
+ struct request *rq,
+ unsigned int nr_bytes)
{
- blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
+ blk_add_trace_rq(q, rq, nr_bytes, BLK_TA_COMPLETE);
}
/**
@@ -1427,7 +1429,8 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
return print_one_line(iter, true);
}
-static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set)
+static int
+blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
/* don't output context-info for blk_classic output */
if (bit == TRACE_BLK_OPT_CLASSIC) {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index cd7f76d1eb86..5916a8e59e87 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -62,27 +62,30 @@
#define FTRACE_HASH_DEFAULT_BITS 10
#define FTRACE_HASH_MAX_BITS 12
-#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL)
+#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL)
#ifdef CONFIG_DYNAMIC_FTRACE
-#define INIT_REGEX_LOCK(opsname) \
- .regex_lock = __MUTEX_INITIALIZER(opsname.regex_lock),
+#define INIT_OPS_HASH(opsname) \
+ .func_hash = &opsname.local_hash, \
+ .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
+#define ASSIGN_OPS_HASH(opsname, val) \
+ .func_hash = val, \
+ .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
#else
-#define INIT_REGEX_LOCK(opsname)
+#define INIT_OPS_HASH(opsname)
+#define ASSIGN_OPS_HASH(opsname, val)
#endif
static struct ftrace_ops ftrace_list_end __read_mostly = {
.func = ftrace_stub,
.flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
+ INIT_OPS_HASH(ftrace_list_end)
};
/* ftrace_enabled is a method to turn ftrace on or off */
int ftrace_enabled __read_mostly;
static int last_ftrace_enabled;
-/* Quick disabling of function tracer. */
-int function_trace_stop __read_mostly;
-
/* Current function tracing op */
struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
/* What to set function_trace_op to */
@@ -103,7 +106,6 @@ static int ftrace_disabled __read_mostly;
static DEFINE_MUTEX(ftrace_lock);
-static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
@@ -144,7 +146,8 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops)
{
#ifdef CONFIG_DYNAMIC_FTRACE
if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) {
- mutex_init(&ops->regex_lock);
+ mutex_init(&ops->local_hash.regex_lock);
+ ops->func_hash = &ops->local_hash;
ops->flags |= FTRACE_OPS_FL_INITIALIZED;
}
#endif
@@ -171,23 +174,6 @@ int ftrace_nr_registered_ops(void)
return cnt;
}
-static void
-ftrace_global_list_func(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *regs)
-{
- int bit;
-
- bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX);
- if (bit < 0)
- return;
-
- do_for_each_ftrace_op(op, ftrace_global_list) {
- op->func(ip, parent_ip, op, regs);
- } while_for_each_ftrace_op(op);
-
- trace_clear_recursion(bit);
-}
-
static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *regs)
{
@@ -237,49 +223,6 @@ static int control_ops_alloc(struct ftrace_ops *ops)
return 0;
}
-static void control_ops_free(struct ftrace_ops *ops)
-{
- free_percpu(ops->disabled);
-}
-
-static void update_global_ops(void)
-{
- ftrace_func_t func;
-
- /*
- * If there's only one function registered, then call that
- * function directly. Otherwise, we need to iterate over the
- * registered callers.
- */
- if (ftrace_global_list == &ftrace_list_end ||
- ftrace_global_list->next == &ftrace_list_end) {
- func = ftrace_global_list->func;
- /*
- * As we are calling the function directly.
- * If it does not have recursion protection,
- * the function_trace_op needs to be updated
- * accordingly.
- */
- if (ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE)
- global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
- else
- global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE;
- } else {
- func = ftrace_global_list_func;
- /* The list has its own recursion protection. */
- global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
- }
-
-
- /* If we filter on pids, update to use the pid function */
- if (!list_empty(&ftrace_pids)) {
- set_ftrace_pid_function(func);
- func = ftrace_pid_func;
- }
-
- global_ops.func = func;
-}
-
static void ftrace_sync(struct work_struct *work)
{
/*
@@ -307,8 +250,6 @@ static void update_ftrace_function(void)
{
ftrace_func_t func;
- update_global_ops();
-
/*
* If we are at the end of the list and this ops is
* recursion safe and not dynamic and the arch supports passing ops,
@@ -320,10 +261,7 @@ static void update_ftrace_function(void)
(ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) &&
!FTRACE_FORCE_LIST_FUNC)) {
/* Set the ftrace_ops that the arch callback uses */
- if (ftrace_ops_list == &global_ops)
- set_function_trace_op = ftrace_global_list;
- else
- set_function_trace_op = ftrace_ops_list;
+ set_function_trace_op = ftrace_ops_list;
func = ftrace_ops_list->func;
} else {
/* Just use the default ftrace_ops */
@@ -331,12 +269,12 @@ static void update_ftrace_function(void)
func = ftrace_ops_list_func;
}
+ update_function_graph_func();
+
/* If there's no change, then do nothing more here */
if (ftrace_trace_function == func)
return;
- update_function_graph_func();
-
/*
* If we are using the list function, it doesn't care
* about the function_trace_ops.
@@ -379,6 +317,11 @@ static void update_ftrace_function(void)
ftrace_trace_function = func;
}
+int using_ftrace_ops_list_func(void)
+{
+ return ftrace_trace_function == ftrace_ops_list_func;
+}
+
static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
{
ops->next = *list;
@@ -437,16 +380,12 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
static int __register_ftrace_function(struct ftrace_ops *ops)
{
- if (FTRACE_WARN_ON(ops == &global_ops))
+ if (ops->flags & FTRACE_OPS_FL_DELETED)
return -EINVAL;
if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
return -EBUSY;
- /* We don't support both control and global flags set. */
- if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)
- return -EINVAL;
-
#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS
/*
* If the ftrace_ops specifies SAVE_REGS, then it only can be used
@@ -464,10 +403,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
if (!core_kernel_data((unsigned long)ops))
ops->flags |= FTRACE_OPS_FL_DYNAMIC;
- if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
- add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops);
- ops->flags |= FTRACE_OPS_FL_ENABLED;
- } else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
+ if (ops->flags & FTRACE_OPS_FL_CONTROL) {
if (control_ops_alloc(ops))
return -ENOMEM;
add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
@@ -487,15 +423,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
return -EBUSY;
- if (FTRACE_WARN_ON(ops == &global_ops))
- return -EINVAL;
-
- if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
- ret = remove_ftrace_list_ops(&ftrace_global_list,
- &global_ops, ops);
- if (!ret)
- ops->flags &= ~FTRACE_OPS_FL_ENABLED;
- } else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
+ if (ops->flags & FTRACE_OPS_FL_CONTROL) {
ret = remove_ftrace_list_ops(&ftrace_control_list,
&control_ops, ops);
} else
@@ -898,7 +826,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip,
local_irq_save(flags);
- stat = &__get_cpu_var(ftrace_profile_stats);
+ stat = this_cpu_ptr(&ftrace_profile_stats);
if (!stat->hash || !ftrace_profile_enabled)
goto out;
@@ -929,7 +857,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
unsigned long flags;
local_irq_save(flags);
- stat = &__get_cpu_var(ftrace_profile_stats);
+ stat = this_cpu_ptr(&ftrace_profile_stats);
if (!stat->hash || !ftrace_profile_enabled)
goto out;
@@ -978,7 +906,7 @@ static void unregister_ftrace_profiler(void)
static struct ftrace_ops ftrace_profile_ops __read_mostly = {
.func = function_profile_call,
.flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
- INIT_REGEX_LOCK(ftrace_profile_ops)
+ INIT_OPS_HASH(ftrace_profile_ops)
};
static int register_ftrace_profiler(void)
@@ -1118,6 +1046,8 @@ static struct pid * const ftrace_swapper_pid = &init_struct_pid;
#ifdef CONFIG_DYNAMIC_FTRACE
+static struct ftrace_ops *removed_ops;
+
#ifndef CONFIG_FTRACE_MCOUNT_RECORD
# error Dynamic ftrace depends on MCOUNT_RECORD
#endif
@@ -1158,11 +1088,12 @@ static const struct ftrace_hash empty_hash = {
#define EMPTY_HASH ((struct ftrace_hash *)&empty_hash)
static struct ftrace_ops global_ops = {
- .func = ftrace_stub,
- .notrace_hash = EMPTY_HASH,
- .filter_hash = EMPTY_HASH,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
- INIT_REGEX_LOCK(global_ops)
+ .func = ftrace_stub,
+ .local_hash.notrace_hash = EMPTY_HASH,
+ .local_hash.filter_hash = EMPTY_HASH,
+ INIT_OPS_HASH(global_ops)
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE |
+ FTRACE_OPS_FL_INITIALIZED,
};
struct ftrace_page {
@@ -1172,8 +1103,6 @@ struct ftrace_page {
int size;
};
-static struct ftrace_page *ftrace_new_pgs;
-
#define ENTRY_SIZE sizeof(struct dyn_ftrace)
#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE)
@@ -1183,7 +1112,7 @@ static struct ftrace_page *ftrace_new_pgs;
static struct ftrace_page *ftrace_pages_start;
static struct ftrace_page *ftrace_pages;
-static bool ftrace_hash_empty(struct ftrace_hash *hash)
+static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash)
{
return !hash || !hash->count;
}
@@ -1305,8 +1234,8 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
void ftrace_free_filter(struct ftrace_ops *ops)
{
ftrace_ops_init(ops);
- free_ftrace_hash(ops->filter_hash);
- free_ftrace_hash(ops->notrace_hash);
+ free_ftrace_hash(ops->func_hash->filter_hash);
+ free_ftrace_hash(ops->func_hash->notrace_hash);
}
static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
@@ -1367,9 +1296,9 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
}
static void
-ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash);
+ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash);
static void
-ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash);
+ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash);
static int
ftrace_hash_move(struct ftrace_ops *ops, int enable,
@@ -1382,25 +1311,15 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
struct ftrace_hash *new_hash;
int size = src->count;
int bits = 0;
- int ret;
int i;
/*
- * Remove the current set, update the hash and add
- * them back.
- */
- ftrace_hash_rec_disable(ops, enable);
-
- /*
* If the new source is empty, just free dst and assign it
* the empty_hash.
*/
if (!src->count) {
- free_ftrace_hash_rcu(*dst);
- rcu_assign_pointer(*dst, EMPTY_HASH);
- /* still need to update the function records */
- ret = 0;
- goto out;
+ new_hash = EMPTY_HASH;
+ goto update;
}
/*
@@ -1413,10 +1332,9 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
if (bits > FTRACE_HASH_MAX_BITS)
bits = FTRACE_HASH_MAX_BITS;
- ret = -ENOMEM;
new_hash = alloc_ftrace_hash(bits);
if (!new_hash)
- goto out;
+ return -ENOMEM;
size = 1 << src->size_bits;
for (i = 0; i < size; i++) {
@@ -1427,20 +1345,20 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
}
}
+update:
+ /*
+ * Remove the current set, update the hash and add
+ * them back.
+ */
+ ftrace_hash_rec_disable_modify(ops, enable);
+
old_hash = *dst;
rcu_assign_pointer(*dst, new_hash);
free_ftrace_hash_rcu(old_hash);
- ret = 0;
- out:
- /*
- * Enable regardless of ret:
- * On success, we enable the new hash.
- * On failure, we re-enable the original hash.
- */
- ftrace_hash_rec_enable(ops, enable);
+ ftrace_hash_rec_enable_modify(ops, enable);
- return ret;
+ return 0;
}
/*
@@ -1472,8 +1390,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
return 0;
#endif
- filter_hash = rcu_dereference_raw_notrace(ops->filter_hash);
- notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash);
+ filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash);
+ notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash);
if ((ftrace_hash_empty(filter_hash) ||
ftrace_lookup_ip(filter_hash, ip)) &&
@@ -1560,7 +1478,7 @@ unsigned long ftrace_location(unsigned long ip)
* the function tracer. It checks the ftrace internal tables to
* determine if the address belongs or not.
*/
-int ftrace_text_reserved(void *start, void *end)
+int ftrace_text_reserved(const void *start, const void *end)
{
unsigned long ret;
@@ -1570,6 +1488,66 @@ int ftrace_text_reserved(void *start, void *end)
return (int)!!ret;
}
+/* Test if ops registered to this rec needs regs */
+static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *ops;
+ bool keep_regs = false;
+
+ for (ops = ftrace_ops_list;
+ ops != &ftrace_list_end; ops = ops->next) {
+ /* pass rec in as regs to have non-NULL val */
+ if (ftrace_ops_test(ops, rec->ip, rec)) {
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
+ keep_regs = true;
+ break;
+ }
+ }
+ }
+
+ return keep_regs;
+}
+
+static void ftrace_remove_tramp(struct ftrace_ops *ops,
+ struct dyn_ftrace *rec)
+{
+ /* If TRAMP is not set, no ops should have a trampoline for this */
+ if (!(rec->flags & FTRACE_FL_TRAMP))
+ return;
+
+ rec->flags &= ~FTRACE_FL_TRAMP;
+
+ if ((!ftrace_hash_empty(ops->func_hash->filter_hash) &&
+ !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) ||
+ ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip))
+ return;
+ /*
+ * The tramp_hash entry will be removed at time
+ * of update.
+ */
+ ops->nr_trampolines--;
+}
+
+static void ftrace_clear_tramps(struct dyn_ftrace *rec, struct ftrace_ops *ops)
+{
+ struct ftrace_ops *op;
+
+ /* If TRAMP is not set, no ops should have a trampoline for this */
+ if (!(rec->flags & FTRACE_FL_TRAMP))
+ return;
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ /*
+ * This function is called to clear other tramps
+ * not the one that is being updated.
+ */
+ if (op == ops)
+ continue;
+ if (op->nr_trampolines)
+ ftrace_remove_tramp(op, rec);
+ } while_for_each_ftrace_op(op);
+}
+
static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
int filter_hash,
bool inc)
@@ -1597,14 +1575,14 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
* gets inversed.
*/
if (filter_hash) {
- hash = ops->filter_hash;
- other_hash = ops->notrace_hash;
+ hash = ops->func_hash->filter_hash;
+ other_hash = ops->func_hash->notrace_hash;
if (ftrace_hash_empty(hash))
all = 1;
} else {
inc = !inc;
- hash = ops->notrace_hash;
- other_hash = ops->filter_hash;
+ hash = ops->func_hash->notrace_hash;
+ other_hash = ops->func_hash->filter_hash;
/*
* If the notrace hash has no items,
* then there's nothing to do.
@@ -1630,7 +1608,14 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip);
/*
+ * If filter_hash is set, we want to match all functions
+ * that are in the hash but not in the other hash.
*
+ * If filter_hash is not set, then we are decrementing.
+ * That means we match anything that is in the hash
+ * and also in the other_hash. That is, we need to turn
+ * off functions in the other hash because they are disabled
+ * by this hash.
*/
if (filter_hash && in_hash && !in_other_hash)
match = 1;
@@ -1643,8 +1628,27 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
if (inc) {
rec->flags++;
- if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX))
+ if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX))
return;
+
+ /*
+ * If there's only a single callback registered to a
+ * function, and the ops has a trampoline registered
+ * for it, then we can call it directly.
+ */
+ if (ftrace_rec_count(rec) == 1 && ops->trampoline) {
+ rec->flags |= FTRACE_FL_TRAMP;
+ ops->nr_trampolines++;
+ } else {
+ /*
+ * If we are adding another function callback
+ * to this function, and the previous had a
+ * custom trampoline in use, then we need to go
+ * back to the default trampoline.
+ */
+ ftrace_clear_tramps(rec, ops);
+ }
+
/*
* If any ops wants regs saved for this function
* then all ops will get saved regs.
@@ -1652,9 +1656,30 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
rec->flags |= FTRACE_FL_REGS;
} else {
- if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0))
+ if (FTRACE_WARN_ON(ftrace_rec_count(rec) == 0))
return;
rec->flags--;
+
+ if (ops->trampoline && !ftrace_rec_count(rec))
+ ftrace_remove_tramp(ops, rec);
+
+ /*
+ * If the rec had REGS enabled and the ops that is
+ * being removed had REGS set, then see if there is
+ * still any ops for this record that wants regs.
+ * If not, we can stop recording them.
+ */
+ if (ftrace_rec_count(rec) > 0 &&
+ rec->flags & FTRACE_FL_REGS &&
+ ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
+ if (!test_rec_ops_needs_regs(rec))
+ rec->flags &= ~FTRACE_FL_REGS;
+ }
+
+ /*
+ * flags will be cleared in ftrace_check_record()
+ * if rec count is zero.
+ */
}
count++;
/* Shortcut, if we handled all records, we are done. */
@@ -1675,6 +1700,41 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
__ftrace_hash_rec_update(ops, filter_hash, 1);
}
+static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops,
+ int filter_hash, int inc)
+{
+ struct ftrace_ops *op;
+
+ __ftrace_hash_rec_update(ops, filter_hash, inc);
+
+ if (ops->func_hash != &global_ops.local_hash)
+ return;
+
+ /*
+ * If the ops shares the global_ops hash, then we need to update
+ * all ops that are enabled and use this hash.
+ */
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ /* Already done */
+ if (op == ops)
+ continue;
+ if (op->func_hash == &global_ops.local_hash)
+ __ftrace_hash_rec_update(op, filter_hash, inc);
+ } while_for_each_ftrace_op(op);
+}
+
+static void ftrace_hash_rec_disable_modify(struct ftrace_ops *ops,
+ int filter_hash)
+{
+ ftrace_hash_rec_update_modify(ops, filter_hash, 0);
+}
+
+static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops,
+ int filter_hash)
+{
+ ftrace_hash_rec_update_modify(ops, filter_hash, 1);
+}
+
static void print_ip_ins(const char *fmt, unsigned char *p)
{
int i;
@@ -1739,17 +1799,23 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
* If we are disabling calls, then disable all records that
* are enabled.
*/
- if (enable && (rec->flags & ~FTRACE_FL_MASK))
+ if (enable && ftrace_rec_count(rec))
flag = FTRACE_FL_ENABLED;
/*
- * If enabling and the REGS flag does not match the REGS_EN, then
- * do not ignore this record. Set flags to fail the compare against
- * ENABLED.
+ * If enabling and the REGS flag does not match the REGS_EN, or
+ * the TRAMP flag doesn't match the TRAMP_EN, then do not ignore
+ * this record. Set flags to fail the compare against ENABLED.
*/
- if (flag &&
- (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN)))
- flag |= FTRACE_FL_REGS;
+ if (flag) {
+ if (!(rec->flags & FTRACE_FL_REGS) !=
+ !(rec->flags & FTRACE_FL_REGS_EN))
+ flag |= FTRACE_FL_REGS;
+
+ if (!(rec->flags & FTRACE_FL_TRAMP) !=
+ !(rec->flags & FTRACE_FL_TRAMP_EN))
+ flag |= FTRACE_FL_TRAMP;
+ }
/* If the state of this record hasn't changed, then do nothing */
if ((rec->flags & FTRACE_FL_ENABLED) == flag)
@@ -1767,29 +1833,31 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
else
rec->flags &= ~FTRACE_FL_REGS_EN;
}
+ if (flag & FTRACE_FL_TRAMP) {
+ if (rec->flags & FTRACE_FL_TRAMP)
+ rec->flags |= FTRACE_FL_TRAMP_EN;
+ else
+ rec->flags &= ~FTRACE_FL_TRAMP_EN;
+ }
}
/*
* If this record is being updated from a nop, then
* return UPDATE_MAKE_CALL.
- * Otherwise, if the EN flag is set, then return
- * UPDATE_MODIFY_CALL_REGS to tell the caller to convert
- * from the non-save regs, to a save regs function.
* Otherwise,
* return UPDATE_MODIFY_CALL to tell the caller to convert
- * from the save regs, to a non-save regs function.
+ * from the save regs, to a non-save regs function or
+ * vice versa, or from a trampoline call.
*/
if (flag & FTRACE_FL_ENABLED)
return FTRACE_UPDATE_MAKE_CALL;
- else if (rec->flags & FTRACE_FL_REGS_EN)
- return FTRACE_UPDATE_MODIFY_CALL_REGS;
- else
- return FTRACE_UPDATE_MODIFY_CALL;
+
+ return FTRACE_UPDATE_MODIFY_CALL;
}
if (update) {
/* If there's no more users, clear all flags */
- if (!(rec->flags & ~FTRACE_FL_MASK))
+ if (!ftrace_rec_count(rec))
rec->flags = 0;
else
/* Just disable the record (keep REGS state) */
@@ -1826,6 +1894,107 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)
return ftrace_check_record(rec, enable, 0);
}
+static struct ftrace_ops *
+ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *op;
+
+ /* Removed ops need to be tested first */
+ if (removed_ops && removed_ops->tramp_hash) {
+ if (ftrace_lookup_ip(removed_ops->tramp_hash, rec->ip))
+ return removed_ops;
+ }
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ if (!op->tramp_hash)
+ continue;
+
+ if (ftrace_lookup_ip(op->tramp_hash, rec->ip))
+ return op;
+
+ } while_for_each_ftrace_op(op);
+
+ return NULL;
+}
+
+static struct ftrace_ops *
+ftrace_find_tramp_ops_new(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *op;
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ /* pass rec in as regs to have non-NULL val */
+ if (ftrace_ops_test(op, rec->ip, rec))
+ return op;
+ } while_for_each_ftrace_op(op);
+
+ return NULL;
+}
+
+/**
+ * ftrace_get_addr_new - Get the call address to set to
+ * @rec: The ftrace record descriptor
+ *
+ * If the record has the FTRACE_FL_REGS set, that means that it
+ * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS
+ * is not not set, then it wants to convert to the normal callback.
+ *
+ * Returns the address of the trampoline to set to
+ */
+unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *ops;
+
+ /* Trampolines take precedence over regs */
+ if (rec->flags & FTRACE_FL_TRAMP) {
+ ops = ftrace_find_tramp_ops_new(rec);
+ if (FTRACE_WARN_ON(!ops || !ops->trampoline)) {
+ pr_warn("Bad trampoline accounting at: %p (%pS) (%lx)\n",
+ (void *)rec->ip, (void *)rec->ip, rec->flags);
+ /* Ftrace is shutting down, return anything */
+ return (unsigned long)FTRACE_ADDR;
+ }
+ return ops->trampoline;
+ }
+
+ if (rec->flags & FTRACE_FL_REGS)
+ return (unsigned long)FTRACE_REGS_ADDR;
+ else
+ return (unsigned long)FTRACE_ADDR;
+}
+
+/**
+ * ftrace_get_addr_curr - Get the call address that is already there
+ * @rec: The ftrace record descriptor
+ *
+ * The FTRACE_FL_REGS_EN is set when the record already points to
+ * a function that saves all the regs. Basically the '_EN' version
+ * represents the current state of the function.
+ *
+ * Returns the address of the trampoline that is currently being called
+ */
+unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *ops;
+
+ /* Trampolines take precedence over regs */
+ if (rec->flags & FTRACE_FL_TRAMP_EN) {
+ ops = ftrace_find_tramp_ops_curr(rec);
+ if (FTRACE_WARN_ON(!ops)) {
+ pr_warning("Bad trampoline accounting at: %p (%pS)\n",
+ (void *)rec->ip, (void *)rec->ip);
+ /* Ftrace is shutting down, return anything */
+ return (unsigned long)FTRACE_ADDR;
+ }
+ return ops->trampoline;
+ }
+
+ if (rec->flags & FTRACE_FL_REGS_EN)
+ return (unsigned long)FTRACE_REGS_ADDR;
+ else
+ return (unsigned long)FTRACE_ADDR;
+}
+
static int
__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
{
@@ -1833,12 +2002,12 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
unsigned long ftrace_addr;
int ret;
- ret = ftrace_update_record(rec, enable);
+ ftrace_addr = ftrace_get_addr_new(rec);
- if (rec->flags & FTRACE_FL_REGS)
- ftrace_addr = (unsigned long)FTRACE_REGS_ADDR;
- else
- ftrace_addr = (unsigned long)FTRACE_ADDR;
+ /* This needs to be done before we call ftrace_update_record */
+ ftrace_old_addr = ftrace_get_addr_curr(rec);
+
+ ret = ftrace_update_record(rec, enable);
switch (ret) {
case FTRACE_UPDATE_IGNORE:
@@ -1848,15 +2017,9 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
return ftrace_make_call(rec, ftrace_addr);
case FTRACE_UPDATE_MAKE_NOP:
- return ftrace_make_nop(NULL, rec, ftrace_addr);
+ return ftrace_make_nop(NULL, rec, ftrace_old_addr);
- case FTRACE_UPDATE_MODIFY_CALL_REGS:
case FTRACE_UPDATE_MODIFY_CALL:
- if (rec->flags & FTRACE_FL_REGS)
- ftrace_old_addr = (unsigned long)FTRACE_ADDR;
- else
- ftrace_old_addr = (unsigned long)FTRACE_REGS_ADDR;
-
return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
}
@@ -1994,6 +2157,7 @@ int __weak ftrace_arch_code_modify_post_process(void)
void ftrace_modify_all_code(int command)
{
int update = command & FTRACE_UPDATE_TRACE_FUNC;
+ int err = 0;
/*
* If the ftrace_caller calls a ftrace_ops func directly,
@@ -2005,8 +2169,11 @@ void ftrace_modify_all_code(int command)
* to make sure the ops are having the right functions
* traced.
*/
- if (update)
- ftrace_update_ftrace_func(ftrace_ops_list_func);
+ if (update) {
+ err = ftrace_update_ftrace_func(ftrace_ops_list_func);
+ if (FTRACE_WARN_ON(err))
+ return;
+ }
if (command & FTRACE_UPDATE_CALLS)
ftrace_replace_code(1);
@@ -2019,13 +2186,16 @@ void ftrace_modify_all_code(int command)
/* If irqs are disabled, we are in stop machine */
if (!irqs_disabled())
smp_call_function(ftrace_sync_ipi, NULL, 1);
- ftrace_update_ftrace_func(ftrace_trace_function);
+ err = ftrace_update_ftrace_func(ftrace_trace_function);
+ if (FTRACE_WARN_ON(err))
+ return;
}
if (command & FTRACE_START_FUNC_RET)
- ftrace_enable_ftrace_graph_caller();
+ err = ftrace_enable_ftrace_graph_caller();
else if (command & FTRACE_STOP_FUNC_RET)
- ftrace_disable_ftrace_graph_caller();
+ err = ftrace_disable_ftrace_graph_caller();
+ FTRACE_WARN_ON(err);
}
static int __ftrace_modify_code(void *data)
@@ -2061,6 +2231,92 @@ void __weak arch_ftrace_update_code(int command)
ftrace_run_stop_machine(command);
}
+static int ftrace_save_ops_tramp_hash(struct ftrace_ops *ops)
+{
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec;
+ int size, bits;
+ int ret;
+
+ size = ops->nr_trampolines;
+ bits = 0;
+ /*
+ * Make the hash size about 1/2 the # found
+ */
+ for (size /= 2; size; size >>= 1)
+ bits++;
+
+ ops->tramp_hash = alloc_ftrace_hash(bits);
+ /*
+ * TODO: a failed allocation is going to screw up
+ * the accounting of what needs to be modified
+ * and not. For now, we kill ftrace if we fail
+ * to allocate here. But there are ways around this,
+ * but that will take a little more work.
+ */
+ if (!ops->tramp_hash)
+ return -ENOMEM;
+
+ do_for_each_ftrace_rec(pg, rec) {
+ if (ftrace_rec_count(rec) == 1 &&
+ ftrace_ops_test(ops, rec->ip, rec)) {
+
+ /*
+ * If another ops adds to a rec, the rec will
+ * lose its trampoline and never get it back
+ * until all ops are off of it.
+ */
+ if (!(rec->flags & FTRACE_FL_TRAMP))
+ continue;
+
+ /* This record had better have a trampoline */
+ if (FTRACE_WARN_ON(!(rec->flags & FTRACE_FL_TRAMP_EN)))
+ return -1;
+
+ ret = add_hash_entry(ops->tramp_hash, rec->ip);
+ if (ret < 0)
+ return ret;
+ }
+ } while_for_each_ftrace_rec();
+
+ /* The number of recs in the hash must match nr_trampolines */
+ if (FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines))
+ pr_warn("count=%ld trampolines=%d\n",
+ ops->tramp_hash->count,
+ ops->nr_trampolines);
+
+ return 0;
+}
+
+static int ftrace_save_tramp_hashes(void)
+{
+ struct ftrace_ops *op;
+ int ret;
+
+ /*
+ * Now that any trampoline is being used, we need to save the
+ * hashes for the ops that have them. This allows the mapping
+ * back from the record to the ops that has the trampoline to
+ * know what code is being replaced. Modifying code must always
+ * verify what it is changing.
+ */
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+
+ /* The tramp_hash is recreated each time. */
+ free_ftrace_hash(op->tramp_hash);
+ op->tramp_hash = NULL;
+
+ if (op->nr_trampolines) {
+ ret = ftrace_save_ops_tramp_hash(op);
+ if (ret)
+ return ret;
+ }
+
+ } while_for_each_ftrace_op(op);
+
+ return 0;
+}
+
static void ftrace_run_update_code(int command)
{
int ret;
@@ -2069,11 +2325,6 @@ static void ftrace_run_update_code(int command)
FTRACE_WARN_ON(ret);
if (ret)
return;
- /*
- * Do not call function tracer while we update the code.
- * We are in stop machine.
- */
- function_trace_stop++;
/*
* By default we use stop_machine() to modify the code.
@@ -2083,15 +2334,20 @@ static void ftrace_run_update_code(int command)
*/
arch_ftrace_update_code(command);
- function_trace_stop--;
-
ret = ftrace_arch_code_modify_post_process();
FTRACE_WARN_ON(ret);
+
+ ret = ftrace_save_tramp_hashes();
+ FTRACE_WARN_ON(ret);
}
static ftrace_func_t saved_ftrace_func;
static int ftrace_start_up;
-static int global_start_up;
+
+static void control_ops_free(struct ftrace_ops *ops)
+{
+ free_percpu(ops->disabled);
+}
static void ftrace_startup_enable(int command)
{
@@ -2108,7 +2364,6 @@ static void ftrace_startup_enable(int command)
static int ftrace_startup(struct ftrace_ops *ops, int command)
{
- bool hash_enable = true;
int ret;
if (unlikely(ftrace_disabled))
@@ -2121,18 +2376,9 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
ftrace_start_up++;
command |= FTRACE_UPDATE_CALLS;
- /* ops marked global share the filter hashes */
- if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
- ops = &global_ops;
- /* Don't update hash if global is already set */
- if (global_start_up)
- hash_enable = false;
- global_start_up++;
- }
-
ops->flags |= FTRACE_OPS_FL_ENABLED;
- if (hash_enable)
- ftrace_hash_rec_enable(ops, 1);
+
+ ftrace_hash_rec_enable(ops, 1);
ftrace_startup_enable(command);
@@ -2141,7 +2387,6 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
static int ftrace_shutdown(struct ftrace_ops *ops, int command)
{
- bool hash_disable = true;
int ret;
if (unlikely(ftrace_disabled))
@@ -2159,22 +2404,9 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
*/
WARN_ON_ONCE(ftrace_start_up < 0);
- if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
- ops = &global_ops;
- global_start_up--;
- WARN_ON_ONCE(global_start_up < 0);
- /* Don't update hash if global still has users */
- if (global_start_up) {
- WARN_ON_ONCE(!ftrace_start_up);
- hash_disable = false;
- }
- }
-
- if (hash_disable)
- ftrace_hash_rec_disable(ops, 1);
+ ftrace_hash_rec_disable(ops, 1);
- if (ops != &global_ops || !global_start_up)
- ops->flags &= ~FTRACE_OPS_FL_ENABLED;
+ ops->flags &= ~FTRACE_OPS_FL_ENABLED;
command |= FTRACE_UPDATE_CALLS;
@@ -2195,8 +2427,16 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
return 0;
}
+ /*
+ * If the ops uses a trampoline, then it needs to be
+ * tested first on update.
+ */
+ removed_ops = ops;
+
ftrace_run_update_code(command);
+ removed_ops = NULL;
+
/*
* Dynamic ops may be freed, we must make sure that all
* callers are done before leaving this function.
@@ -2244,7 +2484,6 @@ static void ftrace_shutdown_sysctl(void)
}
static cycle_t ftrace_update_time;
-static unsigned long ftrace_update_cnt;
unsigned long ftrace_update_tot_cnt;
static inline int ops_traces_mod(struct ftrace_ops *ops)
@@ -2253,8 +2492,8 @@ static inline int ops_traces_mod(struct ftrace_ops *ops)
* Filter_hash being empty will default to trace module.
* But notrace hash requires a test of individual module functions.
*/
- return ftrace_hash_empty(ops->filter_hash) &&
- ftrace_hash_empty(ops->notrace_hash);
+ return ftrace_hash_empty(ops->func_hash->filter_hash) &&
+ ftrace_hash_empty(ops->func_hash->notrace_hash);
}
/*
@@ -2276,12 +2515,12 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
return 0;
/* The function must be in the filter */
- if (!ftrace_hash_empty(ops->filter_hash) &&
- !ftrace_lookup_ip(ops->filter_hash, rec->ip))
+ if (!ftrace_hash_empty(ops->func_hash->filter_hash) &&
+ !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))
return 0;
/* If in notrace hash, we ignore it too */
- if (ftrace_lookup_ip(ops->notrace_hash, rec->ip))
+ if (ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip))
return 0;
return 1;
@@ -2300,11 +2539,12 @@ static int referenced_filters(struct dyn_ftrace *rec)
return cnt;
}
-static int ftrace_update_code(struct module *mod)
+static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
{
struct ftrace_page *pg;
struct dyn_ftrace *p;
cycle_t start, stop;
+ unsigned long update_cnt = 0;
unsigned long ref = 0;
bool test = false;
int i;
@@ -2330,9 +2570,8 @@ static int ftrace_update_code(struct module *mod)
}
start = ftrace_now(raw_smp_processor_id());
- ftrace_update_cnt = 0;
- for (pg = ftrace_new_pgs; pg; pg = pg->next) {
+ for (pg = new_pgs; pg; pg = pg->next) {
for (i = 0; i < pg->index; i++) {
int cnt = ref;
@@ -2353,7 +2592,7 @@ static int ftrace_update_code(struct module *mod)
if (!ftrace_code_disable(mod, p))
break;
- ftrace_update_cnt++;
+ update_cnt++;
/*
* If the tracing is enabled, go ahead and enable the record.
@@ -2372,11 +2611,9 @@ static int ftrace_update_code(struct module *mod)
}
}
- ftrace_new_pgs = NULL;
-
stop = ftrace_now(raw_smp_processor_id());
ftrace_update_time = stop - start;
- ftrace_update_tot_cnt += ftrace_update_cnt;
+ ftrace_update_tot_cnt += update_cnt;
return 0;
}
@@ -2457,7 +2694,8 @@ ftrace_allocate_pages(unsigned long num_to_init)
return start_pg;
free_pages:
- while (start_pg) {
+ pg = start_pg;
+ while (pg) {
order = get_count_order(pg->size / ENTRIES_PER_PAGE);
free_pages((unsigned long)pg->records, order);
start_pg = pg->next;
@@ -2468,22 +2706,6 @@ ftrace_allocate_pages(unsigned long num_to_init)
return NULL;
}
-static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
-{
- int cnt;
-
- if (!num_to_init) {
- pr_info("ftrace: No functions to be traced?\n");
- return -1;
- }
-
- cnt = num_to_init / ENTRIES_PER_PAGE;
- pr_info("ftrace: allocating %ld entries in %d pages\n",
- num_to_init, cnt + 1);
-
- return 0;
-}
-
#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
struct ftrace_iterator {
@@ -2619,10 +2841,10 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
} else {
rec = &iter->pg->records[iter->idx++];
if (((iter->flags & FTRACE_ITER_FILTER) &&
- !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) ||
+ !(ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))) ||
((iter->flags & FTRACE_ITER_NOTRACE) &&
- !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) ||
+ !ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) ||
((iter->flags & FTRACE_ITER_ENABLED) &&
!(rec->flags & FTRACE_FL_ENABLED))) {
@@ -2670,8 +2892,10 @@ static void *t_start(struct seq_file *m, loff_t *pos)
* off, we can short cut and just print out that all
* functions are enabled.
*/
- if (iter->flags & FTRACE_ITER_FILTER &&
- ftrace_hash_empty(ops->filter_hash)) {
+ if ((iter->flags & FTRACE_ITER_FILTER &&
+ ftrace_hash_empty(ops->func_hash->filter_hash)) ||
+ (iter->flags & FTRACE_ITER_NOTRACE &&
+ ftrace_hash_empty(ops->func_hash->notrace_hash))) {
if (*pos > 0)
return t_hash_start(m, pos);
iter->flags |= FTRACE_ITER_PRINTALL;
@@ -2716,7 +2940,10 @@ static int t_show(struct seq_file *m, void *v)
return t_hash_show(m, iter);
if (iter->flags & FTRACE_ITER_PRINTALL) {
- seq_printf(m, "#### all functions enabled ####\n");
+ if (iter->flags & FTRACE_ITER_NOTRACE)
+ seq_printf(m, "#### no functions disabled ####\n");
+ else
+ seq_printf(m, "#### all functions enabled ####\n");
return 0;
}
@@ -2726,10 +2953,22 @@ static int t_show(struct seq_file *m, void *v)
return 0;
seq_printf(m, "%ps", (void *)rec->ip);
- if (iter->flags & FTRACE_ITER_ENABLED)
+ if (iter->flags & FTRACE_ITER_ENABLED) {
seq_printf(m, " (%ld)%s",
- rec->flags & ~FTRACE_FL_MASK,
- rec->flags & FTRACE_FL_REGS ? " R" : "");
+ ftrace_rec_count(rec),
+ rec->flags & FTRACE_FL_REGS ? " R" : " ");
+ if (rec->flags & FTRACE_FL_TRAMP_EN) {
+ struct ftrace_ops *ops;
+
+ ops = ftrace_find_tramp_ops_curr(rec);
+ if (ops && ops->trampoline)
+ seq_printf(m, "\ttramp: %pS",
+ (void *)ops->trampoline);
+ else
+ seq_printf(m, "\ttramp: ERROR!");
+ }
+ }
+
seq_printf(m, "\n");
return 0;
@@ -2777,13 +3016,6 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
return iter ? 0 : -ENOMEM;
}
-static void ftrace_filter_reset(struct ftrace_hash *hash)
-{
- mutex_lock(&ftrace_lock);
- ftrace_hash_clear(hash);
- mutex_unlock(&ftrace_lock);
-}
-
/**
* ftrace_regex_open - initialize function tracer filter files
* @ops: The ftrace_ops that hold the hash filters
@@ -2825,15 +3057,21 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
iter->ops = ops;
iter->flags = flag;
- mutex_lock(&ops->regex_lock);
+ mutex_lock(&ops->func_hash->regex_lock);
if (flag & FTRACE_ITER_NOTRACE)
- hash = ops->notrace_hash;
+ hash = ops->func_hash->notrace_hash;
else
- hash = ops->filter_hash;
+ hash = ops->func_hash->filter_hash;
if (file->f_mode & FMODE_WRITE) {
- iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash);
+ const int size_bits = FTRACE_HASH_DEFAULT_BITS;
+
+ if (file->f_flags & O_TRUNC)
+ iter->hash = alloc_ftrace_hash(size_bits);
+ else
+ iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash);
+
if (!iter->hash) {
trace_parser_put(&iter->parser);
kfree(iter);
@@ -2842,10 +3080,6 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
}
}
- if ((file->f_mode & FMODE_WRITE) &&
- (file->f_flags & O_TRUNC))
- ftrace_filter_reset(iter->hash);
-
if (file->f_mode & FMODE_READ) {
iter->pg = ftrace_pages_start;
@@ -2863,7 +3097,7 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
file->private_data = iter;
out_unlock:
- mutex_unlock(&ops->regex_lock);
+ mutex_unlock(&ops->func_hash->regex_lock);
return ret;
}
@@ -2871,7 +3105,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
static int
ftrace_filter_open(struct inode *inode, struct file *file)
{
- return ftrace_regex_open(&global_ops,
+ struct ftrace_ops *ops = inode->i_private;
+
+ return ftrace_regex_open(ops,
FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH,
inode, file);
}
@@ -2879,7 +3115,9 @@ ftrace_filter_open(struct inode *inode, struct file *file)
static int
ftrace_notrace_open(struct inode *inode, struct file *file)
{
- return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE,
+ struct ftrace_ops *ops = inode->i_private;
+
+ return ftrace_regex_open(ops, FTRACE_ITER_NOTRACE,
inode, file);
}
@@ -3097,7 +3335,7 @@ static struct ftrace_ops trace_probe_ops __read_mostly =
{
.func = function_trace_probe_call,
.flags = FTRACE_OPS_FL_INITIALIZED,
- INIT_REGEX_LOCK(trace_probe_ops)
+ INIT_OPS_HASH(trace_probe_ops)
};
static int ftrace_probe_registered;
@@ -3160,7 +3398,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
void *data)
{
struct ftrace_func_probe *entry;
- struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash;
+ struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
struct ftrace_hash *hash;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
@@ -3177,7 +3415,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
if (WARN_ON(not))
return -EINVAL;
- mutex_lock(&trace_probe_ops.regex_lock);
+ mutex_lock(&trace_probe_ops.func_hash->regex_lock);
hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
if (!hash) {
@@ -3246,7 +3484,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
out_unlock:
mutex_unlock(&ftrace_lock);
out:
- mutex_unlock(&trace_probe_ops.regex_lock);
+ mutex_unlock(&trace_probe_ops.func_hash->regex_lock);
free_ftrace_hash(hash);
return count;
@@ -3264,7 +3502,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
struct ftrace_func_entry *rec_entry;
struct ftrace_func_probe *entry;
struct ftrace_func_probe *p;
- struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash;
+ struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
struct list_head free_list;
struct ftrace_hash *hash;
struct hlist_node *tmp;
@@ -3286,7 +3524,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
return;
}
- mutex_lock(&trace_probe_ops.regex_lock);
+ mutex_lock(&trace_probe_ops.func_hash->regex_lock);
hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
if (!hash)
@@ -3339,7 +3577,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
mutex_unlock(&ftrace_lock);
out_unlock:
- mutex_unlock(&trace_probe_ops.regex_lock);
+ mutex_unlock(&trace_probe_ops.func_hash->regex_lock);
free_ftrace_hash(hash);
}
@@ -3532,28 +3770,26 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
struct ftrace_hash *hash;
int ret;
- /* All global ops uses the global ops filters */
- if (ops->flags & FTRACE_OPS_FL_GLOBAL)
- ops = &global_ops;
-
if (unlikely(ftrace_disabled))
return -ENODEV;
- mutex_lock(&ops->regex_lock);
+ mutex_lock(&ops->func_hash->regex_lock);
if (enable)
- orig_hash = &ops->filter_hash;
+ orig_hash = &ops->func_hash->filter_hash;
else
- orig_hash = &ops->notrace_hash;
+ orig_hash = &ops->func_hash->notrace_hash;
+
+ if (reset)
+ hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
+ else
+ hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
- hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
if (!hash) {
ret = -ENOMEM;
goto out_regex_unlock;
}
- if (reset)
- ftrace_filter_reset(hash);
if (buf && !ftrace_match_records(hash, buf, len)) {
ret = -EINVAL;
goto out_regex_unlock;
@@ -3572,7 +3808,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
mutex_unlock(&ftrace_lock);
out_regex_unlock:
- mutex_unlock(&ops->regex_lock);
+ mutex_unlock(&ops->func_hash->regex_lock);
free_ftrace_hash(hash);
return ret;
@@ -3647,8 +3883,7 @@ int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
}
EXPORT_SYMBOL_GPL(ftrace_set_notrace);
/**
- * ftrace_set_filter - set a function to filter on in ftrace
- * @ops - the ops to set the filter with
+ * ftrace_set_global_filter - set a function to filter on with global tracers
* @buf - the string that holds the function filter text.
* @len - the length of the string.
* @reset - non zero to reset all filters before applying this filter.
@@ -3663,8 +3898,7 @@ void ftrace_set_global_filter(unsigned char *buf, int len, int reset)
EXPORT_SYMBOL_GPL(ftrace_set_global_filter);
/**
- * ftrace_set_notrace - set a function to not trace in ftrace
- * @ops - the ops to set the notrace filter with
+ * ftrace_set_global_notrace - set a function to not trace with global tracers
* @buf - the string that holds the function notrace text.
* @len - the length of the string.
* @reset - non zero to reset all filters before applying this filter.
@@ -3707,6 +3941,7 @@ __setup("ftrace_filter=", set_ftrace_filter);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
+static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);
static int __init set_graph_function(char *str)
@@ -3716,16 +3951,29 @@ static int __init set_graph_function(char *str)
}
__setup("ftrace_graph_filter=", set_graph_function);
-static void __init set_ftrace_early_graph(char *buf)
+static int __init set_graph_notrace_function(char *str)
+{
+ strlcpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE);
+ return 1;
+}
+__setup("ftrace_graph_notrace=", set_graph_notrace_function);
+
+static void __init set_ftrace_early_graph(char *buf, int enable)
{
int ret;
char *func;
+ unsigned long *table = ftrace_graph_funcs;
+ int *count = &ftrace_graph_count;
+
+ if (!enable) {
+ table = ftrace_graph_notrace_funcs;
+ count = &ftrace_graph_notrace_count;
+ }
while (buf) {
func = strsep(&buf, ",");
/* we allow only one expression at a time */
- ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
- FTRACE_GRAPH_MAX_FUNCS, func);
+ ret = ftrace_set_func(table, count, FTRACE_GRAPH_MAX_FUNCS, func);
if (ret)
printk(KERN_DEBUG "ftrace: function %s not "
"traceable\n", func);
@@ -3754,7 +4002,9 @@ static void __init set_ftrace_early_filters(void)
ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
if (ftrace_graph_buf[0])
- set_ftrace_early_graph(ftrace_graph_buf);
+ set_ftrace_early_graph(ftrace_graph_buf, 1);
+ if (ftrace_graph_notrace_buf[0])
+ set_ftrace_early_graph(ftrace_graph_notrace_buf, 0);
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
}
@@ -3781,15 +4031,15 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
trace_parser_put(parser);
- mutex_lock(&iter->ops->regex_lock);
+ mutex_lock(&iter->ops->func_hash->regex_lock);
if (file->f_mode & FMODE_WRITE) {
filter_hash = !!(iter->flags & FTRACE_ITER_FILTER);
if (filter_hash)
- orig_hash = &iter->ops->filter_hash;
+ orig_hash = &iter->ops->func_hash->filter_hash;
else
- orig_hash = &iter->ops->notrace_hash;
+ orig_hash = &iter->ops->func_hash->notrace_hash;
mutex_lock(&ftrace_lock);
ret = ftrace_hash_move(iter->ops, filter_hash,
@@ -3800,7 +4050,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
mutex_unlock(&ftrace_lock);
}
- mutex_unlock(&iter->ops->regex_lock);
+ mutex_unlock(&iter->ops->func_hash->regex_lock);
free_ftrace_hash(iter->hash);
kfree(iter);
@@ -3896,7 +4146,12 @@ static int g_show(struct seq_file *m, void *v)
return 0;
if (ptr == (unsigned long *)1) {
- seq_printf(m, "#### all functions enabled ####\n");
+ struct ftrace_graph_data *fgd = m->private;
+
+ if (fgd->table == ftrace_graph_funcs)
+ seq_printf(m, "#### all functions enabled ####\n");
+ else
+ seq_printf(m, "#### no functions disabled ####\n");
return 0;
}
@@ -4109,6 +4364,36 @@ static const struct file_operations ftrace_graph_notrace_fops = {
};
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+void ftrace_create_filter_files(struct ftrace_ops *ops,
+ struct dentry *parent)
+{
+
+ trace_create_file("set_ftrace_filter", 0644, parent,
+ ops, &ftrace_filter_fops);
+
+ trace_create_file("set_ftrace_notrace", 0644, parent,
+ ops, &ftrace_notrace_fops);
+}
+
+/*
+ * The name "destroy_filter_files" is really a misnomer. Although
+ * in the future, it may actualy delete the files, but this is
+ * really intended to make sure the ops passed in are disabled
+ * and that when this function returns, the caller is free to
+ * free the ops.
+ *
+ * The "destroy" name is only to match the "create" name that this
+ * should be paired with.
+ */
+void ftrace_destroy_filter_files(struct ftrace_ops *ops)
+{
+ mutex_lock(&ftrace_lock);
+ if (ops->flags & FTRACE_OPS_FL_ENABLED)
+ ftrace_shutdown(ops, 0);
+ ops->flags |= FTRACE_OPS_FL_DELETED;
+ mutex_unlock(&ftrace_lock);
+}
+
static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
{
@@ -4118,11 +4403,7 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
trace_create_file("enabled_functions", 0444,
d_tracer, NULL, &ftrace_enabled_fops);
- trace_create_file("set_ftrace_filter", 0644, d_tracer,
- NULL, &ftrace_filter_fops);
-
- trace_create_file("set_ftrace_notrace", 0644, d_tracer,
- NULL, &ftrace_notrace_fops);
+ ftrace_create_filter_files(&global_ops, d_tracer);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
trace_create_file("set_graph_function", 0444, d_tracer,
@@ -4238,9 +4519,6 @@ static int ftrace_process_locs(struct module *mod,
/* Assign the last page to ftrace_pages */
ftrace_pages = pg;
- /* These new locations need to be initialized */
- ftrace_new_pgs = start_pg;
-
/*
* We only need to disable interrupts on start up
* because we are modifying code that an interrupt
@@ -4251,7 +4529,7 @@ static int ftrace_process_locs(struct module *mod,
*/
if (!mod)
local_irq_save(flags);
- ftrace_update_code(mod);
+ ftrace_update_code(mod, start_pg);
if (!mod)
local_irq_restore(flags);
ret = 0;
@@ -4315,16 +4593,11 @@ static void ftrace_init_module(struct module *mod,
ftrace_process_locs(mod, start, end);
}
-static int ftrace_module_notify_enter(struct notifier_block *self,
- unsigned long val, void *data)
+void ftrace_module_init(struct module *mod)
{
- struct module *mod = data;
-
- if (val == MODULE_STATE_COMING)
- ftrace_init_module(mod, mod->ftrace_callsites,
- mod->ftrace_callsites +
- mod->num_ftrace_callsites);
- return 0;
+ ftrace_init_module(mod, mod->ftrace_callsites,
+ mod->ftrace_callsites +
+ mod->num_ftrace_callsites);
}
static int ftrace_module_notify_exit(struct notifier_block *self,
@@ -4338,11 +4611,6 @@ static int ftrace_module_notify_exit(struct notifier_block *self,
return 0;
}
#else
-static int ftrace_module_notify_enter(struct notifier_block *self,
- unsigned long val, void *data)
-{
- return 0;
-}
static int ftrace_module_notify_exit(struct notifier_block *self,
unsigned long val, void *data)
{
@@ -4350,40 +4618,32 @@ static int ftrace_module_notify_exit(struct notifier_block *self,
}
#endif /* CONFIG_MODULES */
-struct notifier_block ftrace_module_enter_nb = {
- .notifier_call = ftrace_module_notify_enter,
- .priority = INT_MAX, /* Run before anything that can use kprobes */
-};
-
struct notifier_block ftrace_module_exit_nb = {
.notifier_call = ftrace_module_notify_exit,
.priority = INT_MIN, /* Run after anything that can remove kprobes */
};
-extern unsigned long __start_mcount_loc[];
-extern unsigned long __stop_mcount_loc[];
-
void __init ftrace_init(void)
{
- unsigned long count, addr, flags;
+ extern unsigned long __start_mcount_loc[];
+ extern unsigned long __stop_mcount_loc[];
+ unsigned long count, flags;
int ret;
- /* Keep the ftrace pointer to the stub */
- addr = (unsigned long)ftrace_stub;
-
local_irq_save(flags);
- ftrace_dyn_arch_init(&addr);
+ ret = ftrace_dyn_arch_init();
local_irq_restore(flags);
-
- /* ftrace_dyn_arch_init places the return code in addr */
- if (addr)
+ if (ret)
goto failed;
count = __stop_mcount_loc - __start_mcount_loc;
-
- ret = ftrace_dyn_table_alloc(count);
- if (ret)
+ if (!count) {
+ pr_info("ftrace: No functions to be traced?\n");
goto failed;
+ }
+
+ pr_info("ftrace: allocating %ld entries in %ld pages\n",
+ count, count / ENTRIES_PER_PAGE + 1);
last_ftrace_enabled = ftrace_enabled = 1;
@@ -4391,10 +4651,6 @@ void __init ftrace_init(void)
__start_mcount_loc,
__stop_mcount_loc);
- ret = register_module_notifier(&ftrace_module_enter_nb);
- if (ret)
- pr_warning("Failed to register trace ftrace module enter notifier\n");
-
ret = register_module_notifier(&ftrace_module_exit_nb);
if (ret)
pr_warning("Failed to register trace ftrace module exit notifier\n");
@@ -4411,7 +4667,6 @@ void __init ftrace_init(void)
static struct ftrace_ops global_ops = {
.func = ftrace_stub,
.flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
- INIT_REGEX_LOCK(global_ops)
};
static int __init ftrace_nodyn_init(void)
@@ -4431,7 +4686,13 @@ static inline void ftrace_startup_enable(int command) { }
(ops)->flags |= FTRACE_OPS_FL_ENABLED; \
___ret; \
})
-# define ftrace_shutdown(ops, command) __unregister_ftrace_function(ops)
+# define ftrace_shutdown(ops, command) \
+ ({ \
+ int ___ret = __unregister_ftrace_function(ops); \
+ if (!___ret) \
+ (ops)->flags &= ~FTRACE_OPS_FL_ENABLED; \
+ ___ret; \
+ })
# define ftrace_startup_sysctl() do { } while (0)
# define ftrace_shutdown_sysctl() do { } while (0)
@@ -4444,6 +4705,34 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
#endif /* CONFIG_DYNAMIC_FTRACE */
+__init void ftrace_init_global_array_ops(struct trace_array *tr)
+{
+ tr->ops = &global_ops;
+ tr->ops->private = tr;
+}
+
+void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func)
+{
+ /* If we filter on pids, update to use the pid function */
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
+ if (WARN_ON(tr->ops->func != ftrace_stub))
+ printk("ftrace ops had %pS for function\n",
+ tr->ops->func);
+ /* Only the top level instance does pid tracing */
+ if (!list_empty(&ftrace_pids)) {
+ set_ftrace_pid_function(func);
+ func = ftrace_pid_func;
+ }
+ }
+ tr->ops->func = func;
+ tr->ops->private = tr;
+}
+
+void ftrace_reset_array_ops(struct trace_array *tr)
+{
+ tr->ops->func = ftrace_stub;
+}
+
static void
ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *regs)
@@ -4479,7 +4768,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
static struct ftrace_ops control_ops = {
.func = ftrace_ops_control_func,
.flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
- INIT_REGEX_LOCK(control_ops)
+ INIT_OPS_HASH(control_ops)
};
static inline void
@@ -4489,9 +4778,6 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op;
int bit;
- if (function_trace_stop)
- return;
-
bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
if (bit < 0)
return;
@@ -4502,9 +4788,15 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
*/
preempt_disable_notrace();
do_for_each_ftrace_op(op, ftrace_ops_list) {
- if (ftrace_ops_test(op, ip, regs))
+ if (ftrace_ops_test(op, ip, regs)) {
+ if (FTRACE_WARN_ON(!op->func)) {
+ pr_warn("op=%p %pS\n", op, op);
+ goto out;
+ }
op->func(ip, parent_ip, op, regs);
+ }
} while_for_each_ftrace_op(op);
+out:
preempt_enable_notrace();
trace_clear_recursion(bit);
}
@@ -4908,8 +5200,18 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static struct ftrace_ops graph_ops = {
+ .func = ftrace_stub,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE |
+ FTRACE_OPS_FL_INITIALIZED |
+ FTRACE_OPS_FL_STUB,
+#ifdef FTRACE_GRAPH_TRAMP_ADDR
+ .trampoline = FTRACE_GRAPH_TRAMP_ADDR,
+#endif
+ ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)
+};
+
static int ftrace_graph_active;
-static struct notifier_block ftrace_suspend_notifier;
int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
{
@@ -5055,13 +5357,6 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
return NOTIFY_DONE;
}
-/* Just a place holder for function graph */
-static struct ftrace_ops fgraph_ops __read_mostly = {
- .func = ftrace_stub,
- .flags = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL |
- FTRACE_OPS_FL_RECURSION_SAFE,
-};
-
static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
{
if (!ftrace_ops_test(&global_ops, trace->func, NULL))
@@ -5078,14 +5373,34 @@ static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
*/
static void update_function_graph_func(void)
{
- if (ftrace_ops_list == &ftrace_list_end ||
- (ftrace_ops_list == &global_ops &&
- global_ops.next == &ftrace_list_end))
- ftrace_graph_entry = __ftrace_graph_entry;
- else
+ struct ftrace_ops *op;
+ bool do_test = false;
+
+ /*
+ * The graph and global ops share the same set of functions
+ * to test. If any other ops is on the list, then
+ * the graph tracing needs to test if its the function
+ * it should call.
+ */
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ if (op != &global_ops && op != &graph_ops &&
+ op != &ftrace_list_end) {
+ do_test = true;
+ /* in double loop, break out with goto */
+ goto out;
+ }
+ } while_for_each_ftrace_op(op);
+ out:
+ if (do_test)
ftrace_graph_entry = ftrace_graph_entry_test;
+ else
+ ftrace_graph_entry = __ftrace_graph_entry;
}
+static struct notifier_block ftrace_suspend_notifier = {
+ .notifier_call = ftrace_suspend_notifier_call,
+};
+
int register_ftrace_graph(trace_func_graph_ret_t retfunc,
trace_func_graph_ent_t entryfunc)
{
@@ -5099,7 +5414,6 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
goto out;
}
- ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
register_pm_notifier(&ftrace_suspend_notifier);
ftrace_graph_active++;
@@ -5121,7 +5435,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
ftrace_graph_entry = ftrace_graph_entry_test;
update_function_graph_func();
- ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET);
+ ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET);
out:
mutex_unlock(&ftrace_lock);
@@ -5139,7 +5453,7 @@ void unregister_ftrace_graph(void)
ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
ftrace_graph_entry = ftrace_graph_entry_stub;
__ftrace_graph_entry = ftrace_graph_entry_stub;
- ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET);
+ ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET);
unregister_pm_notifier(&ftrace_suspend_notifier);
unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
@@ -5219,9 +5533,4 @@ void ftrace_graph_exit_task(struct task_struct *t)
kfree(ret_stack);
}
-
-void ftrace_graph_stop(void)
-{
- ftrace_stop();
-}
#endif
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index fc4da2d97f9b..2d75c94ae87d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -543,7 +543,7 @@ static void rb_wake_up_waiters(struct irq_work *work)
* as data is added to any of the @buffer's cpu buffers. Otherwise
* it will wait for data to be added to a specific cpu buffer.
*/
-void ring_buffer_wait(struct ring_buffer *buffer, int cpu)
+int ring_buffer_wait(struct ring_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
DEFINE_WAIT(wait);
@@ -557,6 +557,8 @@ void ring_buffer_wait(struct ring_buffer *buffer, int cpu)
if (cpu == RING_BUFFER_ALL_CPUS)
work = &buffer->irq_work;
else {
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return -ENODEV;
cpu_buffer = buffer->buffers[cpu];
work = &cpu_buffer->irq_work;
}
@@ -591,6 +593,7 @@ void ring_buffer_wait(struct ring_buffer *buffer, int cpu)
schedule();
finish_wait(&work->waiters, &wait);
+ return 0;
}
/**
@@ -613,10 +616,6 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
struct ring_buffer_per_cpu *cpu_buffer;
struct rb_irq_work *work;
- if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
- (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
- return POLLIN | POLLRDNORM;
-
if (cpu == RING_BUFFER_ALL_CPUS)
work = &buffer->irq_work;
else {
@@ -627,8 +626,22 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
work = &cpu_buffer->irq_work;
}
- work->waiters_pending = true;
poll_wait(filp, &work->waiters, poll_table);
+ work->waiters_pending = true;
+ /*
+ * There's a tight race between setting the waiters_pending and
+ * checking if the ring buffer is empty. Once the waiters_pending bit
+ * is set, the next event will wake the task up, but we can get stuck
+ * if there's only a single event in.
+ *
+ * FIXME: Ideally, we need a memory barrier on the writer side as well,
+ * but adding a memory barrier to all events will cause too much of a
+ * performance hit in the fast path. We only need a memory barrier when
+ * the buffer goes from empty to having content. But as this race is
+ * extremely small, and it's not a problem if another event comes in, we
+ * will fix it later.
+ */
+ smp_mb();
if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
(cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
@@ -1301,7 +1314,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
* In that off case, we need to allocate for all possible cpus.
*/
#ifdef CONFIG_HOTPLUG_CPU
- get_online_cpus();
+ cpu_notifier_register_begin();
cpumask_copy(buffer->cpumask, cpu_online_mask);
#else
cpumask_copy(buffer->cpumask, cpu_possible_mask);
@@ -1324,10 +1337,10 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
#ifdef CONFIG_HOTPLUG_CPU
buffer->cpu_notify.notifier_call = rb_cpu_notify;
buffer->cpu_notify.priority = 0;
- register_cpu_notifier(&buffer->cpu_notify);
+ __register_cpu_notifier(&buffer->cpu_notify);
+ cpu_notifier_register_done();
#endif
- put_online_cpus();
mutex_init(&buffer->mutex);
return buffer;
@@ -1341,7 +1354,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
fail_free_cpumask:
free_cpumask_var(buffer->cpumask);
- put_online_cpus();
+#ifdef CONFIG_HOTPLUG_CPU
+ cpu_notifier_register_done();
+#endif
fail_free_buffer:
kfree(buffer);
@@ -1358,16 +1373,17 @@ ring_buffer_free(struct ring_buffer *buffer)
{
int cpu;
- get_online_cpus();
-
#ifdef CONFIG_HOTPLUG_CPU
- unregister_cpu_notifier(&buffer->cpu_notify);
+ cpu_notifier_register_begin();
+ __unregister_cpu_notifier(&buffer->cpu_notify);
#endif
for_each_buffer_cpu(buffer, cpu)
rb_free_cpu_buffer(buffer->buffers[cpu]);
- put_online_cpus();
+#ifdef CONFIG_HOTPLUG_CPU
+ cpu_notifier_register_done();
+#endif
kfree(buffer->buffers);
free_cpumask_var(buffer->cpumask);
@@ -1687,22 +1703,14 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
if (!cpu_buffer->nr_pages_to_update)
continue;
- /* The update must run on the CPU that is being updated. */
- preempt_disable();
- if (cpu == smp_processor_id() || !cpu_online(cpu)) {
+ /* Can't run something on an offline CPU. */
+ if (!cpu_online(cpu)) {
rb_update_pages(cpu_buffer);
cpu_buffer->nr_pages_to_update = 0;
} else {
- /*
- * Can not disable preemption for schedule_work_on()
- * on PREEMPT_RT.
- */
- preempt_enable();
schedule_work_on(cpu,
&cpu_buffer->update_pages_work);
- preempt_disable();
}
- preempt_enable();
}
/* wait for all the updates to complete */
@@ -1740,22 +1748,14 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
get_online_cpus();
- preempt_disable();
- /* The update must run on the CPU that is being updated. */
- if (cpu_id == smp_processor_id() || !cpu_online(cpu_id))
+ /* Can't run something on an offline CPU. */
+ if (!cpu_online(cpu_id))
rb_update_pages(cpu_buffer);
else {
- /*
- * Can not disable preemption for schedule_work_on()
- * on PREEMPT_RT.
- */
- preempt_enable();
schedule_work_on(cpu_id,
&cpu_buffer->update_pages_work);
wait_for_completion(&cpu_buffer->update_done);
- preempt_disable();
}
- preempt_enable();
cpu_buffer->nr_pages_to_update = 0;
put_online_cpus();
@@ -1982,7 +1982,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
/**
* rb_update_event - update event type and data
- * @event: the even to update
+ * @event: the event to update
* @type: the type of event
* @length: the size of the event field in the ring buffer
*
@@ -3355,21 +3355,16 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
/* Iterator usage is expected to have record disabled */
- if (list_empty(&cpu_buffer->reader_page->list)) {
- iter->head_page = rb_set_head_page(cpu_buffer);
- if (unlikely(!iter->head_page))
- return;
- iter->head = iter->head_page->read;
- } else {
- iter->head_page = cpu_buffer->reader_page;
- iter->head = cpu_buffer->reader_page->read;
- }
+ iter->head_page = cpu_buffer->reader_page;
+ iter->head = cpu_buffer->reader_page->read;
+
+ iter->cache_reader_page = iter->head_page;
+ iter->cache_read = cpu_buffer->read;
+
if (iter->head)
iter->read_stamp = cpu_buffer->read_stamp;
else
iter->read_stamp = iter->head_page->page->time_stamp;
- iter->cache_reader_page = cpu_buffer->reader_page;
- iter->cache_read = cpu_buffer->read;
}
/**
@@ -3762,18 +3757,20 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
return NULL;
/*
- * We repeat when a time extend is encountered.
- * Since the time extend is always attached to a data event,
- * we should never loop more than once.
- * (We never hit the following condition more than twice).
+ * We repeat when a time extend is encountered or we hit
+ * the end of the page. Since the time extend is always attached
+ * to a data event, we should never loop more than three times.
+ * Once for going to next page, once on time extend, and
+ * finally once to get the event.
+ * (We never hit the following condition more than thrice).
*/
- if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
+ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3))
return NULL;
if (rb_per_cpu_empty(cpu_buffer))
return NULL;
- if (iter->head >= local_read(&iter->head_page->page->commit)) {
+ if (iter->head >= rb_page_size(iter->head_page)) {
rb_inc_iter(iter);
goto again;
}
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index a5457d577b98..0434ff1b808e 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -40,8 +40,8 @@ static int write_iteration = 50;
module_param(write_iteration, uint, 0644);
MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
-static int producer_nice = 19;
-static int consumer_nice = 19;
+static int producer_nice = MAX_NICE;
+static int consumer_nice = MAX_NICE;
static int producer_fifo = -1;
static int consumer_fifo = -1;
@@ -308,7 +308,7 @@ static void ring_buffer_producer(void)
/* Let the user know that the test is running at low priority */
if (producer_fifo < 0 && consumer_fifo < 0 &&
- producer_nice == 19 && consumer_nice == 19)
+ producer_nice == MAX_NICE && consumer_nice == MAX_NICE)
trace_printk("WARNING!!! This test is running at lowest priority.\n");
trace_printk("Time: %lld (usecs)\n", time);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 815c878f409b..8a528392b1f4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -73,7 +73,8 @@ static struct tracer_flags dummy_tracer_flags = {
.opts = dummy_tracer_opt
};
-static int dummy_set_flag(u32 old_flags, u32 bit, int set)
+static int
+dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
return 0;
}
@@ -118,7 +119,7 @@ enum ftrace_dump_mode ftrace_dump_on_oops;
/* When set, tracing will stop when a WARN*() is hit */
int __disable_trace_on_warning;
-static int tracing_set_tracer(const char *buf);
+static int tracing_set_tracer(struct trace_array *tr, const char *buf);
#define MAX_TRACER_SIZE 100
static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
@@ -180,6 +181,17 @@ static int __init set_trace_boot_options(char *str)
}
__setup("trace_options=", set_trace_boot_options);
+static char trace_boot_clock_buf[MAX_TRACER_SIZE] __initdata;
+static char *trace_boot_clock __initdata;
+
+static int __init set_trace_boot_clock(char *str)
+{
+ strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE);
+ trace_boot_clock = trace_boot_clock_buf;
+ return 0;
+}
+__setup("trace_clock=", set_trace_boot_clock);
+
unsigned long long ns2usecs(cycle_t nsec)
{
@@ -263,7 +275,7 @@ int call_filter_check_discard(struct ftrace_event_call *call, void *rec,
}
EXPORT_SYMBOL_GPL(call_filter_check_discard);
-cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
+static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
{
u64 ts;
@@ -454,6 +466,12 @@ int __trace_puts(unsigned long ip, const char *str, int size)
struct print_entry *entry;
unsigned long irq_flags;
int alloc;
+ int pc;
+
+ if (!(trace_flags & TRACE_ITER_PRINTK))
+ return 0;
+
+ pc = preempt_count();
if (unlikely(tracing_selftest_running || tracing_disabled))
return 0;
@@ -463,7 +481,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
local_save_flags(irq_flags);
buffer = global_trace.trace_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
- irq_flags, preempt_count());
+ irq_flags, pc);
if (!event)
return 0;
@@ -480,6 +498,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
entry->buf[size] = '\0';
__buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(buffer, irq_flags, 4, pc);
return size;
}
@@ -497,6 +516,12 @@ int __trace_bputs(unsigned long ip, const char *str)
struct bputs_entry *entry;
unsigned long irq_flags;
int size = sizeof(struct bputs_entry);
+ int pc;
+
+ if (!(trace_flags & TRACE_ITER_PRINTK))
+ return 0;
+
+ pc = preempt_count();
if (unlikely(tracing_selftest_running || tracing_disabled))
return 0;
@@ -504,7 +529,7 @@ int __trace_bputs(unsigned long ip, const char *str)
local_save_flags(irq_flags);
buffer = global_trace.trace_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
- irq_flags, preempt_count());
+ irq_flags, pc);
if (!event)
return 0;
@@ -513,6 +538,7 @@ int __trace_bputs(unsigned long ip, const char *str)
entry->str = str;
__buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(buffer, irq_flags, 4, pc);
return 1;
}
@@ -587,7 +613,7 @@ static int alloc_snapshot(struct trace_array *tr)
return 0;
}
-void free_snapshot(struct trace_array *tr)
+static void free_snapshot(struct trace_array *tr)
{
/*
* We don't free the ring buffer. instead, resize it because
@@ -794,11 +820,12 @@ static struct {
const char *name;
int in_ns; /* is this clock in nanoseconds? */
} trace_clocks[] = {
- { trace_clock_local, "local", 1 },
- { trace_clock_global, "global", 1 },
- { trace_clock_counter, "counter", 0 },
- { trace_clock_jiffies, "uptime", 1 },
- { trace_clock, "perf", 1 },
+ { trace_clock_local, "local", 1 },
+ { trace_clock_global, "global", 1 },
+ { trace_clock_counter, "counter", 0 },
+ { trace_clock_jiffies, "uptime", 0 },
+ { trace_clock, "perf", 1 },
+ { ktime_get_mono_fast_ns, "mono", 1 },
ARCH_TRACE_CLOCKS
};
@@ -911,30 +938,6 @@ out:
return ret;
}
-ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
-{
- int len;
- int ret;
-
- if (!cnt)
- return 0;
-
- if (s->len <= s->readpos)
- return -EBUSY;
-
- len = s->len - s->readpos;
- if (cnt > len)
- cnt = len;
- ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
- if (ret == cnt)
- return -EFAULT;
-
- cnt -= ret;
-
- s->readpos += cnt;
- return cnt;
-}
-
static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
{
int len;
@@ -951,27 +954,9 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
return cnt;
}
-/*
- * ftrace_max_lock is used to protect the swapping of buffers
- * when taking a max snapshot. The buffers themselves are
- * protected by per_cpu spinlocks. But the action of the swap
- * needs its own lock.
- *
- * This is defined as a arch_spinlock_t in order to help
- * with performance when lockdep debugging is enabled.
- *
- * It is also used in other places outside the update_max_tr
- * so it needs to be defined outside of the
- * CONFIG_TRACER_MAX_TRACE.
- */
-static arch_spinlock_t ftrace_max_lock =
- (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
-
unsigned long __read_mostly tracing_thresh;
#ifdef CONFIG_TRACER_MAX_TRACE
-unsigned long __read_mostly tracing_max_latency;
-
/*
* Copy the new maximum trace into the separate maximum-trace
* structure. (this way the maximum trace is permanently saved,
@@ -988,7 +973,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
max_buf->cpu = cpu;
max_buf->time_start = data->preempt_timestamp;
- max_data->saved_latency = tracing_max_latency;
+ max_data->saved_latency = tr->max_latency;
max_data->critical_start = data->critical_start;
max_data->critical_end = data->critical_end;
@@ -1036,14 +1021,14 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
return;
}
- arch_spin_lock(&ftrace_max_lock);
+ arch_spin_lock(&tr->max_lock);
buf = tr->trace_buffer.buffer;
tr->trace_buffer.buffer = tr->max_buffer.buffer;
tr->max_buffer.buffer = buf;
__update_max_tr(tr, tsk, cpu);
- arch_spin_unlock(&ftrace_max_lock);
+ arch_spin_unlock(&tr->max_lock);
}
/**
@@ -1069,7 +1054,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
return;
}
- arch_spin_lock(&ftrace_max_lock);
+ arch_spin_lock(&tr->max_lock);
ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu);
@@ -1087,17 +1072,17 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
__update_max_tr(tr, tsk, cpu);
- arch_spin_unlock(&ftrace_max_lock);
+ arch_spin_unlock(&tr->max_lock);
}
#endif /* CONFIG_TRACER_MAX_TRACE */
-static void default_wait_pipe(struct trace_iterator *iter)
+static int wait_on_pipe(struct trace_iterator *iter)
{
/* Iterators are static, they should be filled or empty */
if (trace_buffer_iter(iter, iter->cpu_file))
- return;
+ return 0;
- ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
+ return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
}
#ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -1208,8 +1193,6 @@ int register_tracer(struct tracer *type)
else
if (!type->flags->opts)
type->flags->opts = dummy_tracer_opt;
- if (!type->wait_pipe)
- type->wait_pipe = default_wait_pipe;
ret = run_tracer_selftest(type);
if (ret < 0)
@@ -1230,7 +1213,7 @@ int register_tracer(struct tracer *type)
printk(KERN_INFO "Starting tracer '%s'\n", type->name);
/* Do we want this tracer to start on bootup? */
- tracing_set_tracer(type->name);
+ tracing_set_tracer(&global_trace, type->name);
default_bootup_tracer = NULL;
/* disable other selftests, since this will break it. */
tracing_selftest_disabled = true;
@@ -1293,22 +1276,71 @@ void tracing_reset_all_online_cpus(void)
}
}
-#define SAVED_CMDLINES 128
+#define SAVED_CMDLINES_DEFAULT 128
#define NO_CMDLINE_MAP UINT_MAX
-static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
-static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
-static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
-static int cmdline_idx;
static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
+struct saved_cmdlines_buffer {
+ unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
+ unsigned *map_cmdline_to_pid;
+ unsigned cmdline_num;
+ int cmdline_idx;
+ char *saved_cmdlines;
+};
+static struct saved_cmdlines_buffer *savedcmd;
/* temporary disable recording */
static atomic_t trace_record_cmdline_disabled __read_mostly;
-static void trace_init_cmdlines(void)
+static inline char *get_saved_cmdlines(int idx)
+{
+ return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN];
+}
+
+static inline void set_cmdline(int idx, const char *cmdline)
{
- memset(&map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(map_pid_to_cmdline));
- memset(&map_cmdline_to_pid, NO_CMDLINE_MAP, sizeof(map_cmdline_to_pid));
- cmdline_idx = 0;
+ memcpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
+}
+
+static int allocate_cmdlines_buffer(unsigned int val,
+ struct saved_cmdlines_buffer *s)
+{
+ s->map_cmdline_to_pid = kmalloc(val * sizeof(*s->map_cmdline_to_pid),
+ GFP_KERNEL);
+ if (!s->map_cmdline_to_pid)
+ return -ENOMEM;
+
+ s->saved_cmdlines = kmalloc(val * TASK_COMM_LEN, GFP_KERNEL);
+ if (!s->saved_cmdlines) {
+ kfree(s->map_cmdline_to_pid);
+ return -ENOMEM;
+ }
+
+ s->cmdline_idx = 0;
+ s->cmdline_num = val;
+ memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
+ sizeof(s->map_pid_to_cmdline));
+ memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
+ val * sizeof(*s->map_cmdline_to_pid));
+
+ return 0;
+}
+
+static int trace_create_savedcmd(void)
+{
+ int ret;
+
+ savedcmd = kmalloc(sizeof(*savedcmd), GFP_KERNEL);
+ if (!savedcmd)
+ return -ENOMEM;
+
+ ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd);
+ if (ret < 0) {
+ kfree(savedcmd);
+ savedcmd = NULL;
+ return -ENOMEM;
+ }
+
+ return 0;
}
int is_tracing_stopped(void)
@@ -1341,7 +1373,7 @@ void tracing_start(void)
}
/* Prevent the buffers from switching */
- arch_spin_lock(&ftrace_max_lock);
+ arch_spin_lock(&global_trace.max_lock);
buffer = global_trace.trace_buffer.buffer;
if (buffer)
@@ -1353,9 +1385,8 @@ void tracing_start(void)
ring_buffer_record_enable(buffer);
#endif
- arch_spin_unlock(&ftrace_max_lock);
+ arch_spin_unlock(&global_trace.max_lock);
- ftrace_start();
out:
raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
}
@@ -1402,13 +1433,12 @@ void tracing_stop(void)
struct ring_buffer *buffer;
unsigned long flags;
- ftrace_stop();
raw_spin_lock_irqsave(&global_trace.start_lock, flags);
if (global_trace.stop_count++)
goto out;
/* Prevent the buffers from switching */
- arch_spin_lock(&ftrace_max_lock);
+ arch_spin_lock(&global_trace.max_lock);
buffer = global_trace.trace_buffer.buffer;
if (buffer)
@@ -1420,7 +1450,7 @@ void tracing_stop(void)
ring_buffer_record_disable(buffer);
#endif
- arch_spin_unlock(&ftrace_max_lock);
+ arch_spin_unlock(&global_trace.max_lock);
out:
raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
@@ -1449,12 +1479,12 @@ static void tracing_stop_tr(struct trace_array *tr)
void trace_stop_cmdline_recording(void);
-static void trace_save_cmdline(struct task_struct *tsk)
+static int trace_save_cmdline(struct task_struct *tsk)
{
unsigned pid, idx;
if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
- return;
+ return 0;
/*
* It's not the end of the world if we don't get
@@ -1463,11 +1493,11 @@ static void trace_save_cmdline(struct task_struct *tsk)
* so if we miss here, then better luck next time.
*/
if (!arch_spin_trylock(&trace_cmdline_lock))
- return;
+ return 0;
- idx = map_pid_to_cmdline[tsk->pid];
+ idx = savedcmd->map_pid_to_cmdline[tsk->pid];
if (idx == NO_CMDLINE_MAP) {
- idx = (cmdline_idx + 1) % SAVED_CMDLINES;
+ idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num;
/*
* Check whether the cmdline buffer at idx has a pid
@@ -1475,22 +1505,24 @@ static void trace_save_cmdline(struct task_struct *tsk)
* need to clear the map_pid_to_cmdline. Otherwise we
* would read the new comm for the old pid.
*/
- pid = map_cmdline_to_pid[idx];
+ pid = savedcmd->map_cmdline_to_pid[idx];
if (pid != NO_CMDLINE_MAP)
- map_pid_to_cmdline[pid] = NO_CMDLINE_MAP;
+ savedcmd->map_pid_to_cmdline[pid] = NO_CMDLINE_MAP;
- map_cmdline_to_pid[idx] = tsk->pid;
- map_pid_to_cmdline[tsk->pid] = idx;
+ savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
+ savedcmd->map_pid_to_cmdline[tsk->pid] = idx;
- cmdline_idx = idx;
+ savedcmd->cmdline_idx = idx;
}
- memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
+ set_cmdline(idx, tsk->comm);
arch_spin_unlock(&trace_cmdline_lock);
+
+ return 1;
}
-void trace_find_cmdline(int pid, char comm[])
+static void __trace_find_cmdline(int pid, char comm[])
{
unsigned map;
@@ -1509,13 +1541,19 @@ void trace_find_cmdline(int pid, char comm[])
return;
}
- preempt_disable();
- arch_spin_lock(&trace_cmdline_lock);
- map = map_pid_to_cmdline[pid];
+ map = savedcmd->map_pid_to_cmdline[pid];
if (map != NO_CMDLINE_MAP)
- strcpy(comm, saved_cmdlines[map]);
+ strcpy(comm, get_saved_cmdlines(map));
else
strcpy(comm, "<...>");
+}
+
+void trace_find_cmdline(int pid, char comm[])
+{
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
+
+ __trace_find_cmdline(pid, comm);
arch_spin_unlock(&trace_cmdline_lock);
preempt_enable();
@@ -1529,9 +1567,8 @@ void tracing_record_cmdline(struct task_struct *tsk)
if (!__this_cpu_read(trace_cmdline_save))
return;
- __this_cpu_write(trace_cmdline_save, false);
-
- trace_save_cmdline(tsk);
+ if (trace_save_cmdline(tsk))
+ __this_cpu_write(trace_cmdline_save, false);
}
void
@@ -1600,15 +1637,31 @@ void trace_buffer_unlock_commit(struct ring_buffer *buffer,
}
EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
+static struct ring_buffer *temp_buffer;
+
struct ring_buffer_event *
trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
struct ftrace_event_file *ftrace_file,
int type, unsigned long len,
unsigned long flags, int pc)
{
+ struct ring_buffer_event *entry;
+
*current_rb = ftrace_file->tr->trace_buffer.buffer;
- return trace_buffer_lock_reserve(*current_rb,
+ entry = trace_buffer_lock_reserve(*current_rb,
type, len, flags, pc);
+ /*
+ * If tracing is off, but we have triggers enabled
+ * we still need to look at the event data. Use the temp_buffer
+ * to store the trace event for the tigger to use. It's recusive
+ * safe and will not be recorded anywhere.
+ */
+ if (!entry && ftrace_file->flags & FTRACE_EVENT_FL_TRIGGER_COND) {
+ *current_rb = temp_buffer;
+ entry = trace_buffer_lock_reserve(*current_rb,
+ type, len, flags, pc);
+ }
+ return entry;
}
EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);
@@ -1718,7 +1771,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
*/
barrier();
if (use_stack == 1) {
- trace.entries = &__get_cpu_var(ftrace_stack).calls[0];
+ trace.entries = this_cpu_ptr(ftrace_stack.calls);
trace.max_entries = FTRACE_STACK_MAX_ENTRIES;
if (regs)
@@ -1967,7 +2020,21 @@ void trace_printk_init_buffers(void)
if (alloc_percpu_trace_buffer())
return;
- pr_info("ftrace: Allocated trace_printk buffers\n");
+ /* trace_printk() is for debug use only. Don't use it in production. */
+
+ pr_warning("\n**********************************************************\n");
+ pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
+ pr_warning("** **\n");
+ pr_warning("** trace_printk() being used. Allocating extra memory. **\n");
+ pr_warning("** **\n");
+ pr_warning("** This means that this is a DEBUG kernel and it is **\n");
+ pr_warning("** unsafe for produciton use. **\n");
+ pr_warning("** **\n");
+ pr_warning("** If you see this message and you are not debugging **\n");
+ pr_warning("** the kernel, report this immediately to your vendor! **\n");
+ pr_warning("** **\n");
+ pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
+ pr_warning("**********************************************************\n");
/* Expand the buffers to set size */
tracing_update_buffers();
@@ -3121,27 +3188,52 @@ static int tracing_open(struct inode *inode, struct file *file)
return ret;
}
+/*
+ * Some tracers are not suitable for instance buffers.
+ * A tracer is always available for the global array (toplevel)
+ * or if it explicitly states that it is.
+ */
+static bool
+trace_ok_for_array(struct tracer *t, struct trace_array *tr)
+{
+ return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances;
+}
+
+/* Find the next tracer that this trace array may use */
+static struct tracer *
+get_tracer_for_array(struct trace_array *tr, struct tracer *t)
+{
+ while (t && !trace_ok_for_array(t, tr))
+ t = t->next;
+
+ return t;
+}
+
static void *
t_next(struct seq_file *m, void *v, loff_t *pos)
{
+ struct trace_array *tr = m->private;
struct tracer *t = v;
(*pos)++;
if (t)
- t = t->next;
+ t = get_tracer_for_array(tr, t->next);
return t;
}
static void *t_start(struct seq_file *m, loff_t *pos)
{
+ struct trace_array *tr = m->private;
struct tracer *t;
loff_t l = 0;
mutex_lock(&trace_types_lock);
- for (t = trace_types; t && l < *pos; t = t_next(m, t, &l))
- ;
+
+ t = get_tracer_for_array(tr, trace_types);
+ for (; t && l < *pos; t = t_next(m, t, &l))
+ ;
return t;
}
@@ -3176,10 +3268,21 @@ static const struct seq_operations show_traces_seq_ops = {
static int show_traces_open(struct inode *inode, struct file *file)
{
+ struct trace_array *tr = inode->i_private;
+ struct seq_file *m;
+ int ret;
+
if (tracing_disabled)
return -ENODEV;
- return seq_open(file, &show_traces_seq_ops);
+ ret = seq_open(file, &show_traces_seq_ops);
+ if (ret)
+ return ret;
+
+ m = file->private_data;
+ m->private = tr;
+
+ return 0;
}
static ssize_t
@@ -3269,7 +3372,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
mutex_lock(&tracing_cpumask_update_lock);
local_irq_disable();
- arch_spin_lock(&ftrace_max_lock);
+ arch_spin_lock(&tr->max_lock);
for_each_tracing_cpu(cpu) {
/*
* Increase/decrease the disabled counter if we are
@@ -3286,7 +3389,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu);
}
}
- arch_spin_unlock(&ftrace_max_lock);
+ arch_spin_unlock(&tr->max_lock);
local_irq_enable();
cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new);
@@ -3339,13 +3442,14 @@ static int tracing_trace_options_show(struct seq_file *m, void *v)
return 0;
}
-static int __set_tracer_option(struct tracer *trace,
+static int __set_tracer_option(struct trace_array *tr,
struct tracer_flags *tracer_flags,
struct tracer_opt *opts, int neg)
{
+ struct tracer *trace = tr->current_trace;
int ret;
- ret = trace->set_flag(tracer_flags->val, opts->bit, !neg);
+ ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg);
if (ret)
return ret;
@@ -3357,8 +3461,9 @@ static int __set_tracer_option(struct tracer *trace,
}
/* Try to assign a tracer specific option */
-static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
+static int set_tracer_option(struct trace_array *tr, char *cmp, int neg)
{
+ struct tracer *trace = tr->current_trace;
struct tracer_flags *tracer_flags = trace->flags;
struct tracer_opt *opts = NULL;
int i;
@@ -3367,8 +3472,7 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
opts = &tracer_flags->opts[i];
if (strcmp(cmp, opts->name) == 0)
- return __set_tracer_option(trace, trace->flags,
- opts, neg);
+ return __set_tracer_option(tr, trace->flags, opts, neg);
}
return -EINVAL;
@@ -3391,7 +3495,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
/* Give the tracer a chance to approve the change */
if (tr->current_trace->flag_changed)
- if (tr->current_trace->flag_changed(tr->current_trace, mask, !!enabled))
+ if (tr->current_trace->flag_changed(tr, mask, !!enabled))
return -EINVAL;
if (enabled)
@@ -3440,7 +3544,7 @@ static int trace_set_options(struct trace_array *tr, char *option)
/* If no option could be set, test the specific tracer options */
if (!trace_options[i])
- ret = set_tracer_option(tr->current_trace, cmp, neg);
+ ret = set_tracer_option(tr, cmp, neg);
mutex_unlock(&trace_types_lock);
@@ -3527,6 +3631,7 @@ static const char readme_msg[] =
" trace_options\t\t- Set format or modify how tracing happens\n"
"\t\t\t Disable an option by adding a suffix 'no' to the\n"
"\t\t\t option name\n"
+ " saved_cmdlines_size\t- echo command number in here to store comm-pid list\n"
#ifdef CONFIG_DYNAMIC_FTRACE
"\n available_filter_functions - list of functions that can be filtered on\n"
" set_ftrace_filter\t- echo function name in here to only trace these\n"
@@ -3546,6 +3651,8 @@ static const char readme_msg[] =
#ifdef CONFIG_TRACER_SNAPSHOT
"\t\t snapshot\n"
#endif
+ "\t\t dump\n"
+ "\t\t cpudump\n"
"\t example: echo do_fault:traceoff > set_ftrace_filter\n"
"\t echo do_trap:traceoff:3 > set_ftrace_filter\n"
"\t The first one will disable tracing every time do_fault is hit\n"
@@ -3569,6 +3676,7 @@ static const char readme_msg[] =
#endif
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
" set_graph_function\t- Trace the nested calls of a function (function_graph)\n"
+ " set_graph_notrace\t- Do not trace the nested calls of a function (function_graph)\n"
" max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n"
#endif
#ifdef CONFIG_TRACER_SNAPSHOT
@@ -3638,55 +3746,153 @@ static const struct file_operations tracing_readme_fops = {
.llseek = generic_file_llseek,
};
+static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ unsigned int *ptr = v;
+
+ if (*pos || m->count)
+ ptr++;
+
+ (*pos)++;
+
+ for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num];
+ ptr++) {
+ if (*ptr == -1 || *ptr == NO_CMDLINE_MAP)
+ continue;
+
+ return ptr;
+ }
+
+ return NULL;
+}
+
+static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos)
+{
+ void *v;
+ loff_t l = 0;
+
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
+
+ v = &savedcmd->map_cmdline_to_pid[0];
+ while (l <= *pos) {
+ v = saved_cmdlines_next(m, v, &l);
+ if (!v)
+ return NULL;
+ }
+
+ return v;
+}
+
+static void saved_cmdlines_stop(struct seq_file *m, void *v)
+{
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+}
+
+static int saved_cmdlines_show(struct seq_file *m, void *v)
+{
+ char buf[TASK_COMM_LEN];
+ unsigned int *pid = v;
+
+ __trace_find_cmdline(*pid, buf);
+ seq_printf(m, "%d %s\n", *pid, buf);
+ return 0;
+}
+
+static const struct seq_operations tracing_saved_cmdlines_seq_ops = {
+ .start = saved_cmdlines_start,
+ .next = saved_cmdlines_next,
+ .stop = saved_cmdlines_stop,
+ .show = saved_cmdlines_show,
+};
+
+static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp)
+{
+ if (tracing_disabled)
+ return -ENODEV;
+
+ return seq_open(filp, &tracing_saved_cmdlines_seq_ops);
+}
+
+static const struct file_operations tracing_saved_cmdlines_fops = {
+ .open = tracing_saved_cmdlines_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
static ssize_t
-tracing_saved_cmdlines_read(struct file *file, char __user *ubuf,
- size_t cnt, loff_t *ppos)
+tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
- char *buf_comm;
- char *file_buf;
- char *buf;
- int len = 0;
- int pid;
- int i;
+ char buf[64];
+ int r;
+
+ arch_spin_lock(&trace_cmdline_lock);
+ r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
+ arch_spin_unlock(&trace_cmdline_lock);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
+{
+ kfree(s->saved_cmdlines);
+ kfree(s->map_cmdline_to_pid);
+ kfree(s);
+}
+
+static int tracing_resize_saved_cmdlines(unsigned int val)
+{
+ struct saved_cmdlines_buffer *s, *savedcmd_temp;
- file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL);
- if (!file_buf)
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
return -ENOMEM;
- buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL);
- if (!buf_comm) {
- kfree(file_buf);
+ if (allocate_cmdlines_buffer(val, s) < 0) {
+ kfree(s);
return -ENOMEM;
}
- buf = file_buf;
+ arch_spin_lock(&trace_cmdline_lock);
+ savedcmd_temp = savedcmd;
+ savedcmd = s;
+ arch_spin_unlock(&trace_cmdline_lock);
+ free_saved_cmdlines_buffer(savedcmd_temp);
- for (i = 0; i < SAVED_CMDLINES; i++) {
- int r;
+ return 0;
+}
- pid = map_cmdline_to_pid[i];
- if (pid == -1 || pid == NO_CMDLINE_MAP)
- continue;
+static ssize_t
+tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ unsigned long val;
+ int ret;
- trace_find_cmdline(pid, buf_comm);
- r = sprintf(buf, "%d %s\n", pid, buf_comm);
- buf += r;
- len += r;
- }
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
- len = simple_read_from_buffer(ubuf, cnt, ppos,
- file_buf, len);
+ /* must have at least 1 entry or less than PID_MAX_DEFAULT */
+ if (!val || val > PID_MAX_DEFAULT)
+ return -EINVAL;
- kfree(file_buf);
- kfree(buf_comm);
+ ret = tracing_resize_saved_cmdlines((unsigned int)val);
+ if (ret < 0)
+ return ret;
- return len;
+ *ppos += cnt;
+
+ return cnt;
}
-static const struct file_operations tracing_saved_cmdlines_fops = {
- .open = tracing_open_generic,
- .read = tracing_saved_cmdlines_read,
- .llseek = generic_file_llseek,
+static const struct file_operations tracing_saved_cmdlines_size_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_saved_cmdlines_size_read,
+ .write = tracing_saved_cmdlines_size_write,
};
static ssize_t
@@ -3869,10 +4075,26 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer);
static void
destroy_trace_option_files(struct trace_option_dentry *topts);
-static int tracing_set_tracer(const char *buf)
+/*
+ * Used to clear out the tracer before deletion of an instance.
+ * Must have trace_types_lock held.
+ */
+static void tracing_set_nop(struct trace_array *tr)
+{
+ if (tr->current_trace == &nop_trace)
+ return;
+
+ tr->current_trace->enabled--;
+
+ if (tr->current_trace->reset)
+ tr->current_trace->reset(tr);
+
+ tr->current_trace = &nop_trace;
+}
+
+static int tracing_set_tracer(struct trace_array *tr, const char *buf)
{
static struct trace_option_dentry *topts;
- struct trace_array *tr = &global_trace;
struct tracer *t;
#ifdef CONFIG_TRACER_MAX_TRACE
bool had_max_tr;
@@ -3900,9 +4122,15 @@ static int tracing_set_tracer(const char *buf)
if (t == tr->current_trace)
goto out;
+ /* Some tracers are only allowed for the top level buffer */
+ if (!trace_ok_for_array(t, tr)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
trace_branch_disable();
- tr->current_trace->enabled = false;
+ tr->current_trace->enabled--;
if (tr->current_trace->reset)
tr->current_trace->reset(tr);
@@ -3925,9 +4153,11 @@ static int tracing_set_tracer(const char *buf)
free_snapshot(tr);
}
#endif
- destroy_trace_option_files(topts);
-
- topts = create_trace_option_files(tr, t);
+ /* Currently, only the top instance has options */
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
+ destroy_trace_option_files(topts);
+ topts = create_trace_option_files(tr, t);
+ }
#ifdef CONFIG_TRACER_MAX_TRACE
if (t->use_max_tr && !had_max_tr) {
@@ -3944,7 +4174,7 @@ static int tracing_set_tracer(const char *buf)
}
tr->current_trace = t;
- tr->current_trace->enabled = true;
+ tr->current_trace->enabled++;
trace_branch_enable(tr);
out:
mutex_unlock(&trace_types_lock);
@@ -3956,6 +4186,7 @@ static ssize_t
tracing_set_trace_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
+ struct trace_array *tr = filp->private_data;
char buf[MAX_TRACER_SIZE+1];
int i;
size_t ret;
@@ -3975,7 +4206,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
buf[i] = 0;
- err = tracing_set_tracer(buf);
+ err = tracing_set_tracer(tr, buf);
if (err)
return err;
@@ -3985,10 +4216,9 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
}
static ssize_t
-tracing_max_lat_read(struct file *filp, char __user *ubuf,
- size_t cnt, loff_t *ppos)
+tracing_nsecs_read(unsigned long *ptr, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
- unsigned long *ptr = filp->private_data;
char buf[64];
int r;
@@ -4000,10 +4230,9 @@ tracing_max_lat_read(struct file *filp, char __user *ubuf,
}
static ssize_t
-tracing_max_lat_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
+tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
- unsigned long *ptr = filp->private_data;
unsigned long val;
int ret;
@@ -4016,6 +4245,52 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
return cnt;
}
+static ssize_t
+tracing_thresh_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return tracing_nsecs_read(&tracing_thresh, ubuf, cnt, ppos);
+}
+
+static ssize_t
+tracing_thresh_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ int ret;
+
+ mutex_lock(&trace_types_lock);
+ ret = tracing_nsecs_write(&tracing_thresh, ubuf, cnt, ppos);
+ if (ret < 0)
+ goto out;
+
+ if (tr->current_trace->update_thresh) {
+ ret = tr->current_trace->update_thresh(tr);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = cnt;
+out:
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+}
+
+static ssize_t
+tracing_max_lat_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return tracing_nsecs_read(filp->private_data, ubuf, cnt, ppos);
+}
+
+static ssize_t
+tracing_max_lat_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return tracing_nsecs_write(filp->private_data, ubuf, cnt, ppos);
+}
+
static int tracing_open_pipe(struct inode *inode, struct file *filp)
{
struct trace_array *tr = inode->i_private;
@@ -4133,29 +4408,11 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
return trace_poll(iter, filp, poll_table);
}
-/*
- * This is a make-shift waitqueue.
- * A tracer might use this callback on some rare cases:
- *
- * 1) the current tracer might hold the runqueue lock when it wakes up
- * a reader, hence a deadlock (sched, function, and function graph tracers)
- * 2) the function tracers, trace all functions, we don't want
- * the overhead of calling wake_up and friends
- * (and tracing them too)
- *
- * Anyway, this is really very primitive wakeup.
- */
-void poll_wait_pipe(struct trace_iterator *iter)
-{
- set_current_state(TASK_INTERRUPTIBLE);
- /* sleep for 100 msecs, and try again. */
- schedule_timeout(HZ / 10);
-}
-
/* Must be called with trace_types_lock mutex held. */
static int tracing_wait_pipe(struct file *filp)
{
struct trace_iterator *iter = filp->private_data;
+ int ret;
while (trace_empty(iter)) {
@@ -4163,15 +4420,6 @@ static int tracing_wait_pipe(struct file *filp)
return -EAGAIN;
}
- mutex_unlock(&iter->mutex);
-
- iter->trace->wait_pipe(iter);
-
- mutex_lock(&iter->mutex);
-
- if (signal_pending(current))
- return -EINTR;
-
/*
* We block until we read something and tracing is disabled.
* We still block if tracing is disabled, but we have never
@@ -4183,6 +4431,18 @@ static int tracing_wait_pipe(struct file *filp)
*/
if (!tracing_is_on() && iter->pos)
break;
+
+ mutex_unlock(&iter->mutex);
+
+ ret = wait_on_pipe(iter);
+
+ mutex_lock(&iter->mutex);
+
+ if (ret)
+ return ret;
+
+ if (signal_pending(current))
+ return -EINTR;
}
return 1;
@@ -4300,8 +4560,6 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
static const struct pipe_buf_operations tracing_pipe_buf_ops = {
.can_merge = 0,
- .map = generic_pipe_buf_map,
- .unmap = generic_pipe_buf_unmap,
.confirm = generic_pipe_buf_confirm,
.release = generic_pipe_buf_release,
.steal = generic_pipe_buf_steal,
@@ -4396,7 +4654,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
trace_access_lock(iter->cpu_file);
/* Fill as many pages as possible. */
- for (i = 0, rem = len; i < pipe->buffers && rem; i++) {
+ for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) {
spd.pages[i] = alloc_page(GFP_KERNEL);
if (!spd.pages[i])
break;
@@ -4683,25 +4941,10 @@ static int tracing_clock_show(struct seq_file *m, void *v)
return 0;
}
-static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *fpos)
+static int tracing_set_clock(struct trace_array *tr, const char *clockstr)
{
- struct seq_file *m = filp->private_data;
- struct trace_array *tr = m->private;
- char buf[64];
- const char *clockstr;
int i;
- if (cnt >= sizeof(buf))
- return -EINVAL;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
-
- buf[cnt] = 0;
-
- clockstr = strstrip(buf);
-
for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) {
if (strcmp(trace_clocks[i].name, clockstr) == 0)
break;
@@ -4729,6 +4972,32 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
mutex_unlock(&trace_types_lock);
+ return 0;
+}
+
+static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *fpos)
+{
+ struct seq_file *m = filp->private_data;
+ struct trace_array *tr = m->private;
+ char buf[64];
+ const char *clockstr;
+ int ret;
+
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ clockstr = strstrip(buf);
+
+ ret = tracing_set_clock(tr, clockstr);
+ if (ret)
+ return ret;
+
*fpos += cnt;
return cnt;
@@ -4923,6 +5192,13 @@ static int snapshot_raw_open(struct inode *inode, struct file *filp)
#endif /* CONFIG_TRACER_SNAPSHOT */
+static const struct file_operations tracing_thresh_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_thresh_read,
+ .write = tracing_thresh_write,
+ .llseek = generic_file_llseek,
+};
+
static const struct file_operations tracing_max_lat_fops = {
.open = tracing_open_generic,
.read = tracing_max_lat_read,
@@ -5096,8 +5372,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
goto out_unlock;
}
mutex_unlock(&trace_types_lock);
- iter->trace->wait_pipe(iter);
+ ret = wait_on_pipe(iter);
mutex_lock(&trace_types_lock);
+ if (ret) {
+ size = ret;
+ goto out_unlock;
+ }
if (signal_pending(current)) {
size = -EINTR;
goto out_unlock;
@@ -5178,8 +5458,6 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
/* Pipe buffer operations for a buffer. */
static const struct pipe_buf_operations buffer_pipe_buf_ops = {
.can_merge = 0,
- .map = generic_pipe_buf_map,
- .unmap = generic_pipe_buf_unmap,
.confirm = generic_pipe_buf_confirm,
.release = buffer_pipe_buf_release,
.steal = generic_pipe_buf_steal,
@@ -5255,7 +5533,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
trace_access_lock(iter->cpu_file);
entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
- for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
+ for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) {
struct page *page;
int r;
@@ -5309,8 +5587,10 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
goto out;
}
mutex_unlock(&trace_types_lock);
- iter->trace->wait_pipe(iter);
+ ret = wait_on_pipe(iter);
mutex_lock(&trace_types_lock);
+ if (ret)
+ goto out;
if (signal_pending(current)) {
ret = -EINTR;
goto out;
@@ -5689,7 +5969,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (!!(topt->flags->val & topt->opt->bit) != val) {
mutex_lock(&trace_types_lock);
- ret = __set_tracer_option(topt->tr->current_trace, topt->flags,
+ ret = __set_tracer_option(topt->tr, topt->flags,
topt->opt, !val);
mutex_unlock(&trace_types_lock);
if (ret)
@@ -5856,10 +6136,8 @@ destroy_trace_option_files(struct trace_option_dentry *topts)
if (!topts)
return;
- for (cnt = 0; topts[cnt].opt; cnt++) {
- if (topts[cnt].entry)
- debugfs_remove(topts[cnt].entry);
- }
+ for (cnt = 0; topts[cnt].opt; cnt++)
+ debugfs_remove(topts[cnt].entry);
kfree(topts);
}
@@ -6003,6 +6281,28 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
return 0;
}
+static void free_trace_buffer(struct trace_buffer *buf)
+{
+ if (buf->buffer) {
+ ring_buffer_free(buf->buffer);
+ buf->buffer = NULL;
+ free_percpu(buf->data);
+ buf->data = NULL;
+ }
+}
+
+static void free_trace_buffers(struct trace_array *tr)
+{
+ if (!tr)
+ return;
+
+ free_trace_buffer(&tr->trace_buffer);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ free_trace_buffer(&tr->max_buffer);
+#endif
+}
+
static int new_instance_create(const char *name)
{
struct trace_array *tr;
@@ -6032,6 +6332,8 @@ static int new_instance_create(const char *name)
raw_spin_lock_init(&tr->start_lock);
+ tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+
tr->current_trace = &nop_trace;
INIT_LIST_HEAD(&tr->systems);
@@ -6059,8 +6361,7 @@ static int new_instance_create(const char *name)
return 0;
out_free_tr:
- if (tr->trace_buffer.buffer)
- ring_buffer_free(tr->trace_buffer.buffer);
+ free_trace_buffers(tr);
free_cpumask_var(tr->tracing_cpumask);
kfree(tr->name);
kfree(tr);
@@ -6096,10 +6397,11 @@ static int instance_delete(const char *name)
list_del(&tr->list);
+ tracing_set_nop(tr);
event_trace_del_tracer(tr);
+ ftrace_destroy_function_files(tr);
debugfs_remove_recursive(tr->dir);
- free_percpu(tr->trace_buffer.data);
- ring_buffer_free(tr->trace_buffer.buffer);
+ free_trace_buffers(tr);
kfree(tr->name);
kfree(tr);
@@ -6191,6 +6493,12 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
{
int cpu;
+ trace_create_file("available_tracers", 0444, d_tracer,
+ tr, &show_traces_fops);
+
+ trace_create_file("current_tracer", 0644, d_tracer,
+ tr, &set_tracer_fops);
+
trace_create_file("tracing_cpumask", 0644, d_tracer,
tr, &tracing_cpumask_fops);
@@ -6221,6 +6529,14 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("tracing_on", 0644, d_tracer,
tr, &rb_simple_fops);
+#ifdef CONFIG_TRACER_MAX_TRACE
+ trace_create_file("tracing_max_latency", 0644, d_tracer,
+ &tr->max_latency, &tracing_max_lat_fops);
+#endif
+
+ if (ftrace_create_function_files(tr, d_tracer))
+ WARN(1, "Could not allocate function filter files");
+
#ifdef CONFIG_TRACER_SNAPSHOT
trace_create_file("snapshot", 0644, d_tracer,
tr, &snapshot_fops);
@@ -6243,19 +6559,8 @@ static __init int tracer_init_debugfs(void)
init_tracer_debugfs(&global_trace, d_tracer);
- trace_create_file("available_tracers", 0444, d_tracer,
- &global_trace, &show_traces_fops);
-
- trace_create_file("current_tracer", 0644, d_tracer,
- &global_trace, &set_tracer_fops);
-
-#ifdef CONFIG_TRACER_MAX_TRACE
- trace_create_file("tracing_max_latency", 0644, d_tracer,
- &tracing_max_latency, &tracing_max_lat_fops);
-#endif
-
trace_create_file("tracing_thresh", 0644, d_tracer,
- &tracing_thresh, &tracing_max_lat_fops);
+ &global_trace, &tracing_thresh_fops);
trace_create_file("README", 0444, d_tracer,
NULL, &tracing_readme_fops);
@@ -6263,6 +6568,9 @@ static __init int tracer_init_debugfs(void)
trace_create_file("saved_cmdlines", 0444, d_tracer,
NULL, &tracing_saved_cmdlines_fops);
+ trace_create_file("saved_cmdlines_size", 0644, d_tracer,
+ NULL, &tracing_saved_cmdlines_size_fops);
+
#ifdef CONFIG_DYNAMIC_FTRACE
trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
&ftrace_update_tot_cnt, &tracing_dyn_info_fops);
@@ -6494,17 +6802,30 @@ __init static int tracer_alloc_buffers(void)
raw_spin_lock_init(&global_trace.start_lock);
+ /* Used for event triggers */
+ temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE);
+ if (!temp_buffer)
+ goto out_free_cpumask;
+
+ if (trace_create_savedcmd() < 0)
+ goto out_free_temp_buffer;
+
/* TODO: make the number of buffers hot pluggable with CPUS */
if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
WARN_ON(1);
- goto out_free_cpumask;
+ goto out_free_savedcmd;
}
if (global_trace.buffer_disabled)
tracing_off();
- trace_init_cmdlines();
+ if (trace_boot_clock) {
+ ret = tracing_set_clock(&global_trace, trace_boot_clock);
+ if (ret < 0)
+ pr_warning("Trace clock %s not defined, going back to default\n",
+ trace_boot_clock);
+ }
/*
* register_tracer() might reference current_trace, so it
@@ -6513,6 +6834,10 @@ __init static int tracer_alloc_buffers(void)
*/
global_trace.current_trace = &nop_trace;
+ global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+
+ ftrace_init_global_array_ops(&global_trace);
+
register_tracer(&nop_trace);
/* All seems OK, enable tracing */
@@ -6540,11 +6865,11 @@ __init static int tracer_alloc_buffers(void)
return 0;
+out_free_savedcmd:
+ free_saved_cmdlines_buffer(savedcmd);
+out_free_temp_buffer:
+ ring_buffer_free(temp_buffer);
out_free_cpumask:
- free_percpu(global_trace.trace_buffer.data);
-#ifdef CONFIG_TRACER_MAX_TRACE
- free_percpu(global_trace.max_buffer.data);
-#endif
free_cpumask_var(global_trace.tracing_cpumask);
out_free_buffer_mask:
free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 02b592f2d4b7..385391fb1d3b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -13,6 +13,7 @@
#include <linux/hw_breakpoint.h>
#include <linux/trace_seq.h>
#include <linux/ftrace_event.h>
+#include <linux/compiler.h>
#ifdef CONFIG_FTRACE_SYSCALLS
#include <asm/unistd.h> /* For NR_SYSCALLS */
@@ -189,7 +190,22 @@ struct trace_array {
*/
struct trace_buffer max_buffer;
bool allocated_snapshot;
+ unsigned long max_latency;
#endif
+ /*
+ * max_lock is used to protect the swapping of buffers
+ * when taking a max snapshot. The buffers themselves are
+ * protected by per_cpu spinlocks. But the action of the swap
+ * needs its own lock.
+ *
+ * This is defined as a arch_spinlock_t in order to help
+ * with performance when lockdep debugging is enabled.
+ *
+ * It is also used in other places outside the update_max_tr
+ * so it needs to be defined outside of the
+ * CONFIG_TRACER_MAX_TRACE.
+ */
+ arch_spinlock_t max_lock;
int buffer_disabled;
#ifdef CONFIG_FTRACE_SYSCALLS
int sys_refcount_enter;
@@ -210,6 +226,11 @@ struct trace_array {
struct list_head events;
cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
int ref;
+#ifdef CONFIG_FUNCTION_TRACER
+ struct ftrace_ops *ops;
+ /* function tracing enabled */
+ int function_enabled;
+#endif
};
enum {
@@ -231,6 +252,9 @@ static inline struct trace_array *top_trace_array(void)
{
struct trace_array *tr;
+ if (list_empty(&ftrace_trace_arrays))
+ return NULL;
+
tr = list_entry(ftrace_trace_arrays.prev,
typeof(*tr), list);
WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL));
@@ -315,9 +339,9 @@ struct tracer_flags {
* @reset: called when one switches to another tracer
* @start: called when tracing is unpaused (echo 1 > tracing_enabled)
* @stop: called when tracing is paused (echo 0 > tracing_enabled)
+ * @update_thresh: called when tracing_thresh is updated
* @open: called when the trace file is opened
* @pipe_open: called when the trace_pipe file is opened
- * @wait_pipe: override how the user waits for traces on trace_pipe
* @close: called when the trace file is released
* @pipe_close: called when the trace_pipe file is released
* @read: override the default read callback on trace_pipe
@@ -334,9 +358,9 @@ struct tracer {
void (*reset)(struct trace_array *tr);
void (*start)(struct trace_array *tr);
void (*stop)(struct trace_array *tr);
+ int (*update_thresh)(struct trace_array *tr);
void (*open)(struct trace_iterator *iter);
void (*pipe_open)(struct trace_iterator *iter);
- void (*wait_pipe)(struct trace_iterator *iter);
void (*close)(struct trace_iterator *iter);
void (*pipe_close)(struct trace_iterator *iter);
ssize_t (*read)(struct trace_iterator *iter,
@@ -355,14 +379,16 @@ struct tracer {
void (*print_header)(struct seq_file *m);
enum print_line_t (*print_line)(struct trace_iterator *iter);
/* If you handled the flag setting, return 0 */
- int (*set_flag)(u32 old_flags, u32 bit, int set);
+ int (*set_flag)(struct trace_array *tr,
+ u32 old_flags, u32 bit, int set);
/* Return 0 if OK with change, else return non-zero */
- int (*flag_changed)(struct tracer *tracer,
+ int (*flag_changed)(struct trace_array *tr,
u32 mask, int set);
struct tracer *next;
struct tracer_flags *flags;
+ int enabled;
bool print_max;
- bool enabled;
+ bool allow_instances;
#ifdef CONFIG_TRACER_MAX_TRACE
bool use_max_tr;
#endif
@@ -408,13 +434,7 @@ enum {
TRACE_FTRACE_IRQ_BIT,
TRACE_FTRACE_SIRQ_BIT,
- /* GLOBAL_BITs must be greater than FTRACE_BITs */
- TRACE_GLOBAL_BIT,
- TRACE_GLOBAL_NMI_BIT,
- TRACE_GLOBAL_IRQ_BIT,
- TRACE_GLOBAL_SIRQ_BIT,
-
- /* INTERNAL_BITs must be greater than GLOBAL_BITs */
+ /* INTERNAL_BITs must be greater than FTRACE_BITs */
TRACE_INTERNAL_BIT,
TRACE_INTERNAL_NMI_BIT,
TRACE_INTERNAL_IRQ_BIT,
@@ -441,9 +461,6 @@ enum {
#define TRACE_FTRACE_START TRACE_FTRACE_BIT
#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1)
-#define TRACE_GLOBAL_START TRACE_GLOBAL_BIT
-#define TRACE_GLOBAL_MAX ((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1)
-
#define TRACE_LIST_START TRACE_INTERNAL_BIT
#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1)
@@ -552,8 +569,6 @@ void trace_init_global_iter(struct trace_iterator *iter);
void tracing_iter_reset(struct trace_iterator *iter, int cpu);
-void poll_wait_pipe(struct trace_iterator *iter);
-
void tracing_sched_switch_trace(struct trace_array *tr,
struct task_struct *prev,
struct task_struct *next,
@@ -600,8 +615,6 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs);
extern unsigned long tracing_thresh;
#ifdef CONFIG_TRACER_MAX_TRACE
-extern unsigned long tracing_max_latency;
-
void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
void update_max_tr_single(struct trace_array *tr,
struct task_struct *tsk, int cpu);
@@ -716,6 +729,8 @@ extern unsigned long trace_flags;
#define TRACE_GRAPH_PRINT_PROC 0x8
#define TRACE_GRAPH_PRINT_DURATION 0x10
#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
+#define TRACE_GRAPH_PRINT_IRQS 0x40
+#define TRACE_GRAPH_PRINT_TAIL 0x80
#define TRACE_GRAPH_PRINT_FILL_SHIFT 28
#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)
@@ -812,13 +827,45 @@ static inline int ftrace_trace_task(struct task_struct *task)
return test_tsk_trace_trace(task);
}
extern int ftrace_is_dead(void);
+int ftrace_create_function_files(struct trace_array *tr,
+ struct dentry *parent);
+void ftrace_destroy_function_files(struct trace_array *tr);
+void ftrace_init_global_array_ops(struct trace_array *tr);
+void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func);
+void ftrace_reset_array_ops(struct trace_array *tr);
+int using_ftrace_ops_list_func(void);
#else
static inline int ftrace_trace_task(struct task_struct *task)
{
return 1;
}
static inline int ftrace_is_dead(void) { return 0; }
-#endif
+static inline int
+ftrace_create_function_files(struct trace_array *tr,
+ struct dentry *parent)
+{
+ return 0;
+}
+static inline void ftrace_destroy_function_files(struct trace_array *tr) { }
+static inline __init void
+ftrace_init_global_array_ops(struct trace_array *tr) { }
+static inline void ftrace_reset_array_ops(struct trace_array *tr) { }
+/* ftace_func_t type is not defined, use macro instead of static inline */
+#define ftrace_init_array_ops(tr, func) do { } while (0)
+#endif /* CONFIG_FUNCTION_TRACER */
+
+#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE)
+void ftrace_create_filter_files(struct ftrace_ops *ops,
+ struct dentry *parent);
+void ftrace_destroy_filter_files(struct ftrace_ops *ops);
+#else
+/*
+ * The ops parameter passed in is usually undefined.
+ * This must be a macro.
+ */
+#define ftrace_create_filter_files(ops, parent) do { } while (0)
+#define ftrace_destroy_filter_files(ops) do { } while (0)
+#endif /* CONFIG_FUNCTION_TRACER && CONFIG_DYNAMIC_FTRACE */
int ftrace_event_is_function(struct ftrace_event_call *call);
@@ -1249,7 +1296,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
#undef FTRACE_ENTRY
#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \
extern struct ftrace_event_call \
- __attribute__((__aligned__(4))) event_##call;
+ __aligned(4) event_##call;
#undef FTRACE_ENTRY_DUP
#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \
FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
new file mode 100644
index 000000000000..40a14cbcf8e0
--- /dev/null
+++ b/kernel/trace/trace_benchmark.c
@@ -0,0 +1,198 @@
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/trace_clock.h>
+
+#define CREATE_TRACE_POINTS
+#include "trace_benchmark.h"
+
+static struct task_struct *bm_event_thread;
+
+static char bm_str[BENCHMARK_EVENT_STRLEN] = "START";
+
+static u64 bm_total;
+static u64 bm_totalsq;
+static u64 bm_last;
+static u64 bm_max;
+static u64 bm_min;
+static u64 bm_first;
+static u64 bm_cnt;
+static u64 bm_stddev;
+static unsigned int bm_avg;
+static unsigned int bm_std;
+
+/*
+ * This gets called in a loop recording the time it took to write
+ * the tracepoint. What it writes is the time statistics of the last
+ * tracepoint write. As there is nothing to write the first time
+ * it simply writes "START". As the first write is cold cache and
+ * the rest is hot, we save off that time in bm_first and it is
+ * reported as "first", which is shown in the second write to the
+ * tracepoint. The "first" field is writen within the statics from
+ * then on but never changes.
+ */
+static void trace_do_benchmark(void)
+{
+ u64 start;
+ u64 stop;
+ u64 delta;
+ u64 stddev;
+ u64 seed;
+ u64 last_seed;
+ unsigned int avg;
+ unsigned int std = 0;
+
+ /* Only run if the tracepoint is actually active */
+ if (!trace_benchmark_event_enabled())
+ return;
+
+ local_irq_disable();
+ start = trace_clock_local();
+ trace_benchmark_event(bm_str);
+ stop = trace_clock_local();
+ local_irq_enable();
+
+ bm_cnt++;
+
+ delta = stop - start;
+
+ /*
+ * The first read is cold cached, keep it separate from the
+ * other calculations.
+ */
+ if (bm_cnt == 1) {
+ bm_first = delta;
+ scnprintf(bm_str, BENCHMARK_EVENT_STRLEN,
+ "first=%llu [COLD CACHED]", bm_first);
+ return;
+ }
+
+ bm_last = delta;
+
+ if (delta > bm_max)
+ bm_max = delta;
+ if (!bm_min || delta < bm_min)
+ bm_min = delta;
+
+ /*
+ * When bm_cnt is greater than UINT_MAX, it breaks the statistics
+ * accounting. Freeze the statistics when that happens.
+ * We should have enough data for the avg and stddev anyway.
+ */
+ if (bm_cnt > UINT_MAX) {
+ scnprintf(bm_str, BENCHMARK_EVENT_STRLEN,
+ "last=%llu first=%llu max=%llu min=%llu ** avg=%u std=%d std^2=%lld",
+ bm_last, bm_first, bm_max, bm_min, bm_avg, bm_std, bm_stddev);
+ return;
+ }
+
+ bm_total += delta;
+ bm_totalsq += delta * delta;
+
+
+ if (bm_cnt > 1) {
+ /*
+ * Apply Welford's method to calculate standard deviation:
+ * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2)
+ */
+ stddev = (u64)bm_cnt * bm_totalsq - bm_total * bm_total;
+ do_div(stddev, (u32)bm_cnt);
+ do_div(stddev, (u32)bm_cnt - 1);
+ } else
+ stddev = 0;
+
+ delta = bm_total;
+ do_div(delta, bm_cnt);
+ avg = delta;
+
+ if (stddev > 0) {
+ int i = 0;
+ /*
+ * stddev is the square of standard deviation but
+ * we want the actualy number. Use the average
+ * as our seed to find the std.
+ *
+ * The next try is:
+ * x = (x + N/x) / 2
+ *
+ * Where N is the squared number to find the square
+ * root of.
+ */
+ seed = avg;
+ do {
+ last_seed = seed;
+ seed = stddev;
+ if (!last_seed)
+ break;
+ do_div(seed, last_seed);
+ seed += last_seed;
+ do_div(seed, 2);
+ } while (i++ < 10 && last_seed != seed);
+
+ std = seed;
+ }
+
+ scnprintf(bm_str, BENCHMARK_EVENT_STRLEN,
+ "last=%llu first=%llu max=%llu min=%llu avg=%u std=%d std^2=%lld",
+ bm_last, bm_first, bm_max, bm_min, avg, std, stddev);
+
+ bm_std = std;
+ bm_avg = avg;
+ bm_stddev = stddev;
+}
+
+static int benchmark_event_kthread(void *arg)
+{
+ /* sleep a bit to make sure the tracepoint gets activated */
+ msleep(100);
+
+ while (!kthread_should_stop()) {
+
+ trace_do_benchmark();
+
+ /*
+ * We don't go to sleep, but let others
+ * run as well.
+ */
+ cond_resched();
+ }
+
+ return 0;
+}
+
+/*
+ * When the benchmark tracepoint is enabled, it calls this
+ * function and the thread that calls the tracepoint is created.
+ */
+void trace_benchmark_reg(void)
+{
+ bm_event_thread = kthread_run(benchmark_event_kthread,
+ NULL, "event_benchmark");
+ WARN_ON(!bm_event_thread);
+}
+
+/*
+ * When the benchmark tracepoint is disabled, it calls this
+ * function and the thread that calls the tracepoint is deleted
+ * and all the numbers are reset.
+ */
+void trace_benchmark_unreg(void)
+{
+ if (!bm_event_thread)
+ return;
+
+ kthread_stop(bm_event_thread);
+
+ strcpy(bm_str, "START");
+ bm_total = 0;
+ bm_totalsq = 0;
+ bm_last = 0;
+ bm_max = 0;
+ bm_min = 0;
+ bm_cnt = 0;
+ /* These don't need to be reset but reset them anyway */
+ bm_first = 0;
+ bm_std = 0;
+ bm_avg = 0;
+ bm_stddev = 0;
+}
diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h
new file mode 100644
index 000000000000..3c1df1df4e29
--- /dev/null
+++ b/kernel/trace/trace_benchmark.h
@@ -0,0 +1,41 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM benchmark
+
+#if !defined(_TRACE_BENCHMARK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_BENCHMARK_H
+
+#include <linux/tracepoint.h>
+
+extern void trace_benchmark_reg(void);
+extern void trace_benchmark_unreg(void);
+
+#define BENCHMARK_EVENT_STRLEN 128
+
+TRACE_EVENT_FN(benchmark_event,
+
+ TP_PROTO(const char *str),
+
+ TP_ARGS(str),
+
+ TP_STRUCT__entry(
+ __array( char, str, BENCHMARK_EVENT_STRLEN )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN);
+ ),
+
+ TP_printk("%s", __entry->str),
+
+ trace_benchmark_reg, trace_benchmark_unreg
+);
+
+#endif /* _TRACE_BENCHMARK_H */
+
+#undef TRACE_INCLUDE_FILE
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_benchmark
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 26dc348332b7..57b67b1f24d1 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -59,13 +59,14 @@ u64 notrace trace_clock(void)
/*
* trace_jiffy_clock(): Simply use jiffies as a clock counter.
+ * Note that this use of jiffies_64 is not completely safe on
+ * 32-bit systems. But the window is tiny, and the effect if
+ * we are affected is that we will have an obviously bogus
+ * timestamp on a trace event - i.e. not life threatening.
*/
u64 notrace trace_clock_jiffies(void)
{
- u64 jiffy = jiffies - INITIAL_JIFFIES;
-
- /* Return nsecs */
- return (u64)jiffies_to_usecs(jiffy) * 1000ULL;
+ return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES);
}
/*
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index e854f420e033..4b9c114ee9de 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -30,10 +30,38 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
return ret;
}
+ /*
+ * We checked and allowed to create parent,
+ * allow children without checking.
+ */
+ if (p_event->parent)
+ return 0;
+
+ /*
+ * It's ok to check current process (owner) permissions in here,
+ * because code below is called only via perf_event_open syscall.
+ */
+
/* The ftrace function trace is allowed only for root. */
- if (ftrace_event_is_function(tp_event) &&
- perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
- return -EPERM;
+ if (ftrace_event_is_function(tp_event)) {
+ if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /*
+ * We don't allow user space callchains for function trace
+ * event, due to issues with page faults while tracing page
+ * fault handler and its overall trickiness nature.
+ */
+ if (!p_event->attr.exclude_callchain_user)
+ return -EINVAL;
+
+ /*
+ * Same reason to disable user stack dump as for user space
+ * callchains above.
+ */
+ if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER)
+ return -EINVAL;
+ }
/* No tracing, just counting, so no obvious leak */
if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
@@ -232,8 +260,8 @@ void perf_trace_del(struct perf_event *p_event, int flags)
tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
}
-__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
- struct pt_regs *regs, int *rctxp)
+void *perf_trace_buf_prepare(int size, unsigned short type,
+ struct pt_regs *regs, int *rctxp)
{
struct trace_entry *entry;
unsigned long flags;
@@ -265,6 +293,7 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
return raw_data;
}
EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
+NOKPROBE_SYMBOL(perf_trace_buf_prepare);
#ifdef CONFIG_FUNCTION_TRACER
static void
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f3989ceb5cd5..ef06ce7e9cf8 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -8,6 +8,8 @@
*
*/
+#define pr_fmt(fmt) fmt
+
#include <linux/workqueue.h>
#include <linux/spinlock.h>
#include <linux/kthread.h>
@@ -27,12 +29,6 @@
DEFINE_MUTEX(event_mutex);
-DEFINE_MUTEX(event_storage_mutex);
-EXPORT_SYMBOL_GPL(event_storage_mutex);
-
-char event_storage[EVENT_STORAGE_SIZE];
-EXPORT_SYMBOL_GPL(event_storage);
-
LIST_HEAD(ftrace_events);
static LIST_HEAD(ftrace_common_fields);
@@ -194,29 +190,60 @@ int trace_event_raw_init(struct ftrace_event_call *call)
}
EXPORT_SYMBOL_GPL(trace_event_raw_init);
+void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
+ struct ftrace_event_file *ftrace_file,
+ unsigned long len)
+{
+ struct ftrace_event_call *event_call = ftrace_file->event_call;
+
+ local_save_flags(fbuffer->flags);
+ fbuffer->pc = preempt_count();
+ fbuffer->ftrace_file = ftrace_file;
+
+ fbuffer->event =
+ trace_event_buffer_lock_reserve(&fbuffer->buffer, ftrace_file,
+ event_call->event.type, len,
+ fbuffer->flags, fbuffer->pc);
+ if (!fbuffer->event)
+ return NULL;
+
+ fbuffer->entry = ring_buffer_event_data(fbuffer->event);
+ return fbuffer->entry;
+}
+EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve);
+
+void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer)
+{
+ event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer,
+ fbuffer->event, fbuffer->entry,
+ fbuffer->flags, fbuffer->pc);
+}
+EXPORT_SYMBOL_GPL(ftrace_event_buffer_commit);
+
int ftrace_event_reg(struct ftrace_event_call *call,
enum trace_reg type, void *data)
{
struct ftrace_event_file *file = data;
+ WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT));
switch (type) {
case TRACE_REG_REGISTER:
- return tracepoint_probe_register(call->name,
+ return tracepoint_probe_register(call->tp,
call->class->probe,
file);
case TRACE_REG_UNREGISTER:
- tracepoint_probe_unregister(call->name,
+ tracepoint_probe_unregister(call->tp,
call->class->probe,
file);
return 0;
#ifdef CONFIG_PERF_EVENTS
case TRACE_REG_PERF_REGISTER:
- return tracepoint_probe_register(call->name,
+ return tracepoint_probe_register(call->tp,
call->class->perf_probe,
call);
case TRACE_REG_PERF_UNREGISTER:
- tracepoint_probe_unregister(call->name,
+ tracepoint_probe_unregister(call->tp,
call->class->perf_probe,
call);
return 0;
@@ -328,7 +355,7 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
if (ret) {
tracing_stop_cmdline_record();
pr_info("event trace: Could not enable event "
- "%s\n", call->name);
+ "%s\n", ftrace_event_name(call));
break;
}
set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
@@ -445,6 +472,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file)
list_del(&file->list);
remove_subsystem(file->system);
+ free_event_filter(file->filter);
kmem_cache_free(file_cachep, file);
}
@@ -457,27 +485,29 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
{
struct ftrace_event_file *file;
struct ftrace_event_call *call;
+ const char *name;
int ret = -EINVAL;
list_for_each_entry(file, &tr->events, list) {
call = file->event_call;
+ name = ftrace_event_name(call);
- if (!call->name || !call->class || !call->class->reg)
+ if (!name || !call->class || !call->class->reg)
continue;
if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
continue;
if (match &&
- strcmp(match, call->name) != 0 &&
+ strcmp(match, name) != 0 &&
strcmp(match, call->class->system) != 0)
continue;
if (sub && strcmp(sub, call->class->system) != 0)
continue;
- if (event && strcmp(event, call->name) != 0)
+ if (event && strcmp(event, name) != 0)
continue;
ftrace_event_enable_disable(file, set);
@@ -547,6 +577,9 @@ int trace_set_clr_event(const char *system, const char *event, int set)
{
struct trace_array *tr = top_trace_array();
+ if (!tr)
+ return -ENODEV;
+
return __ftrace_set_clr_event(tr, NULL, system, event, set);
}
EXPORT_SYMBOL_GPL(trace_set_clr_event);
@@ -675,7 +708,7 @@ static int t_show(struct seq_file *m, void *v)
if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
seq_printf(m, "%s:", call->class->system);
- seq_printf(m, "%s\n", call->name);
+ seq_printf(m, "%s\n", ftrace_event_name(call));
return 0;
}
@@ -768,7 +801,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
mutex_lock(&event_mutex);
list_for_each_entry(file, &tr->events, list) {
call = file->event_call;
- if (!call->name || !call->class || !call->class->reg)
+ if (!ftrace_event_name(call) || !call->class || !call->class->reg)
continue;
if (system && strcmp(call->class->system, system->name) != 0)
@@ -883,7 +916,7 @@ static int f_show(struct seq_file *m, void *v)
switch ((unsigned long)v) {
case FORMAT_HEADER:
- seq_printf(m, "name: %s\n", call->name);
+ seq_printf(m, "name: %s\n", ftrace_event_name(call));
seq_printf(m, "ID: %d\n", call->event.type);
seq_printf(m, "format:\n");
return 0;
@@ -1460,7 +1493,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
dir->entry = debugfs_create_dir(name, parent);
if (!dir->entry) {
- pr_warning("Failed to create system directory %s\n", name);
+ pr_warn("Failed to create system directory %s\n", name);
__put_system(system);
goto out_free;
}
@@ -1476,7 +1509,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
if (!entry) {
kfree(system->filter);
system->filter = NULL;
- pr_warning("Could not create debugfs '%s/filter' entry\n", name);
+ pr_warn("Could not create debugfs '%s/filter' entry\n", name);
}
trace_create_file("enable", 0644, dir->entry, dir,
@@ -1491,8 +1524,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
out_fail:
/* Only print this message if failed on memory allocation */
if (!dir || !system)
- pr_warning("No memory to create event subsystem %s\n",
- name);
+ pr_warn("No memory to create event subsystem %s\n", name);
return NULL;
}
@@ -1503,6 +1535,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
struct trace_array *tr = file->tr;
struct list_head *head;
struct dentry *d_events;
+ const char *name;
int ret;
/*
@@ -1516,10 +1549,10 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
} else
d_events = parent;
- file->dir = debugfs_create_dir(call->name, d_events);
+ name = ftrace_event_name(call);
+ file->dir = debugfs_create_dir(name, d_events);
if (!file->dir) {
- pr_warning("Could not create debugfs '%s' directory\n",
- call->name);
+ pr_warn("Could not create debugfs '%s' directory\n", name);
return -1;
}
@@ -1542,8 +1575,8 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
if (list_empty(head)) {
ret = call->class->define_fields(call);
if (ret < 0) {
- pr_warning("Could not initialize trace point"
- " events/%s\n", call->name);
+ pr_warn("Could not initialize trace point events/%s\n",
+ name);
return -1;
}
}
@@ -1588,7 +1621,6 @@ static void event_remove(struct ftrace_event_call *call)
if (file->event_call != call)
continue;
ftrace_event_enable_disable(file, 0);
- destroy_preds(file);
/*
* The do_for_each_event_file() is
* a double loop. After finding the call for this
@@ -1607,15 +1639,16 @@ static void event_remove(struct ftrace_event_call *call)
static int event_init(struct ftrace_event_call *call)
{
int ret = 0;
+ const char *name;
- if (WARN_ON(!call->name))
+ name = ftrace_event_name(call);
+ if (WARN_ON(!name))
return -EINVAL;
if (call->class->raw_init) {
ret = call->class->raw_init(call);
if (ret < 0 && ret != -ENOSYS)
- pr_warn("Could not initialize trace events/%s\n",
- call->name);
+ pr_warn("Could not initialize trace events/%s\n", name);
}
return ret;
@@ -1714,7 +1747,8 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)
{
event_remove(call);
trace_destroy_fields(call);
- destroy_call_preds(call);
+ free_event_filter(call->filter);
+ call->filter = NULL;
}
static int probe_remove_event_call(struct ftrace_event_call *call)
@@ -1860,8 +1894,8 @@ __trace_add_event_dirs(struct trace_array *tr)
list_for_each_entry(call, &ftrace_events, list) {
ret = __trace_add_new_event(call, tr);
if (ret < 0)
- pr_warning("Could not create directory for event %s\n",
- call->name);
+ pr_warn("Could not create directory for event %s\n",
+ ftrace_event_name(call));
}
}
@@ -1870,18 +1904,20 @@ find_event_file(struct trace_array *tr, const char *system, const char *event)
{
struct ftrace_event_file *file;
struct ftrace_event_call *call;
+ const char *name;
list_for_each_entry(file, &tr->events, list) {
call = file->event_call;
+ name = ftrace_event_name(call);
- if (!call->name || !call->class || !call->class->reg)
+ if (!name || !call->class || !call->class->reg)
continue;
if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
continue;
- if (strcmp(event, call->name) == 0 &&
+ if (strcmp(event, name) == 0 &&
strcmp(system, call->class->system) == 0)
return file;
}
@@ -1949,7 +1985,7 @@ event_enable_print(struct seq_file *m, unsigned long ip,
seq_printf(m, "%s:%s:%s",
data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
data->file->event_call->class->system,
- data->file->event_call->name);
+ ftrace_event_name(data->file->event_call));
if (data->count == -1)
seq_printf(m, ":unlimited\n");
@@ -2032,6 +2068,9 @@ event_enable_func(struct ftrace_hash *hash,
bool enable;
int ret;
+ if (!tr)
+ return -ENODEV;
+
/* hash funcs only work with set_ftrace_filter */
if (!enabled || !param)
return -EINVAL;
@@ -2168,8 +2207,8 @@ __trace_early_add_event_dirs(struct trace_array *tr)
list_for_each_entry(file, &tr->events, list) {
ret = event_create_dir(tr->event_dir, file);
if (ret < 0)
- pr_warning("Could not create directory for event %s\n",
- file->event_call->name);
+ pr_warn("Could not create directory for event %s\n",
+ ftrace_event_name(file->event_call));
}
}
@@ -2192,8 +2231,8 @@ __trace_early_add_events(struct trace_array *tr)
ret = __trace_early_add_new_event(call, tr);
if (ret < 0)
- pr_warning("Could not create early event %s\n",
- call->name);
+ pr_warn("Could not create early event %s\n",
+ ftrace_event_name(call));
}
}
@@ -2240,13 +2279,13 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
entry = debugfs_create_file("set_event", 0644, parent,
tr, &ftrace_set_event_fops);
if (!entry) {
- pr_warning("Could not create debugfs 'set_event' entry\n");
+ pr_warn("Could not create debugfs 'set_event' entry\n");
return -ENOMEM;
}
d_events = debugfs_create_dir("events", parent);
if (!d_events) {
- pr_warning("Could not create debugfs 'events' directory\n");
+ pr_warn("Could not create debugfs 'events' directory\n");
return -ENOMEM;
}
@@ -2363,6 +2402,9 @@ static __init int event_trace_enable(void)
char *token;
int ret;
+ if (!tr)
+ return -ENODEV;
+
for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) {
call = *iter;
@@ -2409,6 +2451,8 @@ static __init int event_trace_init(void)
int ret;
tr = top_trace_array();
+ if (!tr)
+ return -ENODEV;
d_tracer = tracing_init_dentry();
if (!d_tracer)
@@ -2417,11 +2461,10 @@ static __init int event_trace_init(void)
entry = debugfs_create_file("available_events", 0444, d_tracer,
tr, &ftrace_avail_fops);
if (!entry)
- pr_warning("Could not create debugfs "
- "'available_events' entry\n");
+ pr_warn("Could not create debugfs 'available_events' entry\n");
if (trace_define_common_fields())
- pr_warning("tracing: Failed to allocate common fields");
+ pr_warn("tracing: Failed to allocate common fields");
ret = early_event_add_tracer(d_tracer, tr);
if (ret)
@@ -2430,7 +2473,7 @@ static __init int event_trace_init(void)
#ifdef CONFIG_MODULES
ret = register_module_notifier(&trace_module_nb);
if (ret)
- pr_warning("Failed to register trace events module notifier\n");
+ pr_warn("Failed to register trace events module notifier\n");
#endif
return 0;
}
@@ -2502,6 +2545,8 @@ static __init void event_trace_self_tests(void)
int ret;
tr = top_trace_array();
+ if (!tr)
+ return;
pr_info("Running tests on trace events:\n");
@@ -2525,14 +2570,14 @@ static __init void event_trace_self_tests(void)
continue;
#endif
- pr_info("Testing event %s: ", call->name);
+ pr_info("Testing event %s: ", ftrace_event_name(call));
/*
* If an event is already enabled, someone is using
* it and the self test should not be on.
*/
if (file->flags & FTRACE_EVENT_FL_ENABLED) {
- pr_warning("Enabled event during self test!\n");
+ pr_warn("Enabled event during self test!\n");
WARN_ON_ONCE(1);
continue;
}
@@ -2560,8 +2605,8 @@ static __init void event_trace_self_tests(void)
ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1);
if (WARN_ON_ONCE(ret)) {
- pr_warning("error enabling system %s\n",
- system->name);
+ pr_warn("error enabling system %s\n",
+ system->name);
continue;
}
@@ -2569,8 +2614,8 @@ static __init void event_trace_self_tests(void)
ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0);
if (WARN_ON_ONCE(ret)) {
- pr_warning("error disabling system %s\n",
- system->name);
+ pr_warn("error disabling system %s\n",
+ system->name);
continue;
}
@@ -2584,7 +2629,7 @@ static __init void event_trace_self_tests(void)
ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1);
if (WARN_ON_ONCE(ret)) {
- pr_warning("error enabling all events\n");
+ pr_warn("error enabling all events\n");
return;
}
@@ -2593,7 +2638,7 @@ static __init void event_trace_self_tests(void)
/* reset sysname */
ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
if (WARN_ON_ONCE(ret)) {
- pr_warning("error disabling all events\n");
+ pr_warn("error disabling all events\n");
return;
}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 8a8631926a07..7a8c1528e141 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -774,17 +774,12 @@ static void __free_preds(struct event_filter *filter)
filter->n_preds = 0;
}
-static void call_filter_disable(struct ftrace_event_call *call)
-{
- call->flags &= ~TRACE_EVENT_FL_FILTERED;
-}
-
static void filter_disable(struct ftrace_event_file *file)
{
struct ftrace_event_call *call = file->event_call;
if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
- call_filter_disable(call);
+ call->flags &= ~TRACE_EVENT_FL_FILTERED;
else
file->flags &= ~FTRACE_EVENT_FL_FILTERED;
}
@@ -804,32 +799,6 @@ void free_event_filter(struct event_filter *filter)
__free_filter(filter);
}
-void destroy_call_preds(struct ftrace_event_call *call)
-{
- __free_filter(call->filter);
- call->filter = NULL;
-}
-
-static void destroy_file_preds(struct ftrace_event_file *file)
-{
- __free_filter(file->filter);
- file->filter = NULL;
-}
-
-/*
- * Called when destroying the ftrace_event_file.
- * The file is being freed, so we do not need to worry about
- * the file being currently used. This is for module code removing
- * the tracepoints from within it.
- */
-void destroy_preds(struct ftrace_event_file *file)
-{
- if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
- destroy_call_preds(file->event_call);
- else
- destroy_file_preds(file);
-}
-
static struct event_filter *__alloc_filter(void)
{
struct event_filter *filter;
@@ -873,17 +842,14 @@ static inline void __remove_filter(struct ftrace_event_file *file)
remove_filter_string(file->filter);
}
-static void filter_free_subsystem_preds(struct event_subsystem *system,
+static void filter_free_subsystem_preds(struct ftrace_subsystem_dir *dir,
struct trace_array *tr)
{
struct ftrace_event_file *file;
- struct ftrace_event_call *call;
list_for_each_entry(file, &tr->events, list) {
- call = file->event_call;
- if (strcmp(call->class->system, system->name) != 0)
+ if (file->system != dir)
continue;
-
__remove_filter(file);
}
}
@@ -901,15 +867,13 @@ static inline void __free_subsystem_filter(struct ftrace_event_file *file)
}
}
-static void filter_free_subsystem_filters(struct event_subsystem *system,
+static void filter_free_subsystem_filters(struct ftrace_subsystem_dir *dir,
struct trace_array *tr)
{
struct ftrace_event_file *file;
- struct ftrace_event_call *call;
list_for_each_entry(file, &tr->events, list) {
- call = file->event_call;
- if (strcmp(call->class->system, system->name) != 0)
+ if (file->system != dir)
continue;
__free_subsystem_filter(file);
}
@@ -1582,7 +1546,6 @@ static int fold_pred_tree(struct event_filter *filter,
static int replace_preds(struct ftrace_event_call *call,
struct event_filter *filter,
struct filter_parse_state *ps,
- char *filter_string,
bool dry_run)
{
char *operand1 = NULL, *operand2 = NULL;
@@ -1755,13 +1718,12 @@ struct filter_list {
struct event_filter *filter;
};
-static int replace_system_preds(struct event_subsystem *system,
+static int replace_system_preds(struct ftrace_subsystem_dir *dir,
struct trace_array *tr,
struct filter_parse_state *ps,
char *filter_string)
{
struct ftrace_event_file *file;
- struct ftrace_event_call *call;
struct filter_list *filter_item;
struct filter_list *tmp;
LIST_HEAD(filter_list);
@@ -1769,15 +1731,14 @@ static int replace_system_preds(struct event_subsystem *system,
int err;
list_for_each_entry(file, &tr->events, list) {
- call = file->event_call;
- if (strcmp(call->class->system, system->name) != 0)
+ if (file->system != dir)
continue;
/*
* Try to see if the filter can be applied
* (filter arg is ignored on dry_run)
*/
- err = replace_preds(call, NULL, ps, filter_string, true);
+ err = replace_preds(file->event_call, NULL, ps, true);
if (err)
event_set_no_set_filter_flag(file);
else
@@ -1787,9 +1748,7 @@ static int replace_system_preds(struct event_subsystem *system,
list_for_each_entry(file, &tr->events, list) {
struct event_filter *filter;
- call = file->event_call;
-
- if (strcmp(call->class->system, system->name) != 0)
+ if (file->system != dir)
continue;
if (event_no_set_filter_flag(file))
@@ -1811,7 +1770,7 @@ static int replace_system_preds(struct event_subsystem *system,
if (err)
goto fail_mem;
- err = replace_preds(call, filter, ps, filter_string, false);
+ err = replace_preds(file->event_call, filter, ps, false);
if (err) {
filter_disable(file);
parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
@@ -1933,7 +1892,7 @@ static int create_filter(struct ftrace_event_call *call,
err = create_filter_start(filter_str, set_str, &ps, &filter);
if (!err) {
- err = replace_preds(call, filter, ps, filter_str, false);
+ err = replace_preds(call, filter, ps, false);
if (err && set_str)
append_filter_err(ps, filter);
}
@@ -1959,7 +1918,7 @@ int create_event_filter(struct ftrace_event_call *call,
* Identical to create_filter() except that it creates a subsystem filter
* and always remembers @filter_str.
*/
-static int create_system_filter(struct event_subsystem *system,
+static int create_system_filter(struct ftrace_subsystem_dir *dir,
struct trace_array *tr,
char *filter_str, struct event_filter **filterp)
{
@@ -1969,7 +1928,7 @@ static int create_system_filter(struct event_subsystem *system,
err = create_filter_start(filter_str, true, &ps, &filter);
if (!err) {
- err = replace_system_preds(system, tr, ps, filter_str);
+ err = replace_system_preds(dir, tr, ps, filter_str);
if (!err) {
/* System filters just show a default message */
kfree(filter->filter_string);
@@ -2053,18 +2012,18 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
}
if (!strcmp(strstrip(filter_string), "0")) {
- filter_free_subsystem_preds(system, tr);
+ filter_free_subsystem_preds(dir, tr);
remove_filter_string(system->filter);
filter = system->filter;
system->filter = NULL;
/* Ensure all filters are no longer used */
synchronize_sched();
- filter_free_subsystem_filters(system, tr);
+ filter_free_subsystem_filters(dir, tr);
__free_filter(filter);
goto out_unlock;
}
- err = create_system_filter(system, tr, filter_string, &filter);
+ err = create_system_filter(dir, tr, filter_string, &filter);
if (filter) {
/*
* No event actually uses the system filter
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 8efbb69b04f0..4747b476a030 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -77,7 +77,7 @@ event_triggers_call(struct ftrace_event_file *file, void *rec)
data->ops->func(data);
continue;
}
- filter = rcu_dereference(data->filter);
+ filter = rcu_dereference_sched(data->filter);
if (filter && !filter_match_preds(filter, rec))
continue;
if (data->cmd_ops->post_trigger) {
@@ -1095,7 +1095,7 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
seq_printf(m, "%s:%s:%s",
enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
enable_data->file->event_call->class->system,
- enable_data->file->event_call->name);
+ ftrace_event_name(enable_data->file->event_call));
if (data->count == -1)
seq_puts(m, ":unlimited");
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 7c3e3e72e2b6..d4ddde28a81a 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -95,15 +95,12 @@ static void __always_unused ____ftrace_check_##name(void) \
#undef __array
#define __array(type, item, len) \
do { \
+ char *type_str = #type"["__stringify(len)"]"; \
BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
- mutex_lock(&event_storage_mutex); \
- snprintf(event_storage, sizeof(event_storage), \
- "%s[%d]", #type, len); \
- ret = trace_define_field(event_call, event_storage, #item, \
+ ret = trace_define_field(event_call, type_str, #item, \
offsetof(typeof(field), item), \
sizeof(field.item), \
is_signed_type(type), filter_type); \
- mutex_unlock(&event_storage_mutex); \
if (ret) \
return ret; \
} while (0);
@@ -176,9 +173,11 @@ struct ftrace_event_class __refdata event_class_ftrace_##call = { \
}; \
\
struct ftrace_event_call __used event_##call = { \
- .name = #call, \
- .event.type = etype, \
.class = &event_class_ftrace_##call, \
+ { \
+ .name = #call, \
+ }, \
+ .event.type = etype, \
.print_fmt = print, \
.flags = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \
}; \
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 38fe1483c508..57f0ec962d2c 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -13,33 +13,106 @@
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
+#include <linux/slab.h>
#include <linux/fs.h>
#include "trace.h"
-/* function tracing enabled */
-static int ftrace_function_enabled;
+static void tracing_start_function_trace(struct trace_array *tr);
+static void tracing_stop_function_trace(struct trace_array *tr);
+static void
+function_trace_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *pt_regs);
+static void
+function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *pt_regs);
+static struct tracer_flags func_flags;
+
+/* Our option */
+enum {
+ TRACE_FUNC_OPT_STACK = 0x1,
+};
+
+static int allocate_ftrace_ops(struct trace_array *tr)
+{
+ struct ftrace_ops *ops;
-static struct trace_array *func_trace;
+ ops = kzalloc(sizeof(*ops), GFP_KERNEL);
+ if (!ops)
+ return -ENOMEM;
+
+ /* Currently only the non stack verision is supported */
+ ops->func = function_trace_call;
+ ops->flags = FTRACE_OPS_FL_RECURSION_SAFE;
+
+ tr->ops = ops;
+ ops->private = tr;
+ return 0;
+}
-static void tracing_start_function_trace(void);
-static void tracing_stop_function_trace(void);
+
+int ftrace_create_function_files(struct trace_array *tr,
+ struct dentry *parent)
+{
+ int ret;
+
+ /*
+ * The top level array uses the "global_ops", and the files are
+ * created on boot up.
+ */
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
+ return 0;
+
+ ret = allocate_ftrace_ops(tr);
+ if (ret)
+ return ret;
+
+ ftrace_create_filter_files(tr->ops, parent);
+
+ return 0;
+}
+
+void ftrace_destroy_function_files(struct trace_array *tr)
+{
+ ftrace_destroy_filter_files(tr->ops);
+ kfree(tr->ops);
+ tr->ops = NULL;
+}
static int function_trace_init(struct trace_array *tr)
{
- func_trace = tr;
+ ftrace_func_t func;
+
+ /*
+ * Instance trace_arrays get their ops allocated
+ * at instance creation. Unless it failed
+ * the allocation.
+ */
+ if (!tr->ops)
+ return -ENOMEM;
+
+ /* Currently only the global instance can do stack tracing */
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL &&
+ func_flags.val & TRACE_FUNC_OPT_STACK)
+ func = function_stack_trace_call;
+ else
+ func = function_trace_call;
+
+ ftrace_init_array_ops(tr, func);
+
tr->trace_buffer.cpu = get_cpu();
put_cpu();
tracing_start_cmdline_record();
- tracing_start_function_trace();
+ tracing_start_function_trace(tr);
return 0;
}
static void function_trace_reset(struct trace_array *tr)
{
- tracing_stop_function_trace();
+ tracing_stop_function_trace(tr);
tracing_stop_cmdline_record();
+ ftrace_reset_array_ops(tr);
}
static void function_trace_start(struct trace_array *tr)
@@ -47,25 +120,18 @@ static void function_trace_start(struct trace_array *tr)
tracing_reset_online_cpus(&tr->trace_buffer);
}
-/* Our option */
-enum {
- TRACE_FUNC_OPT_STACK = 0x1,
-};
-
-static struct tracer_flags func_flags;
-
static void
function_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *pt_regs)
{
- struct trace_array *tr = func_trace;
+ struct trace_array *tr = op->private;
struct trace_array_cpu *data;
unsigned long flags;
int bit;
int cpu;
int pc;
- if (unlikely(!ftrace_function_enabled))
+ if (unlikely(!tr->function_enabled))
return;
pc = preempt_count();
@@ -91,14 +157,14 @@ static void
function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *pt_regs)
{
- struct trace_array *tr = func_trace;
+ struct trace_array *tr = op->private;
struct trace_array_cpu *data;
unsigned long flags;
long disabled;
int cpu;
int pc;
- if (unlikely(!ftrace_function_enabled))
+ if (unlikely(!tr->function_enabled))
return;
/*
@@ -128,19 +194,6 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
local_irq_restore(flags);
}
-
-static struct ftrace_ops trace_ops __read_mostly =
-{
- .func = function_trace_call,
- .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
-};
-
-static struct ftrace_ops trace_stack_ops __read_mostly =
-{
- .func = function_stack_trace_call,
- .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
-};
-
static struct tracer_opt func_opts[] = {
#ifdef CONFIG_STACKTRACE
{ TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
@@ -153,29 +206,21 @@ static struct tracer_flags func_flags = {
.opts = func_opts
};
-static void tracing_start_function_trace(void)
+static void tracing_start_function_trace(struct trace_array *tr)
{
- ftrace_function_enabled = 0;
-
- if (func_flags.val & TRACE_FUNC_OPT_STACK)
- register_ftrace_function(&trace_stack_ops);
- else
- register_ftrace_function(&trace_ops);
-
- ftrace_function_enabled = 1;
+ tr->function_enabled = 0;
+ register_ftrace_function(tr->ops);
+ tr->function_enabled = 1;
}
-static void tracing_stop_function_trace(void)
+static void tracing_stop_function_trace(struct trace_array *tr)
{
- ftrace_function_enabled = 0;
-
- if (func_flags.val & TRACE_FUNC_OPT_STACK)
- unregister_ftrace_function(&trace_stack_ops);
- else
- unregister_ftrace_function(&trace_ops);
+ tr->function_enabled = 0;
+ unregister_ftrace_function(tr->ops);
}
-static int func_set_flag(u32 old_flags, u32 bit, int set)
+static int
+func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
switch (bit) {
case TRACE_FUNC_OPT_STACK:
@@ -183,12 +228,14 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)
if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
break;
+ unregister_ftrace_function(tr->ops);
+
if (set) {
- unregister_ftrace_function(&trace_ops);
- register_ftrace_function(&trace_stack_ops);
+ tr->ops->func = function_stack_trace_call;
+ register_ftrace_function(tr->ops);
} else {
- unregister_ftrace_function(&trace_stack_ops);
- register_ftrace_function(&trace_ops);
+ tr->ops->func = function_trace_call;
+ register_ftrace_function(tr->ops);
}
break;
@@ -205,9 +252,9 @@ static struct tracer function_trace __tracer_data =
.init = function_trace_init,
.reset = function_trace_reset,
.start = function_trace_start,
- .wait_pipe = poll_wait_pipe,
.flags = &func_flags,
.set_flag = func_set_flag,
+ .allow_instances = true,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_function,
#endif
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 0b99120d395c..f0a0c982cde3 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -15,6 +15,33 @@
#include "trace.h"
#include "trace_output.h"
+static bool kill_ftrace_graph;
+
+/**
+ * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called
+ *
+ * ftrace_graph_stop() is called when a severe error is detected in
+ * the function graph tracing. This function is called by the critical
+ * paths of function graph to keep those paths from doing any more harm.
+ */
+bool ftrace_graph_is_dead(void)
+{
+ return kill_ftrace_graph;
+}
+
+/**
+ * ftrace_graph_stop - set to permanently disable function graph tracincg
+ *
+ * In case of an error int function graph tracing, this is called
+ * to try to keep function graph tracing from causing any more harm.
+ * Usually this is pretty severe and this is called to try to at least
+ * get a warning out to the user.
+ */
+void ftrace_graph_stop(void)
+{
+ kill_ftrace_graph = true;
+}
+
/* When set, irq functions will be ignored */
static int ftrace_graph_skip_irqs;
@@ -38,15 +65,6 @@ struct fgraph_data {
#define TRACE_GRAPH_INDENT 2
-/* Flag options */
-#define TRACE_GRAPH_PRINT_OVERRUN 0x1
-#define TRACE_GRAPH_PRINT_CPU 0x2
-#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
-#define TRACE_GRAPH_PRINT_PROC 0x8
-#define TRACE_GRAPH_PRINT_DURATION 0x10
-#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
-#define TRACE_GRAPH_PRINT_IRQS 0x40
-
static unsigned int max_depth;
static struct tracer_opt trace_opts[] = {
@@ -64,11 +82,13 @@ static struct tracer_opt trace_opts[] = {
{ TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
/* Display interrupts */
{ TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) },
+ /* Display function name after trailing } */
+ { TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) },
{ } /* Empty entry */
};
static struct tracer_flags tracer_flags = {
- /* Don't display overruns and proc by default */
+ /* Don't display overruns, proc, or tail by default */
.val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS,
.opts = trace_opts
@@ -99,6 +119,9 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
unsigned long long calltime;
int index;
+ if (unlikely(ftrace_graph_is_dead()))
+ return -EBUSY;
+
if (!current->ret_stack)
return -EBUSY;
@@ -330,7 +353,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
return ret;
}
-int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
+static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
{
if (tracing_thresh)
return 1;
@@ -419,7 +442,7 @@ void set_graph_array(struct trace_array *tr)
smp_mb();
}
-void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
+static void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
{
if (tracing_thresh &&
(trace->rettime - trace->calltime < tracing_thresh))
@@ -452,6 +475,12 @@ static void graph_trace_reset(struct trace_array *tr)
unregister_ftrace_graph();
}
+static int graph_trace_update_thresh(struct trace_array *tr)
+{
+ graph_trace_reset(tr);
+ return graph_trace_init(tr);
+}
+
static int max_bytes_for_cpu;
static enum print_line_t
@@ -1176,9 +1205,10 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
* If the return function does not have a matching entry,
* then the entry was lost. Instead of just printing
* the '}' and letting the user guess what function this
- * belongs to, write out the function name.
+ * belongs to, write out the function name. Always do
+ * that if the funcgraph-tail option is enabled.
*/
- if (func_match) {
+ if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) {
ret = trace_seq_puts(s, "}\n");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -1405,7 +1435,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
seq_printf(s, " | | | |\n");
}
-void print_graph_headers(struct seq_file *s)
+static void print_graph_headers(struct seq_file *s)
{
print_graph_headers_flags(s, tracer_flags.val);
}
@@ -1476,7 +1506,8 @@ void graph_trace_close(struct trace_iterator *iter)
}
}
-static int func_graph_set_flag(u32 old_flags, u32 bit, int set)
+static int
+func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
if (bit == TRACE_GRAPH_PRINT_IRQS)
ftrace_graph_skip_irqs = !set;
@@ -1500,11 +1531,11 @@ static struct trace_event graph_trace_ret_event = {
static struct tracer graph_trace __tracer_data = {
.name = "function_graph",
+ .update_thresh = graph_trace_update_thresh,
.open = graph_trace_open,
.pipe_open = graph_trace_open,
.close = graph_trace_close,
.pipe_close = graph_trace_close,
- .wait_pipe = poll_wait_pipe,
.init = graph_trace_init,
.reset = graph_trace_reset,
.print_line = print_graph_function,
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 2aefbee93a6d..9bb104f748d0 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -151,16 +151,11 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
atomic_dec(&data->disabled);
}
-
-static struct ftrace_ops trace_ops __read_mostly =
-{
- .func = irqsoff_tracer_call,
- .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
-};
#endif /* CONFIG_FUNCTION_TRACER */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
+static int
+irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
int cpu;
@@ -175,7 +170,7 @@ static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
for_each_possible_cpu(cpu)
per_cpu(tracing_cpu, cpu) = 0;
- tracing_max_latency = 0;
+ tr->max_latency = 0;
tracing_reset_online_cpus(&irqsoff_trace->trace_buffer);
return start_irqsoff_tracer(irqsoff_trace, set);
@@ -266,7 +261,8 @@ __trace_function(struct trace_array *tr,
#else
#define __trace_function trace_function
-static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
+static int
+irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
return -EINVAL;
}
@@ -301,13 +297,13 @@ static void irqsoff_print_header(struct seq_file *s)
/*
* Should this new latency be reported/recorded?
*/
-static int report_latency(cycle_t delta)
+static int report_latency(struct trace_array *tr, cycle_t delta)
{
if (tracing_thresh) {
if (delta < tracing_thresh)
return 0;
} else {
- if (delta <= tracing_max_latency)
+ if (delta <= tr->max_latency)
return 0;
}
return 1;
@@ -331,13 +327,13 @@ check_critical_timing(struct trace_array *tr,
pc = preempt_count();
- if (!report_latency(delta))
+ if (!report_latency(tr, delta))
goto out;
raw_spin_lock_irqsave(&max_trace_lock, flags);
/* check if we are still the max latency */
- if (!report_latency(delta))
+ if (!report_latency(tr, delta))
goto out_unlock;
__trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
@@ -350,7 +346,7 @@ check_critical_timing(struct trace_array *tr,
data->critical_end = parent_ip;
if (likely(!is_tracing_stopped())) {
- tracing_max_latency = delta;
+ tr->max_latency = delta;
update_max_tr_single(tr, current, cpu);
}
@@ -498,14 +494,14 @@ void trace_hardirqs_off(void)
}
EXPORT_SYMBOL(trace_hardirqs_off);
-void trace_hardirqs_on_caller(unsigned long caller_addr)
+__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
{
if (!preempt_trace() && irq_trace())
stop_critical_timing(CALLER_ADDR0, caller_addr);
}
EXPORT_SYMBOL(trace_hardirqs_on_caller);
-void trace_hardirqs_off_caller(unsigned long caller_addr)
+__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
{
if (!preempt_trace() && irq_trace())
start_critical_timing(CALLER_ADDR0, caller_addr);
@@ -529,7 +525,7 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
}
#endif /* CONFIG_PREEMPT_TRACER */
-static int register_irqsoff_function(int graph, int set)
+static int register_irqsoff_function(struct trace_array *tr, int graph, int set)
{
int ret;
@@ -541,7 +537,7 @@ static int register_irqsoff_function(int graph, int set)
ret = register_ftrace_graph(&irqsoff_graph_return,
&irqsoff_graph_entry);
else
- ret = register_ftrace_function(&trace_ops);
+ ret = register_ftrace_function(tr->ops);
if (!ret)
function_enabled = true;
@@ -549,7 +545,7 @@ static int register_irqsoff_function(int graph, int set)
return ret;
}
-static void unregister_irqsoff_function(int graph)
+static void unregister_irqsoff_function(struct trace_array *tr, int graph)
{
if (!function_enabled)
return;
@@ -557,23 +553,25 @@ static void unregister_irqsoff_function(int graph)
if (graph)
unregister_ftrace_graph();
else
- unregister_ftrace_function(&trace_ops);
+ unregister_ftrace_function(tr->ops);
function_enabled = false;
}
-static void irqsoff_function_set(int set)
+static void irqsoff_function_set(struct trace_array *tr, int set)
{
if (set)
- register_irqsoff_function(is_graph(), 1);
+ register_irqsoff_function(tr, is_graph(), 1);
else
- unregister_irqsoff_function(is_graph());
+ unregister_irqsoff_function(tr, is_graph());
}
-static int irqsoff_flag_changed(struct tracer *tracer, u32 mask, int set)
+static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set)
{
+ struct tracer *tracer = tr->current_trace;
+
if (mask & TRACE_ITER_FUNCTION)
- irqsoff_function_set(set);
+ irqsoff_function_set(tr, set);
return trace_keep_overwrite(tracer, mask, set);
}
@@ -582,7 +580,7 @@ static int start_irqsoff_tracer(struct trace_array *tr, int graph)
{
int ret;
- ret = register_irqsoff_function(graph, 0);
+ ret = register_irqsoff_function(tr, graph, 0);
if (!ret && tracing_is_enabled())
tracer_enabled = 1;
@@ -596,25 +594,37 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
{
tracer_enabled = 0;
- unregister_irqsoff_function(graph);
+ unregister_irqsoff_function(tr, graph);
}
-static void __irqsoff_tracer_init(struct trace_array *tr)
+static bool irqsoff_busy;
+
+static int __irqsoff_tracer_init(struct trace_array *tr)
{
+ if (irqsoff_busy)
+ return -EBUSY;
+
save_flags = trace_flags;
/* non overwrite screws up the latency tracers */
set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
- tracing_max_latency = 0;
+ tr->max_latency = 0;
irqsoff_trace = tr;
/* make sure that the tracer is visible */
smp_wmb();
tracing_reset_online_cpus(&tr->trace_buffer);
- if (start_irqsoff_tracer(tr, is_graph()))
+ ftrace_init_array_ops(tr, irqsoff_tracer_call);
+
+ /* Only toplevel instance supports graph tracing */
+ if (start_irqsoff_tracer(tr, (tr->flags & TRACE_ARRAY_FL_GLOBAL &&
+ is_graph())))
printk(KERN_ERR "failed to start irqsoff tracer\n");
+
+ irqsoff_busy = true;
+ return 0;
}
static void irqsoff_tracer_reset(struct trace_array *tr)
@@ -626,6 +636,9 @@ static void irqsoff_tracer_reset(struct trace_array *tr)
set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
+ ftrace_reset_array_ops(tr);
+
+ irqsoff_busy = false;
}
static void irqsoff_tracer_start(struct trace_array *tr)
@@ -643,8 +656,7 @@ static int irqsoff_tracer_init(struct trace_array *tr)
{
trace_type = TRACER_IRQS_OFF;
- __irqsoff_tracer_init(tr);
- return 0;
+ return __irqsoff_tracer_init(tr);
}
static struct tracer irqsoff_tracer __read_mostly =
{
@@ -664,6 +676,7 @@ static struct tracer irqsoff_tracer __read_mostly =
#endif
.open = irqsoff_trace_open,
.close = irqsoff_trace_close,
+ .allow_instances = true,
.use_max_tr = true,
};
# define register_irqsoff(trace) register_tracer(&trace)
@@ -676,8 +689,7 @@ static int preemptoff_tracer_init(struct trace_array *tr)
{
trace_type = TRACER_PREEMPT_OFF;
- __irqsoff_tracer_init(tr);
- return 0;
+ return __irqsoff_tracer_init(tr);
}
static struct tracer preemptoff_tracer __read_mostly =
@@ -698,6 +710,7 @@ static struct tracer preemptoff_tracer __read_mostly =
#endif
.open = irqsoff_trace_open,
.close = irqsoff_trace_close,
+ .allow_instances = true,
.use_max_tr = true,
};
# define register_preemptoff(trace) register_tracer(&trace)
@@ -712,8 +725,7 @@ static int preemptirqsoff_tracer_init(struct trace_array *tr)
{
trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
- __irqsoff_tracer_init(tr);
- return 0;
+ return __irqsoff_tracer_init(tr);
}
static struct tracer preemptirqsoff_tracer __read_mostly =
@@ -734,6 +746,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
#endif
.open = irqsoff_trace_open,
.close = irqsoff_trace_close,
+ .allow_instances = true,
.use_max_tr = true,
};
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index bdbae450c13e..282f6e4e5539 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -35,37 +35,32 @@ struct trace_kprobe {
struct trace_probe tp;
};
-struct event_file_link {
- struct ftrace_event_file *file;
- struct list_head list;
-};
-
#define SIZEOF_TRACE_KPROBE(n) \
(offsetof(struct trace_kprobe, tp.args) + \
(sizeof(struct probe_arg) * (n)))
-static __kprobes bool trace_kprobe_is_return(struct trace_kprobe *tk)
+static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
{
return tk->rp.handler != NULL;
}
-static __kprobes const char *trace_kprobe_symbol(struct trace_kprobe *tk)
+static nokprobe_inline const char *trace_kprobe_symbol(struct trace_kprobe *tk)
{
return tk->symbol ? tk->symbol : "unknown";
}
-static __kprobes unsigned long trace_kprobe_offset(struct trace_kprobe *tk)
+static nokprobe_inline unsigned long trace_kprobe_offset(struct trace_kprobe *tk)
{
return tk->rp.kp.offset;
}
-static __kprobes bool trace_kprobe_has_gone(struct trace_kprobe *tk)
+static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk)
{
return !!(kprobe_gone(&tk->rp.kp));
}
-static __kprobes bool trace_kprobe_within_module(struct trace_kprobe *tk,
+static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
struct module *mod)
{
int len = strlen(mod->name);
@@ -73,7 +68,7 @@ static __kprobes bool trace_kprobe_within_module(struct trace_kprobe *tk,
return strncmp(mod->name, name, len) == 0 && name[len] == ':';
}
-static __kprobes bool trace_kprobe_is_on_module(struct trace_kprobe *tk)
+static nokprobe_inline bool trace_kprobe_is_on_module(struct trace_kprobe *tk)
{
return !!strchr(trace_kprobe_symbol(tk), ':');
}
@@ -137,19 +132,21 @@ struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
* Kprobes-specific fetch functions
*/
#define DEFINE_FETCH_stack(type) \
-static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
+static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \
void *offset, void *dest) \
{ \
*(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
(unsigned int)((unsigned long)offset)); \
-}
+} \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(stack, type));
+
DEFINE_BASIC_FETCH_FUNCS(stack)
/* No string on the stack entry */
#define fetch_stack_string NULL
#define fetch_stack_string_size NULL
#define DEFINE_FETCH_memory(type) \
-static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
+static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \
void *addr, void *dest) \
{ \
type retval; \
@@ -157,14 +154,16 @@ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
*(type *)dest = 0; \
else \
*(type *)dest = retval; \
-}
+} \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, type));
+
DEFINE_BASIC_FETCH_FUNCS(memory)
/*
* Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
* length and relative data location.
*/
-static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
- void *addr, void *dest)
+static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
+ void *addr, void *dest)
{
long ret;
int maxlen = get_rloc_len(*(u32 *)dest);
@@ -198,10 +197,11 @@ static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
get_rloc_offs(*(u32 *)dest));
}
}
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string));
/* Return the length of string -- including null terminal byte */
-static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
- void *addr, void *dest)
+static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
+ void *addr, void *dest)
{
mm_segment_t old_fs;
int ret, len = 0;
@@ -224,17 +224,19 @@ static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
else
*(u32 *)dest = len;
}
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string_size));
#define DEFINE_FETCH_symbol(type) \
-__kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, \
- void *data, void *dest) \
+void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, void *data, void *dest)\
{ \
struct symbol_cache *sc = data; \
if (sc->addr) \
fetch_memory_##type(regs, (void *)sc->addr, dest); \
else \
*(type *)dest = 0; \
-}
+} \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(symbol, type));
+
DEFINE_BASIC_FETCH_FUNCS(symbol)
DEFINE_FETCH_symbol(string)
DEFINE_FETCH_symbol(string_size)
@@ -346,7 +348,7 @@ static struct trace_kprobe *find_trace_kprobe(const char *event,
struct trace_kprobe *tk;
list_for_each_entry(tk, &probe_list, list)
- if (strcmp(tk->tp.call.name, event) == 0 &&
+ if (strcmp(ftrace_event_name(&tk->tp.call), event) == 0 &&
strcmp(tk->tp.call.class->system, group) == 0)
return tk;
return NULL;
@@ -387,18 +389,6 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)
return ret;
}
-static struct event_file_link *
-find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
-{
- struct event_file_link *link;
-
- list_for_each_entry(link, &tp->files, list)
- if (link->file == file)
- return link;
-
- return NULL;
-}
-
/*
* Disable trace_probe
* if the file is NULL, disable "perf" handler, or disable "trace" handler.
@@ -533,7 +523,8 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
mutex_lock(&probe_lock);
/* Delete old (same name) event if exist */
- old_tk = find_trace_kprobe(tk->tp.call.name, tk->tp.call.class->system);
+ old_tk = find_trace_kprobe(ftrace_event_name(&tk->tp.call),
+ tk->tp.call.class->system);
if (old_tk) {
ret = unregister_trace_kprobe(old_tk);
if (ret < 0)
@@ -581,7 +572,8 @@ static int trace_kprobe_module_callback(struct notifier_block *nb,
if (ret)
pr_warning("Failed to re-register probe %s on"
"%s: %d\n",
- tk->tp.call.name, mod->name, ret);
+ ftrace_event_name(&tk->tp.call),
+ mod->name, ret);
}
}
mutex_unlock(&probe_lock);
@@ -835,7 +827,8 @@ static int probes_seq_show(struct seq_file *m, void *v)
int i;
seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p');
- seq_printf(m, ":%s/%s", tk->tp.call.class->system, tk->tp.call.name);
+ seq_printf(m, ":%s/%s", tk->tp.call.class->system,
+ ftrace_event_name(&tk->tp.call));
if (!tk->symbol)
seq_printf(m, " 0x%p", tk->rp.kp.addr);
@@ -893,7 +886,8 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
{
struct trace_kprobe *tk = v;
- seq_printf(m, " %-44s %15lu %15lu\n", tk->tp.call.name, tk->nhit,
+ seq_printf(m, " %-44s %15lu %15lu\n",
+ ftrace_event_name(&tk->tp.call), tk->nhit,
tk->rp.kp.nmissed);
return 0;
@@ -920,7 +914,7 @@ static const struct file_operations kprobe_profile_ops = {
};
/* Kprobe handler */
-static __kprobes void
+static nokprobe_inline void
__kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
struct ftrace_event_file *ftrace_file)
{
@@ -956,7 +950,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
entry, irq_flags, pc, regs);
}
-static __kprobes void
+static void
kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs)
{
struct event_file_link *link;
@@ -964,9 +958,10 @@ kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs)
list_for_each_entry_rcu(link, &tk->tp.files, list)
__kprobe_trace_func(tk, regs, link->file);
}
+NOKPROBE_SYMBOL(kprobe_trace_func);
/* Kretprobe handler */
-static __kprobes void
+static nokprobe_inline void
__kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
struct pt_regs *regs,
struct ftrace_event_file *ftrace_file)
@@ -1004,7 +999,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
entry, irq_flags, pc, regs);
}
-static __kprobes void
+static void
kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
struct pt_regs *regs)
{
@@ -1013,6 +1008,7 @@ kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
list_for_each_entry_rcu(link, &tk->tp.files, list)
__kretprobe_trace_func(tk, ri, regs, link->file);
}
+NOKPROBE_SYMBOL(kretprobe_trace_func);
/* Event entry printers */
static enum print_line_t
@@ -1028,7 +1024,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
field = (struct kprobe_trace_entry_head *)iter->ent;
tp = container_of(event, struct trace_probe, call.event);
- if (!trace_seq_printf(s, "%s: (", tp->call.name))
+ if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)))
goto partial;
if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
@@ -1064,7 +1060,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
field = (struct kretprobe_trace_entry_head *)iter->ent;
tp = container_of(event, struct trace_probe, call.event);
- if (!trace_seq_printf(s, "%s: (", tp->call.name))
+ if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)))
goto partial;
if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
@@ -1144,7 +1140,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
#ifdef CONFIG_PERF_EVENTS
/* Kprobe profile handler */
-static __kprobes void
+static void
kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
{
struct ftrace_event_call *call = &tk->tp.call;
@@ -1171,9 +1167,10 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
}
+NOKPROBE_SYMBOL(kprobe_perf_func);
/* Kretprobe profile handler */
-static __kprobes void
+static void
kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
struct pt_regs *regs)
{
@@ -1201,6 +1198,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
}
+NOKPROBE_SYMBOL(kretprobe_perf_func);
#endif /* CONFIG_PERF_EVENTS */
/*
@@ -1209,9 +1207,8 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
* kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe
* lockless, but we can't race with this __init function.
*/
-static __kprobes
-int kprobe_register(struct ftrace_event_call *event,
- enum trace_reg type, void *data)
+static int kprobe_register(struct ftrace_event_call *event,
+ enum trace_reg type, void *data)
{
struct trace_kprobe *tk = (struct trace_kprobe *)event->data;
struct ftrace_event_file *file = data;
@@ -1237,8 +1234,7 @@ int kprobe_register(struct ftrace_event_call *event,
return 0;
}
-static __kprobes
-int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
+static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
{
struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
@@ -1252,9 +1248,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
#endif
return 0; /* We don't tweek kernel, so just return 0 */
}
+NOKPROBE_SYMBOL(kprobe_dispatcher);
-static __kprobes
-int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
+static int
+kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
{
struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp);
@@ -1268,6 +1265,7 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
#endif
return 0; /* We don't tweek kernel, so just return 0 */
}
+NOKPROBE_SYMBOL(kretprobe_dispatcher);
static struct trace_event_functions kretprobe_funcs = {
.trace = print_kretprobe_event
@@ -1303,7 +1301,8 @@ static int register_kprobe_event(struct trace_kprobe *tk)
call->data = tk;
ret = trace_add_event_call(call);
if (ret) {
- pr_info("Failed to register kprobe event: %s\n", call->name);
+ pr_info("Failed to register kprobe event: %s\n",
+ ftrace_event_name(call));
kfree(call->print_fmt);
unregister_ftrace_event(&call->event);
}
@@ -1389,6 +1388,9 @@ static __init int kprobe_trace_self_tests_init(void)
struct trace_kprobe *tk;
struct ftrace_event_file *file;
+ if (tracing_is_disabled())
+ return -ENODEV;
+
target = kprobe_trace_selftest_target;
pr_info("Testing kprobe tracing: ");
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 394f94417e2f..fcf0a9e48916 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -62,7 +62,7 @@ static void nop_trace_reset(struct trace_array *tr)
* If you don't implement it, then the flag setting will be
* automatically accepted.
*/
-static int nop_set_flag(u32 old_flags, u32 bit, int set)
+static int nop_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
/*
* Note that you don't need to update nop_flags.val yourself.
@@ -91,11 +91,11 @@ struct tracer nop_trace __read_mostly =
.name = "nop",
.init = nop_trace_init,
.reset = nop_trace_reset,
- .wait_pipe = poll_wait_pipe,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_nop,
#endif
.flags = &nop_flags,
- .set_flag = nop_set_flag
+ .set_flag = nop_set_flag,
+ .allow_instances = true,
};
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index ed32284fbe32..c6977d5a9b12 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -20,23 +20,6 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
static int next_event_type = __TRACE_LAST_TYPE + 1;
-int trace_print_seq(struct seq_file *m, struct trace_seq *s)
-{
- int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
- int ret;
-
- ret = seq_write(m, s->buffer, len);
-
- /*
- * Only reset this buffer if we successfully wrote to the
- * seq_file buffer.
- */
- if (!ret)
- trace_seq_init(s);
-
- return ret;
-}
-
enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
@@ -85,229 +68,6 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
return TRACE_TYPE_HANDLED;
}
-/**
- * trace_seq_printf - sequence printing of trace information
- * @s: trace sequence descriptor
- * @fmt: printf format string
- *
- * It returns 0 if the trace oversizes the buffer's free
- * space, 1 otherwise.
- *
- * The tracer may use either sequence operations or its own
- * copy to user routines. To simplify formating of a trace
- * trace_seq_printf is used to store strings into a special
- * buffer (@s). Then the output may be either used by
- * the sequencer or pulled into another buffer.
- */
-int
-trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
-{
- int len = (PAGE_SIZE - 1) - s->len;
- va_list ap;
- int ret;
-
- if (s->full || !len)
- return 0;
-
- va_start(ap, fmt);
- ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
- va_end(ap);
-
- /* If we can't write it all, don't bother writing anything */
- if (ret >= len) {
- s->full = 1;
- return 0;
- }
-
- s->len += ret;
-
- return 1;
-}
-EXPORT_SYMBOL_GPL(trace_seq_printf);
-
-/**
- * trace_seq_vprintf - sequence printing of trace information
- * @s: trace sequence descriptor
- * @fmt: printf format string
- *
- * The tracer may use either sequence operations or its own
- * copy to user routines. To simplify formating of a trace
- * trace_seq_printf is used to store strings into a special
- * buffer (@s). Then the output may be either used by
- * the sequencer or pulled into another buffer.
- */
-int
-trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
-{
- int len = (PAGE_SIZE - 1) - s->len;
- int ret;
-
- if (s->full || !len)
- return 0;
-
- ret = vsnprintf(s->buffer + s->len, len, fmt, args);
-
- /* If we can't write it all, don't bother writing anything */
- if (ret >= len) {
- s->full = 1;
- return 0;
- }
-
- s->len += ret;
-
- return len;
-}
-EXPORT_SYMBOL_GPL(trace_seq_vprintf);
-
-int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
-{
- int len = (PAGE_SIZE - 1) - s->len;
- int ret;
-
- if (s->full || !len)
- return 0;
-
- ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
-
- /* If we can't write it all, don't bother writing anything */
- if (ret >= len) {
- s->full = 1;
- return 0;
- }
-
- s->len += ret;
-
- return len;
-}
-
-/**
- * trace_seq_puts - trace sequence printing of simple string
- * @s: trace sequence descriptor
- * @str: simple string to record
- *
- * The tracer may use either the sequence operations or its own
- * copy to user routines. This function records a simple string
- * into a special buffer (@s) for later retrieval by a sequencer
- * or other mechanism.
- */
-int trace_seq_puts(struct trace_seq *s, const char *str)
-{
- int len = strlen(str);
-
- if (s->full)
- return 0;
-
- if (len > ((PAGE_SIZE - 1) - s->len)) {
- s->full = 1;
- return 0;
- }
-
- memcpy(s->buffer + s->len, str, len);
- s->len += len;
-
- return len;
-}
-
-int trace_seq_putc(struct trace_seq *s, unsigned char c)
-{
- if (s->full)
- return 0;
-
- if (s->len >= (PAGE_SIZE - 1)) {
- s->full = 1;
- return 0;
- }
-
- s->buffer[s->len++] = c;
-
- return 1;
-}
-EXPORT_SYMBOL(trace_seq_putc);
-
-int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
-{
- if (s->full)
- return 0;
-
- if (len > ((PAGE_SIZE - 1) - s->len)) {
- s->full = 1;
- return 0;
- }
-
- memcpy(s->buffer + s->len, mem, len);
- s->len += len;
-
- return len;
-}
-
-int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len)
-{
- unsigned char hex[HEX_CHARS];
- const unsigned char *data = mem;
- int i, j;
-
- if (s->full)
- return 0;
-
-#ifdef __BIG_ENDIAN
- for (i = 0, j = 0; i < len; i++) {
-#else
- for (i = len-1, j = 0; i >= 0; i--) {
-#endif
- hex[j++] = hex_asc_hi(data[i]);
- hex[j++] = hex_asc_lo(data[i]);
- }
- hex[j++] = ' ';
-
- return trace_seq_putmem(s, hex, j);
-}
-
-void *trace_seq_reserve(struct trace_seq *s, size_t len)
-{
- void *ret;
-
- if (s->full)
- return NULL;
-
- if (len > ((PAGE_SIZE - 1) - s->len)) {
- s->full = 1;
- return NULL;
- }
-
- ret = s->buffer + s->len;
- s->len += len;
-
- return ret;
-}
-
-int trace_seq_path(struct trace_seq *s, const struct path *path)
-{
- unsigned char *p;
-
- if (s->full)
- return 0;
-
- if (s->len >= (PAGE_SIZE - 1)) {
- s->full = 1;
- return 0;
- }
-
- p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
- if (!IS_ERR(p)) {
- p = mangle_path(s->buffer + s->len, p, "\n");
- if (p) {
- s->len = p - s->buffer;
- return 1;
- }
- } else {
- s->buffer[s->len++] = '?';
- return 1;
- }
-
- s->full = 1;
- return 0;
-}
-
const char *
ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
unsigned long flags,
@@ -315,7 +75,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
{
unsigned long mask;
const char *str;
- const char *ret = p->buffer + p->len;
+ const char *ret = trace_seq_buffer_ptr(p);
int i, first = 1;
for (i = 0; flag_array[i].name && flags; i++) {
@@ -351,7 +111,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
const struct trace_print_flags *symbol_array)
{
int i;
- const char *ret = p->buffer + p->len;
+ const char *ret = trace_seq_buffer_ptr(p);
for (i = 0; symbol_array[i].name; i++) {
@@ -362,7 +122,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
break;
}
- if (ret == (const char *)(p->buffer + p->len))
+ if (ret == (const char *)(trace_seq_buffer_ptr(p)))
trace_seq_printf(p, "0x%lx", val);
trace_seq_putc(p, 0);
@@ -377,7 +137,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
const struct trace_print_flags_u64 *symbol_array)
{
int i;
- const char *ret = p->buffer + p->len;
+ const char *ret = trace_seq_buffer_ptr(p);
for (i = 0; symbol_array[i].name; i++) {
@@ -388,7 +148,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
break;
}
- if (ret == (const char *)(p->buffer + p->len))
+ if (ret == (const char *)(trace_seq_buffer_ptr(p)))
trace_seq_printf(p, "0x%llx", val);
trace_seq_putc(p, 0);
@@ -399,10 +159,23 @@ EXPORT_SYMBOL(ftrace_print_symbols_seq_u64);
#endif
const char *
+ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
+ unsigned int bitmask_size)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+
+ trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ftrace_print_bitmask_seq);
+
+const char *
ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
{
int i;
- const char *ret = p->buffer + p->len;
+ const char *ret = trace_seq_buffer_ptr(p);
for (i = 0; i < buf_len; i++)
trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]);
@@ -431,7 +204,7 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
}
trace_seq_init(p);
- ret = trace_seq_printf(s, "%s: ", event->name);
+ ret = trace_seq_printf(s, "%s: ", ftrace_event_name(event));
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -439,6 +212,37 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
}
EXPORT_SYMBOL(ftrace_raw_output_prep);
+static int ftrace_output_raw(struct trace_iterator *iter, char *name,
+ char *fmt, va_list ap)
+{
+ struct trace_seq *s = &iter->seq;
+ int ret;
+
+ ret = trace_seq_printf(s, "%s: ", name);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ ret = trace_seq_vprintf(s, fmt, ap);
+
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...)
+{
+ va_list ap;
+ int ret;
+
+ va_start(ap, fmt);
+ ret = ftrace_output_raw(iter, name, fmt, ap);
+ va_end(ap);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ftrace_output_call);
+
#ifdef CONFIG_KRETPROBES
static inline const char *kretprobed(const char *name)
{
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 127a9d8c8357..80b25b585a70 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -35,9 +35,6 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
extern int __unregister_ftrace_event(struct trace_event *event);
extern struct rw_semaphore trace_event_sem;
-#define MAX_MEMHEX_BYTES 8
-#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
-
#define SEQ_PUT_FIELD_RET(s, x) \
do { \
if (!trace_seq_putmem(s, &(x), sizeof(x))) \
@@ -46,7 +43,6 @@ do { \
#define SEQ_PUT_HEX_FIELD_RET(s, x) \
do { \
- BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \
if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
return TRACE_TYPE_PARTIAL_LINE; \
} while (0)
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 8364a421b4df..d4b9fc22cd27 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -37,13 +37,13 @@ const char *reserved_field_names[] = {
/* Printing in basic type function template */
#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \
-__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
- const char *name, \
- void *data, void *ent) \
+int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \
+ void *data, void *ent) \
{ \
return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \
} \
-const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
+const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \
+NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type));
DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x")
DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x")
@@ -55,9 +55,8 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d")
DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld")
/* Print type function for string type */
-__kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
- const char *name,
- void *data, void *ent)
+int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name,
+ void *data, void *ent)
{
int len = *(u32 *)data >> 16;
@@ -67,6 +66,7 @@ __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
return trace_seq_printf(s, " %s=\"%s\"", name,
(const char *)get_loc_data(data, ent));
}
+NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string));
const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
@@ -81,23 +81,24 @@ const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
/* Data fetch function templates */
#define DEFINE_FETCH_reg(type) \
-__kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
- void *offset, void *dest) \
+void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, void *offset, void *dest) \
{ \
*(type *)dest = (type)regs_get_register(regs, \
(unsigned int)((unsigned long)offset)); \
-}
+} \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(reg, type));
DEFINE_BASIC_FETCH_FUNCS(reg)
/* No string on the register */
#define fetch_reg_string NULL
#define fetch_reg_string_size NULL
#define DEFINE_FETCH_retval(type) \
-__kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \
- void *dummy, void *dest) \
+void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \
+ void *dummy, void *dest) \
{ \
*(type *)dest = (type)regs_return_value(regs); \
-}
+} \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(retval, type));
DEFINE_BASIC_FETCH_FUNCS(retval)
/* No string on the retval */
#define fetch_retval_string NULL
@@ -112,8 +113,8 @@ struct deref_fetch_param {
};
#define DEFINE_FETCH_deref(type) \
-__kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \
- void *data, void *dest) \
+void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \
+ void *data, void *dest) \
{ \
struct deref_fetch_param *dprm = data; \
unsigned long addr; \
@@ -123,12 +124,13 @@ __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \
dprm->fetch(regs, (void *)addr, dest); \
} else \
*(type *)dest = 0; \
-}
+} \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, type));
DEFINE_BASIC_FETCH_FUNCS(deref)
DEFINE_FETCH_deref(string)
-__kprobes void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs,
- void *data, void *dest)
+void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs,
+ void *data, void *dest)
{
struct deref_fetch_param *dprm = data;
unsigned long addr;
@@ -140,16 +142,18 @@ __kprobes void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs,
} else
*(string_size *)dest = 0;
}
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, string_size));
-static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data)
+static void update_deref_fetch_param(struct deref_fetch_param *data)
{
if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
update_deref_fetch_param(data->orig.data);
else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
update_symbol_cache(data->orig.data);
}
+NOKPROBE_SYMBOL(update_deref_fetch_param);
-static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
+static void free_deref_fetch_param(struct deref_fetch_param *data)
{
if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
free_deref_fetch_param(data->orig.data);
@@ -157,6 +161,7 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
free_symbol_cache(data->orig.data);
kfree(data);
}
+NOKPROBE_SYMBOL(free_deref_fetch_param);
/* Bitfield fetch function */
struct bitfield_fetch_param {
@@ -166,8 +171,8 @@ struct bitfield_fetch_param {
};
#define DEFINE_FETCH_bitfield(type) \
-__kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \
- void *data, void *dest) \
+void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \
+ void *data, void *dest) \
{ \
struct bitfield_fetch_param *bprm = data; \
type buf = 0; \
@@ -177,13 +182,13 @@ __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \
buf >>= bprm->low_shift; \
} \
*(type *)dest = buf; \
-}
-
+} \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(bitfield, type));
DEFINE_BASIC_FETCH_FUNCS(bitfield)
#define fetch_bitfield_string NULL
#define fetch_bitfield_string_size NULL
-static __kprobes void
+static void
update_bitfield_fetch_param(struct bitfield_fetch_param *data)
{
/*
@@ -196,7 +201,7 @@ update_bitfield_fetch_param(struct bitfield_fetch_param *data)
update_symbol_cache(data->orig.data);
}
-static __kprobes void
+static void
free_bitfield_fetch_param(struct bitfield_fetch_param *data)
{
/*
@@ -255,17 +260,17 @@ fail:
}
/* Special function : only accept unsigned long */
-static __kprobes void fetch_kernel_stack_address(struct pt_regs *regs,
- void *dummy, void *dest)
+static void fetch_kernel_stack_address(struct pt_regs *regs, void *dummy, void *dest)
{
*(unsigned long *)dest = kernel_stack_pointer(regs);
}
+NOKPROBE_SYMBOL(fetch_kernel_stack_address);
-static __kprobes void fetch_user_stack_address(struct pt_regs *regs,
- void *dummy, void *dest)
+static void fetch_user_stack_address(struct pt_regs *regs, void *dummy, void *dest)
{
*(unsigned long *)dest = user_stack_pointer(regs);
}
+NOKPROBE_SYMBOL(fetch_user_stack_address);
static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
fetch_func_t orig_fn,
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index b73574a5f429..4f815fbce16d 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -81,13 +81,13 @@
*/
#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs))
-static inline void *get_rloc_data(u32 *dl)
+static nokprobe_inline void *get_rloc_data(u32 *dl)
{
return (u8 *)dl + get_rloc_offs(*dl);
}
/* For data_loc conversion */
-static inline void *get_loc_data(u32 *dl, void *ent)
+static nokprobe_inline void *get_loc_data(u32 *dl, void *ent)
{
return (u8 *)ent + get_rloc_offs(*dl);
}
@@ -136,9 +136,8 @@ typedef u32 string_size;
/* Printing in basic type function template */
#define DECLARE_BASIC_PRINT_TYPE_FUNC(type) \
-__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
- const char *name, \
- void *data, void *ent); \
+int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \
+ void *data, void *ent); \
extern const char PRINT_TYPE_FMT_NAME(type)[]
DECLARE_BASIC_PRINT_TYPE_FUNC(u8);
@@ -288,6 +287,11 @@ struct trace_probe {
struct probe_arg args[];
};
+struct event_file_link {
+ struct ftrace_event_file *file;
+ struct list_head list;
+};
+
static inline bool trace_probe_is_enabled(struct trace_probe *tp)
{
return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE));
@@ -298,7 +302,7 @@ static inline bool trace_probe_is_registered(struct trace_probe *tp)
return !!(tp->flags & TP_FLAG_REGISTERED);
}
-static inline __kprobes void call_fetch(struct fetch_param *fprm,
+static nokprobe_inline void call_fetch(struct fetch_param *fprm,
struct pt_regs *regs, void *dest)
{
return fprm->fn(regs, fprm->data, dest);
@@ -316,6 +320,18 @@ static inline int is_good_name(const char *name)
return 1;
}
+static inline struct event_file_link *
+find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
+{
+ struct event_file_link *link;
+
+ list_for_each_entry(link, &tp->files, list)
+ if (link->file == file)
+ return link;
+
+ return NULL;
+}
+
extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
struct probe_arg *parg, bool is_return, bool is_kprobe);
@@ -334,7 +350,7 @@ extern ssize_t traceprobe_probes_write(struct file *file,
extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
/* Sum up total data length for dynamic arraies (strings) */
-static inline __kprobes int
+static nokprobe_inline int
__get_data_size(struct trace_probe *tp, struct pt_regs *regs)
{
int i, ret = 0;
@@ -350,7 +366,7 @@ __get_data_size(struct trace_probe *tp, struct pt_regs *regs)
}
/* Store the value of each argument */
-static inline __kprobes void
+static nokprobe_inline void
store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs,
u8 *data, int maxlen)
{
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 6e32635e5e57..19bd8928ce94 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -130,15 +130,9 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
atomic_dec(&data->disabled);
preempt_enable_notrace();
}
-
-static struct ftrace_ops trace_ops __read_mostly =
-{
- .func = wakeup_tracer_call,
- .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
-};
#endif /* CONFIG_FUNCTION_TRACER */
-static int register_wakeup_function(int graph, int set)
+static int register_wakeup_function(struct trace_array *tr, int graph, int set)
{
int ret;
@@ -150,7 +144,7 @@ static int register_wakeup_function(int graph, int set)
ret = register_ftrace_graph(&wakeup_graph_return,
&wakeup_graph_entry);
else
- ret = register_ftrace_function(&trace_ops);
+ ret = register_ftrace_function(tr->ops);
if (!ret)
function_enabled = true;
@@ -158,7 +152,7 @@ static int register_wakeup_function(int graph, int set)
return ret;
}
-static void unregister_wakeup_function(int graph)
+static void unregister_wakeup_function(struct trace_array *tr, int graph)
{
if (!function_enabled)
return;
@@ -166,32 +160,34 @@ static void unregister_wakeup_function(int graph)
if (graph)
unregister_ftrace_graph();
else
- unregister_ftrace_function(&trace_ops);
+ unregister_ftrace_function(tr->ops);
function_enabled = false;
}
-static void wakeup_function_set(int set)
+static void wakeup_function_set(struct trace_array *tr, int set)
{
if (set)
- register_wakeup_function(is_graph(), 1);
+ register_wakeup_function(tr, is_graph(), 1);
else
- unregister_wakeup_function(is_graph());
+ unregister_wakeup_function(tr, is_graph());
}
-static int wakeup_flag_changed(struct tracer *tracer, u32 mask, int set)
+static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
{
+ struct tracer *tracer = tr->current_trace;
+
if (mask & TRACE_ITER_FUNCTION)
- wakeup_function_set(set);
+ wakeup_function_set(tr, set);
return trace_keep_overwrite(tracer, mask, set);
}
-static int start_func_tracer(int graph)
+static int start_func_tracer(struct trace_array *tr, int graph)
{
int ret;
- ret = register_wakeup_function(graph, 0);
+ ret = register_wakeup_function(tr, graph, 0);
if (!ret && tracing_is_enabled())
tracer_enabled = 1;
@@ -201,15 +197,16 @@ static int start_func_tracer(int graph)
return ret;
}
-static void stop_func_tracer(int graph)
+static void stop_func_tracer(struct trace_array *tr, int graph)
{
tracer_enabled = 0;
- unregister_wakeup_function(graph);
+ unregister_wakeup_function(tr, graph);
}
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
+static int
+wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
if (!(bit & TRACE_DISPLAY_GRAPH))
@@ -218,12 +215,12 @@ static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
if (!(is_graph() ^ set))
return 0;
- stop_func_tracer(!set);
+ stop_func_tracer(tr, !set);
wakeup_reset(wakeup_trace);
- tracing_max_latency = 0;
+ tr->max_latency = 0;
- return start_func_tracer(set);
+ return start_func_tracer(tr, set);
}
static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
@@ -311,7 +308,8 @@ __trace_function(struct trace_array *tr,
#else
#define __trace_function trace_function
-static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
+static int
+wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
return -EINVAL;
}
@@ -346,13 +344,13 @@ static void wakeup_print_header(struct seq_file *s)
/*
* Should this new latency be reported/recorded?
*/
-static int report_latency(cycle_t delta)
+static int report_latency(struct trace_array *tr, cycle_t delta)
{
if (tracing_thresh) {
if (delta < tracing_thresh)
return 0;
} else {
- if (delta <= tracing_max_latency)
+ if (delta <= tr->max_latency)
return 0;
}
return 1;
@@ -420,11 +418,11 @@ probe_wakeup_sched_switch(void *ignore,
T1 = ftrace_now(cpu);
delta = T1-T0;
- if (!report_latency(delta))
+ if (!report_latency(wakeup_trace, delta))
goto out_unlock;
if (likely(!is_tracing_stopped())) {
- tracing_max_latency = delta;
+ wakeup_trace->max_latency = delta;
update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
}
@@ -583,7 +581,7 @@ static void start_wakeup_tracer(struct trace_array *tr)
*/
smp_wmb();
- if (start_func_tracer(is_graph()))
+ if (start_func_tracer(tr, is_graph()))
printk(KERN_ERR "failed to start wakeup tracer\n");
return;
@@ -596,13 +594,15 @@ fail_deprobe:
static void stop_wakeup_tracer(struct trace_array *tr)
{
tracer_enabled = 0;
- stop_func_tracer(is_graph());
+ stop_func_tracer(tr, is_graph());
unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
unregister_trace_sched_wakeup(probe_wakeup, NULL);
unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
}
+static bool wakeup_busy;
+
static int __wakeup_tracer_init(struct trace_array *tr)
{
save_flags = trace_flags;
@@ -611,14 +611,20 @@ static int __wakeup_tracer_init(struct trace_array *tr)
set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
- tracing_max_latency = 0;
+ tr->max_latency = 0;
wakeup_trace = tr;
+ ftrace_init_array_ops(tr, wakeup_tracer_call);
start_wakeup_tracer(tr);
+
+ wakeup_busy = true;
return 0;
}
static int wakeup_tracer_init(struct trace_array *tr)
{
+ if (wakeup_busy)
+ return -EBUSY;
+
wakeup_dl = 0;
wakeup_rt = 0;
return __wakeup_tracer_init(tr);
@@ -626,6 +632,9 @@ static int wakeup_tracer_init(struct trace_array *tr)
static int wakeup_rt_tracer_init(struct trace_array *tr)
{
+ if (wakeup_busy)
+ return -EBUSY;
+
wakeup_dl = 0;
wakeup_rt = 1;
return __wakeup_tracer_init(tr);
@@ -633,6 +642,9 @@ static int wakeup_rt_tracer_init(struct trace_array *tr)
static int wakeup_dl_tracer_init(struct trace_array *tr)
{
+ if (wakeup_busy)
+ return -EBUSY;
+
wakeup_dl = 1;
wakeup_rt = 0;
return __wakeup_tracer_init(tr);
@@ -649,6 +661,8 @@ static void wakeup_tracer_reset(struct trace_array *tr)
set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
+ ftrace_reset_array_ops(tr);
+ wakeup_busy = false;
}
static void wakeup_tracer_start(struct trace_array *tr)
@@ -680,6 +694,7 @@ static struct tracer wakeup_tracer __read_mostly =
#endif
.open = wakeup_trace_open,
.close = wakeup_trace_close,
+ .allow_instances = true,
.use_max_tr = true,
};
@@ -690,7 +705,6 @@ static struct tracer wakeup_rt_tracer __read_mostly =
.reset = wakeup_tracer_reset,
.start = wakeup_tracer_start,
.stop = wakeup_tracer_stop,
- .wait_pipe = poll_wait_pipe,
.print_max = true,
.print_header = wakeup_print_header,
.print_line = wakeup_print_line,
@@ -702,6 +716,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
#endif
.open = wakeup_trace_open,
.close = wakeup_trace_close,
+ .allow_instances = true,
.use_max_tr = true,
};
@@ -712,7 +727,6 @@ static struct tracer wakeup_dl_tracer __read_mostly =
.reset = wakeup_tracer_reset,
.start = wakeup_tracer_start,
.stop = wakeup_tracer_stop,
- .wait_pipe = poll_wait_pipe,
.print_max = true,
.print_header = wakeup_print_header,
.print_line = wakeup_print_line,
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index e98fca60974f..5ef60499dc8e 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -65,7 +65,7 @@ static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
/* Don't allow flipping of max traces now */
local_irq_save(flags);
- arch_spin_lock(&ftrace_max_lock);
+ arch_spin_lock(&buf->tr->max_lock);
cnt = ring_buffer_entries(buf->buffer);
@@ -83,7 +83,7 @@ static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
break;
}
tracing_on();
- arch_spin_unlock(&ftrace_max_lock);
+ arch_spin_unlock(&buf->tr->max_lock);
local_irq_restore(flags);
if (count)
@@ -161,11 +161,6 @@ static struct ftrace_ops test_probe3 = {
.flags = FTRACE_OPS_FL_RECURSION_SAFE,
};
-static struct ftrace_ops test_global = {
- .func = trace_selftest_test_global_func,
- .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
-};
-
static void print_counts(void)
{
printk("(%d %d %d %d %d) ",
@@ -185,7 +180,7 @@ static void reset_counts(void)
trace_selftest_test_dyn_cnt = 0;
}
-static int trace_selftest_ops(int cnt)
+static int trace_selftest_ops(struct trace_array *tr, int cnt)
{
int save_ftrace_enabled = ftrace_enabled;
struct ftrace_ops *dyn_ops;
@@ -220,7 +215,11 @@ static int trace_selftest_ops(int cnt)
register_ftrace_function(&test_probe1);
register_ftrace_function(&test_probe2);
register_ftrace_function(&test_probe3);
- register_ftrace_function(&test_global);
+ /* First time we are running with main function */
+ if (cnt > 1) {
+ ftrace_init_array_ops(tr, trace_selftest_test_global_func);
+ register_ftrace_function(tr->ops);
+ }
DYN_FTRACE_TEST_NAME();
@@ -232,8 +231,10 @@ static int trace_selftest_ops(int cnt)
goto out;
if (trace_selftest_test_probe3_cnt != 1)
goto out;
- if (trace_selftest_test_global_cnt == 0)
- goto out;
+ if (cnt > 1) {
+ if (trace_selftest_test_global_cnt == 0)
+ goto out;
+ }
DYN_FTRACE_TEST_NAME2();
@@ -269,8 +270,10 @@ static int trace_selftest_ops(int cnt)
goto out_free;
if (trace_selftest_test_probe3_cnt != 3)
goto out_free;
- if (trace_selftest_test_global_cnt == 0)
- goto out;
+ if (cnt > 1) {
+ if (trace_selftest_test_global_cnt == 0)
+ goto out;
+ }
if (trace_selftest_test_dyn_cnt == 0)
goto out_free;
@@ -295,7 +298,9 @@ static int trace_selftest_ops(int cnt)
unregister_ftrace_function(&test_probe1);
unregister_ftrace_function(&test_probe2);
unregister_ftrace_function(&test_probe3);
- unregister_ftrace_function(&test_global);
+ if (cnt > 1)
+ unregister_ftrace_function(tr->ops);
+ ftrace_reset_array_ops(tr);
/* Make sure everything is off */
reset_counts();
@@ -315,9 +320,9 @@ static int trace_selftest_ops(int cnt)
}
/* Test dynamic code modification and ftrace filters */
-int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
- struct trace_array *tr,
- int (*func)(void))
+static int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
+ struct trace_array *tr,
+ int (*func)(void))
{
int save_ftrace_enabled = ftrace_enabled;
unsigned long count;
@@ -388,7 +393,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
}
/* Test the ops with global tracing running */
- ret = trace_selftest_ops(1);
+ ret = trace_selftest_ops(tr, 1);
trace->reset(tr);
out:
@@ -399,7 +404,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
/* Test the ops with global tracing off */
if (!ret)
- ret = trace_selftest_ops(2);
+ ret = trace_selftest_ops(tr, 2);
return ret;
}
@@ -802,7 +807,7 @@ out:
int
trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
{
- unsigned long save_max = tracing_max_latency;
+ unsigned long save_max = tr->max_latency;
unsigned long count;
int ret;
@@ -814,7 +819,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
}
/* reset the max latency */
- tracing_max_latency = 0;
+ tr->max_latency = 0;
/* disable interrupts for a bit */
local_irq_disable();
udelay(100);
@@ -841,7 +846,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
ret = -1;
}
- tracing_max_latency = save_max;
+ tr->max_latency = save_max;
return ret;
}
@@ -851,7 +856,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
int
trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
{
- unsigned long save_max = tracing_max_latency;
+ unsigned long save_max = tr->max_latency;
unsigned long count;
int ret;
@@ -876,7 +881,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
}
/* reset the max latency */
- tracing_max_latency = 0;
+ tr->max_latency = 0;
/* disable preemption for a bit */
preempt_disable();
udelay(100);
@@ -903,7 +908,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
ret = -1;
}
- tracing_max_latency = save_max;
+ tr->max_latency = save_max;
return ret;
}
@@ -913,7 +918,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
int
trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr)
{
- unsigned long save_max = tracing_max_latency;
+ unsigned long save_max = tr->max_latency;
unsigned long count;
int ret;
@@ -938,7 +943,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
}
/* reset the max latency */
- tracing_max_latency = 0;
+ tr->max_latency = 0;
/* disable preemption and interrupts for a bit */
preempt_disable();
@@ -973,7 +978,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
}
/* do the test by disabling interrupts first this time */
- tracing_max_latency = 0;
+ tr->max_latency = 0;
tracing_start();
trace->start(tr);
@@ -1004,7 +1009,7 @@ out:
tracing_start();
out_no_start:
trace->reset(tr);
- tracing_max_latency = save_max;
+ tr->max_latency = save_max;
return ret;
}
@@ -1057,7 +1062,7 @@ static int trace_wakeup_test_thread(void *data)
int
trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
{
- unsigned long save_max = tracing_max_latency;
+ unsigned long save_max = tr->max_latency;
struct task_struct *p;
struct completion is_ready;
unsigned long count;
@@ -1083,7 +1088,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
}
/* reset the max latency */
- tracing_max_latency = 0;
+ tr->max_latency = 0;
while (p->on_rq) {
/*
@@ -1113,7 +1118,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
trace->reset(tr);
tracing_start();
- tracing_max_latency = save_max;
+ tr->max_latency = save_max;
/* kill the thread */
kthread_stop(p);
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
new file mode 100644
index 000000000000..1f24ed99dca2
--- /dev/null
+++ b/kernel/trace/trace_seq.c
@@ -0,0 +1,428 @@
+/*
+ * trace_seq.c
+ *
+ * Copyright (C) 2008-2014 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ * The trace_seq is a handy tool that allows you to pass a descriptor around
+ * to a buffer that other functions can write to. It is similar to the
+ * seq_file functionality but has some differences.
+ *
+ * To use it, the trace_seq must be initialized with trace_seq_init().
+ * This will set up the counters within the descriptor. You can call
+ * trace_seq_init() more than once to reset the trace_seq to start
+ * from scratch.
+ *
+ * The buffer size is currently PAGE_SIZE, although it may become dynamic
+ * in the future.
+ *
+ * A write to the buffer will either succed or fail. That is, unlike
+ * sprintf() there will not be a partial write (well it may write into
+ * the buffer but it wont update the pointers). This allows users to
+ * try to write something into the trace_seq buffer and if it fails
+ * they can flush it and try again.
+ *
+ */
+#include <linux/uaccess.h>
+#include <linux/seq_file.h>
+#include <linux/trace_seq.h>
+
+/* How much buffer is left on the trace_seq? */
+#define TRACE_SEQ_BUF_LEFT(s) ((PAGE_SIZE - 1) - (s)->len)
+
+/* How much buffer is written? */
+#define TRACE_SEQ_BUF_USED(s) min((s)->len, (unsigned int)(PAGE_SIZE - 1))
+
+/**
+ * trace_print_seq - move the contents of trace_seq into a seq_file
+ * @m: the seq_file descriptor that is the destination
+ * @s: the trace_seq descriptor that is the source.
+ *
+ * Returns 0 on success and non zero on error. If it succeeds to
+ * write to the seq_file it will reset the trace_seq, otherwise
+ * it does not modify the trace_seq to let the caller try again.
+ */
+int trace_print_seq(struct seq_file *m, struct trace_seq *s)
+{
+ unsigned int len = TRACE_SEQ_BUF_USED(s);
+ int ret;
+
+ ret = seq_write(m, s->buffer, len);
+
+ /*
+ * Only reset this buffer if we successfully wrote to the
+ * seq_file buffer. This lets the caller try again or
+ * do something else with the contents.
+ */
+ if (!ret)
+ trace_seq_init(s);
+
+ return ret;
+}
+
+/**
+ * trace_seq_printf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf() is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ *
+ * Returns 1 if we successfully written all the contents to
+ * the buffer.
+ * Returns 0 if we the length to write is bigger than the
+ * reserved buffer space. In this case, nothing gets written.
+ */
+int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+{
+ unsigned int len = TRACE_SEQ_BUF_LEFT(s);
+ va_list ap;
+ int ret;
+
+ if (s->full || !len)
+ return 0;
+
+ va_start(ap, fmt);
+ ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
+ va_end(ap);
+
+ /* If we can't write it all, don't bother writing anything */
+ if (ret >= len) {
+ s->full = 1;
+ return 0;
+ }
+
+ s->len += ret;
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(trace_seq_printf);
+
+/**
+ * trace_seq_bitmask - write a bitmask array in its ASCII representation
+ * @s: trace sequence descriptor
+ * @maskp: points to an array of unsigned longs that represent a bitmask
+ * @nmaskbits: The number of bits that are valid in @maskp
+ *
+ * Writes a ASCII representation of a bitmask string into @s.
+ *
+ * Returns 1 if we successfully written all the contents to
+ * the buffer.
+ * Returns 0 if we the length to write is bigger than the
+ * reserved buffer space. In this case, nothing gets written.
+ */
+int trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
+ int nmaskbits)
+{
+ unsigned int len = TRACE_SEQ_BUF_LEFT(s);
+ int ret;
+
+ if (s->full || !len)
+ return 0;
+
+ ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits);
+ s->len += ret;
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(trace_seq_bitmask);
+
+/**
+ * trace_seq_vprintf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ *
+ * Returns how much it wrote to the buffer.
+ */
+int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+{
+ unsigned int len = TRACE_SEQ_BUF_LEFT(s);
+ int ret;
+
+ if (s->full || !len)
+ return 0;
+
+ ret = vsnprintf(s->buffer + s->len, len, fmt, args);
+
+ /* If we can't write it all, don't bother writing anything */
+ if (ret >= len) {
+ s->full = 1;
+ return 0;
+ }
+
+ s->len += ret;
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(trace_seq_vprintf);
+
+/**
+ * trace_seq_bprintf - Write the printf string from binary arguments
+ * @s: trace sequence descriptor
+ * @fmt: The format string for the @binary arguments
+ * @binary: The binary arguments for @fmt.
+ *
+ * When recording in a fast path, a printf may be recorded with just
+ * saving the format and the arguments as they were passed to the
+ * function, instead of wasting cycles converting the arguments into
+ * ASCII characters. Instead, the arguments are saved in a 32 bit
+ * word array that is defined by the format string constraints.
+ *
+ * This function will take the format and the binary array and finish
+ * the conversion into the ASCII string within the buffer.
+ *
+ * Returns how much it wrote to the buffer.
+ */
+int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
+{
+ unsigned int len = TRACE_SEQ_BUF_LEFT(s);
+ int ret;
+
+ if (s->full || !len)
+ return 0;
+
+ ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
+
+ /* If we can't write it all, don't bother writing anything */
+ if (ret >= len) {
+ s->full = 1;
+ return 0;
+ }
+
+ s->len += ret;
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(trace_seq_bprintf);
+
+/**
+ * trace_seq_puts - trace sequence printing of simple string
+ * @s: trace sequence descriptor
+ * @str: simple string to record
+ *
+ * The tracer may use either the sequence operations or its own
+ * copy to user routines. This function records a simple string
+ * into a special buffer (@s) for later retrieval by a sequencer
+ * or other mechanism.
+ *
+ * Returns how much it wrote to the buffer.
+ */
+int trace_seq_puts(struct trace_seq *s, const char *str)
+{
+ unsigned int len = strlen(str);
+
+ if (s->full)
+ return 0;
+
+ if (len > TRACE_SEQ_BUF_LEFT(s)) {
+ s->full = 1;
+ return 0;
+ }
+
+ memcpy(s->buffer + s->len, str, len);
+ s->len += len;
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(trace_seq_puts);
+
+/**
+ * trace_seq_putc - trace sequence printing of simple character
+ * @s: trace sequence descriptor
+ * @c: simple character to record
+ *
+ * The tracer may use either the sequence operations or its own
+ * copy to user routines. This function records a simple charater
+ * into a special buffer (@s) for later retrieval by a sequencer
+ * or other mechanism.
+ *
+ * Returns how much it wrote to the buffer.
+ */
+int trace_seq_putc(struct trace_seq *s, unsigned char c)
+{
+ if (s->full)
+ return 0;
+
+ if (TRACE_SEQ_BUF_LEFT(s) < 1) {
+ s->full = 1;
+ return 0;
+ }
+
+ s->buffer[s->len++] = c;
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(trace_seq_putc);
+
+/**
+ * trace_seq_putmem - write raw data into the trace_seq buffer
+ * @s: trace sequence descriptor
+ * @mem: The raw memory to copy into the buffer
+ * @len: The length of the raw memory to copy (in bytes)
+ *
+ * There may be cases where raw memory needs to be written into the
+ * buffer and a strcpy() would not work. Using this function allows
+ * for such cases.
+ *
+ * Returns how much it wrote to the buffer.
+ */
+int trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len)
+{
+ if (s->full)
+ return 0;
+
+ if (len > TRACE_SEQ_BUF_LEFT(s)) {
+ s->full = 1;
+ return 0;
+ }
+
+ memcpy(s->buffer + s->len, mem, len);
+ s->len += len;
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(trace_seq_putmem);
+
+#define MAX_MEMHEX_BYTES 8U
+#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
+
+/**
+ * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex
+ * @s: trace sequence descriptor
+ * @mem: The raw memory to write its hex ASCII representation of
+ * @len: The length of the raw memory to copy (in bytes)
+ *
+ * This is similar to trace_seq_putmem() except instead of just copying the
+ * raw memory into the buffer it writes its ASCII representation of it
+ * in hex characters.
+ *
+ * Returns how much it wrote to the buffer.
+ */
+int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
+ unsigned int len)
+{
+ unsigned char hex[HEX_CHARS];
+ const unsigned char *data = mem;
+ unsigned int start_len;
+ int i, j;
+ int cnt = 0;
+
+ if (s->full)
+ return 0;
+
+ while (len) {
+ start_len = min(len, HEX_CHARS - 1);
+#ifdef __BIG_ENDIAN
+ for (i = 0, j = 0; i < start_len; i++) {
+#else
+ for (i = start_len-1, j = 0; i >= 0; i--) {
+#endif
+ hex[j++] = hex_asc_hi(data[i]);
+ hex[j++] = hex_asc_lo(data[i]);
+ }
+ if (WARN_ON_ONCE(j == 0 || j/2 > len))
+ break;
+
+ /* j increments twice per loop */
+ len -= j / 2;
+ hex[j++] = ' ';
+
+ cnt += trace_seq_putmem(s, hex, j);
+ }
+ return cnt;
+}
+EXPORT_SYMBOL_GPL(trace_seq_putmem_hex);
+
+/**
+ * trace_seq_path - copy a path into the sequence buffer
+ * @s: trace sequence descriptor
+ * @path: path to write into the sequence buffer.
+ *
+ * Write a path name into the sequence buffer.
+ *
+ * Returns 1 if we successfully written all the contents to
+ * the buffer.
+ * Returns 0 if we the length to write is bigger than the
+ * reserved buffer space. In this case, nothing gets written.
+ */
+int trace_seq_path(struct trace_seq *s, const struct path *path)
+{
+ unsigned char *p;
+
+ if (s->full)
+ return 0;
+
+ if (TRACE_SEQ_BUF_LEFT(s) < 1) {
+ s->full = 1;
+ return 0;
+ }
+
+ p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
+ if (!IS_ERR(p)) {
+ p = mangle_path(s->buffer + s->len, p, "\n");
+ if (p) {
+ s->len = p - s->buffer;
+ return 1;
+ }
+ } else {
+ s->buffer[s->len++] = '?';
+ return 1;
+ }
+
+ s->full = 1;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(trace_seq_path);
+
+/**
+ * trace_seq_to_user - copy the squence buffer to user space
+ * @s: trace sequence descriptor
+ * @ubuf: The userspace memory location to copy to
+ * @cnt: The amount to copy
+ *
+ * Copies the sequence buffer into the userspace memory pointed to
+ * by @ubuf. It starts from the last read position (@s->readpos)
+ * and writes up to @cnt characters or till it reaches the end of
+ * the content in the buffer (@s->len), which ever comes first.
+ *
+ * On success, it returns a positive number of the number of bytes
+ * it copied.
+ *
+ * On failure it returns -EBUSY if all of the content in the
+ * sequence has been already read, which includes nothing in the
+ * sequenc (@s->len == @s->readpos).
+ *
+ * Returns -EFAULT if the copy to userspace fails.
+ */
+int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt)
+{
+ int len;
+ int ret;
+
+ if (!cnt)
+ return 0;
+
+ if (s->len <= s->readpos)
+ return -EBUSY;
+
+ len = s->len - s->readpos;
+ if (cnt > len)
+ cnt = len;
+ ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
+ if (ret == cnt)
+ return -EFAULT;
+
+ cnt -= ret;
+
+ s->readpos += cnt;
+ return cnt;
+}
+EXPORT_SYMBOL_GPL(trace_seq_to_user);
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index e6be585cf06a..8a4e5cb66a4c 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -13,6 +13,7 @@
#include <linux/sysctl.h>
#include <linux/init.h>
#include <linux/fs.h>
+#include <linux/magic.h>
#include <asm/setup.h>
@@ -50,11 +51,33 @@ static DEFINE_MUTEX(stack_sysctl_mutex);
int stack_tracer_enabled;
static int last_stack_tracer_enabled;
+static inline void print_max_stack(void)
+{
+ long i;
+ int size;
+
+ pr_emerg(" Depth Size Location (%d entries)\n"
+ " ----- ---- --------\n",
+ max_stack_trace.nr_entries - 1);
+
+ for (i = 0; i < max_stack_trace.nr_entries; i++) {
+ if (stack_dump_trace[i] == ULONG_MAX)
+ break;
+ if (i+1 == max_stack_trace.nr_entries ||
+ stack_dump_trace[i+1] == ULONG_MAX)
+ size = stack_dump_index[i];
+ else
+ size = stack_dump_index[i] - stack_dump_index[i+1];
+
+ pr_emerg("%3ld) %8d %5d %pS\n", i, stack_dump_index[i],
+ size, (void *)stack_dump_trace[i]);
+ }
+}
+
static inline void
check_stack(unsigned long ip, unsigned long *stack)
{
- unsigned long this_size, flags;
- unsigned long *p, *top, *start;
+ unsigned long this_size, flags; unsigned long *p, *top, *start;
static int tracer_frame;
int frame_size = ACCESS_ONCE(tracer_frame);
int i;
@@ -84,8 +107,12 @@ check_stack(unsigned long ip, unsigned long *stack)
max_stack_size = this_size;
- max_stack_trace.nr_entries = 0;
- max_stack_trace.skip = 3;
+ max_stack_trace.nr_entries = 0;
+
+ if (using_ftrace_ops_list_func())
+ max_stack_trace.skip = 4;
+ else
+ max_stack_trace.skip = 3;
save_stack_trace(&max_stack_trace);
@@ -144,6 +171,12 @@ check_stack(unsigned long ip, unsigned long *stack)
i++;
}
+ if ((current != &init_task &&
+ *(end_of_stack(current)) != STACK_END_MAGIC)) {
+ print_max_stack();
+ BUG();
+ }
+
out:
arch_spin_unlock(&max_stack_lock);
local_irq_restore(flags);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 79e52d93860b..33ff6a24b802 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -108,8 +108,8 @@ static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n)
* Uprobes-specific fetch functions
*/
#define DEFINE_FETCH_stack(type) \
-static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
- void *offset, void *dest) \
+static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \
+ void *offset, void *dest) \
{ \
*(type *)dest = (type)get_user_stack_nth(regs, \
((unsigned long)offset)); \
@@ -120,8 +120,8 @@ DEFINE_BASIC_FETCH_FUNCS(stack)
#define fetch_stack_string_size NULL
#define DEFINE_FETCH_memory(type) \
-static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
- void *addr, void *dest) \
+static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \
+ void *addr, void *dest) \
{ \
type retval; \
void __user *vaddr = (void __force __user *) addr; \
@@ -136,8 +136,8 @@ DEFINE_BASIC_FETCH_FUNCS(memory)
* Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
* length and relative data location.
*/
-static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
- void *addr, void *dest)
+static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
+ void *addr, void *dest)
{
long ret;
u32 rloc = *(u32 *)dest;
@@ -158,8 +158,8 @@ static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
}
}
-static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
- void *addr, void *dest)
+static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
+ void *addr, void *dest)
{
int len;
void __user *vaddr = (void __force __user *) addr;
@@ -184,8 +184,8 @@ static unsigned long translate_user_vaddr(void *file_offset)
}
#define DEFINE_FETCH_file_offset(type) \
-static __kprobes void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs,\
- void *offset, void *dest) \
+static void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs, \
+ void *offset, void *dest)\
{ \
void *vaddr = (void *)translate_user_vaddr(offset); \
\
@@ -260,11 +260,11 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
goto error;
INIT_LIST_HEAD(&tu->list);
+ INIT_LIST_HEAD(&tu->tp.files);
tu->consumer.handler = uprobe_dispatcher;
if (is_ret)
tu->consumer.ret_handler = uretprobe_dispatcher;
init_trace_uprobe_filter(&tu->filter);
- tu->tp.call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER;
return tu;
error:
@@ -293,7 +293,7 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou
struct trace_uprobe *tu;
list_for_each_entry(tu, &uprobe_list, list)
- if (strcmp(tu->tp.call.name, event) == 0 &&
+ if (strcmp(ftrace_event_name(&tu->tp.call), event) == 0 &&
strcmp(tu->tp.call.class->system, group) == 0)
return tu;
@@ -323,7 +323,8 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
mutex_lock(&uprobe_lock);
/* register as an event */
- old_tu = find_probe_event(tu->tp.call.name, tu->tp.call.class->system);
+ old_tu = find_probe_event(ftrace_event_name(&tu->tp.call),
+ tu->tp.call.class->system);
if (old_tu) {
/* delete old event */
ret = unregister_trace_uprobe(old_tu);
@@ -598,7 +599,8 @@ static int probes_seq_show(struct seq_file *m, void *v)
char c = is_ret_probe(tu) ? 'r' : 'p';
int i;
- seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, tu->tp.call.name);
+ seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system,
+ ftrace_event_name(&tu->tp.call));
seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
for (i = 0; i < tu->tp.nr_args; i++)
@@ -648,7 +650,8 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
{
struct trace_uprobe *tu = v;
- seq_printf(m, " %s %-44s %15lu\n", tu->filename, tu->tp.call.name, tu->nhit);
+ seq_printf(m, " %s %-44s %15lu\n", tu->filename,
+ ftrace_event_name(&tu->tp.call), tu->nhit);
return 0;
}
@@ -728,9 +731,15 @@ static int uprobe_buffer_enable(void)
static void uprobe_buffer_disable(void)
{
+ int cpu;
+
BUG_ON(!mutex_is_locked(&event_mutex));
if (--uprobe_buffer_refcnt == 0) {
+ for_each_possible_cpu(cpu)
+ free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer,
+ cpu)->buf);
+
free_percpu(uprobe_cpu_buffer);
uprobe_cpu_buffer = NULL;
}
@@ -758,31 +767,32 @@ static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb)
mutex_unlock(&ucb->mutex);
}
-static void uprobe_trace_print(struct trace_uprobe *tu,
- unsigned long func, struct pt_regs *regs)
+static void __uprobe_trace_func(struct trace_uprobe *tu,
+ unsigned long func, struct pt_regs *regs,
+ struct uprobe_cpu_buffer *ucb, int dsize,
+ struct ftrace_event_file *ftrace_file)
{
struct uprobe_trace_entry_head *entry;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
- struct uprobe_cpu_buffer *ucb;
void *data;
- int size, dsize, esize;
+ int size, esize;
struct ftrace_event_call *call = &tu->tp.call;
- dsize = __get_data_size(&tu->tp, regs);
- esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
+ WARN_ON(call != ftrace_file->event_call);
- if (WARN_ON_ONCE(!uprobe_cpu_buffer || tu->tp.size + dsize > PAGE_SIZE))
+ if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE))
return;
- ucb = uprobe_buffer_get();
- store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
+ if (ftrace_trigger_soft_disabled(ftrace_file))
+ return;
+ esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
size = esize + tu->tp.size + dsize;
- event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
- size, 0, 0);
+ event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
+ call->event.type, size, 0, 0);
if (!event)
- goto out;
+ return;
entry = ring_buffer_event_data(event);
if (is_ret_probe(tu)) {
@@ -796,25 +806,36 @@ static void uprobe_trace_print(struct trace_uprobe *tu,
memcpy(data, ucb->buf, tu->tp.size + dsize);
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(buffer, event, 0, 0);
-
-out:
- uprobe_buffer_put(ucb);
+ event_trigger_unlock_commit(ftrace_file, buffer, event, entry, 0, 0);
}
/* uprobe handler */
-static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
+static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,
+ struct uprobe_cpu_buffer *ucb, int dsize)
{
- if (!is_ret_probe(tu))
- uprobe_trace_print(tu, 0, regs);
+ struct event_file_link *link;
+
+ if (is_ret_probe(tu))
+ return 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(link, &tu->tp.files, list)
+ __uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file);
+ rcu_read_unlock();
+
return 0;
}
static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func,
- struct pt_regs *regs)
+ struct pt_regs *regs,
+ struct uprobe_cpu_buffer *ucb, int dsize)
{
- uprobe_trace_print(tu, func, regs);
+ struct event_file_link *link;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(link, &tu->tp.files, list)
+ __uprobe_trace_func(tu, func, regs, ucb, dsize, link->file);
+ rcu_read_unlock();
}
/* Event entry printers */
@@ -831,12 +852,14 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
tu = container_of(event, struct trace_uprobe, tp.call.event);
if (is_ret_probe(tu)) {
- if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->tp.call.name,
+ if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)",
+ ftrace_event_name(&tu->tp.call),
entry->vaddr[1], entry->vaddr[0]))
goto partial;
data = DATAOF_TRACE_ENTRY(entry, true);
} else {
- if (!trace_seq_printf(s, "%s: (0x%lx)", tu->tp.call.name,
+ if (!trace_seq_printf(s, "%s: (0x%lx)",
+ ftrace_event_name(&tu->tp.call),
entry->vaddr[0]))
goto partial;
data = DATAOF_TRACE_ENTRY(entry, false);
@@ -861,37 +884,88 @@ typedef bool (*filter_func_t)(struct uprobe_consumer *self,
struct mm_struct *mm);
static int
-probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter)
+probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
+ filter_func_t filter)
{
- int ret = 0;
+ bool enabled = trace_probe_is_enabled(&tu->tp);
+ struct event_file_link *link = NULL;
+ int ret;
- if (trace_probe_is_enabled(&tu->tp))
- return -EINTR;
+ if (file) {
+ if (tu->tp.flags & TP_FLAG_PROFILE)
+ return -EINTR;
- ret = uprobe_buffer_enable();
- if (ret < 0)
- return ret;
+ link = kmalloc(sizeof(*link), GFP_KERNEL);
+ if (!link)
+ return -ENOMEM;
+
+ link->file = file;
+ list_add_tail_rcu(&link->list, &tu->tp.files);
+
+ tu->tp.flags |= TP_FLAG_TRACE;
+ } else {
+ if (tu->tp.flags & TP_FLAG_TRACE)
+ return -EINTR;
+
+ tu->tp.flags |= TP_FLAG_PROFILE;
+ }
WARN_ON(!uprobe_filter_is_empty(&tu->filter));
- tu->tp.flags |= flag;
+ if (enabled)
+ return 0;
+
+ ret = uprobe_buffer_enable();
+ if (ret)
+ goto err_flags;
+
tu->consumer.filter = filter;
ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
if (ret)
- tu->tp.flags &= ~flag;
+ goto err_buffer;
+
+ return 0;
+ err_buffer:
+ uprobe_buffer_disable();
+
+ err_flags:
+ if (file) {
+ list_del(&link->list);
+ kfree(link);
+ tu->tp.flags &= ~TP_FLAG_TRACE;
+ } else {
+ tu->tp.flags &= ~TP_FLAG_PROFILE;
+ }
return ret;
}
-static void probe_event_disable(struct trace_uprobe *tu, int flag)
+static void
+probe_event_disable(struct trace_uprobe *tu, struct ftrace_event_file *file)
{
if (!trace_probe_is_enabled(&tu->tp))
return;
+ if (file) {
+ struct event_file_link *link;
+
+ link = find_event_file_link(&tu->tp, file);
+ if (!link)
+ return;
+
+ list_del_rcu(&link->list);
+ /* synchronize with u{,ret}probe_trace_func */
+ synchronize_sched();
+ kfree(link);
+
+ if (!list_empty(&tu->tp.files))
+ return;
+ }
+
WARN_ON(!uprobe_filter_is_empty(&tu->filter));
uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
- tu->tp.flags &= ~flag;
+ tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE;
uprobe_buffer_disable();
}
@@ -948,56 +1022,60 @@ uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
}
-static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
+static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
{
bool done;
write_lock(&tu->filter.rwlock);
if (event->hw.tp_target) {
- /*
- * event->parent != NULL means copy_process(), we can avoid
- * uprobe_apply(). current->mm must be probed and we can rely
- * on dup_mmap() which preserves the already installed bp's.
- *
- * attr.enable_on_exec means that exec/mmap will install the
- * breakpoints we need.
- */
+ list_del(&event->hw.tp_list);
done = tu->filter.nr_systemwide ||
- event->parent || event->attr.enable_on_exec ||
+ (event->hw.tp_target->flags & PF_EXITING) ||
uprobe_filter_event(tu, event);
- list_add(&event->hw.tp_list, &tu->filter.perf_events);
} else {
+ tu->filter.nr_systemwide--;
done = tu->filter.nr_systemwide;
- tu->filter.nr_systemwide++;
}
write_unlock(&tu->filter.rwlock);
if (!done)
- uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
+ return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
return 0;
}
-static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
+static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
{
bool done;
+ int err;
write_lock(&tu->filter.rwlock);
if (event->hw.tp_target) {
- list_del(&event->hw.tp_list);
+ /*
+ * event->parent != NULL means copy_process(), we can avoid
+ * uprobe_apply(). current->mm must be probed and we can rely
+ * on dup_mmap() which preserves the already installed bp's.
+ *
+ * attr.enable_on_exec means that exec/mmap will install the
+ * breakpoints we need.
+ */
done = tu->filter.nr_systemwide ||
- (event->hw.tp_target->flags & PF_EXITING) ||
+ event->parent || event->attr.enable_on_exec ||
uprobe_filter_event(tu, event);
+ list_add(&event->hw.tp_list, &tu->filter.perf_events);
} else {
- tu->filter.nr_systemwide--;
done = tu->filter.nr_systemwide;
+ tu->filter.nr_systemwide++;
}
write_unlock(&tu->filter.rwlock);
- if (!done)
- uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
-
- return 0;
+ err = 0;
+ if (!done) {
+ err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
+ if (err)
+ uprobe_perf_close(tu, event);
+ }
+ return err;
}
static bool uprobe_perf_filter(struct uprobe_consumer *uc,
@@ -1014,31 +1092,24 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
return ret;
}
-static void uprobe_perf_print(struct trace_uprobe *tu,
- unsigned long func, struct pt_regs *regs)
+static void __uprobe_perf_func(struct trace_uprobe *tu,
+ unsigned long func, struct pt_regs *regs,
+ struct uprobe_cpu_buffer *ucb, int dsize)
{
struct ftrace_event_call *call = &tu->tp.call;
struct uprobe_trace_entry_head *entry;
struct hlist_head *head;
- struct uprobe_cpu_buffer *ucb;
void *data;
- int size, dsize, esize;
+ int size, esize;
int rctx;
- dsize = __get_data_size(&tu->tp, regs);
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
- if (WARN_ON_ONCE(!uprobe_cpu_buffer))
- return;
-
size = esize + tu->tp.size + dsize;
size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32);
if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
return;
- ucb = uprobe_buffer_get();
- store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
-
preempt_disable();
head = this_cpu_ptr(call->perf_events);
if (hlist_empty(head))
@@ -1068,46 +1139,49 @@ static void uprobe_perf_print(struct trace_uprobe *tu,
perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
out:
preempt_enable();
- uprobe_buffer_put(ucb);
}
/* uprobe profile handler */
-static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
+static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs,
+ struct uprobe_cpu_buffer *ucb, int dsize)
{
if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
return UPROBE_HANDLER_REMOVE;
if (!is_ret_probe(tu))
- uprobe_perf_print(tu, 0, regs);
+ __uprobe_perf_func(tu, 0, regs, ucb, dsize);
return 0;
}
static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
- struct pt_regs *regs)
+ struct pt_regs *regs,
+ struct uprobe_cpu_buffer *ucb, int dsize)
{
- uprobe_perf_print(tu, func, regs);
+ __uprobe_perf_func(tu, func, regs, ucb, dsize);
}
#endif /* CONFIG_PERF_EVENTS */
-static
-int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data)
+static int
+trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
+ void *data)
{
struct trace_uprobe *tu = event->data;
+ struct ftrace_event_file *file = data;
switch (type) {
case TRACE_REG_REGISTER:
- return probe_event_enable(tu, TP_FLAG_TRACE, NULL);
+ return probe_event_enable(tu, file, NULL);
case TRACE_REG_UNREGISTER:
- probe_event_disable(tu, TP_FLAG_TRACE);
+ probe_event_disable(tu, file);
return 0;
#ifdef CONFIG_PERF_EVENTS
case TRACE_REG_PERF_REGISTER:
- return probe_event_enable(tu, TP_FLAG_PROFILE, uprobe_perf_filter);
+ return probe_event_enable(tu, NULL, uprobe_perf_filter);
case TRACE_REG_PERF_UNREGISTER:
- probe_event_disable(tu, TP_FLAG_PROFILE);
+ probe_event_disable(tu, NULL);
return 0;
case TRACE_REG_PERF_OPEN:
@@ -1127,8 +1201,11 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
{
struct trace_uprobe *tu;
struct uprobe_dispatch_data udd;
+ struct uprobe_cpu_buffer *ucb;
+ int dsize, esize;
int ret = 0;
+
tu = container_of(con, struct trace_uprobe, consumer);
tu->nhit++;
@@ -1137,13 +1214,23 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
current->utask->vaddr = (unsigned long) &udd;
+ if (WARN_ON_ONCE(!uprobe_cpu_buffer))
+ return 0;
+
+ dsize = __get_data_size(&tu->tp, regs);
+ esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
+
+ ucb = uprobe_buffer_get();
+ store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
+
if (tu->tp.flags & TP_FLAG_TRACE)
- ret |= uprobe_trace_func(tu, regs);
+ ret |= uprobe_trace_func(tu, regs, ucb, dsize);
#ifdef CONFIG_PERF_EVENTS
if (tu->tp.flags & TP_FLAG_PROFILE)
- ret |= uprobe_perf_func(tu, regs);
+ ret |= uprobe_perf_func(tu, regs, ucb, dsize);
#endif
+ uprobe_buffer_put(ucb);
return ret;
}
@@ -1152,6 +1239,8 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con,
{
struct trace_uprobe *tu;
struct uprobe_dispatch_data udd;
+ struct uprobe_cpu_buffer *ucb;
+ int dsize, esize;
tu = container_of(con, struct trace_uprobe, consumer);
@@ -1160,13 +1249,23 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con,
current->utask->vaddr = (unsigned long) &udd;
+ if (WARN_ON_ONCE(!uprobe_cpu_buffer))
+ return 0;
+
+ dsize = __get_data_size(&tu->tp, regs);
+ esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
+
+ ucb = uprobe_buffer_get();
+ store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
+
if (tu->tp.flags & TP_FLAG_TRACE)
- uretprobe_trace_func(tu, func, regs);
+ uretprobe_trace_func(tu, func, regs, ucb, dsize);
#ifdef CONFIG_PERF_EVENTS
if (tu->tp.flags & TP_FLAG_PROFILE)
- uretprobe_perf_func(tu, func, regs);
+ uretprobe_perf_func(tu, func, regs, ucb, dsize);
#endif
+ uprobe_buffer_put(ucb);
return 0;
}
@@ -1192,13 +1291,14 @@ static int register_uprobe_event(struct trace_uprobe *tu)
kfree(call->print_fmt);
return -ENODEV;
}
- call->flags = 0;
+
call->class->reg = trace_uprobe_register;
call->data = tu;
ret = trace_add_event_call(call);
if (ret) {
- pr_info("Failed to register uprobe event: %s\n", call->name);
+ pr_info("Failed to register uprobe event: %s\n",
+ ftrace_event_name(call));
kfree(call->print_fmt);
unregister_ftrace_event(&call->event);
}
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 031cc5655a51..3490407dc7b7 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2008 Mathieu Desnoyers
+ * Copyright (C) 2008-2014 Mathieu Desnoyers
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -33,43 +33,29 @@ extern struct tracepoint * const __stop___tracepoints_ptrs[];
/* Set to 1 to enable tracepoint debug output */
static const int tracepoint_debug;
+#ifdef CONFIG_MODULES
/*
- * Tracepoints mutex protects the builtin and module tracepoints and the hash
- * table, as well as the local module list.
+ * Tracepoint module list mutex protects the local module list.
*/
-static DEFINE_MUTEX(tracepoints_mutex);
+static DEFINE_MUTEX(tracepoint_module_list_mutex);
-#ifdef CONFIG_MODULES
-/* Local list of struct module */
+/* Local list of struct tp_module */
static LIST_HEAD(tracepoint_module_list);
#endif /* CONFIG_MODULES */
/*
- * Tracepoint hash table, containing the active tracepoints.
- * Protected by tracepoints_mutex.
+ * tracepoints_mutex protects the builtin and module tracepoints.
+ * tracepoints_mutex nests inside tracepoint_module_list_mutex.
*/
-#define TRACEPOINT_HASH_BITS 6
-#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS)
-static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
+static DEFINE_MUTEX(tracepoints_mutex);
/*
* Note about RCU :
* It is used to delay the free of multiple probes array until a quiescent
* state is reached.
- * Tracepoint entries modifications are protected by the tracepoints_mutex.
*/
-struct tracepoint_entry {
- struct hlist_node hlist;
- struct tracepoint_func *funcs;
- int refcount; /* Number of times armed. 0 if disarmed. */
- char name[0];
-};
-
struct tp_probes {
- union {
- struct rcu_head rcu;
- struct list_head list;
- } u;
+ struct rcu_head rcu;
struct tracepoint_func probes[0];
};
@@ -82,7 +68,7 @@ static inline void *allocate_probes(int count)
static void rcu_free_old_probes(struct rcu_head *head)
{
- kfree(container_of(head, struct tp_probes, u.rcu));
+ kfree(container_of(head, struct tp_probes, rcu));
}
static inline void release_probes(struct tracepoint_func *old)
@@ -90,38 +76,37 @@ static inline void release_probes(struct tracepoint_func *old)
if (old) {
struct tp_probes *tp_probes = container_of(old,
struct tp_probes, probes[0]);
- call_rcu_sched(&tp_probes->u.rcu, rcu_free_old_probes);
+ call_rcu_sched(&tp_probes->rcu, rcu_free_old_probes);
}
}
-static void debug_print_probes(struct tracepoint_entry *entry)
+static void debug_print_probes(struct tracepoint_func *funcs)
{
int i;
- if (!tracepoint_debug || !entry->funcs)
+ if (!tracepoint_debug || !funcs)
return;
- for (i = 0; entry->funcs[i].func; i++)
- printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i].func);
+ for (i = 0; funcs[i].func; i++)
+ printk(KERN_DEBUG "Probe %d : %p\n", i, funcs[i].func);
}
-static struct tracepoint_func *
-tracepoint_entry_add_probe(struct tracepoint_entry *entry,
- void *probe, void *data)
+static struct tracepoint_func *func_add(struct tracepoint_func **funcs,
+ struct tracepoint_func *tp_func)
{
int nr_probes = 0;
struct tracepoint_func *old, *new;
- if (WARN_ON(!probe))
+ if (WARN_ON(!tp_func->func))
return ERR_PTR(-EINVAL);
- debug_print_probes(entry);
- old = entry->funcs;
+ debug_print_probes(*funcs);
+ old = *funcs;
if (old) {
/* (N -> N+1), (N != 0, 1) probes */
for (nr_probes = 0; old[nr_probes].func; nr_probes++)
- if (old[nr_probes].func == probe &&
- old[nr_probes].data == data)
+ if (old[nr_probes].func == tp_func->func &&
+ old[nr_probes].data == tp_func->data)
return ERR_PTR(-EEXIST);
}
/* + 2 : one for new probe, one for NULL func */
@@ -130,33 +115,30 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry,
return ERR_PTR(-ENOMEM);
if (old)
memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
- new[nr_probes].func = probe;
- new[nr_probes].data = data;
+ new[nr_probes] = *tp_func;
new[nr_probes + 1].func = NULL;
- entry->refcount = nr_probes + 1;
- entry->funcs = new;
- debug_print_probes(entry);
+ *funcs = new;
+ debug_print_probes(*funcs);
return old;
}
-static void *
-tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
- void *probe, void *data)
+static void *func_remove(struct tracepoint_func **funcs,
+ struct tracepoint_func *tp_func)
{
int nr_probes = 0, nr_del = 0, i;
struct tracepoint_func *old, *new;
- old = entry->funcs;
+ old = *funcs;
if (!old)
return ERR_PTR(-ENOENT);
- debug_print_probes(entry);
+ debug_print_probes(*funcs);
/* (N -> M), (N > 1, M >= 0) probes */
- if (probe) {
+ if (tp_func->func) {
for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
- if (old[nr_probes].func == probe &&
- old[nr_probes].data == data)
+ if (old[nr_probes].func == tp_func->func &&
+ old[nr_probes].data == tp_func->data)
nr_del++;
}
}
@@ -167,9 +149,8 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
*/
if (nr_probes - nr_del == 0) {
/* N -> 0, (N > 1) */
- entry->funcs = NULL;
- entry->refcount = 0;
- debug_print_probes(entry);
+ *funcs = NULL;
+ debug_print_probes(*funcs);
return old;
} else {
int j = 0;
@@ -179,90 +160,34 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
if (new == NULL)
return ERR_PTR(-ENOMEM);
for (i = 0; old[i].func; i++)
- if (old[i].func != probe || old[i].data != data)
+ if (old[i].func != tp_func->func
+ || old[i].data != tp_func->data)
new[j++] = old[i];
new[nr_probes - nr_del].func = NULL;
- entry->refcount = nr_probes - nr_del;
- entry->funcs = new;
+ *funcs = new;
}
- debug_print_probes(entry);
+ debug_print_probes(*funcs);
return old;
}
/*
- * Get tracepoint if the tracepoint is present in the tracepoint hash table.
- * Must be called with tracepoints_mutex held.
- * Returns NULL if not present.
- */
-static struct tracepoint_entry *get_tracepoint(const char *name)
-{
- struct hlist_head *head;
- struct tracepoint_entry *e;
- u32 hash = jhash(name, strlen(name), 0);
-
- head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
- hlist_for_each_entry(e, head, hlist) {
- if (!strcmp(name, e->name))
- return e;
- }
- return NULL;
-}
-
-/*
- * Add the tracepoint to the tracepoint hash table. Must be called with
- * tracepoints_mutex held.
- */
-static struct tracepoint_entry *add_tracepoint(const char *name)
-{
- struct hlist_head *head;
- struct tracepoint_entry *e;
- size_t name_len = strlen(name) + 1;
- u32 hash = jhash(name, name_len-1, 0);
-
- head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
- hlist_for_each_entry(e, head, hlist) {
- if (!strcmp(name, e->name)) {
- printk(KERN_NOTICE
- "tracepoint %s busy\n", name);
- return ERR_PTR(-EEXIST); /* Already there */
- }
- }
- /*
- * Using kmalloc here to allocate a variable length element. Could
- * cause some memory fragmentation if overused.
- */
- e = kmalloc(sizeof(struct tracepoint_entry) + name_len, GFP_KERNEL);
- if (!e)
- return ERR_PTR(-ENOMEM);
- memcpy(&e->name[0], name, name_len);
- e->funcs = NULL;
- e->refcount = 0;
- hlist_add_head(&e->hlist, head);
- return e;
-}
-
-/*
- * Remove the tracepoint from the tracepoint hash table. Must be called with
- * mutex_lock held.
+ * Add the probe function to a tracepoint.
*/
-static inline void remove_tracepoint(struct tracepoint_entry *e)
+static int tracepoint_add_func(struct tracepoint *tp,
+ struct tracepoint_func *func)
{
- hlist_del(&e->hlist);
- kfree(e);
-}
+ struct tracepoint_func *old, *tp_funcs;
-/*
- * Sets the probe callback corresponding to one tracepoint.
- */
-static void set_tracepoint(struct tracepoint_entry **entry,
- struct tracepoint *elem, int active)
-{
- WARN_ON(strcmp((*entry)->name, elem->name) != 0);
+ if (tp->regfunc && !static_key_enabled(&tp->key))
+ tp->regfunc();
- if (elem->regfunc && !static_key_enabled(&elem->key) && active)
- elem->regfunc();
- else if (elem->unregfunc && static_key_enabled(&elem->key) && !active)
- elem->unregfunc();
+ tp_funcs = rcu_dereference_protected(tp->funcs,
+ lockdep_is_held(&tracepoints_mutex));
+ old = func_add(&tp_funcs, func);
+ if (IS_ERR(old)) {
+ WARN_ON_ONCE(1);
+ return PTR_ERR(old);
+ }
/*
* rcu_assign_pointer has a smp_wmb() which makes sure that the new
@@ -271,426 +196,218 @@ static void set_tracepoint(struct tracepoint_entry **entry,
* include/linux/tracepoints.h. A matching smp_read_barrier_depends()
* is used.
*/
- rcu_assign_pointer(elem->funcs, (*entry)->funcs);
- if (active && !static_key_enabled(&elem->key))
- static_key_slow_inc(&elem->key);
- else if (!active && static_key_enabled(&elem->key))
- static_key_slow_dec(&elem->key);
+ rcu_assign_pointer(tp->funcs, tp_funcs);
+ if (!static_key_enabled(&tp->key))
+ static_key_slow_inc(&tp->key);
+ release_probes(old);
+ return 0;
}
/*
- * Disable a tracepoint and its probe callback.
+ * Remove a probe function from a tracepoint.
* Note: only waiting an RCU period after setting elem->call to the empty
* function insures that the original callback is not used anymore. This insured
* by preempt_disable around the call site.
*/
-static void disable_tracepoint(struct tracepoint *elem)
+static int tracepoint_remove_func(struct tracepoint *tp,
+ struct tracepoint_func *func)
{
- if (elem->unregfunc && static_key_enabled(&elem->key))
- elem->unregfunc();
-
- if (static_key_enabled(&elem->key))
- static_key_slow_dec(&elem->key);
- rcu_assign_pointer(elem->funcs, NULL);
-}
-
-/**
- * tracepoint_update_probe_range - Update a probe range
- * @begin: beginning of the range
- * @end: end of the range
- *
- * Updates the probe callback corresponding to a range of tracepoints.
- * Called with tracepoints_mutex held.
- */
-static void tracepoint_update_probe_range(struct tracepoint * const *begin,
- struct tracepoint * const *end)
-{
- struct tracepoint * const *iter;
- struct tracepoint_entry *mark_entry;
+ struct tracepoint_func *old, *tp_funcs;
- if (!begin)
- return;
-
- for (iter = begin; iter < end; iter++) {
- mark_entry = get_tracepoint((*iter)->name);
- if (mark_entry) {
- set_tracepoint(&mark_entry, *iter,
- !!mark_entry->refcount);
- } else {
- disable_tracepoint(*iter);
- }
+ tp_funcs = rcu_dereference_protected(tp->funcs,
+ lockdep_is_held(&tracepoints_mutex));
+ old = func_remove(&tp_funcs, func);
+ if (IS_ERR(old)) {
+ WARN_ON_ONCE(1);
+ return PTR_ERR(old);
}
-}
-#ifdef CONFIG_MODULES
-void module_update_tracepoints(void)
-{
- struct tp_module *tp_mod;
+ if (!tp_funcs) {
+ /* Removed last function */
+ if (tp->unregfunc && static_key_enabled(&tp->key))
+ tp->unregfunc();
- list_for_each_entry(tp_mod, &tracepoint_module_list, list)
- tracepoint_update_probe_range(tp_mod->tracepoints_ptrs,
- tp_mod->tracepoints_ptrs + tp_mod->num_tracepoints);
-}
-#else /* CONFIG_MODULES */
-void module_update_tracepoints(void)
-{
-}
-#endif /* CONFIG_MODULES */
-
-
-/*
- * Update probes, removing the faulty probes.
- * Called with tracepoints_mutex held.
- */
-static void tracepoint_update_probes(void)
-{
- /* Core kernel tracepoints */
- tracepoint_update_probe_range(__start___tracepoints_ptrs,
- __stop___tracepoints_ptrs);
- /* tracepoints in modules. */
- module_update_tracepoints();
-}
-
-static struct tracepoint_func *
-tracepoint_add_probe(const char *name, void *probe, void *data)
-{
- struct tracepoint_entry *entry;
- struct tracepoint_func *old;
-
- entry = get_tracepoint(name);
- if (!entry) {
- entry = add_tracepoint(name);
- if (IS_ERR(entry))
- return (struct tracepoint_func *)entry;
+ if (static_key_enabled(&tp->key))
+ static_key_slow_dec(&tp->key);
}
- old = tracepoint_entry_add_probe(entry, probe, data);
- if (IS_ERR(old) && !entry->refcount)
- remove_tracepoint(entry);
- return old;
+ rcu_assign_pointer(tp->funcs, tp_funcs);
+ release_probes(old);
+ return 0;
}
/**
* tracepoint_probe_register - Connect a probe to a tracepoint
- * @name: tracepoint name
+ * @tp: tracepoint
* @probe: probe handler
+ * @data: tracepoint data
*
* Returns 0 if ok, error value on error.
- * The probe address must at least be aligned on the architecture pointer size.
+ * Note: if @tp is within a module, the caller is responsible for
+ * unregistering the probe before the module is gone. This can be
+ * performed either with a tracepoint module going notifier, or from
+ * within module exit functions.
*/
-int tracepoint_probe_register(const char *name, void *probe, void *data)
+int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data)
{
- struct tracepoint_func *old;
+ struct tracepoint_func tp_func;
+ int ret;
mutex_lock(&tracepoints_mutex);
- old = tracepoint_add_probe(name, probe, data);
- if (IS_ERR(old)) {
- mutex_unlock(&tracepoints_mutex);
- return PTR_ERR(old);
- }
- tracepoint_update_probes(); /* may update entry */
+ tp_func.func = probe;
+ tp_func.data = data;
+ ret = tracepoint_add_func(tp, &tp_func);
mutex_unlock(&tracepoints_mutex);
- release_probes(old);
- return 0;
+ return ret;
}
EXPORT_SYMBOL_GPL(tracepoint_probe_register);
-static struct tracepoint_func *
-tracepoint_remove_probe(const char *name, void *probe, void *data)
-{
- struct tracepoint_entry *entry;
- struct tracepoint_func *old;
-
- entry = get_tracepoint(name);
- if (!entry)
- return ERR_PTR(-ENOENT);
- old = tracepoint_entry_remove_probe(entry, probe, data);
- if (IS_ERR(old))
- return old;
- if (!entry->refcount)
- remove_tracepoint(entry);
- return old;
-}
-
/**
* tracepoint_probe_unregister - Disconnect a probe from a tracepoint
- * @name: tracepoint name
+ * @tp: tracepoint
* @probe: probe function pointer
+ * @data: tracepoint data
*
- * We do not need to call a synchronize_sched to make sure the probes have
- * finished running before doing a module unload, because the module unload
- * itself uses stop_machine(), which insures that every preempt disabled section
- * have finished.
+ * Returns 0 if ok, error value on error.
*/
-int tracepoint_probe_unregister(const char *name, void *probe, void *data)
+int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data)
{
- struct tracepoint_func *old;
+ struct tracepoint_func tp_func;
+ int ret;
mutex_lock(&tracepoints_mutex);
- old = tracepoint_remove_probe(name, probe, data);
- if (IS_ERR(old)) {
- mutex_unlock(&tracepoints_mutex);
- return PTR_ERR(old);
- }
- tracepoint_update_probes(); /* may update entry */
+ tp_func.func = probe;
+ tp_func.data = data;
+ ret = tracepoint_remove_func(tp, &tp_func);
mutex_unlock(&tracepoints_mutex);
- release_probes(old);
- return 0;
+ return ret;
}
EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
-static LIST_HEAD(old_probes);
-static int need_update;
-
-static void tracepoint_add_old_probes(void *old)
+#ifdef CONFIG_MODULES
+bool trace_module_has_bad_taint(struct module *mod)
{
- need_update = 1;
- if (old) {
- struct tp_probes *tp_probes = container_of(old,
- struct tp_probes, probes[0]);
- list_add(&tp_probes->u.list, &old_probes);
- }
+ return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP) |
+ (1 << TAINT_UNSIGNED_MODULE));
}
-/**
- * tracepoint_probe_register_noupdate - register a probe but not connect
- * @name: tracepoint name
- * @probe: probe handler
- *
- * caller must call tracepoint_probe_update_all()
- */
-int tracepoint_probe_register_noupdate(const char *name, void *probe,
- void *data)
-{
- struct tracepoint_func *old;
-
- mutex_lock(&tracepoints_mutex);
- old = tracepoint_add_probe(name, probe, data);
- if (IS_ERR(old)) {
- mutex_unlock(&tracepoints_mutex);
- return PTR_ERR(old);
- }
- tracepoint_add_old_probes(old);
- mutex_unlock(&tracepoints_mutex);
- return 0;
-}
-EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
+static BLOCKING_NOTIFIER_HEAD(tracepoint_notify_list);
/**
- * tracepoint_probe_unregister_noupdate - remove a probe but not disconnect
- * @name: tracepoint name
- * @probe: probe function pointer
+ * register_tracepoint_notifier - register tracepoint coming/going notifier
+ * @nb: notifier block
*
- * caller must call tracepoint_probe_update_all()
+ * Notifiers registered with this function are called on module
+ * coming/going with the tracepoint_module_list_mutex held.
+ * The notifier block callback should expect a "struct tp_module" data
+ * pointer.
*/
-int tracepoint_probe_unregister_noupdate(const char *name, void *probe,
- void *data)
+int register_tracepoint_module_notifier(struct notifier_block *nb)
{
- struct tracepoint_func *old;
-
- mutex_lock(&tracepoints_mutex);
- old = tracepoint_remove_probe(name, probe, data);
- if (IS_ERR(old)) {
- mutex_unlock(&tracepoints_mutex);
- return PTR_ERR(old);
- }
- tracepoint_add_old_probes(old);
- mutex_unlock(&tracepoints_mutex);
- return 0;
-}
-EXPORT_SYMBOL_GPL(tracepoint_probe_unregister_noupdate);
-
-/**
- * tracepoint_probe_update_all - update tracepoints
- */
-void tracepoint_probe_update_all(void)
-{
- LIST_HEAD(release_probes);
- struct tp_probes *pos, *next;
+ struct tp_module *tp_mod;
+ int ret;
- mutex_lock(&tracepoints_mutex);
- if (!need_update) {
- mutex_unlock(&tracepoints_mutex);
- return;
- }
- if (!list_empty(&old_probes))
- list_replace_init(&old_probes, &release_probes);
- need_update = 0;
- tracepoint_update_probes();
- mutex_unlock(&tracepoints_mutex);
- list_for_each_entry_safe(pos, next, &release_probes, u.list) {
- list_del(&pos->u.list);
- call_rcu_sched(&pos->u.rcu, rcu_free_old_probes);
- }
+ mutex_lock(&tracepoint_module_list_mutex);
+ ret = blocking_notifier_chain_register(&tracepoint_notify_list, nb);
+ if (ret)
+ goto end;
+ list_for_each_entry(tp_mod, &tracepoint_module_list, list)
+ (void) nb->notifier_call(nb, MODULE_STATE_COMING, tp_mod);
+end:
+ mutex_unlock(&tracepoint_module_list_mutex);
+ return ret;
}
-EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
+EXPORT_SYMBOL_GPL(register_tracepoint_module_notifier);
/**
- * tracepoint_get_iter_range - Get a next tracepoint iterator given a range.
- * @tracepoint: current tracepoints (in), next tracepoint (out)
- * @begin: beginning of the range
- * @end: end of the range
+ * unregister_tracepoint_notifier - unregister tracepoint coming/going notifier
+ * @nb: notifier block
*
- * Returns whether a next tracepoint has been found (1) or not (0).
- * Will return the first tracepoint in the range if the input tracepoint is
- * NULL.
+ * The notifier block callback should expect a "struct tp_module" data
+ * pointer.
*/
-static int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
- struct tracepoint * const *begin, struct tracepoint * const *end)
+int unregister_tracepoint_module_notifier(struct notifier_block *nb)
{
- if (!*tracepoint && begin != end) {
- *tracepoint = begin;
- return 1;
- }
- if (*tracepoint >= begin && *tracepoint < end)
- return 1;
- return 0;
-}
+ struct tp_module *tp_mod;
+ int ret;
-#ifdef CONFIG_MODULES
-static void tracepoint_get_iter(struct tracepoint_iter *iter)
-{
- int found = 0;
- struct tp_module *iter_mod;
-
- /* Core kernel tracepoints */
- if (!iter->module) {
- found = tracepoint_get_iter_range(&iter->tracepoint,
- __start___tracepoints_ptrs,
- __stop___tracepoints_ptrs);
- if (found)
- goto end;
- }
- /* Tracepoints in modules */
- mutex_lock(&tracepoints_mutex);
- list_for_each_entry(iter_mod, &tracepoint_module_list, list) {
- /*
- * Sorted module list
- */
- if (iter_mod < iter->module)
- continue;
- else if (iter_mod > iter->module)
- iter->tracepoint = NULL;
- found = tracepoint_get_iter_range(&iter->tracepoint,
- iter_mod->tracepoints_ptrs,
- iter_mod->tracepoints_ptrs
- + iter_mod->num_tracepoints);
- if (found) {
- iter->module = iter_mod;
- break;
- }
- }
- mutex_unlock(&tracepoints_mutex);
+ mutex_lock(&tracepoint_module_list_mutex);
+ ret = blocking_notifier_chain_unregister(&tracepoint_notify_list, nb);
+ if (ret)
+ goto end;
+ list_for_each_entry(tp_mod, &tracepoint_module_list, list)
+ (void) nb->notifier_call(nb, MODULE_STATE_GOING, tp_mod);
end:
- if (!found)
- tracepoint_iter_reset(iter);
-}
-#else /* CONFIG_MODULES */
-static void tracepoint_get_iter(struct tracepoint_iter *iter)
-{
- int found = 0;
-
- /* Core kernel tracepoints */
- found = tracepoint_get_iter_range(&iter->tracepoint,
- __start___tracepoints_ptrs,
- __stop___tracepoints_ptrs);
- if (!found)
- tracepoint_iter_reset(iter);
-}
-#endif /* CONFIG_MODULES */
-
-void tracepoint_iter_start(struct tracepoint_iter *iter)
-{
- tracepoint_get_iter(iter);
-}
-EXPORT_SYMBOL_GPL(tracepoint_iter_start);
-
-void tracepoint_iter_next(struct tracepoint_iter *iter)
-{
- iter->tracepoint++;
- /*
- * iter->tracepoint may be invalid because we blindly incremented it.
- * Make sure it is valid by marshalling on the tracepoints, getting the
- * tracepoints from following modules if necessary.
- */
- tracepoint_get_iter(iter);
-}
-EXPORT_SYMBOL_GPL(tracepoint_iter_next);
+ mutex_unlock(&tracepoint_module_list_mutex);
+ return ret;
-void tracepoint_iter_stop(struct tracepoint_iter *iter)
-{
}
-EXPORT_SYMBOL_GPL(tracepoint_iter_stop);
+EXPORT_SYMBOL_GPL(unregister_tracepoint_module_notifier);
-void tracepoint_iter_reset(struct tracepoint_iter *iter)
+/*
+ * Ensure the tracer unregistered the module's probes before the module
+ * teardown is performed. Prevents leaks of probe and data pointers.
+ */
+static void tp_module_going_check_quiescent(struct tracepoint * const *begin,
+ struct tracepoint * const *end)
{
-#ifdef CONFIG_MODULES
- iter->module = NULL;
-#endif /* CONFIG_MODULES */
- iter->tracepoint = NULL;
-}
-EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
+ struct tracepoint * const *iter;
-#ifdef CONFIG_MODULES
-bool trace_module_has_bad_taint(struct module *mod)
-{
- return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP));
+ if (!begin)
+ return;
+ for (iter = begin; iter < end; iter++)
+ WARN_ON_ONCE((*iter)->funcs);
}
static int tracepoint_module_coming(struct module *mod)
{
- struct tp_module *tp_mod, *iter;
+ struct tp_module *tp_mod;
int ret = 0;
+ if (!mod->num_tracepoints)
+ return 0;
+
/*
* We skip modules that taint the kernel, especially those with different
* module headers (for forced load), to make sure we don't cause a crash.
- * Staging and out-of-tree GPL modules are fine.
+ * Staging, out-of-tree, and unsigned GPL modules are fine.
*/
if (trace_module_has_bad_taint(mod))
return 0;
- mutex_lock(&tracepoints_mutex);
+ mutex_lock(&tracepoint_module_list_mutex);
tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
if (!tp_mod) {
ret = -ENOMEM;
goto end;
}
- tp_mod->num_tracepoints = mod->num_tracepoints;
- tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs;
-
- /*
- * tracepoint_module_list is kept sorted by struct module pointer
- * address for iteration on tracepoints from a seq_file that can release
- * the mutex between calls.
- */
- list_for_each_entry_reverse(iter, &tracepoint_module_list, list) {
- BUG_ON(iter == tp_mod); /* Should never be in the list twice */
- if (iter < tp_mod) {
- /* We belong to the location right after iter. */
- list_add(&tp_mod->list, &iter->list);
- goto module_added;
- }
- }
- /* We belong to the beginning of the list */
- list_add(&tp_mod->list, &tracepoint_module_list);
-module_added:
- tracepoint_update_probe_range(mod->tracepoints_ptrs,
- mod->tracepoints_ptrs + mod->num_tracepoints);
+ tp_mod->mod = mod;
+ list_add_tail(&tp_mod->list, &tracepoint_module_list);
+ blocking_notifier_call_chain(&tracepoint_notify_list,
+ MODULE_STATE_COMING, tp_mod);
end:
- mutex_unlock(&tracepoints_mutex);
+ mutex_unlock(&tracepoint_module_list_mutex);
return ret;
}
-static int tracepoint_module_going(struct module *mod)
+static void tracepoint_module_going(struct module *mod)
{
- struct tp_module *pos;
+ struct tp_module *tp_mod;
- mutex_lock(&tracepoints_mutex);
- tracepoint_update_probe_range(mod->tracepoints_ptrs,
- mod->tracepoints_ptrs + mod->num_tracepoints);
- list_for_each_entry(pos, &tracepoint_module_list, list) {
- if (pos->tracepoints_ptrs == mod->tracepoints_ptrs) {
- list_del(&pos->list);
- kfree(pos);
+ if (!mod->num_tracepoints)
+ return;
+
+ mutex_lock(&tracepoint_module_list_mutex);
+ list_for_each_entry(tp_mod, &tracepoint_module_list, list) {
+ if (tp_mod->mod == mod) {
+ blocking_notifier_call_chain(&tracepoint_notify_list,
+ MODULE_STATE_GOING, tp_mod);
+ list_del(&tp_mod->list);
+ kfree(tp_mod);
+ /*
+ * Called the going notifier before checking for
+ * quiescence.
+ */
+ tp_module_going_check_quiescent(mod->tracepoints_ptrs,
+ mod->tracepoints_ptrs + mod->num_tracepoints);
break;
}
}
@@ -700,12 +417,11 @@ static int tracepoint_module_going(struct module *mod)
* flag on "going", in case a module taints the kernel only after being
* loaded.
*/
- mutex_unlock(&tracepoints_mutex);
- return 0;
+ mutex_unlock(&tracepoint_module_list_mutex);
}
-int tracepoint_module_notify(struct notifier_block *self,
- unsigned long val, void *data)
+static int tracepoint_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
{
struct module *mod = data;
int ret = 0;
@@ -717,24 +433,58 @@ int tracepoint_module_notify(struct notifier_block *self,
case MODULE_STATE_LIVE:
break;
case MODULE_STATE_GOING:
- ret = tracepoint_module_going(mod);
+ tracepoint_module_going(mod);
+ break;
+ case MODULE_STATE_UNFORMED:
break;
}
return ret;
}
-struct notifier_block tracepoint_module_nb = {
+static struct notifier_block tracepoint_module_nb = {
.notifier_call = tracepoint_module_notify,
.priority = 0,
};
-static int init_tracepoints(void)
+static __init int init_tracepoints(void)
{
- return register_module_notifier(&tracepoint_module_nb);
+ int ret;
+
+ ret = register_module_notifier(&tracepoint_module_nb);
+ if (ret)
+ pr_warning("Failed to register tracepoint module enter notifier\n");
+
+ return ret;
}
__initcall(init_tracepoints);
#endif /* CONFIG_MODULES */
+static void for_each_tracepoint_range(struct tracepoint * const *begin,
+ struct tracepoint * const *end,
+ void (*fct)(struct tracepoint *tp, void *priv),
+ void *priv)
+{
+ struct tracepoint * const *iter;
+
+ if (!begin)
+ return;
+ for (iter = begin; iter < end; iter++)
+ fct(*iter, priv);
+}
+
+/**
+ * for_each_kernel_tracepoint - iteration on all kernel tracepoints
+ * @fct: callback
+ * @priv: private data
+ */
+void for_each_kernel_tracepoint(void (*fct)(struct tracepoint *tp, void *priv),
+ void *priv)
+{
+ for_each_tracepoint_range(__start___tracepoints_ptrs,
+ __stop___tracepoints_ptrs, fct, priv);
+}
+EXPORT_SYMBOL_GPL(for_each_kernel_tracepoint);
+
#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
@@ -742,33 +492,29 @@ static int sys_tracepoint_refcount;
void syscall_regfunc(void)
{
- unsigned long flags;
- struct task_struct *g, *t;
+ struct task_struct *p, *t;
if (!sys_tracepoint_refcount) {
- read_lock_irqsave(&tasklist_lock, flags);
- do_each_thread(g, t) {
- /* Skip kernel threads. */
- if (t->mm)
- set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
- } while_each_thread(g, t);
- read_unlock_irqrestore(&tasklist_lock, flags);
+ read_lock(&tasklist_lock);
+ for_each_process_thread(p, t) {
+ set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
+ }
+ read_unlock(&tasklist_lock);
}
sys_tracepoint_refcount++;
}
void syscall_unregfunc(void)
{
- unsigned long flags;
- struct task_struct *g, *t;
+ struct task_struct *p, *t;
sys_tracepoint_refcount--;
if (!sys_tracepoint_refcount) {
- read_lock_irqsave(&tasklist_lock, flags);
- do_each_thread(g, t) {
+ read_lock(&tasklist_lock);
+ for_each_process_thread(p, t) {
clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
- } while_each_thread(g, t);
- read_unlock_irqrestore(&tasklist_lock, flags);
+ }
+ read_unlock(&tasklist_lock);
}
}
#endif
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index a1dd9a1b1327..975cb49e32bf 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -31,20 +31,19 @@ void bacct_add_tsk(struct user_namespace *user_ns,
struct taskstats *stats, struct task_struct *tsk)
{
const struct cred *tcred;
- struct timespec uptime, ts;
cputime_t utime, stime, utimescaled, stimescaled;
- u64 ac_etime;
+ u64 delta;
BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
- /* calculate task elapsed time in timespec */
- do_posix_clock_monotonic_gettime(&uptime);
- ts = timespec_sub(uptime, tsk->start_time);
- /* rebase elapsed time to usec (should never be negative) */
- ac_etime = timespec_to_ns(&ts);
- do_div(ac_etime, NSEC_PER_USEC);
- stats->ac_etime = ac_etime;
- stats->ac_btime = get_seconds() - ts.tv_sec;
+ /* calculate task elapsed time in nsec */
+ delta = ktime_get_ns() - tsk->start_time;
+ /* Convert to micro seconds */
+ do_div(delta, NSEC_PER_USEC);
+ stats->ac_etime = delta;
+ /* Convert to seconds for btime */
+ do_div(delta, USEC_PER_SEC);
+ stats->ac_btime = get_seconds() - delta;
if (thread_group_leader(tsk)) {
stats->ac_exitcode = tsk->exit_code;
if (tsk->flags & PF_FORKNOEXEC)
diff --git a/kernel/up.c b/kernel/up.c
index 509403e3fbc6..1760bf3d1463 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -22,16 +22,16 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
}
EXPORT_SYMBOL(smp_call_function_single);
-void __smp_call_function_single(int cpu, struct call_single_data *csd,
- int wait)
+int smp_call_function_single_async(int cpu, struct call_single_data *csd)
{
unsigned long flags;
local_irq_save(flags);
csd->func(csd->info);
local_irq_restore(flags);
+ return 0;
}
-EXPORT_SYMBOL(__smp_call_function_single);
+EXPORT_SYMBOL(smp_call_function_single_async);
int on_each_cpu(smp_call_func_t func, void *info, int wait)
{
diff --git a/kernel/user.c b/kernel/user.c
index c006131beb77..4efa39350e44 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -87,7 +87,6 @@ static DEFINE_SPINLOCK(uidhash_lock);
struct user_struct root_user = {
.__count = ATOMIC_INIT(1),
.processes = ATOMIC_INIT(1),
- .files = ATOMIC_INIT(0),
.sigpending = ATOMIC_INIT(0),
.locked_shm = 0,
.uid = GLOBAL_ROOT_UID,
@@ -222,5 +221,4 @@ static int __init uid_cache_init(void)
return 0;
}
-
-module_init(uid_cache_init);
+subsys_initcall(uid_cache_init);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index dd06439b9c84..aa312b0dc3ec 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -152,7 +152,7 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
/* Find the matching extent */
extents = map->nr_extents;
- smp_read_barrier_depends();
+ smp_rmb();
for (idx = 0; idx < extents; idx++) {
first = map->extent[idx].first;
last = first + map->extent[idx].count - 1;
@@ -176,7 +176,7 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id)
/* Find the matching extent */
extents = map->nr_extents;
- smp_read_barrier_depends();
+ smp_rmb();
for (idx = 0; idx < extents; idx++) {
first = map->extent[idx].first;
last = first + map->extent[idx].count - 1;
@@ -199,7 +199,7 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id)
/* Find the matching extent */
extents = map->nr_extents;
- smp_read_barrier_depends();
+ smp_rmb();
for (idx = 0; idx < extents; idx++) {
first = map->extent[idx].lower_first;
last = first + map->extent[idx].count - 1;
@@ -286,7 +286,7 @@ EXPORT_SYMBOL(from_kuid_munged);
/**
* make_kgid - Map a user-namespace gid pair into a kgid.
* @ns: User namespace that the gid is in
- * @uid: group identifier
+ * @gid: group identifier
*
* Maps a user-namespace gid pair into a kernel internal kgid,
* and returns that kgid.
@@ -482,7 +482,8 @@ static int projid_m_show(struct seq_file *seq, void *v)
return 0;
}
-static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map)
+static void *m_start(struct seq_file *seq, loff_t *ppos,
+ struct uid_gid_map *map)
{
struct uid_gid_extent *extent = NULL;
loff_t pos = *ppos;
@@ -525,28 +526,29 @@ static void m_stop(struct seq_file *seq, void *v)
return;
}
-struct seq_operations proc_uid_seq_operations = {
+const struct seq_operations proc_uid_seq_operations = {
.start = uid_m_start,
.stop = m_stop,
.next = m_next,
.show = uid_m_show,
};
-struct seq_operations proc_gid_seq_operations = {
+const struct seq_operations proc_gid_seq_operations = {
.start = gid_m_start,
.stop = m_stop,
.next = m_next,
.show = gid_m_show,
};
-struct seq_operations proc_projid_seq_operations = {
+const struct seq_operations proc_projid_seq_operations = {
.start = projid_m_start,
.stop = m_stop,
.next = m_next,
.show = projid_m_show,
};
-static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent)
+static bool mappings_overlap(struct uid_gid_map *new_map,
+ struct uid_gid_extent *extent)
{
u32 upper_first, lower_first, upper_last, lower_last;
unsigned idx;
@@ -615,9 +617,8 @@ static ssize_t map_write(struct file *file, const char __user *buf,
* were written before the count of the extents.
*
* To achieve this smp_wmb() is used on guarantee the write
- * order and smp_read_barrier_depends() is guaranteed that we
- * don't have crazy architectures returning stale data.
- *
+ * order and smp_rmb() is guaranteed that we don't have crazy
+ * architectures returning stale data.
*/
mutex_lock(&id_map_mutex);
@@ -654,7 +655,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
ret = -EINVAL;
pos = kbuf;
new_map.nr_extents = 0;
- for (;pos; pos = next_line) {
+ for (; pos; pos = next_line) {
extent = &new_map.extent[new_map.nr_extents];
/* Find the end of line and ensure I don't look past it */
@@ -688,13 +689,16 @@ static ssize_t map_write(struct file *file, const char __user *buf,
/* Verify we have been given valid starting values */
if ((extent->first == (u32) -1) ||
- (extent->lower_first == (u32) -1 ))
+ (extent->lower_first == (u32) -1))
goto out;
- /* Verify count is not zero and does not cause the extent to wrap */
+ /* Verify count is not zero and does not cause the
+ * extent to wrap
+ */
if ((extent->first + extent->count) <= extent->first)
goto out;
- if ((extent->lower_first + extent->count) <= extent->lower_first)
+ if ((extent->lower_first + extent->count) <=
+ extent->lower_first)
goto out;
/* Do the ranges in extent overlap any previous extents? */
@@ -752,7 +756,8 @@ out:
return ret;
}
-ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos)
+ssize_t proc_uid_map_write(struct file *file, const char __user *buf,
+ size_t size, loff_t *ppos)
{
struct seq_file *seq = file->private_data;
struct user_namespace *ns = seq->private;
@@ -768,7 +773,8 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz
&ns->uid_map, &ns->parent->uid_map);
}
-ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos)
+ssize_t proc_gid_map_write(struct file *file, const char __user *buf,
+ size_t size, loff_t *ppos)
{
struct seq_file *seq = file->private_data;
struct user_namespace *ns = seq->private;
@@ -784,7 +790,8 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz
&ns->gid_map, &ns->parent->gid_map);
}
-ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos)
+ssize_t proc_projid_map_write(struct file *file, const char __user *buf,
+ size_t size, loff_t *ppos)
{
struct seq_file *seq = file->private_data;
struct user_namespace *ns = seq->private;
@@ -801,7 +808,7 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t
&ns->projid_map, &ns->parent->projid_map);
}
-static bool new_idmap_permitted(const struct file *file,
+static bool new_idmap_permitted(const struct file *file,
struct user_namespace *ns, int cap_setid,
struct uid_gid_map *new_map)
{
@@ -812,8 +819,7 @@ static bool new_idmap_permitted(const struct file *file,
kuid_t uid = make_kuid(ns->parent, id);
if (uid_eq(uid, file->f_cred->fsuid))
return true;
- }
- else if (cap_setid == CAP_SETGID) {
+ } else if (cap_setid == CAP_SETGID) {
kgid_t gid = make_kgid(ns->parent, id);
if (gid_eq(gid, file->f_cred->fsgid))
return true;
@@ -902,4 +908,4 @@ static __init int user_namespaces_init(void)
user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
return 0;
}
-module_init(user_namespaces_init);
+subsys_initcall(user_namespaces_init);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index fd393124e507..883aaaa7de8a 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -93,13 +93,13 @@ static void *utsns_get(struct task_struct *task)
struct uts_namespace *ns = NULL;
struct nsproxy *nsproxy;
- rcu_read_lock();
- nsproxy = task_nsproxy(task);
+ task_lock(task);
+ nsproxy = task->nsproxy;
if (nsproxy) {
ns = nsproxy->uts_ns;
get_uts_ns(ns);
}
- rcu_read_unlock();
+ task_unlock(task);
return ns;
}
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 4f69f9a5e221..c8eac43267e9 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -17,7 +17,7 @@
#ifdef CONFIG_PROC_SYSCTL
-static void *get_uts(ctl_table *table, int write)
+static void *get_uts(struct ctl_table *table, int write)
{
char *which = table->data;
struct uts_namespace *uts_ns;
@@ -32,7 +32,7 @@ static void *get_uts(ctl_table *table, int write)
return which;
}
-static void put_uts(ctl_table *table, int write, void *which)
+static void put_uts(struct ctl_table *table, int write, void *which)
{
if (!write)
up_read(&uts_sem);
@@ -44,14 +44,14 @@ static void put_uts(ctl_table *table, int write, void *which)
* Special case of dostring for the UTS structure. This has locks
* to observe. Should this be in kernel/sys.c ????
*/
-static int proc_do_uts_string(ctl_table *table, int write,
+static int proc_do_uts_string(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table uts_table;
int r;
memcpy(&uts_table, table, sizeof(uts_table));
uts_table.data = get_uts(table, write);
- r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
+ r = proc_dostring(&uts_table, write, buffer, lenp, ppos);
put_uts(table, write, uts_table.data);
if (write)
@@ -135,4 +135,4 @@ static int __init utsname_sysctl_init(void)
return 0;
}
-__initcall(utsname_sysctl_init);
+device_initcall(utsname_sysctl_init);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 4431610f049a..a8d6914030fe 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -31,6 +31,12 @@
int watchdog_user_enabled = 1;
int __read_mostly watchdog_thresh = 10;
+#ifdef CONFIG_SMP
+int __read_mostly sysctl_softlockup_all_cpu_backtrace;
+#else
+#define sysctl_softlockup_all_cpu_backtrace 0
+#endif
+
static int __read_mostly watchdog_running;
static u64 __read_mostly sample_period;
@@ -47,6 +53,7 @@ static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
#endif
+static unsigned long soft_lockup_nmi_warn;
/* boot commands */
/*
@@ -95,6 +102,15 @@ static int __init nosoftlockup_setup(char *str)
}
__setup("nosoftlockup", nosoftlockup_setup);
/* */
+#ifdef CONFIG_SMP
+static int __init softlockup_all_cpu_backtrace_setup(char *str)
+{
+ sysctl_softlockup_all_cpu_backtrace =
+ !!simple_strtol(str, NULL, 0);
+ return 1;
+}
+__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
+#endif
/*
* Hard-lockup warnings should be triggered after just a few seconds. Soft-
@@ -138,7 +154,11 @@ static void __touch_watchdog(void)
void touch_softlockup_watchdog(void)
{
- __this_cpu_write(watchdog_touch_ts, 0);
+ /*
+ * Preemption can be enabled. It doesn't matter which CPU's timestamp
+ * gets zeroed here, so use the raw_ operation.
+ */
+ raw_cpu_write(watchdog_touch_ts, 0);
}
EXPORT_SYMBOL(touch_softlockup_watchdog);
@@ -158,14 +178,14 @@ void touch_all_softlockup_watchdogs(void)
#ifdef CONFIG_HARDLOCKUP_DETECTOR
void touch_nmi_watchdog(void)
{
- if (watchdog_user_enabled) {
- unsigned cpu;
-
- for_each_present_cpu(cpu) {
- if (per_cpu(watchdog_nmi_touch, cpu) != true)
- per_cpu(watchdog_nmi_touch, cpu) = true;
- }
- }
+ /*
+ * Using __raw here because some code paths have
+ * preemption enabled. If preemption is enabled
+ * then interrupts should be enabled too, in which
+ * case we shouldn't have to worry about the watchdog
+ * going off.
+ */
+ __raw_get_cpu_var(watchdog_nmi_touch) = true;
touch_softlockup_watchdog();
}
EXPORT_SYMBOL(touch_nmi_watchdog);
@@ -240,9 +260,11 @@ static void watchdog_overflow_callback(struct perf_event *event,
return;
if (hardlockup_panic)
- panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+ panic("Watchdog detected hard LOCKUP on cpu %d",
+ this_cpu);
else
- WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+ WARN(1, "Watchdog detected hard LOCKUP on cpu %d",
+ this_cpu);
__this_cpu_write(hard_watchdog_warn, true);
return;
@@ -267,6 +289,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
struct pt_regs *regs = get_irq_regs();
int duration;
+ int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
/* kick the hardlockup detector */
watchdog_interrupt_count();
@@ -313,7 +336,18 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
if (__this_cpu_read(soft_watchdog_warn) == true)
return HRTIMER_RESTART;
- printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
+ if (softlockup_all_cpu_backtrace) {
+ /* Prevent multiple soft-lockup reports if one cpu is already
+ * engaged in dumping cpu back traces
+ */
+ if (test_and_set_bit(0, &soft_lockup_nmi_warn)) {
+ /* Someone else will report us. Let's give up */
+ __this_cpu_write(soft_watchdog_warn, true);
+ return HRTIMER_RESTART;
+ }
+ }
+
+ pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
print_modules();
@@ -323,6 +357,18 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
else
dump_stack();
+ if (softlockup_all_cpu_backtrace) {
+ /* Avoid generating two back traces for current
+ * given that one is already made above
+ */
+ trigger_allbutself_cpu_backtrace();
+
+ clear_bit(0, &soft_lockup_nmi_warn);
+ /* Barrier to sync with other cpus */
+ smp_mb__after_atomic();
+ }
+
+ add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
if (softlockup_panic)
panic("softlockup: hung tasks");
__this_cpu_write(soft_watchdog_warn, true);
@@ -441,7 +487,7 @@ static int watchdog_nmi_enable(unsigned int cpu)
if (PTR_ERR(event) == -EOPNOTSUPP)
pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
else if (PTR_ERR(event) == -ENOENT)
- pr_warning("disabled (cpu%i): hardware events not enabled\n",
+ pr_warn("disabled (cpu%i): hardware events not enabled\n",
cpu);
else
pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
@@ -505,7 +551,6 @@ static void restart_watchdog_hrtimer(void *info)
static void update_timers(int cpu)
{
- struct call_single_data data = {.func = restart_watchdog_hrtimer};
/*
* Make sure that perf event counter will adopt to a new
* sampling period. Updating the sampling period directly would
@@ -515,7 +560,7 @@ static void update_timers(int cpu)
* might be late already so we have to restart the timer as well.
*/
watchdog_nmi_disable(cpu);
- __smp_call_function_single(cpu, &data, 1);
+ smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1);
watchdog_nmi_enable(cpu);
}
@@ -524,10 +569,8 @@ static void update_timers_all_cpus(void)
int cpu;
get_online_cpus();
- preempt_disable();
for_each_online_cpu(cpu)
update_timers(cpu);
- preempt_enable();
put_online_cpus();
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 193e977a10ea..5dbe22aa3efd 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -65,15 +65,12 @@ enum {
* be executing on any CPU. The pool behaves as an unbound one.
*
* Note that DISASSOCIATED should be flipped only while holding
- * manager_mutex to avoid changing binding state while
- * create_worker() is in progress.
+ * attach_mutex to avoid changing binding state while
+ * worker_attach_to_pool() is in progress.
*/
- POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
- POOL_FREEZING = 1 << 3, /* freeze in progress */
/* worker flags */
- WORKER_STARTED = 1 << 0, /* started */
WORKER_DIE = 1 << 1, /* die die die */
WORKER_IDLE = 1 << 2, /* is idle */
WORKER_PREP = 1 << 3, /* preparing to run works */
@@ -100,10 +97,10 @@ enum {
/*
* Rescue workers are used only on emergencies and shared by
- * all cpus. Give -20.
+ * all cpus. Give MIN_NICE.
*/
- RESCUER_NICE_LEVEL = -20,
- HIGHPRI_NICE_LEVEL = -20,
+ RESCUER_NICE_LEVEL = MIN_NICE,
+ HIGHPRI_NICE_LEVEL = MIN_NICE,
WQ_NAME_LEN = 24,
};
@@ -124,8 +121,7 @@ enum {
* cpu or grabbing pool->lock is enough for read access. If
* POOL_DISASSOCIATED is set, it's identical to L.
*
- * MG: pool->manager_mutex and pool->lock protected. Writes require both
- * locks. Reads can happen under either lock.
+ * A: pool->attach_mutex protected.
*
* PL: wq_pool_mutex protected.
*
@@ -163,8 +159,11 @@ struct worker_pool {
/* see manage_workers() for details on the two manager mutexes */
struct mutex manager_arb; /* manager arbitration */
- struct mutex manager_mutex; /* manager exclusion */
- struct idr worker_idr; /* MG: worker IDs and iteration */
+ struct mutex attach_mutex; /* attach/detach exclusion */
+ struct list_head workers; /* A: attached workers */
+ struct completion *detach_completion; /* all workers detached */
+
+ struct ida worker_ida; /* worker IDs for task name */
struct workqueue_attrs *attrs; /* I: worker attributes */
struct hlist_node hash_node; /* PL: unbound_pool_hash node */
@@ -266,7 +265,6 @@ struct workqueue_struct {
static struct kmem_cache *pwq_cache;
-static int wq_numa_tbl_len; /* highest possible NUMA node id + 1 */
static cpumask_var_t *wq_numa_possible_cpumask;
/* possible CPUs of each node */
@@ -340,16 +338,6 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
lockdep_is_held(&wq->mutex), \
"sched RCU or wq->mutex should be held")
-#ifdef CONFIG_LOCKDEP
-#define assert_manager_or_pool_lock(pool) \
- WARN_ONCE(debug_locks && \
- !lockdep_is_held(&(pool)->manager_mutex) && \
- !lockdep_is_held(&(pool)->lock), \
- "pool->manager_mutex or ->lock should be held")
-#else
-#define assert_manager_or_pool_lock(pool) do { } while (0)
-#endif
-
#define for_each_cpu_worker_pool(pool, cpu) \
for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
(pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
@@ -375,17 +363,16 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
/**
* for_each_pool_worker - iterate through all workers of a worker_pool
* @worker: iteration cursor
- * @wi: integer used for iteration
* @pool: worker_pool to iterate workers of
*
- * This must be called with either @pool->manager_mutex or ->lock held.
+ * This must be called with @pool->attach_mutex.
*
* The if/else clause exists only for the lockdep assertion and can be
* ignored.
*/
-#define for_each_pool_worker(worker, wi, pool) \
- idr_for_each_entry(&(pool)->worker_idr, (worker), (wi)) \
- if (({ assert_manager_or_pool_lock((pool)); false; })) { } \
+#define for_each_pool_worker(worker, pool) \
+ list_for_each_entry((worker), &(pool)->workers, node) \
+ if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \
else
/**
@@ -516,6 +503,13 @@ void destroy_work_on_stack(struct work_struct *work)
}
EXPORT_SYMBOL_GPL(destroy_work_on_stack);
+void destroy_delayed_work_on_stack(struct delayed_work *work)
+{
+ destroy_timer_on_stack(&work->timer);
+ debug_object_free(&work->work, &work_debug_descr);
+}
+EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack);
+
#else
static inline void debug_work_activate(struct work_struct *work) { }
static inline void debug_work_deactivate(struct work_struct *work) { }
@@ -756,13 +750,6 @@ static bool need_to_create_worker(struct worker_pool *pool)
return need_more_worker(pool) && !may_start_working(pool);
}
-/* Do I need to be the manager? */
-static bool need_to_manage_workers(struct worker_pool *pool)
-{
- return need_to_create_worker(pool) ||
- (pool->flags & POOL_MANAGE_WORKERS);
-}
-
/* Do we have too many workers and should some go away? */
static bool too_many_workers(struct worker_pool *pool)
{
@@ -770,13 +757,6 @@ static bool too_many_workers(struct worker_pool *pool)
int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
int nr_busy = pool->nr_workers - nr_idle;
- /*
- * nr_idle and idle_list may disagree if idle rebinding is in
- * progress. Never return %true if idle_list is empty.
- */
- if (list_empty(&pool->idle_list))
- return false;
-
return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
}
@@ -784,8 +764,8 @@ static bool too_many_workers(struct worker_pool *pool)
* Wake up functions.
*/
-/* Return the first worker. Safe with preemption disabled */
-static struct worker *first_worker(struct worker_pool *pool)
+/* Return the first idle worker. Safe with preemption disabled */
+static struct worker *first_idle_worker(struct worker_pool *pool)
{
if (unlikely(list_empty(&pool->idle_list)))
return NULL;
@@ -804,7 +784,7 @@ static struct worker *first_worker(struct worker_pool *pool)
*/
static void wake_up_worker(struct worker_pool *pool)
{
- struct worker *worker = first_worker(pool);
+ struct worker *worker = first_idle_worker(pool);
if (likely(worker))
wake_up_process(worker->task);
@@ -862,7 +842,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
pool = worker->pool;
/* this can only happen on the local cpu */
- if (WARN_ON_ONCE(cpu != raw_smp_processor_id()))
+ if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu))
return NULL;
/*
@@ -878,7 +858,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
*/
if (atomic_dec_and_test(&pool->nr_running) &&
!list_empty(&pool->worklist))
- to_wakeup = first_worker(pool);
+ to_wakeup = first_idle_worker(pool);
return to_wakeup ? to_wakeup->task : NULL;
}
@@ -886,35 +866,22 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
* worker_set_flags - set worker flags and adjust nr_running accordingly
* @worker: self
* @flags: flags to set
- * @wakeup: wakeup an idle worker if necessary
*
- * Set @flags in @worker->flags and adjust nr_running accordingly. If
- * nr_running becomes zero and @wakeup is %true, an idle worker is
- * woken up.
+ * Set @flags in @worker->flags and adjust nr_running accordingly.
*
* CONTEXT:
* spin_lock_irq(pool->lock)
*/
-static inline void worker_set_flags(struct worker *worker, unsigned int flags,
- bool wakeup)
+static inline void worker_set_flags(struct worker *worker, unsigned int flags)
{
struct worker_pool *pool = worker->pool;
WARN_ON_ONCE(worker->task != current);
- /*
- * If transitioning into NOT_RUNNING, adjust nr_running and
- * wake up an idle worker as necessary if requested by
- * @wakeup.
- */
+ /* If transitioning into NOT_RUNNING, adjust nr_running. */
if ((flags & WORKER_NOT_RUNNING) &&
!(worker->flags & WORKER_NOT_RUNNING)) {
- if (wakeup) {
- if (atomic_dec_and_test(&pool->nr_running) &&
- !list_empty(&pool->worklist))
- wake_up_worker(pool);
- } else
- atomic_dec(&pool->nr_running);
+ atomic_dec(&pool->nr_running);
}
worker->flags |= flags;
@@ -1244,7 +1211,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
pwq_activate_delayed_work(work);
list_del_init(&work->entry);
- pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work));
+ pwq_dec_nr_in_flight(pwq, get_work_color(work));
/* work->data points to pwq iff queued, point to pool */
set_work_pool_and_keep_pending(work, pool->id);
@@ -1572,7 +1539,7 @@ static void worker_enter_idle(struct worker *worker)
(worker->hentry.next || worker->hentry.pprev)))
return;
- /* can't use worker_set_flags(), also called from start_worker() */
+ /* can't use worker_set_flags(), also called from create_worker() */
worker->flags |= WORKER_IDLE;
pool->nr_idle++;
worker->last_active = jiffies;
@@ -1614,78 +1581,15 @@ static void worker_leave_idle(struct worker *worker)
list_del_init(&worker->entry);
}
-/**
- * worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it
- * @pool: target worker_pool
- *
- * Bind %current to the cpu of @pool if it is associated and lock @pool.
- *
- * Works which are scheduled while the cpu is online must at least be
- * scheduled to a worker which is bound to the cpu so that if they are
- * flushed from cpu callbacks while cpu is going down, they are
- * guaranteed to execute on the cpu.
- *
- * This function is to be used by unbound workers and rescuers to bind
- * themselves to the target cpu and may race with cpu going down or
- * coming online. kthread_bind() can't be used because it may put the
- * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
- * verbatim as it's best effort and blocking and pool may be
- * [dis]associated in the meantime.
- *
- * This function tries set_cpus_allowed() and locks pool and verifies the
- * binding against %POOL_DISASSOCIATED which is set during
- * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
- * enters idle state or fetches works without dropping lock, it can
- * guarantee the scheduling requirement described in the first paragraph.
- *
- * CONTEXT:
- * Might sleep. Called without any lock but returns with pool->lock
- * held.
- *
- * Return:
- * %true if the associated pool is online (@worker is successfully
- * bound), %false if offline.
- */
-static bool worker_maybe_bind_and_lock(struct worker_pool *pool)
-__acquires(&pool->lock)
-{
- while (true) {
- /*
- * The following call may fail, succeed or succeed
- * without actually migrating the task to the cpu if
- * it races with cpu hotunplug operation. Verify
- * against POOL_DISASSOCIATED.
- */
- if (!(pool->flags & POOL_DISASSOCIATED))
- set_cpus_allowed_ptr(current, pool->attrs->cpumask);
-
- spin_lock_irq(&pool->lock);
- if (pool->flags & POOL_DISASSOCIATED)
- return false;
- if (task_cpu(current) == pool->cpu &&
- cpumask_equal(&current->cpus_allowed, pool->attrs->cpumask))
- return true;
- spin_unlock_irq(&pool->lock);
-
- /*
- * We've raced with CPU hot[un]plug. Give it a breather
- * and retry migration. cond_resched() is required here;
- * otherwise, we might deadlock against cpu_stop trying to
- * bring down the CPU on non-preemptive kernel.
- */
- cpu_relax();
- cond_resched();
- }
-}
-
-static struct worker *alloc_worker(void)
+static struct worker *alloc_worker(int node)
{
struct worker *worker;
- worker = kzalloc(sizeof(*worker), GFP_KERNEL);
+ worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node);
if (worker) {
INIT_LIST_HEAD(&worker->entry);
INIT_LIST_HEAD(&worker->scheduled);
+ INIT_LIST_HEAD(&worker->node);
/* on creation a worker is in !idle && prep state */
worker->flags = WORKER_PREP;
}
@@ -1693,12 +1597,70 @@ static struct worker *alloc_worker(void)
}
/**
+ * worker_attach_to_pool() - attach a worker to a pool
+ * @worker: worker to be attached
+ * @pool: the target pool
+ *
+ * Attach @worker to @pool. Once attached, the %WORKER_UNBOUND flag and
+ * cpu-binding of @worker are kept coordinated with the pool across
+ * cpu-[un]hotplugs.
+ */
+static void worker_attach_to_pool(struct worker *worker,
+ struct worker_pool *pool)
+{
+ mutex_lock(&pool->attach_mutex);
+
+ /*
+ * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
+ * online CPUs. It'll be re-applied when any of the CPUs come up.
+ */
+ set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
+
+ /*
+ * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains
+ * stable across this function. See the comments above the
+ * flag definition for details.
+ */
+ if (pool->flags & POOL_DISASSOCIATED)
+ worker->flags |= WORKER_UNBOUND;
+
+ list_add_tail(&worker->node, &pool->workers);
+
+ mutex_unlock(&pool->attach_mutex);
+}
+
+/**
+ * worker_detach_from_pool() - detach a worker from its pool
+ * @worker: worker which is attached to its pool
+ * @pool: the pool @worker is attached to
+ *
+ * Undo the attaching which had been done in worker_attach_to_pool(). The
+ * caller worker shouldn't access to the pool after detached except it has
+ * other reference to the pool.
+ */
+static void worker_detach_from_pool(struct worker *worker,
+ struct worker_pool *pool)
+{
+ struct completion *detach_completion = NULL;
+
+ mutex_lock(&pool->attach_mutex);
+ list_del(&worker->node);
+ if (list_empty(&pool->workers))
+ detach_completion = pool->detach_completion;
+ mutex_unlock(&pool->attach_mutex);
+
+ /* clear leftover flags without pool->lock after it is detached */
+ worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);
+
+ if (detach_completion)
+ complete(detach_completion);
+}
+
+/**
* create_worker - create a new workqueue worker
* @pool: pool the new worker will belong to
*
- * Create a new worker which is bound to @pool. The returned worker
- * can be started by calling start_worker() or destroyed using
- * destroy_worker().
+ * Create and start a new worker which is attached to @pool.
*
* CONTEXT:
* Might sleep. Does GFP_KERNEL allocations.
@@ -1712,23 +1674,12 @@ static struct worker *create_worker(struct worker_pool *pool)
int id = -1;
char id_buf[16];
- lockdep_assert_held(&pool->manager_mutex);
-
- /*
- * ID is needed to determine kthread name. Allocate ID first
- * without installing the pointer.
- */
- idr_preload(GFP_KERNEL);
- spin_lock_irq(&pool->lock);
-
- id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_NOWAIT);
-
- spin_unlock_irq(&pool->lock);
- idr_preload_end();
+ /* ID is needed to determine kthread name */
+ id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL);
if (id < 0)
goto fail;
- worker = alloc_worker();
+ worker = alloc_worker(pool->node);
if (!worker)
goto fail;
@@ -1751,124 +1702,53 @@ static struct worker *create_worker(struct worker_pool *pool)
/* prevent userland from meddling with cpumask of workqueue workers */
worker->task->flags |= PF_NO_SETAFFINITY;
- /*
- * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
- * online CPUs. It'll be re-applied when any of the CPUs come up.
- */
- set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
+ /* successful, attach the worker to the pool */
+ worker_attach_to_pool(worker, pool);
- /*
- * The caller is responsible for ensuring %POOL_DISASSOCIATED
- * remains stable across this function. See the comments above the
- * flag definition for details.
- */
- if (pool->flags & POOL_DISASSOCIATED)
- worker->flags |= WORKER_UNBOUND;
-
- /* successful, commit the pointer to idr */
+ /* start the newly created worker */
spin_lock_irq(&pool->lock);
- idr_replace(&pool->worker_idr, worker, worker->id);
+ worker->pool->nr_workers++;
+ worker_enter_idle(worker);
+ wake_up_process(worker->task);
spin_unlock_irq(&pool->lock);
return worker;
fail:
- if (id >= 0) {
- spin_lock_irq(&pool->lock);
- idr_remove(&pool->worker_idr, id);
- spin_unlock_irq(&pool->lock);
- }
+ if (id >= 0)
+ ida_simple_remove(&pool->worker_ida, id);
kfree(worker);
return NULL;
}
/**
- * start_worker - start a newly created worker
- * @worker: worker to start
- *
- * Make the pool aware of @worker and start it.
- *
- * CONTEXT:
- * spin_lock_irq(pool->lock).
- */
-static void start_worker(struct worker *worker)
-{
- worker->flags |= WORKER_STARTED;
- worker->pool->nr_workers++;
- worker_enter_idle(worker);
- wake_up_process(worker->task);
-}
-
-/**
- * create_and_start_worker - create and start a worker for a pool
- * @pool: the target pool
- *
- * Grab the managership of @pool and create and start a new worker for it.
- *
- * Return: 0 on success. A negative error code otherwise.
- */
-static int create_and_start_worker(struct worker_pool *pool)
-{
- struct worker *worker;
-
- mutex_lock(&pool->manager_mutex);
-
- worker = create_worker(pool);
- if (worker) {
- spin_lock_irq(&pool->lock);
- start_worker(worker);
- spin_unlock_irq(&pool->lock);
- }
-
- mutex_unlock(&pool->manager_mutex);
-
- return worker ? 0 : -ENOMEM;
-}
-
-/**
* destroy_worker - destroy a workqueue worker
* @worker: worker to be destroyed
*
- * Destroy @worker and adjust @pool stats accordingly.
+ * Destroy @worker and adjust @pool stats accordingly. The worker should
+ * be idle.
*
* CONTEXT:
- * spin_lock_irq(pool->lock) which is released and regrabbed.
+ * spin_lock_irq(pool->lock).
*/
static void destroy_worker(struct worker *worker)
{
struct worker_pool *pool = worker->pool;
- lockdep_assert_held(&pool->manager_mutex);
lockdep_assert_held(&pool->lock);
/* sanity check frenzy */
if (WARN_ON(worker->current_work) ||
- WARN_ON(!list_empty(&worker->scheduled)))
+ WARN_ON(!list_empty(&worker->scheduled)) ||
+ WARN_ON(!(worker->flags & WORKER_IDLE)))
return;
- if (worker->flags & WORKER_STARTED)
- pool->nr_workers--;
- if (worker->flags & WORKER_IDLE)
- pool->nr_idle--;
-
- /*
- * Once WORKER_DIE is set, the kworker may destroy itself at any
- * point. Pin to ensure the task stays until we're done with it.
- */
- get_task_struct(worker->task);
+ pool->nr_workers--;
+ pool->nr_idle--;
list_del_init(&worker->entry);
worker->flags |= WORKER_DIE;
-
- idr_remove(&pool->worker_idr, worker->id);
-
- spin_unlock_irq(&pool->lock);
-
- kthread_stop(worker->task);
- put_task_struct(worker->task);
- kfree(worker);
-
- spin_lock_irq(&pool->lock);
+ wake_up_process(worker->task);
}
static void idle_worker_timeout(unsigned long __pool)
@@ -1877,7 +1757,7 @@ static void idle_worker_timeout(unsigned long __pool)
spin_lock_irq(&pool->lock);
- if (too_many_workers(pool)) {
+ while (too_many_workers(pool)) {
struct worker *worker;
unsigned long expires;
@@ -1885,13 +1765,12 @@ static void idle_worker_timeout(unsigned long __pool)
worker = list_entry(pool->idle_list.prev, struct worker, entry);
expires = worker->last_active + IDLE_WORKER_TIMEOUT;
- if (time_before(jiffies, expires))
+ if (time_before(jiffies, expires)) {
mod_timer(&pool->idle_timer, expires);
- else {
- /* it's been idle for too long, wake up manager */
- pool->flags |= POOL_MANAGE_WORKERS;
- wake_up_worker(pool);
+ break;
}
+
+ destroy_worker(worker);
}
spin_unlock_irq(&pool->lock);
@@ -1909,6 +1788,12 @@ static void send_mayday(struct work_struct *work)
/* mayday mayday mayday */
if (list_empty(&pwq->mayday_node)) {
+ /*
+ * If @pwq is for an unbound wq, its base ref may be put at
+ * any time due to an attribute change. Pin @pwq until the
+ * rescuer is done with it.
+ */
+ get_pwq(pwq);
list_add_tail(&pwq->mayday_node, &wq->maydays);
wake_up_process(wq->rescuer->task);
}
@@ -1974,23 +1859,10 @@ restart:
mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
while (true) {
- struct worker *worker;
-
- worker = create_worker(pool);
- if (worker) {
- del_timer_sync(&pool->mayday_timer);
- spin_lock_irq(&pool->lock);
- start_worker(worker);
- if (WARN_ON_ONCE(need_to_create_worker(pool)))
- goto restart;
- return true;
- }
-
- if (!need_to_create_worker(pool))
+ if (create_worker(pool) || !need_to_create_worker(pool))
break;
- __set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(CREATE_COOLDOWN);
+ schedule_timeout_interruptible(CREATE_COOLDOWN);
if (!need_to_create_worker(pool))
break;
@@ -1998,50 +1870,17 @@ restart:
del_timer_sync(&pool->mayday_timer);
spin_lock_irq(&pool->lock);
+ /*
+ * This is necessary even after a new worker was just successfully
+ * created as @pool->lock was dropped and the new worker might have
+ * already become busy.
+ */
if (need_to_create_worker(pool))
goto restart;
return true;
}
/**
- * maybe_destroy_worker - destroy workers which have been idle for a while
- * @pool: pool to destroy workers for
- *
- * Destroy @pool workers which have been idle for longer than
- * IDLE_WORKER_TIMEOUT.
- *
- * LOCKING:
- * spin_lock_irq(pool->lock) which may be released and regrabbed
- * multiple times. Called only from manager.
- *
- * Return:
- * %false if no action was taken and pool->lock stayed locked, %true
- * otherwise.
- */
-static bool maybe_destroy_workers(struct worker_pool *pool)
-{
- bool ret = false;
-
- while (too_many_workers(pool)) {
- struct worker *worker;
- unsigned long expires;
-
- worker = list_entry(pool->idle_list.prev, struct worker, entry);
- expires = worker->last_active + IDLE_WORKER_TIMEOUT;
-
- if (time_before(jiffies, expires)) {
- mod_timer(&pool->idle_timer, expires);
- break;
- }
-
- destroy_worker(worker);
- ret = true;
- }
-
- return ret;
-}
-
-/**
* manage_workers - manage worker pool
* @worker: self
*
@@ -2070,8 +1909,6 @@ static bool manage_workers(struct worker *worker)
bool ret = false;
/*
- * Managership is governed by two mutexes - manager_arb and
- * manager_mutex. manager_arb handles arbitration of manager role.
* Anyone who successfully grabs manager_arb wins the arbitration
* and becomes the manager. mutex_trylock() on pool->manager_arb
* failure while holding pool->lock reliably indicates that someone
@@ -2080,40 +1917,12 @@ static bool manage_workers(struct worker *worker)
* grabbing manager_arb is responsible for actually performing
* manager duties. If manager_arb is grabbed and released without
* actual management, the pool may stall indefinitely.
- *
- * manager_mutex is used for exclusion of actual management
- * operations. The holder of manager_mutex can be sure that none
- * of management operations, including creation and destruction of
- * workers, won't take place until the mutex is released. Because
- * manager_mutex doesn't interfere with manager role arbitration,
- * it is guaranteed that the pool's management, while may be
- * delayed, won't be disturbed by someone else grabbing
- * manager_mutex.
*/
if (!mutex_trylock(&pool->manager_arb))
return ret;
- /*
- * With manager arbitration won, manager_mutex would be free in
- * most cases. trylock first without dropping @pool->lock.
- */
- if (unlikely(!mutex_trylock(&pool->manager_mutex))) {
- spin_unlock_irq(&pool->lock);
- mutex_lock(&pool->manager_mutex);
- spin_lock_irq(&pool->lock);
- ret = true;
- }
-
- pool->flags &= ~POOL_MANAGE_WORKERS;
-
- /*
- * Destroy and then create so that may_start_working() is true
- * on return.
- */
- ret |= maybe_destroy_workers(pool);
ret |= maybe_create_worker(pool);
- mutex_unlock(&pool->manager_mutex);
mutex_unlock(&pool->manager_arb);
return ret;
}
@@ -2153,13 +1962,8 @@ __acquires(&pool->lock)
lockdep_copy_map(&lockdep_map, &work->lockdep_map);
#endif
- /*
- * Ensure we're on the correct CPU. DISASSOCIATED test is
- * necessary to avoid spurious warnings from rescuers servicing the
- * unbound or a disassociated pool.
- */
- WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
- !(pool->flags & POOL_DISASSOCIATED) &&
+ /* ensure we're on the correct CPU */
+ WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
raw_smp_processor_id() != pool->cpu);
/*
@@ -2185,17 +1989,22 @@ __acquires(&pool->lock)
list_del_init(&work->entry);
/*
- * CPU intensive works don't participate in concurrency
- * management. They're the scheduler's responsibility.
+ * CPU intensive works don't participate in concurrency management.
+ * They're the scheduler's responsibility. This takes @worker out
+ * of concurrency management and the next code block will chain
+ * execution of the pending work items.
*/
if (unlikely(cpu_intensive))
- worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
+ worker_set_flags(worker, WORKER_CPU_INTENSIVE);
/*
- * Unbound pool isn't concurrency managed and work items should be
- * executed ASAP. Wake up another worker if necessary.
+ * Wake up another worker if necessary. The condition is always
+ * false for normal per-cpu workers since nr_running would always
+ * be >= 1 at this point. This is used to chain execution of the
+ * pending work items for WORKER_NOT_RUNNING workers such as the
+ * UNBOUND and CPU_INTENSIVE ones.
*/
- if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
+ if (need_more_worker(pool))
wake_up_worker(pool);
/*
@@ -2301,6 +2110,11 @@ woke_up:
spin_unlock_irq(&pool->lock);
WARN_ON_ONCE(!list_empty(&worker->entry));
worker->task->flags &= ~PF_WQ_WORKER;
+
+ set_task_comm(worker->task, "kworker/dying");
+ ida_simple_remove(&pool->worker_ida, worker->id);
+ worker_detach_from_pool(worker, pool);
+ kfree(worker);
return 0;
}
@@ -2346,11 +2160,8 @@ recheck:
}
} while (keep_working(pool));
- worker_set_flags(worker, WORKER_PREP, false);
+ worker_set_flags(worker, WORKER_PREP);
sleep:
- if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker))
- goto recheck;
-
/*
* pool->lock is held and there's no work to process and no need to
* manage, sleep. Workers are woken up only while holding
@@ -2391,6 +2202,7 @@ static int rescuer_thread(void *__rescuer)
struct worker *rescuer = __rescuer;
struct workqueue_struct *wq = rescuer->rescue_wq;
struct list_head *scheduled = &rescuer->scheduled;
+ bool should_stop;
set_user_nice(current, RESCUER_NICE_LEVEL);
@@ -2402,11 +2214,15 @@ static int rescuer_thread(void *__rescuer)
repeat:
set_current_state(TASK_INTERRUPTIBLE);
- if (kthread_should_stop()) {
- __set_current_state(TASK_RUNNING);
- rescuer->task->flags &= ~PF_WQ_WORKER;
- return 0;
- }
+ /*
+ * By the time the rescuer is requested to stop, the workqueue
+ * shouldn't have any work pending, but @wq->maydays may still have
+ * pwq(s) queued. This can happen by non-rescuer workers consuming
+ * all the work items before the rescuer got to them. Go through
+ * @wq->maydays processing before acting on should_stop so that the
+ * list is always empty on exit.
+ */
+ should_stop = kthread_should_stop();
/* see whether any pwq is asking for help */
spin_lock_irq(&wq_mayday_lock);
@@ -2422,8 +2238,9 @@ repeat:
spin_unlock_irq(&wq_mayday_lock);
- /* migrate to the target cpu if possible */
- worker_maybe_bind_and_lock(pool);
+ worker_attach_to_pool(rescuer, pool);
+
+ spin_lock_irq(&pool->lock);
rescuer->pool = pool;
/*
@@ -2438,20 +2255,35 @@ repeat:
process_scheduled_works(rescuer);
/*
- * Leave this pool. If keep_working() is %true, notify a
+ * Put the reference grabbed by send_mayday(). @pool won't
+ * go away while we're still attached to it.
+ */
+ put_pwq(pwq);
+
+ /*
+ * Leave this pool. If need_more_worker() is %true, notify a
* regular worker; otherwise, we end up with 0 concurrency
* and stalling the execution.
*/
- if (keep_working(pool))
+ if (need_more_worker(pool))
wake_up_worker(pool);
rescuer->pool = NULL;
- spin_unlock(&pool->lock);
- spin_lock(&wq_mayday_lock);
+ spin_unlock_irq(&pool->lock);
+
+ worker_detach_from_pool(rescuer, pool);
+
+ spin_lock_irq(&wq_mayday_lock);
}
spin_unlock_irq(&wq_mayday_lock);
+ if (should_stop) {
+ __set_current_state(TASK_RUNNING);
+ rescuer->task->flags &= ~PF_WQ_WORKER;
+ return 0;
+ }
+
/* rescuers should never participate in concurrency management */
WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
schedule();
@@ -3225,7 +3057,7 @@ static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
return -ENOMEM;
if (sscanf(buf, "%d", &attrs->nice) == 1 &&
- attrs->nice >= -20 && attrs->nice <= 19)
+ attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
ret = apply_workqueue_attrs(wq, attrs);
else
ret = -EINVAL;
@@ -3392,6 +3224,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)
}
}
+ dev_set_uevent_suppress(&wq_dev->dev, false);
kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
return 0;
}
@@ -3520,9 +3353,10 @@ static int init_worker_pool(struct worker_pool *pool)
(unsigned long)pool);
mutex_init(&pool->manager_arb);
- mutex_init(&pool->manager_mutex);
- idr_init(&pool->worker_idr);
+ mutex_init(&pool->attach_mutex);
+ INIT_LIST_HEAD(&pool->workers);
+ ida_init(&pool->worker_ida);
INIT_HLIST_NODE(&pool->hash_node);
pool->refcnt = 1;
@@ -3537,7 +3371,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
{
struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
- idr_destroy(&pool->worker_idr);
+ ida_destroy(&pool->worker_ida);
free_workqueue_attrs(pool->attrs);
kfree(pool);
}
@@ -3555,6 +3389,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
*/
static void put_unbound_pool(struct worker_pool *pool)
{
+ DECLARE_COMPLETION_ONSTACK(detach_completion);
struct worker *worker;
lockdep_assert_held(&wq_pool_mutex);
@@ -3563,7 +3398,7 @@ static void put_unbound_pool(struct worker_pool *pool)
return;
/* sanity checks */
- if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) ||
+ if (WARN_ON(!(pool->cpu < 0)) ||
WARN_ON(!list_empty(&pool->worklist)))
return;
@@ -3575,18 +3410,24 @@ static void put_unbound_pool(struct worker_pool *pool)
/*
* Become the manager and destroy all workers. Grabbing
* manager_arb prevents @pool's workers from blocking on
- * manager_mutex.
+ * attach_mutex.
*/
mutex_lock(&pool->manager_arb);
- mutex_lock(&pool->manager_mutex);
- spin_lock_irq(&pool->lock);
- while ((worker = first_worker(pool)))
+ spin_lock_irq(&pool->lock);
+ while ((worker = first_idle_worker(pool)))
destroy_worker(worker);
WARN_ON(pool->nr_workers || pool->nr_idle);
-
spin_unlock_irq(&pool->lock);
- mutex_unlock(&pool->manager_mutex);
+
+ mutex_lock(&pool->attach_mutex);
+ if (!list_empty(&pool->workers))
+ pool->detach_completion = &detach_completion;
+ mutex_unlock(&pool->attach_mutex);
+
+ if (pool->detach_completion)
+ wait_for_completion(pool->detach_completion);
+
mutex_unlock(&pool->manager_arb);
/* shut down the timers */
@@ -3623,7 +3464,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
if (wqattrs_equal(pool->attrs, attrs)) {
pool->refcnt++;
- goto out_unlock;
+ return pool;
}
}
@@ -3632,9 +3473,6 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
if (!pool || init_worker_pool(pool) < 0)
goto fail;
- if (workqueue_freezing)
- pool->flags |= POOL_FREEZING;
-
lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */
copy_workqueue_attrs(pool->attrs, attrs);
@@ -3659,12 +3497,12 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
goto fail;
/* create and start the initial worker */
- if (create_and_start_worker(pool) < 0)
+ if (!create_worker(pool))
goto fail;
/* install */
hash_add(unbound_pool_hash, &pool->hash_node, hash);
-out_unlock:
+
return pool;
fail:
if (pool)
@@ -3693,11 +3531,6 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
return;
- /*
- * Unlink @pwq. Synchronization against wq->mutex isn't strictly
- * necessary on release but do it anyway. It's easier to verify
- * and consistent with the linking path.
- */
mutex_lock(&wq->mutex);
list_del_rcu(&pwq->pwqs_node);
is_last = list_empty(&wq->pwqs);
@@ -3741,7 +3574,12 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq)
spin_lock_irq(&pwq->pool->lock);
- if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) {
+ /*
+ * During [un]freezing, the caller is responsible for ensuring that
+ * this function is called at least once after @workqueue_freezing
+ * is updated and visible.
+ */
+ if (!freezable || !workqueue_freezing) {
pwq->max_active = wq->saved_max_active;
while (!list_empty(&pwq->delayed_works) &&
@@ -3789,10 +3627,7 @@ static void link_pwq(struct pool_workqueue *pwq)
if (!list_empty(&pwq->pwqs_node))
return;
- /*
- * Set the matching work_color. This is synchronized with
- * wq->mutex to avoid confusing flush_workqueue().
- */
+ /* set the matching work_color */
pwq->work_color = wq->work_color;
/* sync max_active to the current setting */
@@ -3929,7 +3764,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
return -EINVAL;
- pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL);
+ pwq_tbl = kzalloc(nr_node_ids * sizeof(pwq_tbl[0]), GFP_KERNEL);
new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
if (!pwq_tbl || !new_attrs || !tmp_attrs)
@@ -4073,17 +3908,13 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
* Let's determine what needs to be done. If the target cpumask is
* different from wq's, we need to compare it to @pwq's and create
* a new one if they don't match. If the target cpumask equals
- * wq's, the default pwq should be used. If @pwq is already the
- * default one, nothing to do; otherwise, install the default one.
+ * wq's, the default pwq should be used.
*/
if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
goto out_unlock;
} else {
- if (pwq == wq->dfl_pwq)
- goto out_unlock;
- else
- goto use_dfl_pwq;
+ goto use_dfl_pwq;
}
mutex_unlock(&wq->mutex);
@@ -4091,9 +3922,10 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
/* create a new pwq */
pwq = alloc_unbound_pwq(wq, target_attrs);
if (!pwq) {
- pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
- wq->name);
- goto out_unlock;
+ pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
+ wq->name);
+ mutex_lock(&wq->mutex);
+ goto use_dfl_pwq;
}
/*
@@ -4180,7 +4012,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
/* allocate wq and format name */
if (flags & WQ_UNBOUND)
- tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]);
+ tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]);
wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
if (!wq)
@@ -4222,7 +4054,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
if (flags & WQ_MEM_RECLAIM) {
struct worker *rescuer;
- rescuer = alloc_worker();
+ rescuer = alloc_worker(NUMA_NO_NODE);
if (!rescuer)
goto err_destroy;
@@ -4568,28 +4400,25 @@ static void wq_unbind_fn(struct work_struct *work)
int cpu = smp_processor_id();
struct worker_pool *pool;
struct worker *worker;
- int wi;
for_each_cpu_worker_pool(pool, cpu) {
- WARN_ON_ONCE(cpu != smp_processor_id());
-
- mutex_lock(&pool->manager_mutex);
+ mutex_lock(&pool->attach_mutex);
spin_lock_irq(&pool->lock);
/*
- * We've blocked all manager operations. Make all workers
+ * We've blocked all attach/detach operations. Make all workers
* unbound and set DISASSOCIATED. Before this, all workers
* except for the ones which are still executing works from
* before the last CPU down must be on the cpu. After
* this, they may become diasporas.
*/
- for_each_pool_worker(worker, wi, pool)
+ for_each_pool_worker(worker, pool)
worker->flags |= WORKER_UNBOUND;
pool->flags |= POOL_DISASSOCIATED;
spin_unlock_irq(&pool->lock);
- mutex_unlock(&pool->manager_mutex);
+ mutex_unlock(&pool->attach_mutex);
/*
* Call schedule() so that we cross rq->lock and thus can
@@ -4629,9 +4458,8 @@ static void wq_unbind_fn(struct work_struct *work)
static void rebind_workers(struct worker_pool *pool)
{
struct worker *worker;
- int wi;
- lockdep_assert_held(&pool->manager_mutex);
+ lockdep_assert_held(&pool->attach_mutex);
/*
* Restore CPU affinity of all workers. As all idle workers should
@@ -4640,13 +4468,14 @@ static void rebind_workers(struct worker_pool *pool)
* of all workers first and then clear UNBOUND. As we're called
* from CPU_ONLINE, the following shouldn't fail.
*/
- for_each_pool_worker(worker, wi, pool)
+ for_each_pool_worker(worker, pool)
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
pool->attrs->cpumask) < 0);
spin_lock_irq(&pool->lock);
+ pool->flags &= ~POOL_DISASSOCIATED;
- for_each_pool_worker(worker, wi, pool) {
+ for_each_pool_worker(worker, pool) {
unsigned int worker_flags = worker->flags;
/*
@@ -4698,9 +4527,8 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
{
static cpumask_t cpumask;
struct worker *worker;
- int wi;
- lockdep_assert_held(&pool->manager_mutex);
+ lockdep_assert_held(&pool->attach_mutex);
/* is @cpu allowed for @pool? */
if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
@@ -4712,7 +4540,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
return;
/* as we're called from CPU_ONLINE, the following shouldn't fail */
- for_each_pool_worker(worker, wi, pool)
+ for_each_pool_worker(worker, pool)
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
pool->attrs->cpumask) < 0);
}
@@ -4735,7 +4563,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
for_each_cpu_worker_pool(pool, cpu) {
if (pool->nr_workers)
continue;
- if (create_and_start_worker(pool) < 0)
+ if (!create_worker(pool))
return NOTIFY_BAD;
}
break;
@@ -4745,19 +4573,14 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
mutex_lock(&wq_pool_mutex);
for_each_pool(pool, pi) {
- mutex_lock(&pool->manager_mutex);
-
- if (pool->cpu == cpu) {
- spin_lock_irq(&pool->lock);
- pool->flags &= ~POOL_DISASSOCIATED;
- spin_unlock_irq(&pool->lock);
+ mutex_lock(&pool->attach_mutex);
+ if (pool->cpu == cpu)
rebind_workers(pool);
- } else if (pool->cpu < 0) {
+ else if (pool->cpu < 0)
restore_unbound_workers_cpumask(pool, cpu);
- }
- mutex_unlock(&pool->manager_mutex);
+ mutex_unlock(&pool->attach_mutex);
}
/* update NUMA affinity of unbound workqueues */
@@ -4856,24 +4679,14 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
*/
void freeze_workqueues_begin(void)
{
- struct worker_pool *pool;
struct workqueue_struct *wq;
struct pool_workqueue *pwq;
- int pi;
mutex_lock(&wq_pool_mutex);
WARN_ON_ONCE(workqueue_freezing);
workqueue_freezing = true;
- /* set FREEZING */
- for_each_pool(pool, pi) {
- spin_lock_irq(&pool->lock);
- WARN_ON_ONCE(pool->flags & POOL_FREEZING);
- pool->flags |= POOL_FREEZING;
- spin_unlock_irq(&pool->lock);
- }
-
list_for_each_entry(wq, &workqueues, list) {
mutex_lock(&wq->mutex);
for_each_pwq(pwq, wq)
@@ -4943,21 +4756,13 @@ void thaw_workqueues(void)
{
struct workqueue_struct *wq;
struct pool_workqueue *pwq;
- struct worker_pool *pool;
- int pi;
mutex_lock(&wq_pool_mutex);
if (!workqueue_freezing)
goto out_unlock;
- /* clear FREEZING */
- for_each_pool(pool, pi) {
- spin_lock_irq(&pool->lock);
- WARN_ON_ONCE(!(pool->flags & POOL_FREEZING));
- pool->flags &= ~POOL_FREEZING;
- spin_unlock_irq(&pool->lock);
- }
+ workqueue_freezing = false;
/* restore max_active and repopulate worklist */
list_for_each_entry(wq, &workqueues, list) {
@@ -4967,7 +4772,6 @@ void thaw_workqueues(void)
mutex_unlock(&wq->mutex);
}
- workqueue_freezing = false;
out_unlock:
mutex_unlock(&wq_pool_mutex);
}
@@ -4978,10 +4782,6 @@ static void __init wq_numa_init(void)
cpumask_var_t *tbl;
int node, cpu;
- /* determine NUMA pwq table len - highest node id + 1 */
- for_each_node(node)
- wq_numa_tbl_len = max(wq_numa_tbl_len, node + 1);
-
if (num_possible_nodes() <= 1)
return;
@@ -4998,11 +4798,11 @@ static void __init wq_numa_init(void)
* available. Build one from cpu_to_node() which should have been
* fully initialized by now.
*/
- tbl = kzalloc(wq_numa_tbl_len * sizeof(tbl[0]), GFP_KERNEL);
+ tbl = kzalloc(nr_node_ids * sizeof(tbl[0]), GFP_KERNEL);
BUG_ON(!tbl);
for_each_node(node)
- BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
+ BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
node_online(node) ? node : NUMA_NO_NODE));
for_each_possible_cpu(cpu) {
@@ -5058,7 +4858,7 @@ static int __init init_workqueues(void)
for_each_cpu_worker_pool(pool, cpu) {
pool->flags &= ~POOL_DISASSOCIATED;
- BUG_ON(create_and_start_worker(pool) < 0);
+ BUG_ON(!create_worker(pool));
}
}
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index 7e2204db0b1a..45215870ac6c 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -37,6 +37,8 @@ struct worker {
struct task_struct *task; /* I: worker task */
struct worker_pool *pool; /* I: the associated pool */
/* L: for rescuers */
+ struct list_head node; /* A: anchored at pool->workers */
+ /* A: runs through worker->node */
unsigned long last_active; /* L: last active timestamp */
unsigned int flags; /* X: flags */
OpenPOWER on IntegriCloud